summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet
diff options
context:
space:
mode:
authorSebastian Huber <sebastian.huber@embedded-brains.de>2016-10-07 15:10:20 +0200
committerSebastian Huber <sebastian.huber@embedded-brains.de>2017-01-10 09:53:31 +0100
commitc40e45b75eb76d79a05c7fa85c1fa9b5c728a12f (patch)
treead4f2519067709f00ab98b3c591186c26dc3a21f /freebsd/sys/netinet
parentuserspace-header-gen.py: Simplify program ports (diff)
downloadrtems-libbsd-c40e45b75eb76d79a05c7fa85c1fa9b5c728a12f.tar.bz2
Update to FreeBSD head 2016-08-23
Git mirror commit 9fe7c416e6abb28b1398fd3e5687099846800cfd.
Diffstat (limited to 'freebsd/sys/netinet')
-rw-r--r--freebsd/sys/netinet/accf_dns.c6
-rw-r--r--freebsd/sys/netinet/accf_http.c11
-rw-r--r--freebsd/sys/netinet/cc/cc.c15
-rw-r--r--freebsd/sys/netinet/cc/cc.h (renamed from freebsd/sys/netinet/cc.h)21
-rw-r--r--freebsd/sys/netinet/cc/cc_newreno.c18
-rw-r--r--freebsd/sys/netinet/icmp6.h91
-rw-r--r--freebsd/sys/netinet/icmp_var.h21
-rw-r--r--freebsd/sys/netinet/if_atm.c6
-rw-r--r--freebsd/sys/netinet/if_atm.h2
-rw-r--r--freebsd/sys/netinet/if_ether.c1122
-rw-r--r--freebsd/sys/netinet/if_ether.h32
-rw-r--r--freebsd/sys/netinet/igmp.c462
-rw-r--r--freebsd/sys/netinet/igmp_var.h88
-rw-r--r--freebsd/sys/netinet/in.c1782
-rw-r--r--freebsd/sys/netinet/in.h211
-rw-r--r--freebsd/sys/netinet/in_fib.c235
-rw-r--r--freebsd/sys/netinet/in_fib.h61
-rw-r--r--freebsd/sys/netinet/in_gif.c421
-rw-r--r--freebsd/sys/netinet/in_kdtrace.h72
-rw-r--r--freebsd/sys/netinet/in_mcast.c149
-rw-r--r--freebsd/sys/netinet/in_pcb.c386
-rw-r--r--freebsd/sys/netinet/in_pcb.h137
-rw-r--r--freebsd/sys/netinet/in_proto.c37
-rw-r--r--freebsd/sys/netinet/in_rmx.c377
-rw-r--r--freebsd/sys/netinet/in_rss.h57
-rw-r--r--freebsd/sys/netinet/in_systm.h16
-rw-r--r--freebsd/sys/netinet/in_var.h155
-rw-r--r--freebsd/sys/netinet/ip.h25
-rw-r--r--freebsd/sys/netinet/ip6.h6
-rw-r--r--freebsd/sys/netinet/ip_carp.c2681
-rw-r--r--freebsd/sys/netinet/ip_carp.h62
-rw-r--r--freebsd/sys/netinet/ip_divert.c59
-rw-r--r--freebsd/sys/netinet/ip_dummynet.h32
-rw-r--r--freebsd/sys/netinet/ip_ecn.h4
-rw-r--r--freebsd/sys/netinet/ip_encap.c68
-rw-r--r--freebsd/sys/netinet/ip_encap.h2
-rw-r--r--freebsd/sys/netinet/ip_fastfwd.c181
-rw-r--r--freebsd/sys/netinet/ip_fw.h469
-rw-r--r--freebsd/sys/netinet/ip_gre.c354
-rw-r--r--freebsd/sys/netinet/ip_icmp.c248
-rw-r--r--freebsd/sys/netinet/ip_icmp.h12
-rw-r--r--freebsd/sys/netinet/ip_id.c260
-rw-r--r--freebsd/sys/netinet/ip_input.c1046
-rw-r--r--freebsd/sys/netinet/ip_ipsec.h4
-rw-r--r--freebsd/sys/netinet/ip_mroute.c136
-rw-r--r--freebsd/sys/netinet/ip_mroute.h29
-rw-r--r--freebsd/sys/netinet/ip_options.c116
-rw-r--r--freebsd/sys/netinet/ip_options.h5
-rw-r--r--freebsd/sys/netinet/ip_output.c614
-rw-r--r--freebsd/sys/netinet/ip_reass.c660
-rw-r--r--freebsd/sys/netinet/ip_var.h116
-rw-r--r--freebsd/sys/netinet/libalias/alias.c32
-rw-r--r--freebsd/sys/netinet/libalias/alias_cuseeme.c36
-rw-r--r--freebsd/sys/netinet/libalias/alias_db.c21
-rw-r--r--freebsd/sys/netinet/libalias/alias_dummy.c42
-rw-r--r--freebsd/sys/netinet/libalias/alias_irc.c24
-rw-r--r--freebsd/sys/netinet/libalias/alias_local.h2
-rw-r--r--freebsd/sys/netinet/libalias/alias_mod.c192
-rw-r--r--freebsd/sys/netinet/libalias/alias_mod.h138
-rw-r--r--freebsd/sys/netinet/libalias/alias_nbt.c54
-rw-r--r--freebsd/sys/netinet/libalias/alias_pptp.c60
-rw-r--r--freebsd/sys/netinet/libalias/alias_sctp.h1
-rw-r--r--freebsd/sys/netinet/libalias/alias_skinny.c20
-rw-r--r--freebsd/sys/netinet/libalias/alias_smedia.c22
-rw-r--r--freebsd/sys/netinet/pim_var.h35
-rw-r--r--freebsd/sys/netinet/raw_ip.c106
-rw-r--r--freebsd/sys/netinet/sctp.h36
-rw-r--r--freebsd/sys/netinet/sctp_asconf.c221
-rw-r--r--freebsd/sys/netinet/sctp_auth.c70
-rw-r--r--freebsd/sys/netinet/sctp_auth.h3
-rw-r--r--freebsd/sys/netinet/sctp_bsd_addr.c21
-rw-r--r--freebsd/sys/netinet/sctp_cc_functions.c145
-rw-r--r--freebsd/sys/netinet/sctp_constants.h83
-rw-r--r--freebsd/sys/netinet/sctp_dtrace_declare.h1
-rw-r--r--freebsd/sys/netinet/sctp_dtrace_define.h177
-rw-r--r--freebsd/sys/netinet/sctp_header.h86
-rw-r--r--freebsd/sys/netinet/sctp_indata.c3483
-rw-r--r--freebsd/sys/netinet/sctp_indata.h26
-rw-r--r--freebsd/sys/netinet/sctp_input.c956
-rw-r--r--freebsd/sys/netinet/sctp_input.h4
-rw-r--r--freebsd/sys/netinet/sctp_lock_bsd.h2
-rw-r--r--freebsd/sys/netinet/sctp_os_bsd.h84
-rw-r--r--freebsd/sys/netinet/sctp_output.c2326
-rw-r--r--freebsd/sys/netinet/sctp_output.h22
-rw-r--r--freebsd/sys/netinet/sctp_pcb.c653
-rw-r--r--freebsd/sys/netinet/sctp_pcb.h33
-rw-r--r--freebsd/sys/netinet/sctp_peeloff.c9
-rw-r--r--freebsd/sys/netinet/sctp_structs.h144
-rw-r--r--freebsd/sys/netinet/sctp_sysctl.c692
-rw-r--r--freebsd/sys/netinet/sctp_sysctl.h90
-rw-r--r--freebsd/sys/netinet/sctp_timer.c72
-rw-r--r--freebsd/sys/netinet/sctp_timer.h4
-rw-r--r--freebsd/sys/netinet/sctp_uio.h76
-rw-r--r--freebsd/sys/netinet/sctp_usrreq.c2041
-rw-r--r--freebsd/sys/netinet/sctp_var.h93
-rw-r--r--freebsd/sys/netinet/sctputil.c1339
-rw-r--r--freebsd/sys/netinet/sctputil.h59
-rw-r--r--freebsd/sys/netinet/tcp.h16
-rw-r--r--freebsd/sys/netinet/tcp_debug.c3
-rw-r--r--freebsd/sys/netinet/tcp_hostcache.c81
-rw-r--r--freebsd/sys/netinet/tcp_hostcache.h2
-rw-r--r--freebsd/sys/netinet/tcp_input.c1134
-rw-r--r--freebsd/sys/netinet/tcp_lro.c458
-rw-r--r--freebsd/sys/netinet/tcp_lro.h45
-rw-r--r--freebsd/sys/netinet/tcp_offload.c3
-rw-r--r--freebsd/sys/netinet/tcp_output.c445
-rw-r--r--freebsd/sys/netinet/tcp_reass.c80
-rw-r--r--freebsd/sys/netinet/tcp_sack.c39
-rw-r--r--freebsd/sys/netinet/tcp_subr.c1387
-rw-r--r--freebsd/sys/netinet/tcp_syncache.c995
-rw-r--r--freebsd/sys/netinet/tcp_syncache.h47
-rw-r--r--freebsd/sys/netinet/tcp_timer.c583
-rw-r--r--freebsd/sys/netinet/tcp_timer.h39
-rw-r--r--freebsd/sys/netinet/tcp_timewait.c225
-rw-r--r--freebsd/sys/netinet/tcp_usrreq.c568
-rw-r--r--freebsd/sys/netinet/tcp_var.h490
-rw-r--r--freebsd/sys/netinet/udp_usrreq.c584
-rw-r--r--freebsd/sys/netinet/udp_var.h109
-rw-r--r--freebsd/sys/netinet/udplite.h38
119 files changed, 19900 insertions, 14487 deletions
diff --git a/freebsd/sys/netinet/accf_dns.c b/freebsd/sys/netinet/accf_dns.c
index 9858db4e..b6d2ff63 100644
--- a/freebsd/sys/netinet/accf_dns.c
+++ b/freebsd/sys/netinet/accf_dns.c
@@ -77,7 +77,7 @@ sohasdns(struct socket *so, void *arg, int waitflag)
struct sockbuf *sb = &so->so_rcv;
/* If the socket is full, we're ready. */
- if (sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax)
+ if (sbused(sb) >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax)
goto ready;
/* Check to see if we have a request. */
@@ -117,14 +117,14 @@ skippacket(struct sockbuf *sb) {
unsigned long packlen;
struct packet q, *p = &q;
- if (sb->sb_cc < 2)
+ if (sbavail(sb) < 2)
return DNS_WAIT;
q.m = sb->sb_mb;
q.n = q.m->m_nextpkt;
q.moff = 0;
q.offset = 0;
- q.len = sb->sb_cc;
+ q.len = sbavail(sb);
GET16(p, packlen);
if (packlen + 2 > q.len)
diff --git a/freebsd/sys/netinet/accf_http.c b/freebsd/sys/netinet/accf_http.c
index 3af867b0..83093db3 100644
--- a/freebsd/sys/netinet/accf_http.c
+++ b/freebsd/sys/netinet/accf_http.c
@@ -94,7 +94,7 @@ sbfull(struct sockbuf *sb)
"mbcnt(%ld) >= mbmax(%ld): %d",
sb->sb_cc, sb->sb_hiwat, sb->sb_cc >= sb->sb_hiwat,
sb->sb_mbcnt, sb->sb_mbmax, sb->sb_mbcnt >= sb->sb_mbmax);
- return (sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax);
+ return (sbused(sb) >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax);
}
/*
@@ -164,13 +164,14 @@ static int
sohashttpget(struct socket *so, void *arg, int waitflag)
{
- if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) {
+ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 &&
+ !sbfull(&so->so_rcv)) {
struct mbuf *m;
char *cmp;
int cmplen, cc;
m = so->so_rcv.sb_mb;
- cc = so->so_rcv.sb_cc - 1;
+ cc = sbavail(&so->so_rcv) - 1;
if (cc < 1)
return (SU_OK);
switch (*mtod(m, char *)) {
@@ -217,7 +218,7 @@ soparsehttpvers(struct socket *so, void *arg, int waitflag)
goto fallout;
m = so->so_rcv.sb_mb;
- cc = so->so_rcv.sb_cc;
+ cc = sbavail(&so->so_rcv);
inspaces = spaces = 0;
for (m = so->so_rcv.sb_mb; m; m = n) {
n = m->m_nextpkt;
@@ -306,7 +307,7 @@ soishttpconnected(struct socket *so, void *arg, int waitflag)
* have NCHRS left
*/
copied = 0;
- ccleft = so->so_rcv.sb_cc;
+ ccleft = sbavail(&so->so_rcv);
if (ccleft < NCHRS)
goto readmore;
a = b = c = '\0';
diff --git a/freebsd/sys/netinet/cc/cc.c b/freebsd/sys/netinet/cc/cc.c
index 4be9a63b..ab3e831e 100644
--- a/freebsd/sys/netinet/cc/cc.c
+++ b/freebsd/sys/netinet/cc/cc.c
@@ -65,13 +65,13 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
#include <sys/sysctl.h>
-#include <net/if.h>
-#include <net/if_var.h>
+#include <net/vnet.h>
-#include <netinet/cc.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
+#include <netinet/cc/cc.h>
#include <netinet/cc/cc_module.h>
@@ -320,13 +320,14 @@ SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
/* Declare sysctl tree and populate it. */
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
- "congestion control related settings");
+ "Congestion control related settings");
-SYSCTL_VNET_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW,
- NULL, 0, cc_default_algo, "A", "default congestion control algorithm");
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm,
+ CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW,
+ NULL, 0, cc_default_algo, "A", "Default congestion control algorithm");
#ifndef __rtems__
SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,
NULL, 0, cc_list_available, "A",
- "list available congestion control algorithms");
+ "List available congestion control algorithms");
#endif /* __rtems__ */
diff --git a/freebsd/sys/netinet/cc.h b/freebsd/sys/netinet/cc/cc.h
index 14b4a9de..1da6f620 100644
--- a/freebsd/sys/netinet/cc.h
+++ b/freebsd/sys/netinet/cc/cc.h
@@ -48,11 +48,12 @@
* http://caia.swin.edu.au/urp/newtcp/
*/
-#ifndef _NETINET_CC_H_
-#define _NETINET_CC_H_
+#ifndef _NETINET_CC_CC_H_
+#define _NETINET_CC_CC_H_
-/* XXX: TCP_CA_NAME_MAX define lives in tcp.h for compat reasons. */
-#include <netinet/tcp.h>
+#if !defined(_KERNEL)
+#error "no user-servicable parts inside"
+#endif
/* Global CC vars. */
extern STAILQ_HEAD(cc_head, cc_algo) cc_list;
@@ -90,6 +91,10 @@ struct cc_var {
/* cc_var flags. */
#define CCF_ABC_SENTAWND 0x0001 /* ABC counted cwnd worth of bytes? */
#define CCF_CWND_LIMITED 0x0002 /* Are we currently cwnd limited? */
+#define CCF_DELACK 0x0004 /* Is this ack delayed? */
+#define CCF_ACKNOW 0x0008 /* Will this ack be sent now? */
+#define CCF_IPHDR_CE 0x0010 /* Does this packet set CE bit? */
+#define CCF_TCPHDR_CWR 0x0020 /* Does this packet set CWR bit? */
/* ACK types passed to the ack_received() hook. */
#define CC_ACK 0x0001 /* Regular in sequence ACK. */
@@ -143,6 +148,12 @@ struct cc_algo {
/* Called when data transfer resumes after an idle period. */
void (*after_idle)(struct cc_var *ccv);
+ /* Called for an additional ECN processing apart from RFC3168. */
+ void (*ecnpkt_handler)(struct cc_var *ccv);
+
+ /* Called for {get|set}sockopt() on a TCP socket with TCP_CCALGOOPT. */
+ int (*ctl_output)(struct cc_var *, struct sockopt *, void *);
+
STAILQ_ENTRY (cc_algo) entries;
};
@@ -164,4 +175,4 @@ extern struct rwlock cc_list_lock;
#define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock)
#define CC_LIST_LOCK_ASSERT() rw_assert(&cc_list_lock, RA_LOCKED)
-#endif /* _NETINET_CC_H_ */
+#endif /* _NETINET_CC_CC_H_ */
diff --git a/freebsd/sys/netinet/cc/cc_newreno.c b/freebsd/sys/netinet/cc/cc_newreno.c
index c0f0cfc5..8077bb22 100644
--- a/freebsd/sys/netinet/cc/cc_newreno.c
+++ b/freebsd/sys/netinet/cc/cc_newreno.c
@@ -64,10 +64,10 @@ __FBSDID("$FreeBSD$");
#include <net/vnet.h>
-#include <netinet/cc.h>
+#include <netinet/tcp.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_var.h>
-
+#include <netinet/cc/cc.h>
#include <netinet/cc/cc_module.h>
static void newreno_ack_received(struct cc_var *ccv, uint16_t type);
@@ -216,6 +216,9 @@ newreno_cong_signal(struct cc_var *ccv, uint32_t type)
static void
newreno_post_recovery(struct cc_var *ccv)
{
+ int pipe;
+ pipe = 0;
+
if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
/*
* Fast recovery will conclude after returning from this
@@ -226,10 +229,13 @@ newreno_post_recovery(struct cc_var *ccv)
*
* XXXLAS: Find a way to do this without needing curack
*/
- if (SEQ_GT(ccv->curack + CCV(ccv, snd_ssthresh),
- CCV(ccv, snd_max)))
- CCV(ccv, snd_cwnd) = CCV(ccv, snd_max) -
- ccv->curack + CCV(ccv, t_maxseg);
+ if (V_tcp_do_rfc6675_pipe)
+ pipe = tcp_compute_pipe(ccv->ccvc.tcp);
+ else
+ pipe = CCV(ccv, snd_max) - ccv->curack;
+
+ if (pipe < CCV(ccv, snd_ssthresh))
+ CCV(ccv, snd_cwnd) = pipe + CCV(ccv, t_maxseg);
else
CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
}
diff --git a/freebsd/sys/netinet/icmp6.h b/freebsd/sys/netinet/icmp6.h
index 5483721d..af35c847 100644
--- a/freebsd/sys/netinet/icmp6.h
+++ b/freebsd/sys/netinet/icmp6.h
@@ -144,6 +144,9 @@ struct icmp6_hdr {
#define ICMP6_DST_UNREACH_BEYONDSCOPE 2 /* beyond scope of source address */
#define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */
#define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */
+#define ICMP6_DST_UNREACH_POLICY 5 /* failed ingress/egress policy */
+#define ICMP6_DST_UNREACH_REJECT 6 /* Reject route to destination */
+#define ICMP6_DST_UNREACH_SRCROUTE 7 /* Error in source routing header */
#define ICMP6_TIME_EXCEED_TRANSIT 0 /* ttl==0 in transit */
#define ICMP6_TIME_EXCEED_REASSEMBLY 1 /* ttl==0 in reass */
@@ -297,9 +300,11 @@ struct nd_opt_hdr { /* Neighbor discovery option header */
#define ND_OPT_PREFIX_INFORMATION 3
#define ND_OPT_REDIRECTED_HEADER 4
#define ND_OPT_MTU 5
+#define ND_OPT_NONCE 14 /* RFC 3971 */
#define ND_OPT_ROUTE_INFO 24 /* RFC 4191 */
#define ND_OPT_RDNSS 25 /* RFC 6106 */
#define ND_OPT_DNSSL 31 /* RFC 6106 */
+#define ND_OPT_MAX 31
struct nd_opt_prefix_info { /* prefix information */
u_int8_t nd_opt_pi_type;
@@ -330,6 +335,16 @@ struct nd_opt_mtu { /* MTU option */
u_int32_t nd_opt_mtu_mtu;
} __packed;
+#define ND_OPT_NONCE_LEN ((1 * 8) - 2)
+#if ((ND_OPT_NONCE_LEN + 2) % 8) != 0
+#error "(ND_OPT_NONCE_LEN + 2) must be a multiple of 8."
+#endif
+struct nd_opt_nonce { /* nonce option */
+ u_int8_t nd_opt_nonce_type;
+ u_int8_t nd_opt_nonce_len;
+ u_int8_t nd_opt_nonce[ND_OPT_NONCE_LEN];
+} __packed;
+
struct nd_opt_route_info { /* route info */
u_int8_t nd_opt_rti_type;
u_int8_t nd_opt_rti_len;
@@ -555,39 +570,39 @@ do { \
* of the internet control message protocol version 6.
*/
struct icmp6errstat {
- u_quad_t icp6errs_dst_unreach_noroute;
- u_quad_t icp6errs_dst_unreach_admin;
- u_quad_t icp6errs_dst_unreach_beyondscope;
- u_quad_t icp6errs_dst_unreach_addr;
- u_quad_t icp6errs_dst_unreach_noport;
- u_quad_t icp6errs_packet_too_big;
- u_quad_t icp6errs_time_exceed_transit;
- u_quad_t icp6errs_time_exceed_reassembly;
- u_quad_t icp6errs_paramprob_header;
- u_quad_t icp6errs_paramprob_nextheader;
- u_quad_t icp6errs_paramprob_option;
- u_quad_t icp6errs_redirect; /* we regard redirect as an error here */
- u_quad_t icp6errs_unknown;
+ uint64_t icp6errs_dst_unreach_noroute;
+ uint64_t icp6errs_dst_unreach_admin;
+ uint64_t icp6errs_dst_unreach_beyondscope;
+ uint64_t icp6errs_dst_unreach_addr;
+ uint64_t icp6errs_dst_unreach_noport;
+ uint64_t icp6errs_packet_too_big;
+ uint64_t icp6errs_time_exceed_transit;
+ uint64_t icp6errs_time_exceed_reassembly;
+ uint64_t icp6errs_paramprob_header;
+ uint64_t icp6errs_paramprob_nextheader;
+ uint64_t icp6errs_paramprob_option;
+ uint64_t icp6errs_redirect; /* we regard redirect as an error here */
+ uint64_t icp6errs_unknown;
};
struct icmp6stat {
/* statistics related to icmp6 packets generated */
- u_quad_t icp6s_error; /* # of calls to icmp6_error */
- u_quad_t icp6s_canterror; /* no error 'cuz old was icmp */
- u_quad_t icp6s_toofreq; /* no error 'cuz rate limitation */
- u_quad_t icp6s_outhist[256];
+ uint64_t icp6s_error; /* # of calls to icmp6_error */
+ uint64_t icp6s_canterror; /* no error 'cuz old was icmp */
+ uint64_t icp6s_toofreq; /* no error 'cuz rate limitation */
+ uint64_t icp6s_outhist[256];
/* statistics related to input message processed */
- u_quad_t icp6s_badcode; /* icmp6_code out of range */
- u_quad_t icp6s_tooshort; /* packet < sizeof(struct icmp6_hdr) */
- u_quad_t icp6s_checksum; /* bad checksum */
- u_quad_t icp6s_badlen; /* calculated bound mismatch */
+ uint64_t icp6s_badcode; /* icmp6_code out of range */
+ uint64_t icp6s_tooshort; /* packet < sizeof(struct icmp6_hdr) */
+ uint64_t icp6s_checksum; /* bad checksum */
+ uint64_t icp6s_badlen; /* calculated bound mismatch */
/*
* number of responses: this member is inherited from netinet code, but
* for netinet6 code, it is already available in icp6s_outhist[].
*/
- u_quad_t icp6s_reflect;
- u_quad_t icp6s_inhist[256];
- u_quad_t icp6s_nd_toomanyopt; /* too many ND options */
+ uint64_t icp6s_reflect;
+ uint64_t icp6s_inhist[256];
+ uint64_t icp6s_nd_toomanyopt; /* too many ND options */
struct icmp6errstat icp6s_outerrhist;
#define icp6s_odst_unreach_noroute \
icp6s_outerrhist.icp6errs_dst_unreach_noroute
@@ -607,29 +622,33 @@ struct icmp6stat {
#define icp6s_oparamprob_option icp6s_outerrhist.icp6errs_paramprob_option
#define icp6s_oredirect icp6s_outerrhist.icp6errs_redirect
#define icp6s_ounknown icp6s_outerrhist.icp6errs_unknown
- u_quad_t icp6s_pmtuchg; /* path MTU changes */
- u_quad_t icp6s_nd_badopt; /* bad ND options */
- u_quad_t icp6s_badns; /* bad neighbor solicitation */
- u_quad_t icp6s_badna; /* bad neighbor advertisement */
- u_quad_t icp6s_badrs; /* bad router advertisement */
- u_quad_t icp6s_badra; /* bad router advertisement */
- u_quad_t icp6s_badredirect; /* bad redirect message */
+ uint64_t icp6s_pmtuchg; /* path MTU changes */
+ uint64_t icp6s_nd_badopt; /* bad ND options */
+ uint64_t icp6s_badns; /* bad neighbor solicitation */
+ uint64_t icp6s_badna; /* bad neighbor advertisement */
+ uint64_t icp6s_badrs; /* bad router advertisement */
+ uint64_t icp6s_badra; /* bad router advertisement */
+ uint64_t icp6s_badredirect; /* bad redirect message */
};
#ifdef _KERNEL
+#include <sys/counter.h>
+
+VNET_PCPUSTAT_DECLARE(struct icmp6stat, icmp6stat);
/*
* In-kernel consumers can use these accessor macros directly to update
* stats.
*/
-#define ICMP6STAT_ADD(name, val) V_icmp6stat.name += (val)
+#define ICMP6STAT_ADD(name, val) \
+ VNET_PCPUSTAT_ADD(struct icmp6stat, icmp6stat, name, (val))
#define ICMP6STAT_INC(name) ICMP6STAT_ADD(name, 1)
/*
* Kernel module consumers must use this accessor macro.
*/
void kmod_icmp6stat_inc(int statnum);
-#define KMOD_ICMP6STAT_INC(name) \
- kmod_icmp6stat_inc(offsetof(struct icmp6stat, name) / sizeof(u_quad_t))
+#define KMOD_ICMP6STAT_INC(name) \
+ kmod_icmp6stat_inc(offsetof(struct icmp6stat, name) / sizeof(uint64_t))
#endif
/*
@@ -688,7 +707,9 @@ void icmp6_mtudisc_update(struct ip6ctlparam *, int);
#define icmp6_ifstat_inc(ifp, tag) \
do { \
if (ifp) \
- ((struct in6_ifextra *)((ifp)->if_afdata[AF_INET6]))->icmp6_ifstat->tag++; \
+ counter_u64_add(((struct in6_ifextra *) \
+ ((ifp)->if_afdata[AF_INET6]))->icmp6_ifstat[\
+ offsetof(struct icmp6_ifstat, tag) / sizeof(uint64_t)], 1);\
} while (/*CONSTCOND*/ 0)
#define icmp6_ifoutstat_inc(ifp, type, code) \
diff --git a/freebsd/sys/netinet/icmp_var.h b/freebsd/sys/netinet/icmp_var.h
index d939cc2e..d3e72bc2 100644
--- a/freebsd/sys/netinet/icmp_var.h
+++ b/freebsd/sys/netinet/icmp_var.h
@@ -58,11 +58,15 @@ struct icmpstat {
};
#ifdef _KERNEL
+#include <sys/counter.h>
+
+VNET_PCPUSTAT_DECLARE(struct icmpstat, icmpstat);
/*
* In-kernel consumers can use these accessor macros directly to update
* stats.
*/
-#define ICMPSTAT_ADD(name, val) V_icmpstat.name += (val)
+#define ICMPSTAT_ADD(name, val) \
+ VNET_PCPUSTAT_ADD(struct icmpstat, icmpstat, name, (val))
#define ICMPSTAT_INC(name) ICMPSTAT_ADD(name, 1)
/*
@@ -70,30 +74,19 @@ struct icmpstat {
*/
void kmod_icmpstat_inc(int statnum);
#define KMOD_ICMPSTAT_INC(name) \
- kmod_icmpstat_inc(offsetof(struct icmpstat, name) / sizeof(u_long))
+ kmod_icmpstat_inc(offsetof(struct icmpstat, name) / sizeof(uint64_t))
#endif
/*
- * Names for ICMP sysctl objects
+ * Identifiers for ICMP sysctl nodes
*/
#define ICMPCTL_MASKREPL 1 /* allow replies to netmask requests */
#define ICMPCTL_STATS 2 /* statistics (read-only) */
#define ICMPCTL_ICMPLIM 3
-#define ICMPCTL_MAXID 4
-
-#define ICMPCTL_NAMES { \
- { 0, 0 }, \
- { "maskrepl", CTLTYPE_INT }, \
- { "stats", CTLTYPE_STRUCT }, \
- { "icmplim", CTLTYPE_INT }, \
-}
#ifdef _KERNEL
SYSCTL_DECL(_net_inet_icmp);
-VNET_DECLARE(struct icmpstat, icmpstat); /* icmp statistics. */
-#define V_icmpstat VNET(icmpstat)
-
extern int badport_bandlim(int);
#define BANDLIM_UNLIMITED -1
#define BANDLIM_ICMP_UNREACH 0
diff --git a/freebsd/sys/netinet/if_atm.c b/freebsd/sys/netinet/if_atm.c
index e26d0c7c..cb0317fb 100644
--- a/freebsd/sys/netinet/if_atm.c
+++ b/freebsd/sys/netinet/if_atm.c
@@ -54,6 +54,7 @@ __FBSDID("$FreeBSD$");
#include <sys/syslog.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/if_atm.h>
@@ -319,7 +320,7 @@ failed:
* but this is enough for PVCs entered via the "route" command.
*/
int
-atmresolve(struct rtentry *rt, struct mbuf *m, struct sockaddr *dst,
+atmresolve(struct rtentry *rt, struct mbuf *m, const struct sockaddr *dst,
struct atm_pseudohdr *desten)
{
struct sockaddr_dl *sdl;
@@ -331,7 +332,8 @@ atmresolve(struct rtentry *rt, struct mbuf *m, struct sockaddr *dst,
}
if (rt == NULL) {
- rt = RTALLOC1(dst, 0); /* link level on table 0 XXX MRT */
+ /* link level on table 0 XXX MRT */
+ rt = RTALLOC1(__DECONST(struct sockaddr *, dst), 0);
if (rt == NULL)
goto bad; /* failed */
RT_REMREF(rt); /* don't keep LL references */
diff --git a/freebsd/sys/netinet/if_atm.h b/freebsd/sys/netinet/if_atm.h
index bd8b5143..04ad218d 100644
--- a/freebsd/sys/netinet/if_atm.h
+++ b/freebsd/sys/netinet/if_atm.h
@@ -43,5 +43,5 @@ struct rtentry;
struct sockaddr;
void atm_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
-int atmresolve(struct rtentry *, struct mbuf *, struct sockaddr *,
+int atmresolve(struct rtentry *, struct mbuf *, const struct sockaddr *,
struct atm_pseudohdr *);
diff --git a/freebsd/sys/netinet/if_ether.c b/freebsd/sys/netinet/if_ether.c
index eec06dd8..0a8b101e 100644
--- a/freebsd/sys/netinet/if_ether.c
+++ b/freebsd/sys/netinet/if_ether.c
@@ -44,39 +44,50 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
#include <sys/queue.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/malloc.h>
#include <sys/proc.h>
+#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/syslog.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/netisr.h>
-#include <net/if_llc.h>
#include <net/ethernet.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
+#include <netinet/in_fib.h>
#include <netinet/in_var.h>
#include <net/if_llatbl.h>
#include <netinet/if_ether.h>
-#if defined(INET)
+#ifdef INET
#include <netinet/ip_carp.h>
#endif
-#include <net/if_arc.h>
-#include <net/iso88025.h>
-
#include <security/mac/mac_framework.h>
-#define SIN(s) ((struct sockaddr_in *)s)
-#define SDL(s) ((struct sockaddr_dl *)s)
+#define SIN(s) ((const struct sockaddr_in *)(s))
+
+static struct timeval arp_lastlog;
+static int arp_curpps;
+static int arp_maxpps = 1;
+
+/* Simple ARP state machine */
+enum arp_llinfo_state {
+ ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */
+ ARP_LLINFO_REACHABLE, /* LLE is valid */
+ ARP_LLINFO_VERIFY, /* LLE is valid, need refresh */
+ ARP_LLINFO_DELETED, /* LLE is deleted */
+};
SYSCTL_DECL(_net_link_ether);
static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");
@@ -86,53 +97,67 @@ static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, "");
static VNET_DEFINE(int, arpt_keep) = (20*60); /* once resolved, good for 20
* minutes */
static VNET_DEFINE(int, arp_maxtries) = 5;
-VNET_DEFINE(int, useloopback) = 1; /* use loopback interface for
- * local traffic */
static VNET_DEFINE(int, arp_proxyall) = 0;
static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for
* 20 seconds */
-VNET_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */
+static VNET_DEFINE(int, arpt_rexmit) = 1; /* retransmit arp entries, sec*/
+VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */
+VNET_PCPUSTAT_SYSINIT(arpstat);
+
+#ifdef VIMAGE
+VNET_PCPUSTAT_SYSUNINIT(arpstat);
+#endif /* VIMAGE */
static VNET_DEFINE(int, arp_maxhold) = 1;
#define V_arpt_keep VNET(arpt_keep)
#define V_arpt_down VNET(arpt_down)
+#define V_arpt_rexmit VNET(arpt_rexmit)
#define V_arp_maxtries VNET(arp_maxtries)
#define V_arp_proxyall VNET(arp_proxyall)
-#define V_arpstat VNET(arpstat)
#define V_arp_maxhold VNET(arp_maxhold)
-SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW,
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(arpt_keep), 0,
"ARP entry lifetime in seconds");
-SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW,
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(arp_maxtries), 0,
"ARP resolution attempts before returning error");
-SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW,
- &VNET_NAME(useloopback), 0,
- "Use the loopback interface for local traffic");
-SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW,
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(arp_proxyall), 0,
"Enable proxy ARP for all suitable requests");
-SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_RW,
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(arpt_down), 0,
"Incomplete ARP entry lifetime in seconds");
-SYSCTL_VNET_STRUCT(_net_link_ether_arp, OID_AUTO, stats, CTLFLAG_RW,
- &VNET_NAME(arpstat), arpstat,
- "ARP statistics (struct arpstat, net/if_arp.h)");
-SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_RW,
+SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat,
+ arpstat, "ARP statistics (struct arpstat, net/if_arp.h)");
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(arp_maxhold), 0,
"Number of packets to hold per ARP entry");
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second,
+ CTLFLAG_RW, &arp_maxpps, 0,
+ "Maximum number of remotely triggered ARP messages that can be "
+ "logged per second");
+
+#define ARP_LOG(pri, ...) do { \
+ if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps)) \
+ log((pri), "arp: " __VA_ARGS__); \
+} while (0)
+
-static void arp_init(void);
-void arprequest(struct ifnet *,
- struct in_addr *, struct in_addr *, u_char *);
static void arpintr(struct mbuf *);
static void arptimer(void *);
#ifdef INET
static void in_arpinput(struct mbuf *);
#endif
+static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr,
+ struct ifnet *ifp, int bridged, struct llentry *la);
+static void arp_mark_lle_reachable(struct llentry *la);
+static void arp_iflladdr(void *arg __unused, struct ifnet *ifp);
+
+static eventhandler_tag iflladdr_tag;
+
static const struct netisr_handler arp_nh = {
.nh_name = "arp",
.nh_handler = arpintr,
@@ -140,29 +165,6 @@ static const struct netisr_handler arp_nh = {
.nh_policy = NETISR_POLICY_SOURCE,
};
-#ifdef AF_INET
-void arp_ifscrub(struct ifnet *ifp, uint32_t addr);
-
-/*
- * called by in_ifscrub to remove entry from the table when
- * the interface goes away
- */
-void
-arp_ifscrub(struct ifnet *ifp, uint32_t addr)
-{
- struct sockaddr_in addr4;
-
- bzero((void *)&addr4, sizeof(addr4));
- addr4.sin_len = sizeof(addr4);
- addr4.sin_family = AF_INET;
- addr4.sin_addr.s_addr = addr;
- IF_AFDATA_WLOCK(ifp);
- lla_lookup(LLTABLE(ifp), (LLE_DELETE | LLE_IFADDR),
- (struct sockaddr *)&addr4);
- IF_AFDATA_WUNLOCK(ifp);
-}
-#endif
-
/*
* Timeout routine. Age arp_tab entries periodically.
*/
@@ -171,15 +173,83 @@ arptimer(void *arg)
{
struct llentry *lle = (struct llentry *)arg;
struct ifnet *ifp;
+ int r_skip_req;
if (lle->la_flags & LLE_STATIC) {
- LLE_WUNLOCK(lle);
return;
}
-
+ LLE_WLOCK(lle);
+ if (callout_pending(&lle->lle_timer)) {
+ /*
+ * Here we are a bit odd here in the treatment of
+ * active/pending. If the pending bit is set, it got
+ * rescheduled before I ran. The active
+ * bit we ignore, since if it was stopped
+ * in ll_tablefree() and was currently running
+ * it would have return 0 so the code would
+ * not have deleted it since the callout could
+ * not be stopped so we want to go through
+ * with the delete here now. If the callout
+ * was restarted, the pending bit will be back on and
+ * we just want to bail since the callout_reset would
+ * return 1 and our reference would have been removed
+ * by arpresolve() below.
+ */
+ LLE_WUNLOCK(lle);
+ return;
+ }
ifp = lle->lle_tbl->llt_ifp;
CURVNET_SET(ifp->if_vnet);
+ switch (lle->ln_state) {
+ case ARP_LLINFO_REACHABLE:
+
+ /*
+ * Expiration time is approaching.
+ * Let's try to refresh entry if it is still
+ * in use.
+ *
+ * Set r_skip_req to get feedback from
+ * fast path. Change state and re-schedule
+ * ourselves.
+ */
+ LLE_REQ_LOCK(lle);
+ lle->r_skip_req = 1;
+ LLE_REQ_UNLOCK(lle);
+ lle->ln_state = ARP_LLINFO_VERIFY;
+ callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
+ LLE_WUNLOCK(lle);
+ CURVNET_RESTORE();
+ return;
+ case ARP_LLINFO_VERIFY:
+ LLE_REQ_LOCK(lle);
+ r_skip_req = lle->r_skip_req;
+ LLE_REQ_UNLOCK(lle);
+
+ if (r_skip_req == 0 && lle->la_preempt > 0) {
+ /* Entry was used, issue refresh request */
+ struct in_addr dst;
+ dst = lle->r_l3addr.addr4;
+ lle->la_preempt--;
+ callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
+ LLE_WUNLOCK(lle);
+ arprequest(ifp, NULL, &dst, NULL);
+ CURVNET_RESTORE();
+ return;
+ }
+ /* Nothing happened. Reschedule if not too late */
+ if (lle->la_expire > time_uptime) {
+ callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
+ LLE_WUNLOCK(lle);
+ CURVNET_RESTORE();
+ return;
+ }
+ break;
+ case ARP_LLINFO_INCOMPLETE:
+ case ARP_LLINFO_DELETED:
+ break;
+ }
+
if ((lle->la_flags & LLE_DELETED) == 0) {
int evt;
@@ -190,7 +260,7 @@ arptimer(void *arg)
EVENTHANDLER_INVOKE(lle_event, lle, evt);
}
- callout_stop(&lle->la_timer);
+ callout_stop(&lle->lle_timer);
/* XXX: LOR avoidance. We still have ref on lle. */
LLE_WUNLOCK(lle);
@@ -199,64 +269,109 @@ arptimer(void *arg)
/* Guard against race with other llentry_free(). */
if (lle->la_flags & LLE_LINKED) {
- size_t pkts_dropped;
-
LLE_REMREF(lle);
- pkts_dropped = llentry_free(lle);
- ARPSTAT_ADD(dropped, pkts_dropped);
- } else
- LLE_FREE_LOCKED(lle);
-
+ lltable_unlink_entry(lle->lle_tbl, lle);
+ }
IF_AFDATA_UNLOCK(ifp);
+ size_t pkts_dropped = llentry_free(lle);
+
+ ARPSTAT_ADD(dropped, pkts_dropped);
ARPSTAT_INC(timeouts);
CURVNET_RESTORE();
}
/*
+ * Stores link-layer header for @ifp in format suitable for if_output()
+ * into buffer @buf. Resulting header length is stored in @bufsize.
+ *
+ * Returns 0 on success.
+ */
+static int
+arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf,
+ size_t *bufsize)
+{
+ struct if_encap_req ereq;
+ int error;
+
+ bzero(buf, *bufsize);
+ bzero(&ereq, sizeof(ereq));
+ ereq.buf = buf;
+ ereq.bufsize = *bufsize;
+ ereq.rtype = IFENCAP_LL;
+ ereq.family = AF_ARP;
+ ereq.lladdr = ar_tha(ah);
+ ereq.hdata = (u_char *)ah;
+ if (bcast)
+ ereq.flags = IFENCAP_FLAG_BROADCAST;
+ error = ifp->if_requestencap(ifp, &ereq);
+ if (error == 0)
+ *bufsize = ereq.bufsize;
+
+ return (error);
+}
+
+
+/*
* Broadcast an ARP request. Caller specifies:
* - arp header source ip address
* - arp header target ip address
* - arp header source ethernet address
*/
void
-arprequest(struct ifnet *ifp, struct in_addr *sip, struct in_addr *tip,
- u_char *enaddr)
+arprequest(struct ifnet *ifp, const struct in_addr *sip,
+ const struct in_addr *tip, u_char *enaddr)
{
struct mbuf *m;
struct arphdr *ah;
struct sockaddr sa;
+ u_char *carpaddr = NULL;
+ uint8_t linkhdr[LLE_MAX_LINKHDR];
+ size_t linkhdrsize;
+ struct route ro;
+ int error;
if (sip == NULL) {
- /* XXX don't believe this can happen (or explain why) */
/*
* The caller did not supply a source address, try to find
* a compatible one among those assigned to this interface.
*/
struct ifaddr *ifa;
+ IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
- if (!ifa->ifa_addr ||
- ifa->ifa_addr->sa_family != AF_INET)
+ if (ifa->ifa_addr->sa_family != AF_INET)
continue;
- sip = &SIN(ifa->ifa_addr)->sin_addr;
+
+ if (ifa->ifa_carp) {
+ if ((*carp_iamatch_p)(ifa, &carpaddr) == 0)
+ continue;
+ sip = &IA_SIN(ifa)->sin_addr;
+ } else {
+ carpaddr = NULL;
+ sip = &IA_SIN(ifa)->sin_addr;
+ }
+
if (0 == ((sip->s_addr ^ tip->s_addr) &
- SIN(ifa->ifa_netmask)->sin_addr.s_addr) )
+ IA_MASKSIN(ifa)->sin_addr.s_addr))
break; /* found it. */
}
+ IF_ADDR_RUNLOCK(ifp);
if (sip == NULL) {
printf("%s: cannot find matching address\n", __func__);
return;
}
}
+ if (enaddr == NULL)
+ enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp);
- if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
+ if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
return;
- m->m_len = sizeof(*ah) + 2*sizeof(struct in_addr) +
- 2*ifp->if_data.ifi_addrlen;
+ m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) +
+ 2 * ifp->if_addrlen;
m->m_pkthdr.len = m->m_len;
- MH_ALIGN(m, m->m_len);
+ M_ALIGN(m, m->m_len);
ah = mtod(m, struct arphdr *);
bzero((caddr_t)ah, m->m_len);
#ifdef MAC
@@ -266,109 +381,121 @@ arprequest(struct ifnet *ifp, struct in_addr *sip, struct in_addr *tip,
ah->ar_hln = ifp->if_addrlen; /* hardware address length */
ah->ar_pln = sizeof(struct in_addr); /* protocol address length */
ah->ar_op = htons(ARPOP_REQUEST);
- bcopy((caddr_t)enaddr, (caddr_t)ar_sha(ah), ah->ar_hln);
- bcopy((caddr_t)sip, (caddr_t)ar_spa(ah), ah->ar_pln);
- bcopy((caddr_t)tip, (caddr_t)ar_tpa(ah), ah->ar_pln);
+ bcopy(enaddr, ar_sha(ah), ah->ar_hln);
+ bcopy(sip, ar_spa(ah), ah->ar_pln);
+ bcopy(tip, ar_tpa(ah), ah->ar_pln);
sa.sa_family = AF_ARP;
sa.sa_len = 2;
+
+ /* Calculate link header for sending frame */
+ bzero(&ro, sizeof(ro));
+ linkhdrsize = sizeof(linkhdr);
+ error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize);
+ if (error != 0 && error != EAFNOSUPPORT) {
+ ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
+ if_name(ifp), error);
+ return;
+ }
+
+ ro.ro_prepend = linkhdr;
+ ro.ro_plen = linkhdrsize;
+ ro.ro_flags = 0;
+
m->m_flags |= M_BCAST;
- (*ifp->if_output)(ifp, m, &sa, NULL);
+ m_clrprotoflags(m); /* Avoid confusing lower layers. */
+ (*ifp->if_output)(ifp, m, &sa, &ro);
ARPSTAT_INC(txrequests);
}
+
/*
- * Resolve an IP address into an ethernet address.
- * On input:
- * ifp is the interface we use
- * rt0 is the route to the final destination (possibly useless)
- * m is the mbuf. May be NULL if we don't have a packet.
- * dst is the next hop,
- * desten is where we want the address.
+ * Resolve an IP address into an ethernet address - heavy version.
+ * Used internally by arpresolve().
+ * We have already checked than we can't use existing lle without
+ * modification so we have to acquire LLE_EXCLUSIVE lle lock.
*
- * On success, desten is filled in and the function returns 0;
+ * On success, desten and flags are filled in and the function returns 0;
* If the packet must be held pending resolution, we return EWOULDBLOCK
* On other errors, we return the corresponding error code.
* Note that m_freem() handles NULL.
*/
-int
-arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m,
- struct sockaddr *dst, u_char *desten, struct llentry **lle)
+static int
+arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m,
+ const struct sockaddr *dst, u_char *desten, uint32_t *pflags,
+ struct llentry **plle)
{
- struct llentry *la = 0;
- u_int flags = 0;
+ struct llentry *la = NULL, *la_tmp;
struct mbuf *curr = NULL;
struct mbuf *next = NULL;
int error, renew;
+ char *lladdr;
+ int ll_len;
- *lle = NULL;
- if (m != NULL) {
- if (m->m_flags & M_BCAST) {
- /* broadcast */
- (void)memcpy(desten,
- ifp->if_broadcastaddr, ifp->if_addrlen);
- return (0);
- }
- if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) {
- /* multicast */
- ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
- return (0);
- }
+ if (pflags != NULL)
+ *pflags = 0;
+ if (plle != NULL)
+ *plle = NULL;
+
+ if ((flags & LLE_CREATE) == 0) {
+ IF_AFDATA_RLOCK(ifp);
+ la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
+ IF_AFDATA_RUNLOCK(ifp);
}
-retry:
- IF_AFDATA_RLOCK(ifp);
- la = lla_lookup(LLTABLE(ifp), flags, dst);
- IF_AFDATA_RUNLOCK(ifp);
- if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0)
- && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) {
- flags |= (LLE_CREATE | LLE_EXCLUSIVE);
+ if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) {
+ la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
+ if (la == NULL) {
+ log(LOG_DEBUG,
+ "arpresolve: can't allocate llinfo for %s on %s\n",
+ inet_ntoa(SIN(dst)->sin_addr), if_name(ifp));
+ m_freem(m);
+ return (EINVAL);
+ }
+
IF_AFDATA_WLOCK(ifp);
- la = lla_lookup(LLTABLE(ifp), flags, dst);
+ LLE_WLOCK(la);
+ la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
+ /* Prefer ANY existing lle over newly-created one */
+ if (la_tmp == NULL)
+ lltable_link_entry(LLTABLE(ifp), la);
IF_AFDATA_WUNLOCK(ifp);
+ if (la_tmp != NULL) {
+ lltable_free_entry(LLTABLE(ifp), la);
+ la = la_tmp;
+ }
}
if (la == NULL) {
- if (flags & LLE_CREATE)
- log(LOG_DEBUG,
- "arpresolve: can't allocate llinfo for %s on %s\n",
- inet_ntoa(SIN(dst)->sin_addr), ifp->if_xname);
m_freem(m);
return (EINVAL);
}
if ((la->la_flags & LLE_VALID) &&
((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
- bcopy(&la->ll_addr, desten, ifp->if_addrlen);
- /*
- * If entry has an expiry time and it is approaching,
- * see if we need to send an ARP request within this
- * arpt_down interval.
- */
- if (!(la->la_flags & LLE_STATIC) &&
- time_uptime + la->la_preempt > la->la_expire) {
- arprequest(ifp, NULL,
- &SIN(dst)->sin_addr, IF_LLADDR(ifp));
-
- la->la_preempt--;
+ if (flags & LLE_ADDRONLY) {
+ lladdr = la->ll_addr;
+ ll_len = ifp->if_addrlen;
+ } else {
+ lladdr = la->r_linkdata;
+ ll_len = la->r_hdrlen;
}
+ bcopy(lladdr, desten, ll_len);
- *lle = la;
- error = 0;
- goto done;
- }
-
- if (la->la_flags & LLE_STATIC) { /* should not happen! */
- log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n",
- inet_ntoa(SIN(dst)->sin_addr));
- m_freem(m);
- error = EINVAL;
- goto done;
+ /* Check if we have feedback request from arptimer() */
+ if (la->r_skip_req != 0) {
+ LLE_REQ_LOCK(la);
+ la->r_skip_req = 0; /* Notify that entry was used */
+ LLE_REQ_UNLOCK(la);
+ }
+ if (pflags != NULL)
+ *pflags = la->la_flags & (LLE_VALID|LLE_IFADDR);
+ if (plle) {
+ LLE_ADDREF(la);
+ *plle = la;
+ }
+ LLE_WUNLOCK(la);
+ return (0);
}
renew = (la->la_asked == 0 || la->la_expire != time_uptime);
- if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) {
- flags |= LLE_EXCLUSIVE;
- LLE_RUNLOCK(la);
- goto retry;
- }
/*
* There is an arptab entry, but no ethernet address
* response yet. Add the mbuf to the list, dropping
@@ -393,11 +520,6 @@ retry:
} else
la->la_hold = m;
la->la_numheld++;
- if (renew == 0 && (flags & LLE_EXCLUSIVE)) {
- flags &= ~LLE_EXCLUSIVE;
- LLE_DOWNGRADE(la);
- }
-
}
/*
* Return EWOULDBLOCK if we have tried less than arp_maxtries. It
@@ -408,32 +530,113 @@ retry:
if (la->la_asked < V_arp_maxtries)
error = EWOULDBLOCK; /* First request. */
else
- error = rt0 != NULL && (rt0->rt_flags & RTF_GATEWAY) ?
- EHOSTUNREACH : EHOSTDOWN;
+ error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN;
if (renew) {
int canceled;
LLE_ADDREF(la);
la->la_expire = time_uptime;
- canceled = callout_reset(&la->la_timer, hz * V_arpt_down,
+ canceled = callout_reset(&la->lle_timer, hz * V_arpt_down,
arptimer, la);
if (canceled)
LLE_REMREF(la);
la->la_asked++;
LLE_WUNLOCK(la);
- arprequest(ifp, NULL, &SIN(dst)->sin_addr,
- IF_LLADDR(ifp));
+ arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
return (error);
}
-done:
- if (flags & LLE_EXCLUSIVE)
- LLE_WUNLOCK(la);
- else
- LLE_RUNLOCK(la);
+
+ LLE_WUNLOCK(la);
+ return (error);
+}
+
+/*
+ * Resolve an IP address into an ethernet address.
+ */
+int
+arpresolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst,
+ char *desten, uint32_t *pflags, struct llentry **plle)
+{
+ int error;
+
+ flags |= LLE_ADDRONLY;
+ error = arpresolve_full(ifp, 0, flags, NULL, dst, desten, pflags, plle);
return (error);
}
+
+/*
+ * Lookups link header based on an IP address.
+ * On input:
+ * ifp is the interface we use
+ * is_gw != 0 if @dst represents gateway to some destination
+ * m is the mbuf. May be NULL if we don't have a packet.
+ * dst is the next hop,
+ * desten is the storage to put LL header.
+ * flags returns subset of lle flags: LLE_VALID | LLE_IFADDR
+ *
+ * On success, full/partial link header and flags are filled in and
+ * the function returns 0.
+ * If the packet must be held pending resolution, we return EWOULDBLOCK
+ * On other errors, we return the corresponding error code.
+ * Note that m_freem() handles NULL.
+ */
+int
+arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
+ const struct sockaddr *dst, u_char *desten, uint32_t *pflags,
+ struct llentry **plle)
+{
+ struct llentry *la = NULL;
+
+ if (pflags != NULL)
+ *pflags = 0;
+ if (plle != NULL)
+ *plle = NULL;
+
+ if (m != NULL) {
+ if (m->m_flags & M_BCAST) {
+ /* broadcast */
+ (void)memcpy(desten,
+ ifp->if_broadcastaddr, ifp->if_addrlen);
+ return (0);
+ }
+ if (m->m_flags & M_MCAST) {
+ /* multicast */
+ ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
+ return (0);
+ }
+ }
+
+ IF_AFDATA_RLOCK(ifp);
+ la = lla_lookup(LLTABLE(ifp), plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, dst);
+ if (la != NULL && (la->r_flags & RLLE_VALID) != 0) {
+ /* Entry found, let's copy lle info */
+ bcopy(la->r_linkdata, desten, la->r_hdrlen);
+ if (pflags != NULL)
+ *pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR);
+ /* Check if we have feedback request from arptimer() */
+ if (la->r_skip_req != 0) {
+ LLE_REQ_LOCK(la);
+ la->r_skip_req = 0; /* Notify that entry was used */
+ LLE_REQ_UNLOCK(la);
+ }
+ if (plle) {
+ LLE_ADDREF(la);
+ *plle = la;
+ LLE_WUNLOCK(la);
+ }
+ IF_AFDATA_RUNLOCK(ifp);
+ return (0);
+ }
+ if (plle && la)
+ LLE_WUNLOCK(la);
+ IF_AFDATA_RUNLOCK(ifp);
+
+ return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst,
+ desten, pflags, plle));
+}
+
/*
* Common length and type checks are done here,
* then the protocol-specific routine is called.
@@ -442,34 +645,76 @@ static void
arpintr(struct mbuf *m)
{
struct arphdr *ar;
+ struct ifnet *ifp;
+ char *layer;
+ int hlen;
+
+ ifp = m->m_pkthdr.rcvif;
if (m->m_len < sizeof(struct arphdr) &&
((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
- log(LOG_NOTICE, "arp: runt packet -- m_pullup failed\n");
+ ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n",
+ if_name(ifp));
return;
}
ar = mtod(m, struct arphdr *);
- if (ntohs(ar->ar_hrd) != ARPHRD_ETHER &&
- ntohs(ar->ar_hrd) != ARPHRD_IEEE802 &&
- ntohs(ar->ar_hrd) != ARPHRD_ARCNET &&
- ntohs(ar->ar_hrd) != ARPHRD_IEEE1394 &&
- ntohs(ar->ar_hrd) != ARPHRD_INFINIBAND) {
- log(LOG_NOTICE, "arp: unknown hardware address format (0x%2D)"
- " (from %*D to %*D)\n", (unsigned char *)&ar->ar_hrd, "",
- ETHER_ADDR_LEN, (u_char *)ar_sha(ar), ":",
- ETHER_ADDR_LEN, (u_char *)ar_tha(ar), ":");
+ /* Check if length is sufficient */
+ if (m->m_len < arphdr_len(ar)) {
+ m = m_pullup(m, arphdr_len(ar));
+ if (m == NULL) {
+ ARP_LOG(LOG_NOTICE, "short packet received on %s\n",
+ if_name(ifp));
+ return;
+ }
+ ar = mtod(m, struct arphdr *);
+ }
+
+ hlen = 0;
+ layer = "";
+ switch (ntohs(ar->ar_hrd)) {
+ case ARPHRD_ETHER:
+ hlen = ETHER_ADDR_LEN; /* RFC 826 */
+ layer = "ethernet";
+ break;
+ case ARPHRD_IEEE802:
+ hlen = 6; /* RFC 1390, FDDI_ADDR_LEN */
+ layer = "fddi";
+ break;
+ case ARPHRD_ARCNET:
+ hlen = 1; /* RFC 1201, ARC_ADDR_LEN */
+ layer = "arcnet";
+ break;
+ case ARPHRD_INFINIBAND:
+ hlen = 20; /* RFC 4391, INFINIBAND_ALEN */
+ layer = "infiniband";
+ break;
+ case ARPHRD_IEEE1394:
+ hlen = 0; /* SHALL be 16 */ /* RFC 2734 */
+ layer = "firewire";
+
+ /*
+ * Restrict too long hardware addresses.
+ * Currently we are capable of handling 20-byte
+ * addresses ( sizeof(lle->ll_addr) )
+ */
+ if (ar->ar_hln >= 20)
+ hlen = 16;
+ break;
+ default:
+ ARP_LOG(LOG_NOTICE,
+ "packet with unknown hardware format 0x%02d received on "
+ "%s\n", ntohs(ar->ar_hrd), if_name(ifp));
m_freem(m);
return;
}
- if (m->m_len < arphdr_len(ar)) {
- if ((m = m_pullup(m, arphdr_len(ar))) == NULL) {
- log(LOG_NOTICE, "arp: runt packet\n");
- m_freem(m);
- return;
- }
- ar = mtod(m, struct arphdr *);
+ if (hlen != 0 && hlen != ar->ar_hln) {
+ ARP_LOG(LOG_NOTICE,
+ "packet with invalid %s address length %d received on %s\n",
+ layer, ar->ar_hln, if_name(ifp));
+ m_freem(m);
+ return;
}
ARPSTAT_INC(received);
@@ -518,20 +763,27 @@ SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW,
static void
in_arpinput(struct mbuf *m)
{
+ struct rm_priotracker in_ifa_tracker;
struct arphdr *ah;
struct ifnet *ifp = m->m_pkthdr.rcvif;
- struct llentry *la = NULL;
- struct rtentry *rt;
+ struct llentry *la = NULL, *la_tmp;
struct ifaddr *ifa;
struct in_ifaddr *ia;
struct sockaddr sa;
struct in_addr isaddr, itaddr, myaddr;
u_int8_t *enaddr = NULL;
- int op, flags;
- int req_len;
+ int op;
int bridged = 0, is_bridge = 0;
- int carp_match = 0;
+ int carped;
struct sockaddr_in sin;
+ struct sockaddr *dst;
+ struct nhop4_basic nh4;
+ uint8_t linkhdr[LLE_MAX_LINKHDR];
+ struct route ro;
+ size_t linkhdrsize;
+ int lladdr_off;
+ int error;
+
sin.sin_len = sizeof(struct sockaddr_in);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = 0;
@@ -541,25 +793,24 @@ in_arpinput(struct mbuf *m)
if (ifp->if_type == IFT_BRIDGE)
is_bridge = 1;
- req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
- if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) {
- log(LOG_NOTICE, "in_arp: runt packet -- m_pullup failed\n");
- return;
- }
-
+ /*
+ * We already have checked that mbuf contains enough contiguous data
+ * to hold entire arp message according to the arp header.
+ */
ah = mtod(m, struct arphdr *);
+
/*
* ARP is only for IPv4 so we can reject packets with
* a protocol length not equal to an IPv4 address.
*/
if (ah->ar_pln != sizeof(struct in_addr)) {
- log(LOG_NOTICE, "in_arp: requested protocol length != %zu\n",
+ ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n",
sizeof(struct in_addr));
goto drop;
}
if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) {
- log(LOG_NOTICE, "arp: %*D is multicast\n",
+ ARP_LOG(LOG_NOTICE, "%*D is multicast\n",
ifp->if_addrlen, (u_char *)ar_sha(ah), ":");
goto drop;
}
@@ -575,26 +826,16 @@ in_arpinput(struct mbuf *m)
* For a bridge, we want to check the address irrespective
* of the receive interface. (This will change slightly
* when we have clusters of interfaces).
- * If the interface does not match, but the recieving interface
- * is part of carp, we call carp_iamatch to see if this is a
- * request for the virtual host ip.
- * XXX: This is really ugly!
*/
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
ia->ia_ifp == ifp) &&
- itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
+ itaddr.s_addr == ia->ia_addr.sin_addr.s_addr &&
+ (ia->ia_ifa.ifa_carp == NULL ||
+ (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) {
ifa_ref(&ia->ia_ifa);
- IN_IFADDR_RUNLOCK();
- goto match;
- }
- if (ifp->if_carp != NULL &&
- (*carp_iamatch_p)(ifp, ia, &isaddr, &enaddr) &&
- itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
- carp_match = 1;
- ifa_ref(&ia->ia_ifa);
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
goto match;
}
}
@@ -603,7 +844,7 @@ in_arpinput(struct mbuf *m)
ia->ia_ifp == ifp) &&
isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
ifa_ref(&ia->ia_ifa);
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
goto match;
}
@@ -622,13 +863,13 @@ in_arpinput(struct mbuf *m)
if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
ifa_ref(&ia->ia_ifa);
ifp = ia->ia_ifp;
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
goto match;
}
}
}
#undef BDG_MEMBER_MATCHES_ARP
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
/*
* No match, use the first inet address on the receive interface
@@ -636,7 +877,9 @@ in_arpinput(struct mbuf *m)
*/
IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
- if (ifa->ifa_addr->sa_family == AF_INET) {
+ if (ifa->ifa_addr->sa_family == AF_INET &&
+ (ifa->ifa_carp == NULL ||
+ (*carp_iamatch_p)(ifa, &enaddr))) {
ia = ifatoia(ifa);
ifa_ref(ifa);
IF_ADDR_RUNLOCK(ifp);
@@ -647,35 +890,44 @@ in_arpinput(struct mbuf *m)
/*
* If bridging, fall back to using any inet address.
*/
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) {
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
goto drop;
}
ifa_ref(&ia->ia_ifa);
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
match:
if (!enaddr)
enaddr = (u_int8_t *)IF_LLADDR(ifp);
+ carped = (ia->ia_ifa.ifa_carp != NULL);
myaddr = ia->ia_addr.sin_addr;
ifa_free(&ia->ia_ifa);
if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
goto drop; /* it's from me, ignore it. */
if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
- log(LOG_NOTICE,
- "arp: link address is broadcast for IP address %s!\n",
- inet_ntoa(isaddr));
+ ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address "
+ "%s!\n", inet_ntoa(isaddr));
+ goto drop;
+ }
+
+ if (ifp->if_addrlen != ah->ar_hln) {
+ ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
+ "i/f %d (ignored)\n", ifp->if_addrlen,
+ (u_char *) ar_sha(ah), ":", ah->ar_hln,
+ ifp->if_addrlen);
goto drop;
}
+
/*
* Warn if another host is using the same IP address, but only if the
* IP address isn't 0.0.0.0, which is used for DHCP only, in which
* case we suppress the warning to avoid false positive complaints of
* potential misconfiguration.
*/
- if (!bridged && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) {
- log(LOG_ERR,
- "arp: %*D is using my IP address %s on %s!\n",
+ if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr &&
+ myaddr.s_addr != 0) {
+ ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n",
ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
inet_ntoa(isaddr), ifp->if_xname);
itaddr = myaddr;
@@ -689,95 +941,73 @@ match:
sin.sin_len = sizeof(struct sockaddr_in);
sin.sin_family = AF_INET;
sin.sin_addr = isaddr;
- flags = (itaddr.s_addr == myaddr.s_addr) ? LLE_CREATE : 0;
- flags |= LLE_EXCLUSIVE;
- IF_AFDATA_LOCK(ifp);
- la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin);
- IF_AFDATA_UNLOCK(ifp);
- if (la != NULL) {
- /* the following is not an error when doing bridging */
- if (!bridged && la->lle_tbl->llt_ifp != ifp && !carp_match) {
- if (log_arp_wrong_iface)
- log(LOG_WARNING, "arp: %s is on %s "
- "but got reply from %*D on %s\n",
- inet_ntoa(isaddr),
- la->lle_tbl->llt_ifp->if_xname,
- ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
- ifp->if_xname);
- LLE_WUNLOCK(la);
+ dst = (struct sockaddr *)&sin;
+ IF_AFDATA_RLOCK(ifp);
+ la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
+ IF_AFDATA_RUNLOCK(ifp);
+ if (la != NULL)
+ arp_check_update_lle(ah, isaddr, ifp, bridged, la);
+ else if (itaddr.s_addr == myaddr.s_addr) {
+ /*
+ * Request/reply to our address, but no lle exists yet.
+ * Calculate full link prepend to use in lle.
+ */
+ linkhdrsize = sizeof(linkhdr);
+ if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
+ &linkhdrsize, &lladdr_off) != 0)
goto reply;
- }
- if ((la->la_flags & LLE_VALID) &&
- bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) {
- if (la->la_flags & LLE_STATIC) {
- LLE_WUNLOCK(la);
- if (log_arp_permanent_modify)
- log(LOG_ERR,
- "arp: %*D attempts to modify "
- "permanent entry for %s on %s\n",
- ifp->if_addrlen,
- (u_char *)ar_sha(ah), ":",
- inet_ntoa(isaddr), ifp->if_xname);
- goto reply;
- }
- if (log_arp_movements) {
- log(LOG_INFO, "arp: %s moved from %*D "
- "to %*D on %s\n",
- inet_ntoa(isaddr),
- ifp->if_addrlen,
- (u_char *)&la->ll_addr, ":",
- ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
- ifp->if_xname);
- }
- }
- if (ifp->if_addrlen != ah->ar_hln) {
- LLE_WUNLOCK(la);
- log(LOG_WARNING, "arp from %*D: addr len: new %d, "
- "i/f %d (ignored)\n", ifp->if_addrlen,
- (u_char *) ar_sha(ah), ":", ah->ar_hln,
- ifp->if_addrlen);
- goto drop;
- }
- (void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
- la->la_flags |= LLE_VALID;
+ /* Allocate new entry */
+ la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
+ if (la == NULL) {
- EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
+ /*
+ * lle creation may fail if source address belongs
+ * to non-directly connected subnet. However, we
+ * will try to answer the request instead of dropping
+ * frame.
+ */
+ goto reply;
+ }
+ lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
+ lladdr_off);
- if (!(la->la_flags & LLE_STATIC)) {
- int canceled;
+ IF_AFDATA_WLOCK(ifp);
+ LLE_WLOCK(la);
+ la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
- LLE_ADDREF(la);
- la->la_expire = time_uptime + V_arpt_keep;
- canceled = callout_reset(&la->la_timer,
- hz * V_arpt_keep, arptimer, la);
- if (canceled)
- LLE_REMREF(la);
- }
- la->la_asked = 0;
- la->la_preempt = V_arp_maxtries;
/*
- * The packets are all freed within the call to the output
- * routine.
+ * Check if lle still does not exists.
+ * If it does, that means that we either
+ * 1) have configured it explicitly, via
+ * 1a) 'arp -s' static entry or
+ * 1b) interface address static record
+ * or
+ * 2) it was the result of sending first packet to-host
+ * or
+ * 3) it was another arp reply packet we handled in
+ * different thread.
*
- * NB: The lock MUST be released before the call to the
- * output routine.
+ * In all cases except 3) we definitely need to prefer
+ * existing lle. For the sake of simplicity, prefer any
+ * existing lle over newly-create one.
*/
- if (la->la_hold != NULL) {
- struct mbuf *m_hold, *m_hold_next;
+ if (la_tmp == NULL)
+ lltable_link_entry(LLTABLE(ifp), la);
+ IF_AFDATA_WUNLOCK(ifp);
- m_hold = la->la_hold;
- la->la_hold = NULL;
- la->la_numheld = 0;
- memcpy(&sa, L3_ADDR(la), sizeof(sa));
- LLE_WUNLOCK(la);
- for (; m_hold != NULL; m_hold = m_hold_next) {
- m_hold_next = m_hold->m_nextpkt;
- m_hold->m_nextpkt = NULL;
- (*ifp->if_output)(ifp, m_hold, &sa, NULL);
- }
- } else
+ if (la_tmp == NULL) {
+ arp_mark_lle_reachable(la);
LLE_WUNLOCK(la);
+ } else {
+ /* Free newly-create entry and handle packet */
+ lltable_free_entry(LLTABLE(ifp), la);
+ la = la_tmp;
+ la_tmp = NULL;
+ arp_check_update_lle(ah, isaddr, ifp, bridged, la);
+ /* arp_check_update_lle() returns @la unlocked */
+ }
+ la = NULL;
}
reply:
if (op != ARPOP_REQUEST)
@@ -798,7 +1028,7 @@ reply:
if ((lle != NULL) && (lle->la_flags & LLE_PUB)) {
(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
- (void)memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln);
+ (void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln);
LLE_RUNLOCK(lle);
} else {
@@ -808,10 +1038,8 @@ reply:
if (!V_arp_proxyall)
goto drop;
- sin.sin_addr = itaddr;
/* XXX MRT use table 0 for arp reply */
- rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
- if (!rt)
+ if (fib4_lookup_nh_basic(0, itaddr, 0, 0, &nh4) != 0)
goto drop;
/*
@@ -819,11 +1047,8 @@ reply:
* as this one came out of, or we'll get into a fight
* over who claims what Ether address.
*/
- if (!rt->rt_ifp || rt->rt_ifp == ifp) {
- RTFREE_LOCKED(rt);
+ if (nh4.nh_ifp == ifp)
goto drop;
- }
- RTFREE_LOCKED(rt);
(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
@@ -834,21 +1059,16 @@ reply:
* avoids ARP chaos if an interface is connected to the
* wrong network.
*/
- sin.sin_addr = isaddr;
/* XXX MRT use table 0 for arp checks */
- rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
- if (!rt)
+ if (fib4_lookup_nh_basic(0, isaddr, 0, 0, &nh4) != 0)
goto drop;
- if (rt->rt_ifp != ifp) {
- log(LOG_INFO, "arp_proxy: ignoring request"
- " from %s via %s, expecting %s\n",
- inet_ntoa(isaddr), ifp->if_xname,
- rt->rt_ifp->if_xname);
- RTFREE_LOCKED(rt);
+ if (nh4.nh_ifp != ifp) {
+ ARP_LOG(LOG_INFO, "proxy: ignoring request"
+ " from %s via %s\n",
+ inet_ntoa(isaddr), ifp->if_xname);
goto drop;
}
- RTFREE_LOCKED(rt);
#ifdef DEBUG_PROXY
printf("arp: proxying for %s\n", inet_ntoa(itaddr));
@@ -878,7 +1098,29 @@ reply:
m->m_pkthdr.rcvif = NULL;
sa.sa_family = AF_ARP;
sa.sa_len = 2;
- (*ifp->if_output)(ifp, m, &sa, NULL);
+
+ /* Calculate link header for sending frame */
+ bzero(&ro, sizeof(ro));
+ linkhdrsize = sizeof(linkhdr);
+ error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize);
+
+ /*
+ * arp_fillheader() may fail due to lack of support inside encap request
+ * routing. This is not necessary an error, AF_ARP can/should be handled
+ * by if_output().
+ */
+ if (error != 0 && error != EAFNOSUPPORT) {
+ ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
+ if_name(ifp), error);
+ return;
+ }
+
+ ro.ro_prepend = linkhdr;
+ ro.ro_plen = linkhdrsize;
+ ro.ro_flags = 0;
+
+ m_clrprotoflags(m); /* Avoid confusing lower layers. */
+ (*ifp->if_output)(ifp, m, &sa, &ro);
ARPSTAT_INC(txreplies);
return;
@@ -887,45 +1129,249 @@ drop:
}
#endif
+/*
+ * Checks received arp data against existing @la.
+ * Updates lle state/performs notification if necessary.
+ */
+static void
+arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp,
+ int bridged, struct llentry *la)
+{
+ struct sockaddr sa;
+ struct mbuf *m_hold, *m_hold_next;
+ uint8_t linkhdr[LLE_MAX_LINKHDR];
+ size_t linkhdrsize;
+ int lladdr_off;
+
+ LLE_WLOCK_ASSERT(la);
+
+ /* the following is not an error when doing bridging */
+ if (!bridged && la->lle_tbl->llt_ifp != ifp) {
+ if (log_arp_wrong_iface)
+ ARP_LOG(LOG_WARNING, "%s is on %s "
+ "but got reply from %*D on %s\n",
+ inet_ntoa(isaddr),
+ la->lle_tbl->llt_ifp->if_xname,
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+ ifp->if_xname);
+ LLE_WUNLOCK(la);
+ return;
+ }
+ if ((la->la_flags & LLE_VALID) &&
+ bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) {
+ if (la->la_flags & LLE_STATIC) {
+ LLE_WUNLOCK(la);
+ if (log_arp_permanent_modify)
+ ARP_LOG(LOG_ERR,
+ "%*D attempts to modify "
+ "permanent entry for %s on %s\n",
+ ifp->if_addrlen,
+ (u_char *)ar_sha(ah), ":",
+ inet_ntoa(isaddr), ifp->if_xname);
+ return;
+ }
+ if (log_arp_movements) {
+ ARP_LOG(LOG_INFO, "%s moved from %*D "
+ "to %*D on %s\n",
+ inet_ntoa(isaddr),
+ ifp->if_addrlen,
+ (u_char *)&la->ll_addr, ":",
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+ ifp->if_xname);
+ }
+ }
+
+ /* Calculate full link prepend to use in lle */
+ linkhdrsize = sizeof(linkhdr);
+ if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
+ &linkhdrsize, &lladdr_off) != 0)
+ return;
+
+ /* Check if something has changed */
+ if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 ||
+ (la->la_flags & LLE_VALID) == 0) {
+ /* Try to perform LLE update */
+ if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
+ lladdr_off) == 0)
+ return;
+
+ /* Clear fast path feedback request if set */
+ la->r_skip_req = 0;
+ }
+
+ arp_mark_lle_reachable(la);
+
+ /*
+ * The packets are all freed within the call to the output
+ * routine.
+ *
+ * NB: The lock MUST be released before the call to the
+ * output routine.
+ */
+ if (la->la_hold != NULL) {
+ m_hold = la->la_hold;
+ la->la_hold = NULL;
+ la->la_numheld = 0;
+ lltable_fill_sa_entry(la, &sa);
+ LLE_WUNLOCK(la);
+ for (; m_hold != NULL; m_hold = m_hold_next) {
+ m_hold_next = m_hold->m_nextpkt;
+ m_hold->m_nextpkt = NULL;
+ /* Avoid confusing lower layers. */
+ m_clrprotoflags(m_hold);
+ (*ifp->if_output)(ifp, m_hold, &sa, NULL);
+ }
+ } else
+ LLE_WUNLOCK(la);
+}
+
+static void
+arp_mark_lle_reachable(struct llentry *la)
+{
+ int canceled, wtime;
+
+ LLE_WLOCK_ASSERT(la);
+
+ la->ln_state = ARP_LLINFO_REACHABLE;
+ EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
+
+ if (!(la->la_flags & LLE_STATIC)) {
+ LLE_ADDREF(la);
+ la->la_expire = time_uptime + V_arpt_keep;
+ wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit;
+ if (wtime < 0)
+ wtime = V_arpt_keep;
+ canceled = callout_reset(&la->lle_timer,
+ hz * wtime, arptimer, la);
+ if (canceled)
+ LLE_REMREF(la);
+ }
+ la->la_asked = 0;
+ la->la_preempt = V_arp_maxtries;
+}
+
+/*
+ * Add pernament link-layer record for given interface address.
+ */
+static __noinline void
+arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst)
+{
+ struct llentry *lle, *lle_tmp;
+
+ /*
+ * Interface address LLE record is considered static
+ * because kernel code relies on LLE_STATIC flag to check
+ * if these entries can be rewriten by arp updates.
+ */
+ lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst);
+ if (lle == NULL) {
+ log(LOG_INFO, "arp_ifinit: cannot create arp "
+ "entry for interface address\n");
+ return;
+ }
+
+ IF_AFDATA_WLOCK(ifp);
+ LLE_WLOCK(lle);
+ /* Unlink any entry if exists */
+ lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
+ if (lle_tmp != NULL)
+ lltable_unlink_entry(LLTABLE(ifp), lle_tmp);
+
+ lltable_link_entry(LLTABLE(ifp), lle);
+ IF_AFDATA_WUNLOCK(ifp);
+
+ if (lle_tmp != NULL)
+ EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED);
+
+ EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
+ LLE_WUNLOCK(lle);
+ if (lle_tmp != NULL)
+ lltable_free_entry(LLTABLE(ifp), lle_tmp);
+}
+
void
arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
{
- struct llentry *lle;
+ const struct sockaddr_in *dst_in;
+ const struct sockaddr *dst;
- if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) {
- arprequest(ifp, &IA_SIN(ifa)->sin_addr,
- &IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp));
- /*
- * interface address is considered static entry
- * because the output of the arp utility shows
- * that L2 entry as permanent
- */
- IF_AFDATA_LOCK(ifp);
- lle = lla_lookup(LLTABLE(ifp), (LLE_CREATE | LLE_IFADDR | LLE_STATIC),
- (struct sockaddr *)IA_SIN(ifa));
- IF_AFDATA_UNLOCK(ifp);
- if (lle == NULL)
- log(LOG_INFO, "arp_ifinit: cannot create arp "
- "entry for interface address\n");
- else
- LLE_RUNLOCK(lle);
- }
- ifa->ifa_rtrequest = NULL;
+ if (ifa->ifa_carp != NULL)
+ return;
+
+ dst = ifa->ifa_addr;
+ dst_in = (const struct sockaddr_in *)dst;
+
+ if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY)
+ return;
+ arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp));
+
+ arp_add_ifa_lle(ifp, dst);
}
void
-arp_ifinit2(struct ifnet *ifp, struct ifaddr *ifa, u_char *enaddr)
+arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr)
+{
+
+ if (ntohl(addr.s_addr) != INADDR_ANY)
+ arprequest(ifp, &addr, &addr, enaddr);
+}
+
+/*
+ * Sends gratuitous ARPs for each ifaddr to notify other
+ * nodes about the address change.
+ */
+static __noinline void
+arp_handle_ifllchange(struct ifnet *ifp)
+{
+ struct ifaddr *ifa;
+
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family == AF_INET)
+ arp_ifinit(ifp, ifa);
+ }
+}
+
+/*
+ * A handler for interface link layer address change event.
+ */
+static void
+arp_iflladdr(void *arg __unused, struct ifnet *ifp)
+{
+
+ lltable_update_ifaddr(LLTABLE(ifp));
+
+ if ((ifp->if_flags & IFF_UP) != 0)
+ arp_handle_ifllchange(ifp);
+}
+
+static void
+vnet_arp_init(void)
{
- if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY)
- arprequest(ifp, &IA_SIN(ifa)->sin_addr,
- &IA_SIN(ifa)->sin_addr, enaddr);
- ifa->ifa_rtrequest = NULL;
+
+ if (IS_DEFAULT_VNET(curvnet)) {
+ netisr_register(&arp_nh);
+ iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
+ arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
+ }
+#ifdef VIMAGE
+ else
+ netisr_register_vnet(&arp_nh);
+#endif
}
+VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND,
+ vnet_arp_init, 0);
+#ifdef VIMAGE
+/*
+ * We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH
+ * lookups after destroying the hash. Ideally this would go on SI_ORDER_3.5.
+ */
static void
-arp_init(void)
+vnet_arp_destroy(__unused void *arg)
{
- netisr_register(&arp_nh);
+ netisr_unregister_vnet(&arp_nh);
}
-SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0);
+VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
+ vnet_arp_destroy, NULL);
+#endif
diff --git a/freebsd/sys/netinet/if_ether.h b/freebsd/sys/netinet/if_ether.h
index ce63d8db..27e51f78 100644
--- a/freebsd/sys/netinet/if_ether.h
+++ b/freebsd/sys/netinet/if_ether.h
@@ -48,9 +48,9 @@
(enaddr)[0] = 0x01; \
(enaddr)[1] = 0x00; \
(enaddr)[2] = 0x5e; \
- (enaddr)[3] = ((u_char *)ipaddr)[1] & 0x7f; \
- (enaddr)[4] = ((u_char *)ipaddr)[2]; \
- (enaddr)[5] = ((u_char *)ipaddr)[3]; \
+ (enaddr)[3] = ((const u_char *)ipaddr)[1] & 0x7f; \
+ (enaddr)[4] = ((const u_char *)ipaddr)[2]; \
+ (enaddr)[5] = ((const u_char *)ipaddr)[3]; \
}
/*
* Macro to map an IP6 multicast address to an Ethernet multicast address.
@@ -63,10 +63,10 @@
{ \
(enaddr)[0] = 0x33; \
(enaddr)[1] = 0x33; \
- (enaddr)[2] = ((u_char *)ip6addr)[12]; \
- (enaddr)[3] = ((u_char *)ip6addr)[13]; \
- (enaddr)[4] = ((u_char *)ip6addr)[14]; \
- (enaddr)[5] = ((u_char *)ip6addr)[15]; \
+ (enaddr)[2] = ((const u_char *)ip6addr)[12]; \
+ (enaddr)[3] = ((const u_char *)ip6addr)[13]; \
+ (enaddr)[4] = ((const u_char *)ip6addr)[14]; \
+ (enaddr)[5] = ((const u_char *)ip6addr)[15]; \
}
/*
@@ -89,6 +89,7 @@ struct ether_arp {
#define arp_pln ea_hdr.ar_pln
#define arp_op ea_hdr.ar_op
+#ifndef BURN_BRIDGES /* Can be used by third party software. */
struct sockaddr_inarp {
u_char sin_len;
u_char sin_family;
@@ -99,6 +100,8 @@ struct sockaddr_inarp {
u_short sin_other;
#define SIN_PROXY 1
};
+#endif /* !BURN_BRIDGES */
+
/*
* IP and ethernet specific routing flags
*/
@@ -109,14 +112,19 @@ struct sockaddr_inarp {
extern u_char ether_ipmulticast_min[ETHER_ADDR_LEN];
extern u_char ether_ipmulticast_max[ETHER_ADDR_LEN];
-struct llentry;
struct ifaddr;
+struct llentry;
-int arpresolve(struct ifnet *ifp, struct rtentry *rt,
- struct mbuf *m, struct sockaddr *dst, u_char *desten,
- struct llentry **lle);
+int arpresolve_addr(struct ifnet *ifp, int flags,
+ const struct sockaddr *dst, char *desten, uint32_t *pflags,
+ struct llentry **plle);
+int arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
+ const struct sockaddr *dst, u_char *desten, uint32_t *pflags,
+ struct llentry **plle);
+void arprequest(struct ifnet *, const struct in_addr *,
+ const struct in_addr *, u_char *);
void arp_ifinit(struct ifnet *, struct ifaddr *);
-void arp_ifinit2(struct ifnet *, struct ifaddr *, u_char *);
+void arp_announce_ifaddr(struct ifnet *, struct in_addr addr, u_char *);
#endif
#endif
diff --git a/freebsd/sys/netinet/igmp.c b/freebsd/sys/netinet/igmp.c
index 78d9685b..cd57e426 100644
--- a/freebsd/sys/netinet/igmp.c
+++ b/freebsd/sys/netinet/igmp.c
@@ -52,6 +52,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include <rtems/bsd/local/opt_ddb.h>
+
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
#include <sys/module.h>
@@ -60,11 +62,18 @@ __FBSDID("$FreeBSD$");
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/rmlock.h>
#include <sys/sysctl.h>
#include <sys/ktr.h>
#include <sys/condvar.h>
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
#include <net/if.h>
+#include <net/if_var.h>
#include <net/netisr.h>
#include <net/vnet.h>
@@ -85,15 +94,15 @@ __FBSDID("$FreeBSD$");
#define KTR_IGMPV3 KTR_INET
#endif
-static struct igmp_ifinfo *
+static struct igmp_ifsoftc *
igi_alloc_locked(struct ifnet *);
static void igi_delete_locked(const struct ifnet *);
-static void igmp_dispatch_queue(struct ifqueue *, int, const int);
+static void igmp_dispatch_queue(struct mbufq *, int, const int);
static void igmp_fasttimo_vnet(void);
-static void igmp_final_leave(struct in_multi *, struct igmp_ifinfo *);
+static void igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *);
static int igmp_handle_state_change(struct in_multi *,
- struct igmp_ifinfo *);
-static int igmp_initial_join(struct in_multi *, struct igmp_ifinfo *);
+ struct igmp_ifsoftc *);
+static int igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *);
static int igmp_input_v1_query(struct ifnet *, const struct ip *,
const struct igmp *);
static int igmp_input_v2_query(struct ifnet *, const struct ip *,
@@ -101,7 +110,7 @@ static int igmp_input_v2_query(struct ifnet *, const struct ip *,
static int igmp_input_v3_query(struct ifnet *, const struct ip *,
/*const*/ struct igmpv3 *);
static int igmp_input_v3_group_query(struct in_multi *,
- struct igmp_ifinfo *, int, /*const*/ struct igmpv3 *);
+ struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *);
static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
/*const*/ struct igmp *);
static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
@@ -113,25 +122,25 @@ static struct mbuf *
#ifdef KTR
static char * igmp_rec_type_to_str(const int);
#endif
-static void igmp_set_version(struct igmp_ifinfo *, const int);
+static void igmp_set_version(struct igmp_ifsoftc *, const int);
static void igmp_slowtimo_vnet(void);
static int igmp_v1v2_queue_report(struct in_multi *, const int);
static void igmp_v1v2_process_group_timer(struct in_multi *, const int);
-static void igmp_v1v2_process_querier_timers(struct igmp_ifinfo *);
+static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *);
static void igmp_v2_update_group(struct in_multi *, const int);
-static void igmp_v3_cancel_link_timers(struct igmp_ifinfo *);
-static void igmp_v3_dispatch_general_query(struct igmp_ifinfo *);
+static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *);
+static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *);
static struct mbuf *
igmp_v3_encap_report(struct ifnet *, struct mbuf *);
-static int igmp_v3_enqueue_group_record(struct ifqueue *,
+static int igmp_v3_enqueue_group_record(struct mbufq *,
struct in_multi *, const int, const int, const int);
-static int igmp_v3_enqueue_filter_change(struct ifqueue *,
+static int igmp_v3_enqueue_filter_change(struct mbufq *,
struct in_multi *);
-static void igmp_v3_process_group_timers(struct igmp_ifinfo *,
- struct ifqueue *, struct ifqueue *, struct in_multi *,
+static void igmp_v3_process_group_timers(struct igmp_ifsoftc *,
+ struct mbufq *, struct mbufq *, struct in_multi *,
const int);
static int igmp_v3_merge_state_changes(struct in_multi *,
- struct ifqueue *);
+ struct mbufq *);
static void igmp_v3_suppress_group_record(struct in_multi *);
static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
@@ -159,13 +168,13 @@ static const struct netisr_handler igmp_nh = {
* * All output is delegated to the netisr.
* Now that Giant has been eliminated, the netisr may be inlined.
* * IN_MULTI_LOCK covers in_multi.
- * * IGMP_LOCK covers igmp_ifinfo and any global variables in this file,
+ * * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file,
* including the output queue.
* * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
* per-link state iterators.
- * * igmp_ifinfo is valid as long as PF_INET is attached to the interface,
+ * * igmp_ifsoftc is valid as long as PF_INET is attached to the interface,
* therefore it is not refcounted.
- * We allow unlocked reads of igmp_ifinfo when accessed via in_multi.
+ * We allow unlocked reads of igmp_ifsoftc when accessed via in_multi.
*
* Reference counting
* * IGMP acquires its own reference every time an in_multi is passed to
@@ -220,7 +229,8 @@ static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host
#define V_state_change_timers_running VNET(state_change_timers_running)
#define V_current_state_timers_running VNET(current_state_timers_running)
-static VNET_DEFINE(LIST_HEAD(, igmp_ifinfo), igi_head);
+static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head) =
+ LIST_HEAD_INITIALIZER(igi_head);
static VNET_DEFINE(struct igmpstat, igmpstat) = {
.igps_version = IGPS_VERSION_3,
.igps_len = sizeof(struct igmpstat),
@@ -250,32 +260,32 @@ static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3;
/*
* Virtualized sysctls.
*/
-SYSCTL_VNET_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW,
+SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(igmpstat), igmpstat, "");
-SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(igmp_recvifkludge), 0,
"Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
-SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(igmp_sendra), 0,
"Send IP Router Alert option in IGMPv2/v3 messages");
-SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(igmp_sendlocal), 0,
"Send IGMP membership reports for 224.0.0.0/24 groups");
-SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(igmp_v1enable), 0,
"Enable backwards compatibility with IGMPv1");
-SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(igmp_v2enable), 0,
"Enable backwards compatibility with IGMPv2");
-SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(igmp_legacysupp), 0,
"Allow v1/v2 reports to suppress v3 group responses");
-SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, default_version,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
&VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I",
"Default version of IGMP to run on each interface");
-SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
&VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I",
"Rate limit for IGMPv3 Group-and-Source queries in seconds");
@@ -291,7 +301,7 @@ igmp_save_context(struct mbuf *m, struct ifnet *ifp)
{
#ifdef VIMAGE
- m->m_pkthdr.header = ifp->if_vnet;
+ m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
#endif /* VIMAGE */
m->m_pkthdr.flowid = ifp->if_index;
}
@@ -300,7 +310,7 @@ static __inline void
igmp_scrub_context(struct mbuf *m)
{
- m->m_pkthdr.header = NULL;
+ m->m_pkthdr.PH_loc.ptr = NULL;
m->m_pkthdr.flowid = 0;
}
@@ -328,7 +338,7 @@ igmp_restore_context(struct mbuf *m)
#ifdef notyet
#if defined(VIMAGE) && defined(INVARIANTS)
- KASSERT(curvnet == (m->m_pkthdr.header),
+ KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr),
("%s: called when curvnet was not restored", __func__));
#endif
#endif
@@ -413,7 +423,7 @@ out_locked:
}
/*
- * Expose struct igmp_ifinfo to userland, keyed by ifindex.
+ * Expose struct igmp_ifsoftc to userland, keyed by ifindex.
* For use by ifmcstat(8).
*
* SMPng: NOTE: Does an unlocked ifindex space read.
@@ -427,7 +437,7 @@ sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
int error;
u_int namelen;
struct ifnet *ifp;
- struct igmp_ifinfo *igi;
+ struct igmp_ifsoftc *igi;
name = (int *)arg1;
namelen = arg2;
@@ -458,8 +468,18 @@ sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
LIST_FOREACH(igi, &V_igi_head, igi_link) {
if (ifp == igi->igi_ifp) {
- error = SYSCTL_OUT(req, igi,
- sizeof(struct igmp_ifinfo));
+ struct igmp_ifinfo info;
+
+ info.igi_version = igi->igi_version;
+ info.igi_v1_timer = igi->igi_v1_timer;
+ info.igi_v2_timer = igi->igi_v2_timer;
+ info.igi_v3_timer = igi->igi_v3_timer;
+ info.igi_flags = igi->igi_flags;
+ info.igi_rv = igi->igi_rv;
+ info.igi_qi = igi->igi_qi;
+ info.igi_qri = igi->igi_qri;
+ info.igi_uri = igi->igi_uri;
+ error = SYSCTL_OUT(req, &info, sizeof(info));
break;
}
}
@@ -476,15 +496,12 @@ out_locked:
* VIMAGE: Assumes the vnet pointer has been set.
*/
static void
-igmp_dispatch_queue(struct ifqueue *ifq, int limit, const int loop)
+igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop)
{
struct mbuf *m;
- for (;;) {
- _IF_DEQUEUE(ifq, m);
- if (m == NULL)
- break;
- CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, ifq, m);
+ while ((m = mbufq_dequeue(mq)) != NULL) {
+ CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m);
if (loop)
m->m_flags |= M_IGMP_LOOP;
netisr_dispatch(NETISR_IGMP, m);
@@ -525,7 +542,7 @@ igmp_ra_alloc(void)
struct mbuf *m;
struct ipoption *p;
- MGET(m, M_DONTWAIT, MT_DATA);
+ m = m_get(M_WAITOK, MT_DATA);
p = mtod(m, struct ipoption *);
p->ipopt_dst.s_addr = INADDR_ANY;
p->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */
@@ -540,10 +557,10 @@ igmp_ra_alloc(void)
/*
* Attach IGMP when PF_INET is attached to an interface.
*/
-struct igmp_ifinfo *
+struct igmp_ifsoftc *
igmp_domifattach(struct ifnet *ifp)
{
- struct igmp_ifinfo *igi;
+ struct igmp_ifsoftc *igi;
CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
__func__, ifp, ifp->if_xname);
@@ -562,14 +579,14 @@ igmp_domifattach(struct ifnet *ifp)
/*
* VIMAGE: assume curvnet set by caller.
*/
-static struct igmp_ifinfo *
+static struct igmp_ifsoftc *
igi_alloc_locked(/*const*/ struct ifnet *ifp)
{
- struct igmp_ifinfo *igi;
+ struct igmp_ifsoftc *igi;
IGMP_LOCK_ASSERT();
- igi = malloc(sizeof(struct igmp_ifinfo), M_IGMP, M_NOWAIT|M_ZERO);
+ igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO);
if (igi == NULL)
goto out;
@@ -580,17 +597,12 @@ igi_alloc_locked(/*const*/ struct ifnet *ifp)
igi->igi_qi = IGMP_QI_INIT;
igi->igi_qri = IGMP_QRI_INIT;
igi->igi_uri = IGMP_URI_INIT;
-
SLIST_INIT(&igi->igi_relinmhead);
-
- /*
- * Responses to general queries are subject to bounds.
- */
- IFQ_SET_MAXLEN(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
+ mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
- CTR2(KTR_IGMPV3, "allocate igmp_ifinfo for ifp %p(%s)",
+ CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)",
ifp, ifp->if_xname);
out:
@@ -609,7 +621,7 @@ out:
void
igmp_ifdetach(struct ifnet *ifp)
{
- struct igmp_ifinfo *igi;
+ struct igmp_ifsoftc *igi;
struct ifmultiaddr *ifma;
struct in_multi *inm, *tinm;
@@ -656,25 +668,21 @@ igmp_ifdetach(struct ifnet *ifp)
void
igmp_domifdetach(struct ifnet *ifp)
{
- struct igmp_ifinfo *igi;
CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
__func__, ifp, ifp->if_xname);
IGMP_LOCK();
-
- igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
igi_delete_locked(ifp);
-
IGMP_UNLOCK();
}
static void
igi_delete_locked(const struct ifnet *ifp)
{
- struct igmp_ifinfo *igi, *tigi;
+ struct igmp_ifsoftc *igi, *tigi;
- CTR3(KTR_IGMPV3, "%s: freeing igmp_ifinfo for ifp %p(%s)",
+ CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)",
__func__, ifp, ifp->if_xname);
IGMP_LOCK_ASSERT();
@@ -684,7 +692,7 @@ igi_delete_locked(const struct ifnet *ifp)
/*
* Free deferred General Query responses.
*/
- _IF_DRAIN(&igi->igi_gq);
+ mbufq_drain(&igi->igi_gq);
LIST_REMOVE(igi, igi_link);
@@ -696,10 +704,6 @@ igi_delete_locked(const struct ifnet *ifp)
return;
}
}
-
-#ifdef INVARIANTS
- panic("%s: igmp_ifinfo not found for ifp %p\n", __func__, ifp);
-#endif
}
/*
@@ -713,7 +717,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
const struct igmp *igmp)
{
struct ifmultiaddr *ifma;
- struct igmp_ifinfo *igi;
+ struct igmp_ifsoftc *igi;
struct in_multi *inm;
/*
@@ -733,7 +737,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
IGMP_LOCK();
igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
- KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
+ KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
if (igi->igi_flags & IGIF_LOOPBACK) {
CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
@@ -798,7 +802,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
const struct igmp *igmp)
{
struct ifmultiaddr *ifma;
- struct igmp_ifinfo *igi;
+ struct igmp_ifsoftc *igi;
struct in_multi *inm;
int is_general_query;
uint16_t timer;
@@ -827,7 +831,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
IGMP_LOCK();
igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
- KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
+ KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
if (igi->igi_flags & IGIF_LOOPBACK) {
CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
@@ -948,7 +952,7 @@ static int
igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
/*const*/ struct igmpv3 *igmpv3)
{
- struct igmp_ifinfo *igi;
+ struct igmp_ifsoftc *igi;
struct in_multi *inm;
int is_general_query;
uint32_t maxresp, nsrc, qqi;
@@ -1021,7 +1025,7 @@ igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
IGMP_LOCK();
igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
- KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
+ KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
if (igi->igi_flags & IGIF_LOOPBACK) {
CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
@@ -1104,12 +1108,12 @@ out_locked:
}
/*
- * Process a recieved IGMPv3 group-specific or group-and-source-specific
+ * Process a received IGMPv3 group-specific or group-and-source-specific
* query.
- * Return <0 if any error occured. Currently this is ignored.
+ * Return <0 if any error occurred. Currently this is ignored.
*/
static int
-igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifinfo *igi,
+igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi,
int timer, /*const*/ struct igmpv3 *igmpv3)
{
int retval;
@@ -1214,6 +1218,7 @@ static int
igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
/*const*/ struct igmp *igmp)
{
+ struct rm_priotracker in_ifa_tracker;
struct in_ifaddr *ia;
struct in_multi *inm;
@@ -1236,7 +1241,7 @@ igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
* Replace 0.0.0.0 with the subnet address if told to do so.
*/
if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
- IFP_TO_IA(ifp, ia);
+ IFP_TO_IA(ifp, ia, &in_ifa_tracker);
if (ia != NULL) {
ip->ip_src.s_addr = htonl(ia->ia_subnet);
ifa_free(&ia->ia_ifa);
@@ -1254,7 +1259,7 @@ igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
IN_MULTI_LOCK();
inm = inm_lookup(ifp, igmp->igmp_group);
if (inm != NULL) {
- struct igmp_ifinfo *igi;
+ struct igmp_ifsoftc *igi;
igi = inm->inm_igi;
if (igi == NULL) {
@@ -1322,6 +1327,7 @@ static int
igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
/*const*/ struct igmp *igmp)
{
+ struct rm_priotracker in_ifa_tracker;
struct in_ifaddr *ia;
struct in_multi *inm;
@@ -1330,7 +1336,7 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
* leave requires knowing that we are the only member of a
* group.
*/
- IFP_TO_IA(ifp, ia);
+ IFP_TO_IA(ifp, ia, &in_ifa_tracker);
if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
ifa_free(&ia->ia_ifa);
return (0);
@@ -1378,7 +1384,7 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
IN_MULTI_LOCK();
inm = inm_lookup(ifp, igmp->igmp_group);
if (inm != NULL) {
- struct igmp_ifinfo *igi;
+ struct igmp_ifsoftc *igi;
igi = inm->inm_igi;
KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
@@ -1425,26 +1431,29 @@ out_locked:
return (0);
}
-void
-igmp_input(struct mbuf *m, int off)
+int
+igmp_input(struct mbuf **mp, int *offp, int proto)
{
int iphlen;
struct ifnet *ifp;
struct igmp *igmp;
struct ip *ip;
+ struct mbuf *m;
int igmplen;
int minlen;
int queryver;
- CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, m, off);
+ CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp);
+ m = *mp;
ifp = m->m_pkthdr.rcvif;
+ *mp = NULL;
IGMPSTAT_INC(igps_rcv_total);
ip = mtod(m, struct ip *);
- iphlen = off;
- igmplen = ip->ip_len;
+ iphlen = *offp;
+ igmplen = ntohs(ip->ip_len) - iphlen;
/*
* Validate lengths.
@@ -1452,7 +1461,7 @@ igmp_input(struct mbuf *m, int off)
if (igmplen < IGMP_MINLEN) {
IGMPSTAT_INC(igps_rcv_tooshort);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
/*
@@ -1464,10 +1473,10 @@ igmp_input(struct mbuf *m, int off)
minlen += IGMP_V3_QUERY_MINLEN;
else
minlen += IGMP_MINLEN;
- if ((m->m_flags & M_EXT || m->m_len < minlen) &&
- (m = m_pullup(m, minlen)) == 0) {
+ if ((!M_WRITABLE(m) || m->m_len < minlen) &&
+ (m = m_pullup(m, minlen)) == NULL) {
IGMPSTAT_INC(igps_rcv_tooshort);
- return;
+ return (IPPROTO_DONE);
}
ip = mtod(m, struct ip *);
@@ -1480,7 +1489,7 @@ igmp_input(struct mbuf *m, int off)
if (in_cksum(m, igmplen)) {
IGMPSTAT_INC(igps_rcv_badsum);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
m->m_data -= iphlen;
m->m_len += iphlen;
@@ -1493,7 +1502,7 @@ igmp_input(struct mbuf *m, int off)
if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) {
IGMPSTAT_INC(igps_rcv_badttl);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
switch (igmp->igmp_type) {
@@ -1508,7 +1517,7 @@ igmp_input(struct mbuf *m, int off)
} else {
IGMPSTAT_INC(igps_rcv_tooshort);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
switch (queryver) {
@@ -1518,7 +1527,7 @@ igmp_input(struct mbuf *m, int off)
break;
if (igmp_input_v1_query(ifp, ip, igmp) != 0) {
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
break;
@@ -1528,7 +1537,7 @@ igmp_input(struct mbuf *m, int off)
break;
if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
break;
@@ -1546,25 +1555,25 @@ igmp_input(struct mbuf *m, int off)
if (nsrc * sizeof(in_addr_t) >
UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) {
IGMPSTAT_INC(igps_rcv_tooshort);
- return;
+ return (IPPROTO_DONE);
}
/*
* m_pullup() may modify m, so pullup in
* this scope.
*/
igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
- sizeof(struct in_addr) * nsrc;
- if ((m->m_flags & M_EXT ||
+ sizeof(struct in_addr) * nsrc;
+ if ((!M_WRITABLE(m) ||
m->m_len < igmpv3len) &&
(m = m_pullup(m, igmpv3len)) == NULL) {
IGMPSTAT_INC(igps_rcv_tooshort);
- return;
+ return (IPPROTO_DONE);
}
igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
+ iphlen);
if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
}
break;
@@ -1576,7 +1585,7 @@ igmp_input(struct mbuf *m, int off)
break;
if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
break;
@@ -1587,7 +1596,7 @@ igmp_input(struct mbuf *m, int off)
IGMPSTAT_INC(igps_rcv_nora);
if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
break;
@@ -1608,7 +1617,8 @@ igmp_input(struct mbuf *m, int off)
* Pass all valid IGMP packets up to any process(es) listening on a
* raw IGMP socket.
*/
- rip_input(m, off);
+ *mp = m;
+ return (rip_input(mp, offp, proto));
}
@@ -1639,10 +1649,10 @@ igmp_fasttimo(void)
static void
igmp_fasttimo_vnet(void)
{
- struct ifqueue scq; /* State-change packets */
- struct ifqueue qrq; /* Query response packets */
+ struct mbufq scq; /* State-change packets */
+ struct mbufq qrq; /* Query response packets */
struct ifnet *ifp;
- struct igmp_ifinfo *igi;
+ struct igmp_ifsoftc *igi;
struct ifmultiaddr *ifma;
struct in_multi *inm;
int loop, uri_fasthz;
@@ -1701,12 +1711,8 @@ igmp_fasttimo_vnet(void)
loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
PR_FASTHZ);
-
- memset(&qrq, 0, sizeof(struct ifqueue));
- IFQ_SET_MAXLEN(&qrq, IGMP_MAX_G_GS_PACKETS);
-
- memset(&scq, 0, sizeof(struct ifqueue));
- IFQ_SET_MAXLEN(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
+ mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS);
+ mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
}
IF_ADDR_RLOCK(ifp);
@@ -1804,8 +1810,8 @@ igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
* Note: Unlocked read from igi.
*/
static void
-igmp_v3_process_group_timers(struct igmp_ifinfo *igi,
- struct ifqueue *qrq, struct ifqueue *scq,
+igmp_v3_process_group_timers(struct igmp_ifsoftc *igi,
+ struct mbufq *qrq, struct mbufq *scq,
struct in_multi *inm, const int uri_fasthz)
{
int query_response_timer_expired;
@@ -1951,7 +1957,7 @@ igmp_v3_suppress_group_record(struct in_multi *inm)
* as per Section 7.2.1.
*/
static void
-igmp_set_version(struct igmp_ifinfo *igi, const int version)
+igmp_set_version(struct igmp_ifsoftc *igi, const int version)
{
int old_version_timer;
@@ -2000,7 +2006,7 @@ igmp_set_version(struct igmp_ifinfo *igi, const int version)
* query processing.
*/
static void
-igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi)
+igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi)
{
struct ifmultiaddr *ifma;
struct ifnet *ifp;
@@ -2067,7 +2073,7 @@ igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi)
*/
inm->inm_sctimer = 0;
inm->inm_timer = 0;
- _IF_DRAIN(&inm->inm_scq);
+ mbufq_drain(&inm->inm_scq);
}
IF_ADDR_RUNLOCK(ifp);
SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) {
@@ -2081,7 +2087,7 @@ igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi)
* See Section 7.2.1 of RFC 3376.
*/
static void
-igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi)
+igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi)
{
IGMP_LOCK_ASSERT();
@@ -2122,6 +2128,7 @@ igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi)
__func__, igi->igi_version, IGMP_VERSION_2,
igi->igi_ifp, igi->igi_ifp->if_xname);
igi->igi_version = IGMP_VERSION_2;
+ igmp_v3_cancel_link_timers(igi);
}
}
} else if (igi->igi_v1_timer > 0) {
@@ -2176,7 +2183,7 @@ igmp_slowtimo(void)
static void
igmp_slowtimo_vnet(void)
{
- struct igmp_ifinfo *igi;
+ struct igmp_ifsoftc *igi;
IGMP_LOCK();
@@ -2204,10 +2211,10 @@ igmp_v1v2_queue_report(struct in_multi *inm, const int type)
ifp = inm->inm_ifp;
- MGETHDR(m, M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL)
return (ENOMEM);
- MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
+ M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
@@ -2226,7 +2233,7 @@ igmp_v1v2_queue_report(struct in_multi *inm, const int type)
ip = mtod(m, struct ip *);
ip->ip_tos = 0;
- ip->ip_len = sizeof(struct ip) + sizeof(struct igmp);
+ ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp));
ip->ip_off = 0;
ip->ip_p = IPPROTO_IGMP;
ip->ip_src.s_addr = INADDR_ANY;
@@ -2272,7 +2279,7 @@ igmp_v1v2_queue_report(struct in_multi *inm, const int type)
int
igmp_change_state(struct in_multi *inm)
{
- struct igmp_ifinfo *igi;
+ struct igmp_ifsoftc *igi;
struct ifnet *ifp;
int error;
@@ -2295,7 +2302,7 @@ igmp_change_state(struct in_multi *inm)
IGMP_LOCK();
igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
- KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
+ KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
/*
* If we detect a state transition to or from MCAST_UNDEFINED
@@ -2336,10 +2343,10 @@ out_locked:
* initial state of the membership.
*/
static int
-igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi)
+igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi)
{
struct ifnet *ifp;
- struct ifqueue *ifq;
+ struct mbufq *mq;
int error, retval, syncstates;
CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)",
@@ -2413,9 +2420,9 @@ igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi)
* Don't kick the timers if there is nothing to do,
* or if an error occurred.
*/
- ifq = &inm->inm_scq;
- _IF_DRAIN(ifq);
- retval = igmp_v3_enqueue_group_record(ifq, inm, 1,
+ mq = &inm->inm_scq;
+ mbufq_drain(mq);
+ retval = igmp_v3_enqueue_group_record(mq, inm, 1,
0, 0);
CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
__func__, retval);
@@ -2464,7 +2471,7 @@ igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi)
* Issue an intermediate state change during the IGMP life-cycle.
*/
static int
-igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi)
+igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi)
{
struct ifnet *ifp;
int retval;
@@ -2495,7 +2502,7 @@ igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi)
return (0);
}
- _IF_DRAIN(&inm->inm_scq);
+ mbufq_drain(&inm->inm_scq);
retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
@@ -2523,7 +2530,7 @@ igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi)
* to INCLUDE {} for immediate transmission.
*/
static void
-igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi)
+igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi)
{
int syncstates;
@@ -2564,7 +2571,7 @@ igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi)
* TO_IN {} to be sent on the next fast timeout,
* giving us an opportunity to merge reports.
*/
- _IF_DRAIN(&inm->inm_scq);
+ mbufq_drain(&inm->inm_scq);
inm->inm_timer = 0;
if (igi->igi_flags & IGIF_LOOPBACK) {
inm->inm_scrv = 1;
@@ -2642,7 +2649,7 @@ igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi)
* no record(s) were appended.
*/
static int
-igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
+igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm,
const int is_state_change, const int is_group_query,
const int is_source_query)
{
@@ -2732,7 +2739,7 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
* Generate the filter list changes using a separate function.
*/
if (is_filter_list_change)
- return (igmp_v3_enqueue_filter_change(ifq, inm));
+ return (igmp_v3_enqueue_filter_change(mq, inm));
if (type == IGMP_DO_NOTHING) {
CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s",
@@ -2762,7 +2769,7 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
* Note: Group records for G/GSR query responses MUST be sent
* in their own packet.
*/
- m0 = ifq->ifq_tail;
+ m0 = mbufq_last(mq);
if (!is_group_query &&
m0 != NULL &&
(m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
@@ -2773,7 +2780,7 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
m = m0;
CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
} else {
- if (_IF_QFULL(ifq)) {
+ if (mbufq_full(mq)) {
CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
return (-ENOMEM);
}
@@ -2781,14 +2788,14 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
if (!is_state_change && !is_group_query) {
- m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+ m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
if (m)
m->m_data += IGMP_LEADINGSPACE;
}
if (m == NULL) {
- m = m_gethdr(M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m)
- MH_ALIGN(m, IGMP_LEADINGSPACE);
+ M_ALIGN(m, IGMP_LEADINGSPACE);
}
if (m == NULL)
return (-ENOMEM);
@@ -2886,7 +2893,7 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
if (m != m0) {
CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
m->m_pkthdr.PH_vt.vt_nrecs = 1;
- _IF_ENQUEUE(ifq, m);
+ mbufq_enqueue(mq, m);
} else
m->m_pkthdr.PH_vt.vt_nrecs++;
@@ -2902,17 +2909,17 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
* Always try for a cluster first.
*/
while (nims != NULL) {
- if (_IF_QFULL(ifq)) {
+ if (mbufq_full(mq)) {
CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
return (-ENOMEM);
}
- m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+ m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
if (m)
m->m_data += IGMP_LEADINGSPACE;
if (m == NULL) {
- m = m_gethdr(M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m)
- MH_ALIGN(m, IGMP_LEADINGSPACE);
+ M_ALIGN(m, IGMP_LEADINGSPACE);
}
if (m == NULL)
return (-ENOMEM);
@@ -2965,7 +2972,7 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
nbytes += (msrcs * sizeof(in_addr_t));
CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
- _IF_ENQUEUE(ifq, m);
+ mbufq_enqueue(mq, m);
}
return (nbytes);
@@ -3005,7 +3012,7 @@ typedef enum {
* no record(s) were appended.
*/
static int
-igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm)
+igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm)
{
static const int MINRECLEN =
sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
@@ -3049,7 +3056,7 @@ igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm)
*/
while (drt != REC_FULL) {
do {
- m0 = ifq->ifq_tail;
+ m0 = mbufq_last(mq);
if (m0 != NULL &&
(m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
IGMP_V3_REPORT_MAXRECS) &&
@@ -3062,13 +3069,13 @@ igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm)
CTR1(KTR_IGMPV3,
"%s: use previous packet", __func__);
} else {
- m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+ m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
if (m)
m->m_data += IGMP_LEADINGSPACE;
if (m == NULL) {
- m = m_gethdr(M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m)
- MH_ALIGN(m, IGMP_LEADINGSPACE);
+ M_ALIGN(m, IGMP_LEADINGSPACE);
}
if (m == NULL) {
CTR1(KTR_IGMPV3,
@@ -3196,7 +3203,7 @@ igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm)
*/
m->m_pkthdr.PH_vt.vt_nrecs++;
if (m != m0)
- _IF_ENQUEUE(ifq, m);
+ mbufq_enqueue(mq, m);
nbytes += npbytes;
} while (nims != NULL);
drt |= crt;
@@ -3210,9 +3217,9 @@ igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm)
}
static int
-igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
+igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq)
{
- struct ifqueue *gq;
+ struct mbufq *gq;
struct mbuf *m; /* pending state-change */
struct mbuf *m0; /* copy of pending state-change */
struct mbuf *mt; /* last state-change in packet */
@@ -3235,13 +3242,13 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
gq = &inm->inm_scq;
#ifdef KTR
- if (gq->ifq_head == NULL) {
+ if (mbufq_first(gq) == NULL) {
CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
__func__, inm);
}
#endif
- m = gq->ifq_head;
+ m = mbufq_first(gq);
while (m != NULL) {
/*
* Only merge the report into the current packet if
@@ -3252,7 +3259,7 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
* allocated clusters.
*/
domerge = 0;
- mt = ifscq->ifq_tail;
+ mt = mbufq_last(scq);
if (mt != NULL) {
recslen = m_length(m, NULL);
@@ -3264,7 +3271,7 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
domerge = 1;
}
- if (!domerge && _IF_QFULL(gq)) {
+ if (!domerge && mbufq_full(gq)) {
CTR2(KTR_IGMPV3,
"%s: outbound queue full, skipping whole packet %p",
__func__, m);
@@ -3277,7 +3284,7 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
if (!docopy) {
CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
- _IF_DEQUEUE(gq, m0);
+ m0 = mbufq_dequeue(gq);
m = m0->m_nextpkt;
} else {
CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
@@ -3289,13 +3296,13 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
}
if (!domerge) {
- CTR3(KTR_IGMPV3, "%s: queueing %p to ifscq %p)",
- __func__, m0, ifscq);
- _IF_ENQUEUE(ifscq, m0);
+ CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)",
+ __func__, m0, scq);
+ mbufq_enqueue(scq, m0);
} else {
struct mbuf *mtl; /* last mbuf of packet mt */
- CTR3(KTR_IGMPV3, "%s: merging %p with ifscq tail %p)",
+ CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)",
__func__, m0, mt);
mtl = m_last(mt);
@@ -3315,7 +3322,7 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
* Respond to a pending IGMPv3 General Query.
*/
static void
-igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi)
+igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi)
{
struct ifmultiaddr *ifma;
struct ifnet *ifp;
@@ -3328,6 +3335,15 @@ igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi)
KASSERT(igi->igi_version == IGMP_VERSION_3,
("%s: called when version %d", __func__, igi->igi_version));
+ /*
+ * Check that there are some packets queued. If so, send them first.
+ * For large number of groups the reply to general query can take
+ * many packets, we should finish sending them before starting of
+ * queuing the new reply.
+ */
+ if (mbufq_len(&igi->igi_gq) != 0)
+ goto send;
+
ifp = igi->igi_ifp;
IF_ADDR_RLOCK(ifp);
@@ -3363,13 +3379,14 @@ igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi)
}
IF_ADDR_RUNLOCK(ifp);
+send:
loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
/*
* Slew transmission of bursts over 500ms intervals.
*/
- if (igi->igi_gq.ifq_head != NULL) {
+ if (mbufq_first(&igi->igi_gq) != NULL) {
igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
IGMP_RESPONSE_BURST_INTERVAL);
V_interface_timers_running = 1;
@@ -3403,7 +3420,7 @@ igmp_intr(struct mbuf *m)
* indexes to guard against interface detach, they are
* unique to each VIMAGE and must be retrieved.
*/
- CURVNET_SET((struct vnet *)(m->m_pkthdr.header));
+ CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr));
ifindex = igmp_restore_context(m);
/*
@@ -3450,7 +3467,7 @@ igmp_intr(struct mbuf *m)
}
igmp_scrub_context(m0);
- m->m_flags &= ~(M_PROTOFLAGS);
+ m_clrprotoflags(m);
m0->m_pkthdr.rcvif = V_loif;
#ifdef MAC
mac_netinet_igmp_send(ifp, m0);
@@ -3485,6 +3502,7 @@ out:
static struct mbuf *
igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
{
+ struct rm_priotracker in_ifa_tracker;
struct igmp_report *igmp;
struct ip *ip;
int hdrlen, igmpreclen;
@@ -3498,7 +3516,7 @@ igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
if (m->m_flags & M_IGMPV3_HDR) {
igmpreclen -= hdrlen;
} else {
- M_PREPEND(m, hdrlen, M_DONTWAIT);
+ M_PREPEND(m, hdrlen, M_NOWAIT);
if (m == NULL)
return (NULL);
m->m_flags |= M_IGMPV3_HDR;
@@ -3523,8 +3541,8 @@ igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
ip = mtod(m, struct ip *);
ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
- ip->ip_len = hdrlen + igmpreclen;
- ip->ip_off = IP_DF;
+ ip->ip_len = htons(hdrlen + igmpreclen);
+ ip->ip_off = htons(IP_DF);
ip->ip_p = IPPROTO_IGMP;
ip->ip_sum = 0;
@@ -3533,7 +3551,7 @@ igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
if (m->m_flags & M_IGMP_LOOP) {
struct in_ifaddr *ia;
- IFP_TO_IA(ifp, ia);
+ IFP_TO_IA(ifp, ia, &in_ifa_tracker);
if (ia != NULL) {
ip->ip_src = ia->ia_addr.sin_addr;
ifa_free(&ia->ia_ifa);
@@ -3576,70 +3594,82 @@ igmp_rec_type_to_str(const int type)
}
#endif
+#ifdef VIMAGE
static void
-igmp_init(void *unused __unused)
+vnet_igmp_init(const void *unused __unused)
{
- CTR1(KTR_IGMPV3, "%s: initializing", __func__);
-
- IGMP_LOCK_INIT();
-
- m_raopt = igmp_ra_alloc();
-
- netisr_register(&igmp_nh);
+ netisr_register_vnet(&igmp_nh);
}
-SYSINIT(igmp_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_init, NULL);
+VNET_SYSINIT(vnet_igmp_init, SI_SUB_PROTO_MC, SI_ORDER_ANY,
+ vnet_igmp_init, NULL);
static void
-igmp_uninit(void *unused __unused)
+vnet_igmp_uninit(const void *unused __unused)
{
+ /* This can happen when we shutdown the entire network stack. */
CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
- netisr_unregister(&igmp_nh);
-
- m_free(m_raopt);
- m_raopt = NULL;
-
- IGMP_LOCK_DESTROY();
+ netisr_unregister_vnet(&igmp_nh);
}
-SYSUNINIT(igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_uninit, NULL);
+VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY,
+ vnet_igmp_uninit, NULL);
+#endif
-static void
-vnet_igmp_init(const void *unused __unused)
+#ifdef DDB
+DB_SHOW_COMMAND(igi_list, db_show_igi_list)
{
+ struct igmp_ifsoftc *igi, *tigi;
+ LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head;
- CTR1(KTR_IGMPV3, "%s: initializing", __func__);
-
- LIST_INIT(&V_igi_head);
-}
-VNET_SYSINIT(vnet_igmp_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_init,
- NULL);
-
-static void
-vnet_igmp_uninit(const void *unused __unused)
-{
-
- CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
-
- KASSERT(LIST_EMPTY(&V_igi_head),
- ("%s: igi list not empty; ifnets not detached?", __func__));
+ if (!have_addr) {
+ db_printf("usage: show igi_list <addr>\n");
+ return;
+ }
+ igi_head = (struct _igi_list *)addr;
+
+ LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) {
+ db_printf("igmp_ifsoftc %p:\n", igi);
+ db_printf(" ifp %p\n", igi->igi_ifp);
+ db_printf(" version %u\n", igi->igi_version);
+ db_printf(" v1_timer %u\n", igi->igi_v1_timer);
+ db_printf(" v2_timer %u\n", igi->igi_v2_timer);
+ db_printf(" v3_timer %u\n", igi->igi_v3_timer);
+ db_printf(" flags %#x\n", igi->igi_flags);
+ db_printf(" rv %u\n", igi->igi_rv);
+ db_printf(" qi %u\n", igi->igi_qi);
+ db_printf(" qri %u\n", igi->igi_qri);
+ db_printf(" uri %u\n", igi->igi_uri);
+ /* SLIST_HEAD(,in_multi) igi_relinmhead */
+ /* struct mbufq igi_gq; */
+ db_printf("\n");
+ }
}
-VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
- vnet_igmp_uninit, NULL);
+#endif
static int
igmp_modevent(module_t mod, int type, void *unused __unused)
{
- switch (type) {
- case MOD_LOAD:
- case MOD_UNLOAD:
- break;
- default:
- return (EOPNOTSUPP);
- }
- return (0);
+ switch (type) {
+ case MOD_LOAD:
+ CTR1(KTR_IGMPV3, "%s: initializing", __func__);
+ IGMP_LOCK_INIT();
+ m_raopt = igmp_ra_alloc();
+ netisr_register(&igmp_nh);
+ break;
+ case MOD_UNLOAD:
+ CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
+ netisr_unregister(&igmp_nh);
+ m_free(m_raopt);
+ m_raopt = NULL;
+ IGMP_LOCK_DESTROY();
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+ return (0);
}
static moduledata_t igmp_mod = {
@@ -3647,4 +3677,4 @@ static moduledata_t igmp_mod = {
igmp_modevent,
0
};
-DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE);
diff --git a/freebsd/sys/netinet/igmp_var.h b/freebsd/sys/netinet/igmp_var.h
index ca17158f..5242d07d 100644
--- a/freebsd/sys/netinet/igmp_var.h
+++ b/freebsd/sys/netinet/igmp_var.h
@@ -46,24 +46,6 @@
* MULTICAST Revision: 3.5.1.3
*/
-#ifndef BURN_BRIDGES
-/*
- * Pre-IGMPV3 igmpstat structure.
- */
-struct oigmpstat {
- u_int igps_rcv_total; /* total IGMP messages received */
- u_int igps_rcv_tooshort; /* received with too few bytes */
- u_int igps_rcv_badsum; /* received with bad checksum */
- u_int igps_rcv_queries; /* received membership queries */
- u_int igps_rcv_badqueries; /* received invalid queries */
- u_int igps_rcv_reports; /* received membership reports */
- u_int igps_rcv_badreports; /* received invalid reports */
- u_int igps_rcv_ourreports; /* received reports for our groups */
- u_int igps_snd_reports; /* sent membership reports */
- u_int igps_rcv_toolong; /* received with too many bytes */
-};
-#endif
-
/*
* IGMPv3 protocol statistics.
*/
@@ -105,19 +87,16 @@ struct igmpstat {
};
#define IGPS_VERSION_3 3 /* as of FreeBSD 8.x */
#define IGPS_VERSION3_LEN 168
-
-#ifdef _KERNEL
-#define IGMPSTAT_ADD(name, val) V_igmpstat.name += (val)
-#define IGMPSTAT_INC(name) IGMPSTAT_ADD(name, 1)
-#endif
-
#ifdef CTASSERT
-CTASSERT(sizeof(struct igmpstat) == 168);
+CTASSERT(sizeof(struct igmpstat) == IGPS_VERSION3_LEN);
#endif
-#ifdef _KERNEL
-#define IGMP_RANDOM_DELAY(X) (random() % (X) + 1)
+/*
+ * Identifiers for IGMP sysctl nodes
+ */
+#define IGMPCTL_STATS 1 /* statistics (read-only) */
+#define IGMP_RANDOM_DELAY(X) (random() % (X) + 1)
#define IGMP_MAX_STATE_CHANGES 24 /* Max pending changes per group */
/*
@@ -186,6 +165,27 @@ CTASSERT(sizeof(struct igmpstat) == 168);
(sizeof(struct ip) + RAOPT_LEN + sizeof(struct igmp_report))
/*
+ * Structure returned by net.inet.igmp.ifinfo sysctl.
+ */
+struct igmp_ifinfo {
+ uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */
+ uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */
+ uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */
+ uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/
+ uint32_t igi_flags; /* IGMP per-interface flags */
+#define IGIF_SILENT 0x00000001 /* Do not use IGMP on this ifp */
+#define IGIF_LOOPBACK 0x00000002 /* Send IGMP reports to loopback */
+ uint32_t igi_rv; /* IGMPv3 Robustness Variable */
+ uint32_t igi_qi; /* IGMPv3 Query Interval (s) */
+ uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */
+ uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */
+};
+
+#ifdef _KERNEL
+#define IGMPSTAT_ADD(name, val) V_igmpstat.name += (val)
+#define IGMPSTAT_INC(name) IGMPSTAT_ADD(name, 1)
+
+/*
* Subsystem lock macros.
* The IGMP lock is only taken with IGMP. Currently it is system-wide.
* VIMAGE: The lock could be pushed to per-VIMAGE granularity in future.
@@ -197,29 +197,35 @@ CTASSERT(sizeof(struct igmpstat) == 168);
#define IGMP_UNLOCK() mtx_unlock(&igmp_mtx)
#define IGMP_UNLOCK_ASSERT() mtx_assert(&igmp_mtx, MA_NOTOWNED)
-struct igmp_ifinfo;
+/*
+ * Per-interface IGMP router version information.
+ */
+struct igmp_ifsoftc {
+ LIST_ENTRY(igmp_ifsoftc) igi_link;
+ struct ifnet *igi_ifp; /* pointer back to interface */
+ uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */
+ uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */
+ uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */
+ uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/
+ uint32_t igi_flags; /* IGMP per-interface flags */
+ uint32_t igi_rv; /* IGMPv3 Robustness Variable */
+ uint32_t igi_qi; /* IGMPv3 Query Interval (s) */
+ uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */
+ uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */
+ SLIST_HEAD(,in_multi) igi_relinmhead; /* released groups */
+ struct mbufq igi_gq; /* general query responses queue */
+};
int igmp_change_state(struct in_multi *);
void igmp_fasttimo(void);
-struct igmp_ifinfo *
+struct igmp_ifsoftc *
igmp_domifattach(struct ifnet *);
void igmp_domifdetach(struct ifnet *);
void igmp_ifdetach(struct ifnet *);
-void igmp_input(struct mbuf *, int);
+int igmp_input(struct mbuf **, int *, int);
void igmp_slowtimo(void);
SYSCTL_DECL(_net_inet_igmp);
#endif /* _KERNEL */
-
-/*
- * Names for IGMP sysctl objects
- */
-#define IGMPCTL_STATS 1 /* statistics (read-only) */
-#define IGMPCTL_MAXID 2
-
-#define IGMPCTL_NAMES { \
- { 0, 0 }, \
- { "stats", CTLTYPE_STRUCT } \
-}
#endif
diff --git a/freebsd/sys/netinet/in.c b/freebsd/sys/netinet/in.c
index 653580c7..06b23973 100644
--- a/freebsd/sys/netinet/in.c
+++ b/freebsd/sys/netinet/in.c
@@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_mpath.h>
#include <rtems/bsd/sys/param.h>
+#include <sys/eventhandler.h>
#include <sys/systm.h>
#include <sys/sockio.h>
#include <sys/malloc.h>
@@ -45,9 +46,12 @@ __FBSDID("$FreeBSD$");
#include <sys/socket.h>
#include <sys/jail.h>
#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
#include <sys/proc.h>
+#include <sys/rmlock.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
+#include <sys/sx.h>
#include <net/if.h>
#include <net/if_var.h>
@@ -58,37 +62,33 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/vnet.h>
+#include <netinet/if_ether.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
+#include <netinet/ip_carp.h>
#include <netinet/igmp_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
-static int in_mask2len(struct in_addr *);
-static void in_len2mask(struct in_addr *, int);
-static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t,
- struct ifnet *, struct thread *);
+static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *);
+static int in_difaddr_ioctl(caddr_t, struct ifnet *, struct thread *);
-static int in_addprefix(struct in_ifaddr *, int);
-static int in_scrubprefix(struct in_ifaddr *, u_int);
static void in_socktrim(struct sockaddr_in *);
-static int in_ifinit(struct ifnet *,
- struct in_ifaddr *, struct sockaddr_in *, int);
static void in_purgemaddrs(struct ifnet *);
-static VNET_DEFINE(int, sameprefixcarponly);
-#define V_sameprefixcarponly VNET(sameprefixcarponly)
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW,
- &VNET_NAME(sameprefixcarponly), 0,
+static VNET_DEFINE(int, nosameprefix);
+#define V_nosameprefix VNET(nosameprefix)
+SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(nosameprefix), 0,
"Refuse to create same prefixes on different interfaces");
VNET_DECLARE(struct inpcbinfo, ripcbinfo);
#define V_ripcbinfo VNET(ripcbinfo)
-VNET_DECLARE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */
-#define V_arpstat VNET(arpstat)
+static struct sx in_control_sx;
+SX_SYSINIT(in_control_sx, &in_control_sx, "in_control");
/*
* Return 1 if an internet address is for a ``local'' host
@@ -97,17 +97,18 @@ VNET_DECLARE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */
int
in_localaddr(struct in_addr in)
{
+ struct rm_priotracker in_ifa_tracker;
register u_long i = ntohl(in.s_addr);
register struct in_ifaddr *ia;
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if ((i & ia->ia_subnetmask) == ia->ia_subnet) {
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return (1);
}
}
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return (0);
}
@@ -118,20 +119,69 @@ in_localaddr(struct in_addr in)
int
in_localip(struct in_addr in)
{
+ struct rm_priotracker in_ifa_tracker;
struct in_ifaddr *ia;
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) {
if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) {
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return (1);
}
}
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return (0);
}
/*
+ * Return 1 if an internet address is configured on an interface.
+ */
+int
+in_ifhasaddr(struct ifnet *ifp, struct in_addr in)
+{
+ struct ifaddr *ifa;
+ struct in_ifaddr *ia;
+
+ IF_ADDR_RLOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ ia = (struct in_ifaddr *)ifa;
+ if (ia->ia_addr.sin_addr.s_addr == in.s_addr) {
+ IF_ADDR_RUNLOCK(ifp);
+ return (1);
+ }
+ }
+ IF_ADDR_RUNLOCK(ifp);
+
+ return (0);
+}
+
+/*
+ * Return a reference to the interface address which is different to
+ * the supplied one but with same IP address value.
+ */
+static struct in_ifaddr *
+in_localip_more(struct in_ifaddr *ia)
+{
+ struct rm_priotracker in_ifa_tracker;
+ in_addr_t in = IA_SIN(ia)->sin_addr.s_addr;
+ struct in_ifaddr *it;
+
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
+ LIST_FOREACH(it, INADDR_HASH(in), ia_hash) {
+ if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) {
+ ifa_ref(&it->ia_ifa);
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
+ return (it);
+ }
+ }
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
+
+ return (NULL);
+}
+
+/*
* Determine whether an IP address is in a reserved set of addresses
* that may not be forwarded, or whether datagrams to that destination
* may be forwarded.
@@ -169,793 +219,430 @@ in_socktrim(struct sockaddr_in *ap)
}
}
-static int
-in_mask2len(mask)
- struct in_addr *mask;
-{
- int x, y;
- u_char *p;
-
- p = (u_char *)mask;
- for (x = 0; x < sizeof(*mask); x++) {
- if (p[x] != 0xff)
- break;
- }
- y = 0;
- if (x < sizeof(*mask)) {
- for (y = 0; y < 8; y++) {
- if ((p[x] & (0x80 >> y)) == 0)
- break;
- }
- }
- return (x * 8 + y);
-}
-
-static void
-in_len2mask(struct in_addr *mask, int len)
-{
- int i;
- u_char *p;
-
- p = (u_char *)mask;
- bzero(mask, sizeof(*mask));
- for (i = 0; i < len / 8; i++)
- p[i] = 0xff;
- if (len % 8)
- p[i] = (0xff00 >> (len % 8)) & 0xff;
-}
-
/*
* Generic internet control operations (ioctl's).
- *
- * ifp is NULL if not an interface-specific ioctl.
*/
-/* ARGSUSED */
int
in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
struct thread *td)
{
- register struct ifreq *ifr = (struct ifreq *)data;
- register struct in_ifaddr *ia, *iap;
- register struct ifaddr *ifa;
- struct in_addr allhosts_addr;
- struct in_addr dst;
- struct in_ifinfo *ii;
- struct in_aliasreq *ifra = (struct in_aliasreq *)data;
- struct sockaddr_in oldaddr;
- int error, hostIsNew, iaIsNew, maskIsNew;
- int iaIsFirst;
+ struct ifreq *ifr = (struct ifreq *)data;
+ struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr;
+ struct ifaddr *ifa;
+ struct in_ifaddr *ia;
+ int error;
- ia = NULL;
- iaIsFirst = 0;
- iaIsNew = 0;
- allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP);
+ if (ifp == NULL)
+ return (EADDRNOTAVAIL);
/*
- * Filter out ioctls we implement directly; forward the rest on to
- * in_lifaddr_ioctl() and ifp->if_ioctl().
+ * Filter out 4 ioctls we implement directly. Forward the rest
+ * to specific functions and ifp->if_ioctl().
*/
switch (cmd) {
- case SIOCAIFADDR:
- case SIOCDIFADDR:
case SIOCGIFADDR:
case SIOCGIFBRDADDR:
case SIOCGIFDSTADDR:
case SIOCGIFNETMASK:
+ break;
+ case SIOCDIFADDR:
+ sx_xlock(&in_control_sx);
+ error = in_difaddr_ioctl(data, ifp, td);
+ sx_xunlock(&in_control_sx);
+ return (error);
+#ifndef __rtems__
+ case OSIOCAIFADDR: /* 9.x compat */
+#endif /* __rtems__ */
+ case SIOCAIFADDR:
+ sx_xlock(&in_control_sx);
+ error = in_aifaddr_ioctl(cmd, data, ifp, td);
+ sx_xunlock(&in_control_sx);
+ return (error);
case SIOCSIFADDR:
case SIOCSIFBRDADDR:
case SIOCSIFDSTADDR:
case SIOCSIFNETMASK:
- break;
-
- case SIOCALIFADDR:
- if (td != NULL) {
- error = priv_check(td, PRIV_NET_ADDIFADDR);
- if (error)
- return (error);
- }
- if (ifp == NULL)
- return (EINVAL);
- return in_lifaddr_ioctl(so, cmd, data, ifp, td);
-
- case SIOCDLIFADDR:
- if (td != NULL) {
- error = priv_check(td, PRIV_NET_DELIFADDR);
- if (error)
- return (error);
- }
- if (ifp == NULL)
- return (EINVAL);
- return in_lifaddr_ioctl(so, cmd, data, ifp, td);
-
- case SIOCGLIFADDR:
- if (ifp == NULL)
- return (EINVAL);
- return in_lifaddr_ioctl(so, cmd, data, ifp, td);
-
+ /* We no longer support that old commands. */
+ return (EINVAL);
default:
- if (ifp == NULL || ifp->if_ioctl == NULL)
+ if (ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
return ((*ifp->if_ioctl)(ifp, cmd, data));
}
- if (ifp == NULL)
+ if (addr->sin_addr.s_addr != INADDR_ANY &&
+ prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0)
return (EADDRNOTAVAIL);
/*
- * Security checks before we get involved in any work.
- */
- switch (cmd) {
- case SIOCAIFADDR:
- case SIOCSIFADDR:
- case SIOCSIFBRDADDR:
- case SIOCSIFNETMASK:
- case SIOCSIFDSTADDR:
- if (td != NULL) {
- error = priv_check(td, PRIV_NET_ADDIFADDR);
- if (error)
- return (error);
- }
- break;
-
- case SIOCDIFADDR:
- if (td != NULL) {
- error = priv_check(td, PRIV_NET_DELIFADDR);
- if (error)
- return (error);
- }
- break;
- }
-
- /*
- * Find address for this interface, if it exists.
- *
- * If an alias address was specified, find that one instead of the
+ * Find address for this interface, if it exists. If an
+ * address was specified, find that one instead of the
* first one on the interface, if possible.
*/
- dst = ((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr;
- IN_IFADDR_RLOCK();
- LIST_FOREACH(iap, INADDR_HASH(dst.s_addr), ia_hash) {
- if (iap->ia_ifp == ifp &&
- iap->ia_addr.sin_addr.s_addr == dst.s_addr) {
- if (td == NULL || prison_check_ip4(td->td_ucred,
- &dst) == 0)
- ia = iap;
+ IF_ADDR_RLOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ ia = (struct in_ifaddr *)ifa;
+ if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr)
break;
- }
}
- if (ia != NULL)
- ifa_ref(&ia->ia_ifa);
- IN_IFADDR_RUNLOCK();
- if (ia == NULL) {
- IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
- iap = ifatoia(ifa);
- if (iap->ia_addr.sin_family == AF_INET) {
- if (td != NULL &&
- prison_check_ip4(td->td_ucred,
- &iap->ia_addr.sin_addr) != 0)
- continue;
- ia = iap;
- break;
+ if (ifa == NULL)
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
+ if (ifa->ifa_addr->sa_family == AF_INET) {
+ ia = (struct in_ifaddr *)ifa;
+ if (prison_check_ip4(td->td_ucred,
+ &ia->ia_addr.sin_addr) == 0)
+ break;
}
- }
- if (ia != NULL)
- ifa_ref(&ia->ia_ifa);
+
+ if (ifa == NULL) {
IF_ADDR_RUNLOCK(ifp);
+ return (EADDRNOTAVAIL);
}
- if (ia == NULL)
- iaIsFirst = 1;
error = 0;
switch (cmd) {
- case SIOCAIFADDR:
- case SIOCDIFADDR:
- if (ifra->ifra_addr.sin_family == AF_INET) {
- struct in_ifaddr *oia;
-
- IN_IFADDR_RLOCK();
- for (oia = ia; ia; ia = TAILQ_NEXT(ia, ia_link)) {
- if (ia->ia_ifp == ifp &&
- ia->ia_addr.sin_addr.s_addr ==
- ifra->ifra_addr.sin_addr.s_addr)
- break;
- }
- if (ia != NULL && ia != oia)
- ifa_ref(&ia->ia_ifa);
- if (oia != NULL && ia != oia)
- ifa_free(&oia->ia_ifa);
- IN_IFADDR_RUNLOCK();
- if ((ifp->if_flags & IFF_POINTOPOINT)
- && (cmd == SIOCAIFADDR)
- && (ifra->ifra_dstaddr.sin_addr.s_addr
- == INADDR_ANY)) {
- error = EDESTADDRREQ;
- goto out;
- }
- }
- if (cmd == SIOCDIFADDR && ia == NULL) {
- error = EADDRNOTAVAIL;
- goto out;
- }
- /* FALLTHROUGH */
- case SIOCSIFADDR:
- case SIOCSIFNETMASK:
- case SIOCSIFDSTADDR:
- if (ia == NULL) {
- ia = (struct in_ifaddr *)
- malloc(sizeof *ia, M_IFADDR, M_NOWAIT |
- M_ZERO);
- if (ia == NULL) {
- error = ENOBUFS;
- goto out;
- }
-
- ifa = &ia->ia_ifa;
- ifa_init(ifa);
- ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr;
- ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr;
- ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask;
-
- ia->ia_sockmask.sin_len = 8;
- ia->ia_sockmask.sin_family = AF_INET;
- if (ifp->if_flags & IFF_BROADCAST) {
- ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr);
- ia->ia_broadaddr.sin_family = AF_INET;
- }
- ia->ia_ifp = ifp;
-
- ifa_ref(ifa); /* if_addrhead */
- IF_ADDR_WLOCK(ifp);
- TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link);
- IF_ADDR_WUNLOCK(ifp);
- ifa_ref(ifa); /* in_ifaddrhead */
- IN_IFADDR_WLOCK();
- TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link);
- IN_IFADDR_WUNLOCK();
- iaIsNew = 1;
- }
- break;
-
- case SIOCSIFBRDADDR:
case SIOCGIFADDR:
- case SIOCGIFNETMASK:
- case SIOCGIFDSTADDR:
- case SIOCGIFBRDADDR:
- if (ia == NULL) {
- error = EADDRNOTAVAIL;
- goto out;
- }
+ *addr = ia->ia_addr;
break;
- }
-
- /*
- * Most paths in this switch return directly or via out. Only paths
- * that remove the address break in order to hit common removal code.
- */
- switch (cmd) {
- case SIOCGIFADDR:
- *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr;
- goto out;
case SIOCGIFBRDADDR:
if ((ifp->if_flags & IFF_BROADCAST) == 0) {
error = EINVAL;
- goto out;
+ break;
}
- *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr;
- goto out;
+ *addr = ia->ia_broadaddr;
+ break;
case SIOCGIFDSTADDR:
if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
error = EINVAL;
- goto out;
+ break;
}
- *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr;
- goto out;
+ *addr = ia->ia_dstaddr;
+ break;
case SIOCGIFNETMASK:
- *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask;
- goto out;
-
- case SIOCSIFDSTADDR:
- if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
- error = EINVAL;
- goto out;
- }
- oldaddr = ia->ia_dstaddr;
- ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr;
- if (ifp->if_ioctl != NULL) {
- error = (*ifp->if_ioctl)(ifp, SIOCSIFDSTADDR,
- (caddr_t)ia);
- if (error) {
- ia->ia_dstaddr = oldaddr;
- goto out;
- }
- }
- if (ia->ia_flags & IFA_ROUTE) {
- ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr;
- rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST);
- ia->ia_ifa.ifa_dstaddr =
- (struct sockaddr *)&ia->ia_dstaddr;
- rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP);
- }
- goto out;
+ *addr = ia->ia_sockmask;
+ break;
+ }
- case SIOCSIFBRDADDR:
- if ((ifp->if_flags & IFF_BROADCAST) == 0) {
- error = EINVAL;
- goto out;
- }
- ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr;
- goto out;
+ IF_ADDR_RUNLOCK(ifp);
- case SIOCSIFADDR:
- error = in_ifinit(ifp, ia,
- (struct sockaddr_in *) &ifr->ifr_addr, 1);
- if (error != 0 && iaIsNew)
- break;
- if (error == 0) {
- ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
- if (iaIsFirst &&
- (ifp->if_flags & IFF_MULTICAST) != 0) {
- error = in_joingroup(ifp, &allhosts_addr,
- NULL, &ii->ii_allhosts);
- }
- EVENTHANDLER_INVOKE(ifaddr_event, ifp);
- }
- error = 0;
- goto out;
+ return (error);
+}
- case SIOCSIFNETMASK:
- ia->ia_sockmask.sin_addr = ifra->ifra_addr.sin_addr;
- ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr);
- goto out;
+static int
+in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td)
+{
+ const struct in_aliasreq *ifra = (struct in_aliasreq *)data;
+ const struct sockaddr_in *addr = &ifra->ifra_addr;
+ const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr;
+ const struct sockaddr_in *mask = &ifra->ifra_mask;
+ const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr;
+ const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0;
+ struct ifaddr *ifa;
+ struct in_ifaddr *ia;
+ bool iaIsFirst;
+ int error = 0;
- case SIOCAIFADDR:
- maskIsNew = 0;
- hostIsNew = 1;
- error = 0;
- if (ia->ia_addr.sin_family == AF_INET) {
- if (ifra->ifra_addr.sin_len == 0) {
- ifra->ifra_addr = ia->ia_addr;
- hostIsNew = 0;
- } else if (ifra->ifra_addr.sin_addr.s_addr ==
- ia->ia_addr.sin_addr.s_addr)
- hostIsNew = 0;
- }
- if (ifra->ifra_mask.sin_len) {
- /*
- * QL: XXX
- * Need to scrub the prefix here in case
- * the issued command is SIOCAIFADDR with
- * the same address, but with a different
- * prefix length. And if the prefix length
- * is the same as before, then the call is
- * un-necessarily executed here.
- */
- in_ifscrub(ifp, ia, LLE_STATIC);
- ia->ia_sockmask = ifra->ifra_mask;
- ia->ia_sockmask.sin_family = AF_INET;
- ia->ia_subnetmask =
- ntohl(ia->ia_sockmask.sin_addr.s_addr);
- maskIsNew = 1;
- }
- if ((ifp->if_flags & IFF_POINTOPOINT) &&
- (ifra->ifra_dstaddr.sin_family == AF_INET)) {
- in_ifscrub(ifp, ia, LLE_STATIC);
- ia->ia_dstaddr = ifra->ifra_dstaddr;
- maskIsNew = 1; /* We lie; but the effect's the same */
- }
- if (ifra->ifra_addr.sin_family == AF_INET &&
- (hostIsNew || maskIsNew))
- error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0);
- if (error != 0 && iaIsNew)
- break;
+ error = priv_check(td, PRIV_NET_ADDIFADDR);
+ if (error)
+ return (error);
- if ((ifp->if_flags & IFF_BROADCAST) &&
- (ifra->ifra_broadaddr.sin_family == AF_INET))
- ia->ia_broadaddr = ifra->ifra_broadaddr;
- if (error == 0) {
- ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
- if (iaIsFirst &&
- (ifp->if_flags & IFF_MULTICAST) != 0) {
- error = in_joingroup(ifp, &allhosts_addr,
- NULL, &ii->ii_allhosts);
- }
- EVENTHANDLER_INVOKE(ifaddr_event, ifp);
- }
- goto out;
+ /*
+ * ifra_addr must be present and be of INET family.
+ * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional.
+ */
+ if (addr->sin_len != sizeof(struct sockaddr_in) ||
+ addr->sin_family != AF_INET)
+ return (EINVAL);
+ if (broadaddr->sin_len != 0 &&
+ (broadaddr->sin_len != sizeof(struct sockaddr_in) ||
+ broadaddr->sin_family != AF_INET))
+ return (EINVAL);
+ if (mask->sin_len != 0 &&
+ (mask->sin_len != sizeof(struct sockaddr_in) ||
+ mask->sin_family != AF_INET))
+ return (EINVAL);
+ if ((ifp->if_flags & IFF_POINTOPOINT) &&
+ (dstaddr->sin_len != sizeof(struct sockaddr_in) ||
+ dstaddr->sin_addr.s_addr == INADDR_ANY))
+ return (EDESTADDRREQ);
+ if (vhid > 0 && carp_attach_p == NULL)
+ return (EPROTONOSUPPORT);
- case SIOCDIFADDR:
- /*
- * in_ifscrub kills the interface route.
- */
- in_ifscrub(ifp, ia, LLE_STATIC);
+ /*
+ * See whether address already exist.
+ */
+ iaIsFirst = true;
+ ia = NULL;
+ IF_ADDR_RLOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ struct in_ifaddr *it;
- /*
- * in_ifadown gets rid of all the rest of
- * the routes. This is not quite the right
- * thing to do, but at least if we are running
- * a routing process they will come back.
- */
- in_ifadown(&ia->ia_ifa, 1);
- EVENTHANDLER_INVOKE(ifaddr_event, ifp);
- error = 0;
- break;
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
- default:
- panic("in_control: unsupported ioctl");
+ it = (struct in_ifaddr *)ifa;
+ iaIsFirst = false;
+ if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr &&
+ prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)
+ ia = it;
}
+ IF_ADDR_RUNLOCK(ifp);
+
+ if (ia != NULL)
+ (void )in_difaddr_ioctl(data, ifp, td);
+
+ ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK);
+ ia = (struct in_ifaddr *)ifa;
+ ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr;
+ ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr;
+ ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask;
+
+ ia->ia_ifp = ifp;
+ ia->ia_addr = *addr;
+ if (mask->sin_len != 0) {
+ ia->ia_sockmask = *mask;
+ ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr);
+ } else {
+ in_addr_t i = ntohl(addr->sin_addr.s_addr);
- IF_ADDR_WLOCK(ifp);
- /* Re-check that ia is still part of the list. */
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
- if (ifa == &ia->ia_ifa)
- break;
- }
- if (ifa == NULL) {
/*
- * If we lost the race with another thread, there is no need to
- * try it again for the next loop as there is no other exit
- * path between here and out.
- */
- IF_ADDR_WUNLOCK(ifp);
- error = EADDRNOTAVAIL;
- goto out;
+ * Be compatible with network classes, if netmask isn't
+ * supplied, guess it based on classes.
+ */
+ if (IN_CLASSA(i))
+ ia->ia_subnetmask = IN_CLASSA_NET;
+ else if (IN_CLASSB(i))
+ ia->ia_subnetmask = IN_CLASSB_NET;
+ else
+ ia->ia_subnetmask = IN_CLASSC_NET;
+ ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask);
}
- TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
- IF_ADDR_WUNLOCK(ifp);
- ifa_free(&ia->ia_ifa); /* if_addrhead */
+ ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask;
+ in_socktrim(&ia->ia_sockmask);
- IN_IFADDR_WLOCK();
- TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
- if (ia->ia_addr.sin_family == AF_INET) {
- struct in_ifaddr *if_ia;
+ if (ifp->if_flags & IFF_BROADCAST) {
+ if (broadaddr->sin_len != 0) {
+ ia->ia_broadaddr = *broadaddr;
+ } else if (ia->ia_subnetmask == IN_RFC3021_MASK) {
+ ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST;
+ ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in);
+ ia->ia_broadaddr.sin_family = AF_INET;
+ } else {
+ ia->ia_broadaddr.sin_addr.s_addr =
+ htonl(ia->ia_subnet | ~ia->ia_subnetmask);
+ ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in);
+ ia->ia_broadaddr.sin_family = AF_INET;
+ }
+ }
- LIST_REMOVE(ia, ia_hash);
- IN_IFADDR_WUNLOCK();
- /*
- * If this is the last IPv4 address configured on this
- * interface, leave the all-hosts group.
- * No state-change report need be transmitted.
- */
- if_ia = NULL;
- IFP_TO_IA(ifp, if_ia);
- if (if_ia == NULL) {
- ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
- IN_MULTI_LOCK();
- if (ii->ii_allhosts) {
- (void)in_leavegroup_locked(ii->ii_allhosts,
- NULL);
- ii->ii_allhosts = NULL;
- }
- IN_MULTI_UNLOCK();
- } else
- ifa_free(&if_ia->ia_ifa);
- } else
- IN_IFADDR_WUNLOCK();
- ifa_free(&ia->ia_ifa); /* in_ifaddrhead */
-out:
- if (ia != NULL)
- ifa_free(&ia->ia_ifa);
- return (error);
-}
+ if (ifp->if_flags & IFF_POINTOPOINT)
+ ia->ia_dstaddr = *dstaddr;
-/*
- * SIOC[GAD]LIFADDR.
- * SIOCGLIFADDR: get first address. (?!?)
- * SIOCGLIFADDR with IFLR_PREFIX:
- * get first address that matches the specified prefix.
- * SIOCALIFADDR: add the specified address.
- * SIOCALIFADDR with IFLR_PREFIX:
- * EINVAL since we can't deduce hostid part of the address.
- * SIOCDLIFADDR: delete the specified address.
- * SIOCDLIFADDR with IFLR_PREFIX:
- * delete the first address that matches the specified prefix.
- * return values:
- * EINVAL on invalid parameters
- * EADDRNOTAVAIL on prefix match failed/specified address not found
- * other values may be returned from in_ioctl()
- */
-static int
-in_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data,
- struct ifnet *ifp, struct thread *td)
-{
- struct if_laddrreq *iflr = (struct if_laddrreq *)data;
- struct ifaddr *ifa;
+ /* XXXGL: rtinit() needs this strange assignment. */
+ if (ifp->if_flags & IFF_LOOPBACK)
+ ia->ia_dstaddr = ia->ia_addr;
- /* sanity checks */
- if (data == NULL || ifp == NULL) {
- panic("invalid argument to in_lifaddr_ioctl");
- /*NOTRECHED*/
+ if (vhid != 0) {
+ error = (*carp_attach_p)(&ia->ia_ifa, vhid);
+ if (error)
+ return (error);
}
- switch (cmd) {
- case SIOCGLIFADDR:
- /* address must be specified on GET with IFLR_PREFIX */
- if ((iflr->flags & IFLR_PREFIX) == 0)
- break;
- /*FALLTHROUGH*/
- case SIOCALIFADDR:
- case SIOCDLIFADDR:
- /* address must be specified on ADD and DELETE */
- if (iflr->addr.ss_family != AF_INET)
- return (EINVAL);
- if (iflr->addr.ss_len != sizeof(struct sockaddr_in))
- return (EINVAL);
- /* XXX need improvement */
- if (iflr->dstaddr.ss_family
- && iflr->dstaddr.ss_family != AF_INET)
- return (EINVAL);
- if (iflr->dstaddr.ss_family
- && iflr->dstaddr.ss_len != sizeof(struct sockaddr_in))
- return (EINVAL);
- break;
- default: /*shouldn't happen*/
- return (EOPNOTSUPP);
- }
- if (sizeof(struct in_addr) * 8 < iflr->prefixlen)
- return (EINVAL);
+ /* if_addrhead is already referenced by ifa_alloc() */
+ IF_ADDR_WLOCK(ifp);
+ TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link);
+ IF_ADDR_WUNLOCK(ifp);
- switch (cmd) {
- case SIOCALIFADDR:
- {
- struct in_aliasreq ifra;
+ ifa_ref(ifa); /* in_ifaddrhead */
+ IN_IFADDR_WLOCK();
+ TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link);
+ LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash);
+ IN_IFADDR_WUNLOCK();
- if (iflr->flags & IFLR_PREFIX)
- return (EINVAL);
+ /*
+ * Give the interface a chance to initialize
+ * if this is its first address,
+ * and to validate the address if necessary.
+ */
+ if (ifp->if_ioctl != NULL) {
+ error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
+ if (error)
+ goto fail1;
+ }
- /* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR). */
- bzero(&ifra, sizeof(ifra));
- bcopy(iflr->iflr_name, ifra.ifra_name,
- sizeof(ifra.ifra_name));
+ /*
+ * Add route for the network.
+ */
+ if (vhid == 0) {
+ int flags = RTF_UP;
- bcopy(&iflr->addr, &ifra.ifra_addr, iflr->addr.ss_len);
+ if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT))
+ flags |= RTF_HOST;
- if (iflr->dstaddr.ss_family) { /*XXX*/
- bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr,
- iflr->dstaddr.ss_len);
- }
+ error = in_addprefix(ia, flags);
+ if (error)
+ goto fail1;
+ }
- ifra.ifra_mask.sin_family = AF_INET;
- ifra.ifra_mask.sin_len = sizeof(struct sockaddr_in);
- in_len2mask(&ifra.ifra_mask.sin_addr, iflr->prefixlen);
-
- return (in_control(so, SIOCAIFADDR, (caddr_t)&ifra, ifp, td));
- }
- case SIOCGLIFADDR:
- case SIOCDLIFADDR:
- {
- struct in_ifaddr *ia;
- struct in_addr mask, candidate, match;
- struct sockaddr_in *sin;
-
- bzero(&mask, sizeof(mask));
- bzero(&match, sizeof(match));
- if (iflr->flags & IFLR_PREFIX) {
- /* lookup a prefix rather than address. */
- in_len2mask(&mask, iflr->prefixlen);
-
- sin = (struct sockaddr_in *)&iflr->addr;
- match.s_addr = sin->sin_addr.s_addr;
- match.s_addr &= mask.s_addr;
-
- /* if you set extra bits, that's wrong */
- if (match.s_addr != sin->sin_addr.s_addr)
- return (EINVAL);
+ /*
+ * Add a loopback route to self.
+ */
+ if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 &&
+ ia->ia_addr.sin_addr.s_addr != INADDR_ANY &&
+ !((ifp->if_flags & IFF_POINTOPOINT) &&
+ ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) {
+ struct in_ifaddr *eia;
- } else {
- /* on getting an address, take the 1st match */
- /* on deleting an address, do exact match */
- if (cmd != SIOCGLIFADDR) {
- in_len2mask(&mask, 32);
- sin = (struct sockaddr_in *)&iflr->addr;
- match.s_addr = sin->sin_addr.s_addr;
- }
- }
+ eia = in_localip_more(ia);
- IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
- if (ifa->ifa_addr->sa_family != AF_INET)
- continue;
- if (match.s_addr == 0)
- break;
- candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr;
- candidate.s_addr &= mask.s_addr;
- if (candidate.s_addr == match.s_addr)
- break;
- }
- if (ifa != NULL)
- ifa_ref(ifa);
- IF_ADDR_RUNLOCK(ifp);
- if (ifa == NULL)
- return (EADDRNOTAVAIL);
- ia = (struct in_ifaddr *)ifa;
+ if (eia == NULL) {
+ error = ifa_add_loopback_route((struct ifaddr *)ia,
+ (struct sockaddr *)&ia->ia_addr);
+ if (error)
+ goto fail2;
+ } else
+ ifa_free(&eia->ia_ifa);
+ }
- if (cmd == SIOCGLIFADDR) {
- /* fill in the if_laddrreq structure */
- bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len);
+ if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) {
+ struct in_addr allhosts_addr;
+ struct in_ifinfo *ii;
- if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
- bcopy(&ia->ia_dstaddr, &iflr->dstaddr,
- ia->ia_dstaddr.sin_len);
- } else
- bzero(&iflr->dstaddr, sizeof(iflr->dstaddr));
+ ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
+ allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP);
- iflr->prefixlen =
- in_mask2len(&ia->ia_sockmask.sin_addr);
+ error = in_joingroup(ifp, &allhosts_addr, NULL,
+ &ii->ii_allhosts);
+ }
- iflr->flags = 0; /*XXX*/
- ifa_free(ifa);
+ EVENTHANDLER_INVOKE(ifaddr_event, ifp);
- return (0);
- } else {
- struct in_aliasreq ifra;
-
- /* fill in_aliasreq and do ioctl(SIOCDIFADDR) */
- bzero(&ifra, sizeof(ifra));
- bcopy(iflr->iflr_name, ifra.ifra_name,
- sizeof(ifra.ifra_name));
-
- bcopy(&ia->ia_addr, &ifra.ifra_addr,
- ia->ia_addr.sin_len);
- if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
- bcopy(&ia->ia_dstaddr, &ifra.ifra_dstaddr,
- ia->ia_dstaddr.sin_len);
- }
- bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr,
- ia->ia_sockmask.sin_len);
- ifa_free(ifa);
+ return (error);
- return (in_control(so, SIOCDIFADDR, (caddr_t)&ifra,
- ifp, td));
- }
- }
- }
+fail2:
+ if (vhid == 0)
+ (void )in_scrubprefix(ia, LLE_STATIC);
- return (EOPNOTSUPP); /*just for safety*/
-}
+fail1:
+ if (ia->ia_ifa.ifa_carp)
+ (*carp_detach_p)(&ia->ia_ifa);
-/*
- * Delete any existing route for an interface.
- */
-void
-in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia, u_int flags)
-{
+ IF_ADDR_WLOCK(ifp);
+ TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
+ IF_ADDR_WUNLOCK(ifp);
+ ifa_free(&ia->ia_ifa); /* if_addrhead */
+
+ IN_IFADDR_WLOCK();
+ TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
+ LIST_REMOVE(ia, ia_hash);
+ IN_IFADDR_WUNLOCK();
+ ifa_free(&ia->ia_ifa); /* in_ifaddrhead */
- in_scrubprefix(ia, flags);
+ return (error);
}
-/*
- * Initialize an interface's internet address
- * and routing table entry.
- */
static int
-in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin,
- int scrub)
+in_difaddr_ioctl(caddr_t data, struct ifnet *ifp, struct thread *td)
{
- register u_long i = ntohl(sin->sin_addr.s_addr);
- struct sockaddr_in oldaddr;
- int s = splimp(), flags = RTF_UP, error = 0;
-
- oldaddr = ia->ia_addr;
- if (oldaddr.sin_family == AF_INET)
- LIST_REMOVE(ia, ia_hash);
- ia->ia_addr = *sin;
- if (ia->ia_addr.sin_family == AF_INET) {
- IN_IFADDR_WLOCK();
- LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr),
- ia, ia_hash);
- IN_IFADDR_WUNLOCK();
- }
- /*
- * Give the interface a chance to initialize
- * if this is its first address,
- * and to validate the address if necessary.
- */
- if (ifp->if_ioctl != NULL) {
- error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
- if (error) {
- splx(s);
- /* LIST_REMOVE(ia, ia_hash) is done in in_control */
- ia->ia_addr = oldaddr;
- IN_IFADDR_WLOCK();
- if (ia->ia_addr.sin_family == AF_INET)
- LIST_INSERT_HEAD(INADDR_HASH(
- ia->ia_addr.sin_addr.s_addr), ia, ia_hash);
- else
- /*
- * If oldaddr family is not AF_INET (e.g.
- * interface has been just created) in_control
- * does not call LIST_REMOVE, and we end up
- * with bogus ia entries in hash
- */
- LIST_REMOVE(ia, ia_hash);
- IN_IFADDR_WUNLOCK();
+ const struct ifreq *ifr = (struct ifreq *)data;
+ const struct sockaddr_in *addr = (const struct sockaddr_in *)
+ &ifr->ifr_addr;
+ struct ifaddr *ifa;
+ struct in_ifaddr *ia;
+ bool deleteAny, iaIsLast;
+ int error;
+
+ if (td != NULL) {
+ error = priv_check(td, PRIV_NET_DELIFADDR);
+ if (error)
return (error);
- }
}
- splx(s);
- if (scrub) {
- ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr;
- in_ifscrub(ifp, ia, LLE_STATIC);
- ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr;
+
+ if (addr->sin_len != sizeof(struct sockaddr_in) ||
+ addr->sin_family != AF_INET)
+ deleteAny = true;
+ else
+ deleteAny = false;
+
+ iaIsLast = true;
+ ia = NULL;
+ IF_ADDR_WLOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ struct in_ifaddr *it;
+
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+
+ it = (struct in_ifaddr *)ifa;
+ if (deleteAny && ia == NULL && (td == NULL ||
+ prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0))
+ ia = it;
+
+ if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr &&
+ (td == NULL || prison_check_ip4(td->td_ucred,
+ &addr->sin_addr) == 0))
+ ia = it;
+
+ if (it != ia)
+ iaIsLast = false;
}
- /*
- * Be compatible with network classes, if netmask isn't supplied,
- * guess it based on classes.
- */
- if (ia->ia_subnetmask == 0) {
- if (IN_CLASSA(i))
- ia->ia_subnetmask = IN_CLASSA_NET;
- else if (IN_CLASSB(i))
- ia->ia_subnetmask = IN_CLASSB_NET;
- else
- ia->ia_subnetmask = IN_CLASSC_NET;
- ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask);
+
+ if (ia == NULL) {
+ IF_ADDR_WUNLOCK(ifp);
+ return (EADDRNOTAVAIL);
}
- ia->ia_subnet = i & ia->ia_subnetmask;
- in_socktrim(&ia->ia_sockmask);
+
+ TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
+ IF_ADDR_WUNLOCK(ifp);
+ ifa_free(&ia->ia_ifa); /* if_addrhead */
+
+ IN_IFADDR_WLOCK();
+ TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
+ LIST_REMOVE(ia, ia_hash);
+ IN_IFADDR_WUNLOCK();
+
/*
- * XXX: carp(4) does not have interface route
+ * in_scrubprefix() kills the interface route.
*/
- if (ifp->if_type == IFT_CARP)
- return (0);
+ in_scrubprefix(ia, LLE_STATIC);
+
/*
- * Add route for the network.
+ * in_ifadown gets rid of all the rest of
+ * the routes. This is not quite the right
+ * thing to do, but at least if we are running
+ * a routing process they will come back.
*/
- ia->ia_ifa.ifa_metric = ifp->if_metric;
- if (ifp->if_flags & IFF_BROADCAST) {
- if (ia->ia_subnetmask == IN_RFC3021_MASK)
- ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST;
- else
- ia->ia_broadaddr.sin_addr.s_addr =
- htonl(ia->ia_subnet | ~ia->ia_subnetmask);
- } else if (ifp->if_flags & IFF_LOOPBACK) {
- ia->ia_dstaddr = ia->ia_addr;
- flags |= RTF_HOST;
- } else if (ifp->if_flags & IFF_POINTOPOINT) {
- if (ia->ia_dstaddr.sin_family != AF_INET)
- return (0);
- flags |= RTF_HOST;
- }
- if ((error = in_addprefix(ia, flags)) != 0)
- return (error);
-
- if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY)
- return (0);
-
- if (ifp->if_flags & IFF_POINTOPOINT) {
- if (ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)
- return (0);
- }
+ in_ifadown(&ia->ia_ifa, 1);
+ if (ia->ia_ifa.ifa_carp)
+ (*carp_detach_p)(&ia->ia_ifa);
/*
- * add a loopback route to self
+ * If this is the last IPv4 address configured on this
+ * interface, leave the all-hosts group.
+ * No state-change report need be transmitted.
*/
- if (V_useloopback && !(ifp->if_flags & IFF_LOOPBACK)) {
- struct route ia_ro;
-
- bzero(&ia_ro, sizeof(ia_ro));
- *((struct sockaddr_in *)(&ia_ro.ro_dst)) = ia->ia_addr;
- rtalloc_ign_fib(&ia_ro, 0, RT_DEFAULT_FIB);
- if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) &&
- (ia_ro.ro_rt->rt_ifp == V_loif)) {
- RT_LOCK(ia_ro.ro_rt);
- RT_ADDREF(ia_ro.ro_rt);
- RTFREE_LOCKED(ia_ro.ro_rt);
- } else
- error = ifa_add_loopback_route((struct ifaddr *)ia,
- (struct sockaddr *)&ia->ia_addr);
- if (error == 0)
- ia->ia_flags |= IFA_RTSELF;
- if (ia_ro.ro_rt != NULL)
- RTFREE(ia_ro.ro_rt);
+ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) {
+ struct in_ifinfo *ii;
+
+ ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
+ IN_MULTI_LOCK();
+ if (ii->ii_allhosts) {
+ (void)in_leavegroup_locked(ii->ii_allhosts, NULL);
+ ii->ii_allhosts = NULL;
+ }
+ IN_MULTI_UNLOCK();
}
- return (error);
+ EVENTHANDLER_INVOKE(ifaddr_event, ifp);
+ ifa_free(&ia->ia_ifa); /* in_ifaddrhead */
+
+ return (0);
}
#define rtinitflags(x) \
@@ -965,9 +652,10 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin,
/*
* Check if we have a route for the given prefix already or add one accordingly.
*/
-static int
+int
in_addprefix(struct in_ifaddr *target, int flags)
{
+ struct rm_priotracker in_ifa_tracker;
struct in_ifaddr *ia;
struct in_addr prefix, mask, p, m;
int error;
@@ -981,7 +669,7 @@ in_addprefix(struct in_ifaddr *target, int flags)
prefix.s_addr &= mask.s_addr;
}
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
/* Look for an existing address with the same prefix, mask, and fib */
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if (rtinitflags(ia)) {
@@ -1009,28 +697,26 @@ in_addprefix(struct in_ifaddr *target, int flags)
#ifdef RADIX_MPATH
if (ia->ia_addr.sin_addr.s_addr ==
target->ia_addr.sin_addr.s_addr) {
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return (EEXIST);
} else
break;
#endif
- if (V_sameprefixcarponly &&
- target->ia_ifp->if_type != IFT_CARP &&
- ia->ia_ifp->if_type != IFT_CARP) {
- IN_IFADDR_RUNLOCK();
+ if (V_nosameprefix) {
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return (EEXIST);
} else {
int fibnum;
- fibnum = rt_add_addr_allfibs ? RT_ALL_FIBS :
+ fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS :
target->ia_ifp->if_fib;
rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum);
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return (0);
}
}
}
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
/*
* No-one seem to have this prefix route, so we try to insert it.
@@ -1041,68 +727,87 @@ in_addprefix(struct in_ifaddr *target, int flags)
return (error);
}
-extern void arp_ifscrub(struct ifnet *ifp, uint32_t addr);
+/*
+ * Removes either all lle entries for given @ia, or lle
+ * corresponding to @ia address.
+ */
+static void
+in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags)
+{
+ struct sockaddr_in addr, mask;
+ struct sockaddr *saddr, *smask;
+ struct ifnet *ifp;
+
+ saddr = (struct sockaddr *)&addr;
+ bzero(&addr, sizeof(addr));
+ addr.sin_len = sizeof(addr);
+ addr.sin_family = AF_INET;
+ smask = (struct sockaddr *)&mask;
+ bzero(&mask, sizeof(mask));
+ mask.sin_len = sizeof(mask);
+ mask.sin_family = AF_INET;
+ mask.sin_addr.s_addr = ia->ia_subnetmask;
+ ifp = ia->ia_ifp;
+
+ if (all) {
+
+ /*
+ * Remove all L2 entries matching given prefix.
+ * Convert address to host representation to avoid
+ * doing this on every callback. ia_subnetmask is already
+ * stored in host representation.
+ */
+ addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr);
+ lltable_prefix_free(AF_INET, saddr, smask, flags);
+ } else {
+ /* Remove interface address only */
+ addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr;
+ lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr);
+ }
+}
/*
* If there is no other address in the system that can serve a route to the
* same prefix, remove the route. Hand over the route to the new address
* otherwise.
*/
-static int
+int
in_scrubprefix(struct in_ifaddr *target, u_int flags)
{
+ struct rm_priotracker in_ifa_tracker;
struct in_ifaddr *ia;
- struct in_addr prefix, mask, p;
+ struct in_addr prefix, mask, p, m;
int error = 0;
- struct sockaddr_in prefix0, mask0;
/*
* Remove the loopback route to the interface address.
- * The "useloopback" setting is not consulted because if the
- * user configures an interface address, turns off this
- * setting, and then tries to delete that interface address,
- * checking the current setting of "useloopback" would leave
- * that interface address loopback route untouched, which
- * would be wrong. Therefore the interface address loopback route
- * deletion is unconditional.
*/
if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) &&
!(target->ia_ifp->if_flags & IFF_LOOPBACK) &&
- (target->ia_flags & IFA_RTSELF)) {
- struct route ia_ro;
- int freeit = 0;
- int fibnum;
+ (flags & LLE_STATIC)) {
+ struct in_ifaddr *eia;
- bzero(&ia_ro, sizeof(ia_ro));
- *((struct sockaddr_in *)(&ia_ro.ro_dst)) = target->ia_addr;
- fibnum = target->ia_ifp->if_fib;
- rtalloc_ign_fib(&ia_ro, 0, fibnum);
- if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) &&
- (ia_ro.ro_rt->rt_ifp == V_loif)) {
- RT_LOCK(ia_ro.ro_rt);
- if (ia_ro.ro_rt->rt_refcnt <= 1)
- freeit = 1;
- else if (flags & LLE_STATIC) {
- RT_REMREF(ia_ro.ro_rt);
- target->ia_flags &= ~IFA_RTSELF;
- }
- RTFREE_LOCKED(ia_ro.ro_rt);
- }
- if (freeit && (flags & LLE_STATIC)) {
+ /*
+ * XXXME: add fib-aware in_localip.
+ * We definitely don't want to switch between
+ * prefixes in different fibs.
+ */
+ eia = in_localip_more(target);
+
+ if (eia != NULL) {
+ error = ifa_switch_loopback_route((struct ifaddr *)eia,
+ (struct sockaddr *)&target->ia_addr);
+ ifa_free(&eia->ia_ifa);
+ } else {
error = ifa_del_loopback_route((struct ifaddr *)target,
(struct sockaddr *)&target->ia_addr);
- if (error == 0)
- target->ia_flags &= ~IFA_RTSELF;
}
- if ((flags & LLE_STATIC) &&
- !(target->ia_ifp->if_flags & IFF_NOARP))
- /* remove arp cache */
- arp_ifscrub(target->ia_ifp, IA_SIN(target)->sin_addr.s_addr);
}
- if (rtinitflags(target))
+ if (rtinitflags(target)) {
prefix = target->ia_dstaddr.sin_addr;
- else {
+ mask.s_addr = 0;
+ } else {
prefix = target->ia_addr.sin_addr;
mask = target->ia_sockmask.sin_addr;
prefix.s_addr &= mask.s_addr;
@@ -1111,38 +816,48 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags)
if ((target->ia_flags & IFA_ROUTE) == 0) {
int fibnum;
- fibnum = rt_add_addr_allfibs ? RT_ALL_FIBS :
+ fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS :
target->ia_ifp->if_fib;
rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum);
+
+ /*
+ * Removing address from !IFF_UP interface or
+ * prefix which exists on other interface (along with route).
+ * No entries should exist here except target addr.
+ * Given that, delete this entry only.
+ */
+ in_scrubprefixlle(target, 0, flags);
return (0);
}
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
- if (rtinitflags(ia))
+ if (rtinitflags(ia)) {
p = ia->ia_dstaddr.sin_addr;
- else {
+
+ if (prefix.s_addr != p.s_addr)
+ continue;
+ } else {
p = ia->ia_addr.sin_addr;
- p.s_addr &= ia->ia_sockmask.sin_addr.s_addr;
+ m = ia->ia_sockmask.sin_addr;
+ p.s_addr &= m.s_addr;
+
+ if (prefix.s_addr != p.s_addr ||
+ mask.s_addr != m.s_addr)
+ continue;
}
- if ((prefix.s_addr != p.s_addr) ||
- !(ia->ia_ifp->if_flags & IFF_UP))
+ if ((ia->ia_ifp->if_flags & IFF_UP) == 0)
continue;
/*
* If we got a matching prefix address, move IFA_ROUTE and
* the route itself to it. Make sure that routing daemons
* get a heads-up.
- *
- * XXX: a special case for carp(4) interface - this should
- * be more generally specified as an interface that
- * doesn't support such action.
*/
- if ((ia->ia_flags & IFA_ROUTE) == 0
- && (ia->ia_ifp->if_type != IFT_CARP)) {
+ if ((ia->ia_flags & IFA_ROUTE) == 0) {
ifa_ref(&ia->ia_ifa);
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
error = rtinit(&(target->ia_ifa), (int)RTM_DELETE,
rtinitflags(target));
if (error == 0)
@@ -1150,6 +865,9 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags)
else
log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n",
error);
+ /* Scrub all entries IFF interface is different */
+ in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp,
+ flags);
error = rtinit(&ia->ia_ifa, (int)RTM_ADD,
rtinitflags(ia) | RTF_UP);
if (error == 0)
@@ -1161,21 +879,12 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags)
return (error);
}
}
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
/*
* remove all L2 entries on the given prefix
*/
- bzero(&prefix0, sizeof(prefix0));
- prefix0.sin_len = sizeof(prefix0);
- prefix0.sin_family = AF_INET;
- prefix0.sin_addr.s_addr = target->ia_subnet;
- bzero(&mask0, sizeof(mask0));
- mask0.sin_len = sizeof(mask0);
- mask0.sin_family = AF_INET;
- mask0.sin_addr.s_addr = target->ia_subnetmask;
- lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0,
- (struct sockaddr *)&mask0, flags);
+ in_scrubprefixlle(target, 1, flags);
/*
* As no-one seem to have this prefix, we can remove the route.
@@ -1190,6 +899,58 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags)
#undef rtinitflags
+void
+in_ifscrub_all(void)
+{
+ struct ifnet *ifp;
+ struct ifaddr *ifa, *nifa;
+ struct ifaliasreq ifr;
+
+ IFNET_RLOCK();
+ TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+ /* Cannot lock here - lock recursion. */
+ /* IF_ADDR_RLOCK(ifp); */
+ TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) {
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+
+ /*
+ * This is ugly but the only way for legacy IP to
+ * cleanly remove addresses and everything attached.
+ */
+ bzero(&ifr, sizeof(ifr));
+ ifr.ifra_addr = *ifa->ifa_addr;
+ if (ifa->ifa_dstaddr)
+ ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
+ (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr,
+ ifp, NULL);
+ }
+ /* IF_ADDR_RUNLOCK(ifp); */
+ in_purgemaddrs(ifp);
+ igmp_domifdetach(ifp);
+ }
+ IFNET_RUNLOCK();
+}
+
+int
+in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia)
+{
+
+ return ((in.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
+ /*
+ * Check for old-style (host 0) broadcast, but
+ * taking into account that RFC 3021 obsoletes it.
+ */
+ (ia->ia_subnetmask != IN_RFC3021_MASK &&
+ ntohl(in.s_addr) == ia->ia_subnet)) &&
+ /*
+ * Check for an all one subnetmask. These
+ * only exist when an interface gets a secondary
+ * address.
+ */
+ ia->ia_subnetmask != (u_long)0xffffffff);
+}
+
/*
* Return 1 if the address might be a local broadcast address.
*/
@@ -1197,37 +958,27 @@ int
in_broadcast(struct in_addr in, struct ifnet *ifp)
{
register struct ifaddr *ifa;
- u_long t;
+ int found;
if (in.s_addr == INADDR_BROADCAST ||
in.s_addr == INADDR_ANY)
return (1);
if ((ifp->if_flags & IFF_BROADCAST) == 0)
return (0);
- t = ntohl(in.s_addr);
+ found = 0;
/*
* Look through the list of addresses for a match
* with a broadcast address.
*/
-#define ia ((struct in_ifaddr *)ifa)
+ IF_ADDR_RLOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
if (ifa->ifa_addr->sa_family == AF_INET &&
- (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
- /*
- * Check for old-style (host 0) broadcast, but
- * taking into account that RFC 3021 obsoletes it.
- */
- (ia->ia_subnetmask != IN_RFC3021_MASK &&
- t == ia->ia_subnet)) &&
- /*
- * Check for an all one subnetmask. These
- * only exist when an interface gets a secondary
- * address.
- */
- ia->ia_subnetmask != (u_long)0xffffffff)
- return (1);
- return (0);
-#undef ia
+ in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) {
+ found = 1;
+ break;
+ }
+ IF_ADDR_RUNLOCK(ifp);
+ return (found);
}
/*
@@ -1239,6 +990,7 @@ in_ifdetach(struct ifnet *ifp)
in_pcbpurgeif0(&V_ripcbinfo, ifp);
in_pcbpurgeif0(&V_udbinfo, ifp);
+ in_pcbpurgeif0(&V_ulitecbinfo, ifp);
in_purgemaddrs(ifp);
}
@@ -1288,34 +1040,44 @@ in_purgemaddrs(struct ifnet *ifp)
IN_MULTI_UNLOCK();
}
-#include <net/if_dl.h>
-#include <netinet/if_ether.h>
-
struct in_llentry {
struct llentry base;
- struct sockaddr_in l3_addr4;
};
+#define IN_LLTBL_DEFAULT_HSIZE 32
+#define IN_LLTBL_HASH(k, h) \
+ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1))
+
/*
- * Deletes an address from the address table.
- * This function is called by the timer functions
- * such as arptimer() and nd6_llinfo_timer(), and
- * the caller does the locking.
+ * Do actual deallocation of @lle.
*/
static void
-in_lltable_free(struct lltable *llt, struct llentry *lle)
+in_lltable_destroy_lle_unlocked(struct llentry *lle)
{
- LLE_WUNLOCK(lle);
+
LLE_LOCK_DESTROY(lle);
+ LLE_REQ_DESTROY(lle);
free(lle, M_LLTABLE);
}
+/*
+ * Called by LLE_FREE_LOCKED when number of references
+ * drops to zero.
+ */
+static void
+in_lltable_destroy_lle(struct llentry *lle)
+{
+
+ LLE_WUNLOCK(lle);
+ in_lltable_destroy_lle_unlocked(lle);
+}
+
static struct llentry *
-in_lltable_new(const struct sockaddr *l3addr, u_int flags)
+in_lltable_new(struct in_addr addr4, u_int flags)
{
struct in_llentry *lle;
- lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_DONTWAIT | M_ZERO);
+ lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO);
if (lle == NULL) /* NB: caller generates msg */
return NULL;
@@ -1324,82 +1086,123 @@ in_lltable_new(const struct sockaddr *l3addr, u_int flags)
* an ARP request.
*/
lle->base.la_expire = time_uptime; /* mark expired */
- lle->l3_addr4 = *(const struct sockaddr_in *)l3addr;
+ lle->base.r_l3addr.addr4 = addr4;
lle->base.lle_refcnt = 1;
- lle->base.lle_free = in_lltable_free;
+ lle->base.lle_free = in_lltable_destroy_lle;
LLE_LOCK_INIT(&lle->base);
- callout_init_rw(&lle->base.la_timer, &lle->base.lle_lock,
- CALLOUT_RETURNUNLOCKED);
+ LLE_REQ_INIT(&lle->base);
+ callout_init(&lle->base.lle_timer, 1);
return (&lle->base);
}
-#define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \
- (((ntohl((d)->sin_addr.s_addr) ^ (a)->sin_addr.s_addr) & (m)->sin_addr.s_addr)) == 0 )
+#define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \
+ ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 )
+
+static int
+in_lltable_match_prefix(const struct sockaddr *saddr,
+ const struct sockaddr *smask, u_int flags, struct llentry *lle)
+{
+ struct in_addr addr, mask, lle_addr;
+
+ addr = ((const struct sockaddr_in *)saddr)->sin_addr;
+ mask = ((const struct sockaddr_in *)smask)->sin_addr;
+ lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr);
+
+ if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0)
+ return (0);
+
+ if (lle->la_flags & LLE_IFADDR) {
+
+ /*
+ * Delete LLE_IFADDR records IFF address & flag matches.
+ * Note that addr is the interface address within prefix
+ * being matched.
+ * Note also we should handle 'ifdown' cases without removing
+ * ifaddr macs.
+ */
+ if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0)
+ return (1);
+ return (0);
+ }
+
+ /* flags & LLE_STATIC means deleting both dynamic and static entries */
+ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))
+ return (1);
+
+ return (0);
+}
static void
-in_lltable_prefix_free(struct lltable *llt, const struct sockaddr *prefix,
- const struct sockaddr *mask, u_int flags)
+in_lltable_free_entry(struct lltable *llt, struct llentry *lle)
{
- const struct sockaddr_in *pfx = (const struct sockaddr_in *)prefix;
- const struct sockaddr_in *msk = (const struct sockaddr_in *)mask;
- struct llentry *lle, *next;
- int i;
+ struct ifnet *ifp;
size_t pkts_dropped;
- IF_AFDATA_WLOCK(llt->llt_ifp);
- for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) {
- LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) {
- /*
- * (flags & LLE_STATIC) means deleting all entries
- * including static ARP entries.
- */
- if (IN_ARE_MASKED_ADDR_EQUAL(satosin(L3_ADDR(lle)),
- pfx, msk) && ((flags & LLE_STATIC) ||
- !(lle->la_flags & LLE_STATIC))) {
- LLE_WLOCK(lle);
- if (callout_stop(&lle->la_timer))
- LLE_REMREF(lle);
- pkts_dropped = llentry_free(lle);
- ARPSTAT_ADD(dropped, pkts_dropped);
- }
- }
+ LLE_WLOCK_ASSERT(lle);
+ KASSERT(llt != NULL, ("lltable is NULL"));
+
+ /* Unlink entry from table if not already */
+ if ((lle->la_flags & LLE_LINKED) != 0) {
+ ifp = llt->llt_ifp;
+ IF_AFDATA_WLOCK_ASSERT(ifp);
+ lltable_unlink_entry(llt, lle);
}
- IF_AFDATA_WUNLOCK(llt->llt_ifp);
-}
+ /* cancel timer */
+ if (callout_stop(&lle->lle_timer) > 0)
+ LLE_REMREF(lle);
+
+ /* Drop hold queue */
+ pkts_dropped = llentry_free(lle);
+ ARPSTAT_ADD(dropped, pkts_dropped);
+}
static int
in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr)
{
- struct rtentry *rt;
+ struct rt_addrinfo info;
+ struct sockaddr_in rt_key, rt_mask;
+ struct sockaddr rt_gateway;
+ int rt_flags;
KASSERT(l3addr->sa_family == AF_INET,
("sin_family %d", l3addr->sa_family));
- /* XXX rtalloc1_fib should take a const param */
- rt = rtalloc1_fib(__DECONST(struct sockaddr *, l3addr), 0, 0,
- ifp->if_fib);
+ bzero(&rt_key, sizeof(rt_key));
+ rt_key.sin_len = sizeof(rt_key);
+ bzero(&rt_mask, sizeof(rt_mask));
+ rt_mask.sin_len = sizeof(rt_mask);
+ bzero(&rt_gateway, sizeof(rt_gateway));
+ rt_gateway.sa_len = sizeof(rt_gateway);
+
+ bzero(&info, sizeof(info));
+ info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key;
+ info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&rt_mask;
+ info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway;
- if (rt == NULL)
+ if (rib_lookup_info(ifp->if_fib, l3addr, NHR_REF, 0, &info) != 0)
return (EINVAL);
+ rt_flags = info.rti_flags;
+
/*
* If the gateway for an existing host route matches the target L3
* address, which is a special route inserted by some implementation
* such as MANET, and the interface is of the correct type, then
* allow for ARP to proceed.
*/
- if (rt->rt_flags & RTF_GATEWAY) {
- if (!(rt->rt_flags & RTF_HOST) || !rt->rt_ifp ||
- rt->rt_ifp->if_type != IFT_ETHER ||
- (rt->rt_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 ||
- memcmp(rt->rt_gateway->sa_data, l3addr->sa_data,
+ if (rt_flags & RTF_GATEWAY) {
+ if (!(rt_flags & RTF_HOST) || !info.rti_ifp ||
+ info.rti_ifp->if_type != IFT_ETHER ||
+ (info.rti_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 ||
+ memcmp(rt_gateway.sa_data, l3addr->sa_data,
sizeof(in_addr_t)) != 0) {
- RTFREE_LOCKED(rt);
+ rib_free_info(&info);
return (EINVAL);
}
}
+ rib_free_info(&info);
/*
* Make sure that at least the destination address is covered
@@ -1408,21 +1211,19 @@ in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr
* on one interface and the corresponding outgoing packet leaves
* another interface.
*/
- if (!(rt->rt_flags & RTF_HOST) && rt->rt_ifp != ifp) {
+ if (!(rt_flags & RTF_HOST) && info.rti_ifp != ifp) {
const char *sa, *mask, *addr, *lim;
int len;
- mask = (const char *)rt_mask(rt);
+ mask = (const char *)&rt_mask;
/*
* Just being extra cautious to avoid some custom
* code getting into trouble.
*/
- if (mask == NULL) {
- RTFREE_LOCKED(rt);
+ if ((info.rti_addrs & RTA_NETMASK) == 0)
return (EINVAL);
- }
- sa = (const char *)rt_key(rt);
+ sa = (const char *)&rt_key;
addr = (const char *)l3addr;
len = ((const struct sockaddr_in *)l3addr)->sin_len;
lim = addr + len;
@@ -1433,151 +1234,188 @@ in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr
log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n",
inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr));
#endif
- RTFREE_LOCKED(rt);
return (EINVAL);
}
}
}
- RTFREE_LOCKED(rt);
return (0);
}
-/*
- * Return NULL if not found or marked for deletion.
- * If found return lle read locked.
- */
-static struct llentry *
-in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr)
+static inline uint32_t
+in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize)
+{
+
+ return (IN_LLTBL_HASH(dst.s_addr, hsize));
+}
+
+static uint32_t
+in_lltable_hash(const struct llentry *lle, uint32_t hsize)
+{
+
+ return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize));
+}
+
+static void
+in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
+{
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)sa;
+ bzero(sin, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = lle->r_l3addr.addr4;
+}
+
+static inline struct llentry *
+in_lltable_find_dst(struct lltable *llt, struct in_addr dst)
{
- const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr;
- struct ifnet *ifp = llt->llt_ifp;
struct llentry *lle;
struct llentries *lleh;
- u_int hashkey;
-
- IF_AFDATA_LOCK_ASSERT(ifp);
- KASSERT(l3addr->sa_family == AF_INET,
- ("sin_family %d", l3addr->sa_family));
+ u_int hashidx;
- hashkey = sin->sin_addr.s_addr;
- lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)];
+ hashidx = in_lltable_hash_dst(dst, llt->llt_hsize);
+ lleh = &llt->lle_head[hashidx];
LIST_FOREACH(lle, lleh, lle_next) {
- struct sockaddr_in *sa2 = satosin(L3_ADDR(lle));
if (lle->la_flags & LLE_DELETED)
continue;
- if (sa2->sin_addr.s_addr == sin->sin_addr.s_addr)
+ if (lle->r_l3addr.addr4.s_addr == dst.s_addr)
break;
}
- if (lle == NULL) {
-#ifdef DIAGNOSTIC
- if (flags & LLE_DELETE)
- log(LOG_INFO, "interface address is missing from cache = %p in delete\n", lle);
-#endif
- if (!(flags & LLE_CREATE))
- return (NULL);
- IF_AFDATA_WLOCK_ASSERT(ifp);
- /*
- * A route that covers the given address must have
- * been installed 1st because we are doing a resolution,
- * verify this.
- */
- if (!(flags & LLE_IFADDR) &&
- in_lltable_rtcheck(ifp, flags, l3addr) != 0)
- goto done;
-
- lle = in_lltable_new(l3addr, flags);
- if (lle == NULL) {
- log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
- goto done;
- }
- lle->la_flags = flags & ~LLE_CREATE;
- if ((flags & (LLE_CREATE | LLE_IFADDR)) == (LLE_CREATE | LLE_IFADDR)) {
- bcopy(IF_LLADDR(ifp), &lle->ll_addr, ifp->if_addrlen);
- lle->la_flags |= (LLE_VALID | LLE_STATIC);
- }
- lle->lle_tbl = llt;
- lle->lle_head = lleh;
- lle->la_flags |= LLE_LINKED;
- LIST_INSERT_HEAD(lleh, lle, lle_next);
- } else if (flags & LLE_DELETE) {
- if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) {
- LLE_WLOCK(lle);
- lle->la_flags |= LLE_DELETED;
- EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
+ return (lle);
+}
+
+static void
+in_lltable_delete_entry(struct lltable *llt, struct llentry *lle)
+{
+
+ lle->la_flags |= LLE_DELETED;
+ EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
#ifdef DIAGNOSTIC
- log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
+ log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
#endif
- if ((lle->la_flags &
- (LLE_STATIC | LLE_IFADDR)) == LLE_STATIC)
- llentry_free(lle);
- else
- LLE_WUNLOCK(lle);
- }
- lle = (void *)-1;
+ llentry_free(lle);
+}
+
+static struct llentry *
+in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr)
+{
+ const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr;
+ struct ifnet *ifp = llt->llt_ifp;
+ struct llentry *lle;
+ char linkhdr[LLE_MAX_LINKHDR];
+ size_t linkhdrsize;
+ int lladdr_off;
+
+ KASSERT(l3addr->sa_family == AF_INET,
+ ("sin_family %d", l3addr->sa_family));
+ /*
+ * A route that covers the given address must have
+ * been installed 1st because we are doing a resolution,
+ * verify this.
+ */
+ if (!(flags & LLE_IFADDR) &&
+ in_lltable_rtcheck(ifp, flags, l3addr) != 0)
+ return (NULL);
+
+ lle = in_lltable_new(sin->sin_addr, flags);
+ if (lle == NULL) {
+ log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
+ return (NULL);
}
- if (LLE_IS_VALID(lle)) {
- if (flags & LLE_EXCLUSIVE)
- LLE_WLOCK(lle);
- else
- LLE_RLOCK(lle);
+ lle->la_flags = flags;
+ if (flags & LLE_STATIC)
+ lle->r_flags |= RLLE_VALID;
+ if ((flags & LLE_IFADDR) == LLE_IFADDR) {
+ linkhdrsize = LLE_MAX_LINKHDR;
+ if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp),
+ linkhdr, &linkhdrsize, &lladdr_off) != 0) {
+ in_lltable_destroy_lle_unlocked(lle);
+ return (NULL);
+ }
+ lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize,
+ lladdr_off);
+ lle->la_flags |= LLE_STATIC;
+ lle->r_flags |= (RLLE_VALID | RLLE_IFADDR);
}
-done:
+
+ return (lle);
+}
+
+/*
+ * Return NULL if not found or marked for deletion.
+ * If found return lle read locked.
+ */
+static struct llentry *
+in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr)
+{
+ const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr;
+ struct llentry *lle;
+
+ IF_AFDATA_LOCK_ASSERT(llt->llt_ifp);
+ KASSERT(l3addr->sa_family == AF_INET,
+ ("sin_family %d", l3addr->sa_family));
+ lle = in_lltable_find_dst(llt, sin->sin_addr);
+
+ if (lle == NULL)
+ return (NULL);
+
+ KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) !=
+ (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X",
+ flags));
+
+ if (flags & LLE_UNLOCKED)
+ return (lle);
+
+ if (flags & LLE_EXCLUSIVE)
+ LLE_WLOCK(lle);
+ else
+ LLE_RLOCK(lle);
+
return (lle);
}
static int
-in_lltable_dump(struct lltable *llt, struct sysctl_req *wr)
+in_lltable_dump_entry(struct lltable *llt, struct llentry *lle,
+ struct sysctl_req *wr)
{
-#define SIN(lle) ((struct sockaddr_in *) L3_ADDR(lle))
struct ifnet *ifp = llt->llt_ifp;
- struct llentry *lle;
/* XXX stack use */
struct {
struct rt_msghdr rtm;
- struct sockaddr_inarp sin;
+ struct sockaddr_in sin;
struct sockaddr_dl sdl;
} arpc;
- int error, i;
-
- LLTABLE_LOCK_ASSERT();
-
- error = 0;
- for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) {
- LIST_FOREACH(lle, &llt->lle_head[i], lle_next) {
- struct sockaddr_dl *sdl;
+ struct sockaddr_dl *sdl;
+ int error;
+ bzero(&arpc, sizeof(arpc));
/* skip deleted entries */
if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
- continue;
+ return (0);
/* Skip if jailed and not a valid IP of the prison. */
- if (prison_if(wr->td->td_ucred, L3_ADDR(lle)) != 0)
- continue;
+ lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin);
+ if (prison_if(wr->td->td_ucred,
+ (struct sockaddr *)&arpc.sin) != 0)
+ return (0);
/*
* produce a msg made of:
* struct rt_msghdr;
- * struct sockaddr_inarp; (IPv4)
+ * struct sockaddr_in; (IPv4)
* struct sockaddr_dl;
*/
- bzero(&arpc, sizeof(arpc));
arpc.rtm.rtm_msglen = sizeof(arpc);
arpc.rtm.rtm_version = RTM_VERSION;
arpc.rtm.rtm_type = RTM_GET;
arpc.rtm.rtm_flags = RTF_UP;
arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
- arpc.sin.sin_family = AF_INET;
- arpc.sin.sin_len = sizeof(arpc.sin);
- arpc.sin.sin_addr.s_addr = SIN(lle)->sin_addr.s_addr;
/* publish */
- if (lle->la_flags & LLE_PUB) {
+ if (lle->la_flags & LLE_PUB)
arpc.rtm.rtm_flags |= RTF_ANNOUNCE;
- /* proxy only */
- if (lle->la_flags & LLE_PROXY)
- arpc.sin.sin_other = SIN_PROXY;
- }
sdl = &arpc.sdl;
sdl->sdl_family = AF_LINK;
@@ -1586,7 +1424,7 @@ in_lltable_dump(struct lltable *llt, struct sysctl_req *wr)
sdl->sdl_type = ifp->if_type;
if ((lle->la_flags & LLE_VALID) == LLE_VALID) {
sdl->sdl_alen = ifp->if_addrlen;
- bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
+ bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
} else {
sdl->sdl_alen = 0;
bzero(LLADDR(sdl), ifp->if_addrlen);
@@ -1597,35 +1435,47 @@ in_lltable_dump(struct lltable *llt, struct sysctl_req *wr)
arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
if (lle->la_flags & LLE_STATIC)
arpc.rtm.rtm_flags |= RTF_STATIC;
+ if (lle->la_flags & LLE_IFADDR)
+ arpc.rtm.rtm_flags |= RTF_PINNED;
arpc.rtm.rtm_index = ifp->if_index;
error = SYSCTL_OUT(wr, &arpc, sizeof(arpc));
- if (error)
- break;
- }
- }
- return error;
-#undef SIN
+
+ return (error);
+}
+
+static struct lltable *
+in_lltattach(struct ifnet *ifp)
+{
+ struct lltable *llt;
+
+ llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE);
+ llt->llt_af = AF_INET;
+ llt->llt_ifp = ifp;
+
+ llt->llt_lookup = in_lltable_lookup;
+ llt->llt_alloc_entry = in_lltable_alloc;
+ llt->llt_delete_entry = in_lltable_delete_entry;
+ llt->llt_dump_entry = in_lltable_dump_entry;
+ llt->llt_hash = in_lltable_hash;
+ llt->llt_fill_sa_entry = in_lltable_fill_sa_entry;
+ llt->llt_free_entry = in_lltable_free_entry;
+ llt->llt_match_prefix = in_lltable_match_prefix;
+ lltable_link(llt);
+
+ return (llt);
}
void *
in_domifattach(struct ifnet *ifp)
{
struct in_ifinfo *ii;
- struct lltable *llt;
ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO);
- llt = lltable_init(ifp, AF_INET);
- if (llt != NULL) {
- llt->llt_prefix_free = in_lltable_prefix_free;
- llt->llt_lookup = in_lltable_lookup;
- llt->llt_dump = in_lltable_dump;
- }
- ii->ii_llt = llt;
-
+ ii->ii_llt = in_lltattach(ifp);
ii->ii_igmp = igmp_domifattach(ifp);
- return ii;
+ return (ii);
}
void
diff --git a/freebsd/sys/netinet/in.h b/freebsd/sys/netinet/in.h
index 06f9b793..b06e3334 100644
--- a/freebsd/sys/netinet/in.h
+++ b/freebsd/sys/netinet/in.h
@@ -47,8 +47,8 @@
#define IPPROTO_TCP 6 /* tcp */
#define IPPROTO_UDP 17 /* user datagram protocol */
-#define INADDR_ANY (u_int32_t)0x00000000
-#define INADDR_BROADCAST (u_int32_t)0xffffffff /* must be masked */
+#define INADDR_ANY ((in_addr_t)0x00000000)
+#define INADDR_BROADCAST ((in_addr_t)0xffffffff) /* must be masked */
#ifndef _UINT8_T_DECLARED
typedef __uint8_t uint8_t;
@@ -104,7 +104,7 @@ struct sockaddr_in {
char sin_zero[8];
};
-#if !defined(_KERNEL) && __BSD_VISIBLE
+#if !defined(_KERNEL) && __POSIX_VISIBLE >= 200112
#ifndef _BYTEORDER_PROTOTYPED
#define _BYTEORDER_PROTOTYPED
@@ -124,7 +124,7 @@ __END_DECLS
#define ntohs(x) __ntohs(x)
#endif
-#endif /* !_KERNEL && __BSD_VISIBLE */
+#endif /* !_KERNEL && __POSIX_VISIBLE >= 200112 */
#if __POSIX_VISIBLE >= 200112
#define IPPROTO_IPV6 41 /* IP6 header */
@@ -241,12 +241,17 @@ __END_DECLS
#define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */
#define IPPROTO_SCTP 132 /* SCTP */
#define IPPROTO_MH 135 /* IPv6 Mobility Header */
+#define IPPROTO_UDPLITE 136 /* UDP-Lite */
+#define IPPROTO_HIP 139 /* IP6 Host Identity Protocol */
+#define IPPROTO_SHIM6 140 /* IP6 Shim6 Protocol */
/* 101-254: Partly Unassigned */
#define IPPROTO_PIM 103 /* Protocol Independent Mcast */
#define IPPROTO_CARP 112 /* CARP */
#define IPPROTO_PGM 113 /* PGM */
#define IPPROTO_MPLS 137 /* MPLS-in-IP */
#define IPPROTO_PFSYNC 240 /* PFSYNC */
+#define IPPROTO_RESERVED_253 253 /* Reserved */
+#define IPPROTO_RESERVED_254 254 /* Reserved */
/* 255: Reserved */
/* BSD Private, local use, namespace incursion, no longer used */
#define IPPROTO_OLD_DIVERT 254 /* OLD divert pseudo-proto */
@@ -343,61 +348,61 @@ __END_DECLS
* On subnets, the decomposition of addresses to host and net parts
* is done according to subnet mask, not the masks here.
*/
-#define IN_CLASSA(i) (((u_int32_t)(i) & 0x80000000) == 0)
+#define IN_CLASSA(i) (((in_addr_t)(i) & 0x80000000) == 0)
#define IN_CLASSA_NET 0xff000000
#define IN_CLASSA_NSHIFT 24
#define IN_CLASSA_HOST 0x00ffffff
#define IN_CLASSA_MAX 128
-#define IN_CLASSB(i) (((u_int32_t)(i) & 0xc0000000) == 0x80000000)
+#define IN_CLASSB(i) (((in_addr_t)(i) & 0xc0000000) == 0x80000000)
#define IN_CLASSB_NET 0xffff0000
#define IN_CLASSB_NSHIFT 16
#define IN_CLASSB_HOST 0x0000ffff
#define IN_CLASSB_MAX 65536
-#define IN_CLASSC(i) (((u_int32_t)(i) & 0xe0000000) == 0xc0000000)
+#define IN_CLASSC(i) (((in_addr_t)(i) & 0xe0000000) == 0xc0000000)
#define IN_CLASSC_NET 0xffffff00
#define IN_CLASSC_NSHIFT 8
#define IN_CLASSC_HOST 0x000000ff
-#define IN_CLASSD(i) (((u_int32_t)(i) & 0xf0000000) == 0xe0000000)
+#define IN_CLASSD(i) (((in_addr_t)(i) & 0xf0000000) == 0xe0000000)
#define IN_CLASSD_NET 0xf0000000 /* These ones aren't really */
#define IN_CLASSD_NSHIFT 28 /* net and host fields, but */
#define IN_CLASSD_HOST 0x0fffffff /* routing needn't know. */
#define IN_MULTICAST(i) IN_CLASSD(i)
-#define IN_EXPERIMENTAL(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000)
-#define IN_BADCLASS(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000)
+#define IN_EXPERIMENTAL(i) (((in_addr_t)(i) & 0xf0000000) == 0xf0000000)
+#define IN_BADCLASS(i) (((in_addr_t)(i) & 0xf0000000) == 0xf0000000)
-#define IN_LINKLOCAL(i) (((u_int32_t)(i) & 0xffff0000) == 0xa9fe0000)
-#define IN_LOOPBACK(i) (((u_int32_t)(i) & 0xff000000) == 0x7f000000)
-#define IN_ZERONET(i) (((u_int32_t)(i) & 0xff000000) == 0)
+#define IN_LINKLOCAL(i) (((in_addr_t)(i) & 0xffff0000) == 0xa9fe0000)
+#define IN_LOOPBACK(i) (((in_addr_t)(i) & 0xff000000) == 0x7f000000)
+#define IN_ZERONET(i) (((in_addr_t)(i) & 0xff000000) == 0)
-#define IN_PRIVATE(i) ((((u_int32_t)(i) & 0xff000000) == 0x0a000000) || \
- (((u_int32_t)(i) & 0xfff00000) == 0xac100000) || \
- (((u_int32_t)(i) & 0xffff0000) == 0xc0a80000))
+#define IN_PRIVATE(i) ((((in_addr_t)(i) & 0xff000000) == 0x0a000000) || \
+ (((in_addr_t)(i) & 0xfff00000) == 0xac100000) || \
+ (((in_addr_t)(i) & 0xffff0000) == 0xc0a80000))
-#define IN_LOCAL_GROUP(i) (((u_int32_t)(i) & 0xffffff00) == 0xe0000000)
+#define IN_LOCAL_GROUP(i) (((in_addr_t)(i) & 0xffffff00) == 0xe0000000)
#define IN_ANY_LOCAL(i) (IN_LINKLOCAL(i) || IN_LOCAL_GROUP(i))
-#define INADDR_LOOPBACK (u_int32_t)0x7f000001
+#define INADDR_LOOPBACK ((in_addr_t)0x7f000001)
#ifndef _KERNEL
-#define INADDR_NONE 0xffffffff /* -1 return */
+#define INADDR_NONE ((in_addr_t)0xffffffff) /* -1 return */
#endif
-#define INADDR_UNSPEC_GROUP (u_int32_t)0xe0000000 /* 224.0.0.0 */
-#define INADDR_ALLHOSTS_GROUP (u_int32_t)0xe0000001 /* 224.0.0.1 */
-#define INADDR_ALLRTRS_GROUP (u_int32_t)0xe0000002 /* 224.0.0.2 */
-#define INADDR_ALLRPTS_GROUP (u_int32_t)0xe0000016 /* 224.0.0.22, IGMPv3 */
-#define INADDR_CARP_GROUP (u_int32_t)0xe0000012 /* 224.0.0.18 */
-#define INADDR_PFSYNC_GROUP (u_int32_t)0xe00000f0 /* 224.0.0.240 */
-#define INADDR_ALLMDNS_GROUP (u_int32_t)0xe00000fb /* 224.0.0.251 */
-#define INADDR_MAX_LOCAL_GROUP (u_int32_t)0xe00000ff /* 224.0.0.255 */
+#define INADDR_UNSPEC_GROUP ((in_addr_t)0xe0000000) /* 224.0.0.0 */
+#define INADDR_ALLHOSTS_GROUP ((in_addr_t)0xe0000001) /* 224.0.0.1 */
+#define INADDR_ALLRTRS_GROUP ((in_addr_t)0xe0000002) /* 224.0.0.2 */
+#define INADDR_ALLRPTS_GROUP ((in_addr_t)0xe0000016) /* 224.0.0.22, IGMPv3 */
+#define INADDR_CARP_GROUP ((in_addr_t)0xe0000012) /* 224.0.0.18 */
+#define INADDR_PFSYNC_GROUP ((in_addr_t)0xe00000f0) /* 224.0.0.240 */
+#define INADDR_ALLMDNS_GROUP ((in_addr_t)0xe00000fb) /* 224.0.0.251 */
+#define INADDR_MAX_LOCAL_GROUP ((in_addr_t)0xe00000ff) /* 224.0.0.255 */
#define IN_LOOPBACKNET 127 /* official! */
-#define IN_RFC3021_MASK (u_int32_t)0xfffffffe
+#define IN_RFC3021_MASK ((in_addr_t)0xfffffffe)
/*
* Options for use with [gs]etsockopt at the IP level.
@@ -427,10 +432,11 @@ __END_DECLS
#define IP_RECVIF 20 /* bool; receive reception if w/dgram */
/* for IPSEC */
#define IP_IPSEC_POLICY 21 /* int; set/get security policy */
-#define IP_FAITH 22 /* bool; accept FAITH'ed connections */
-
+ /* unused; was IP_FAITH */
#define IP_ONESBCAST 23 /* bool: send all-ones broadcast */
#define IP_BINDANY 24 /* bool: allow bind to any address */
+#define IP_BINDMULTI 25 /* bool: allow multiple listeners on a tuple */
+#define IP_RSS_LISTEN_BUCKET 26 /* int; set RSS listen bucket */
/*
* Options for controlling the firewall and dummynet.
@@ -485,6 +491,13 @@ __END_DECLS
#define MCAST_BLOCK_SOURCE 84 /* block a source */
#define MCAST_UNBLOCK_SOURCE 85 /* unblock a source */
+/* Flow and RSS definitions */
+#define IP_FLOWID 90 /* get flow id for the given socket/inp */
+#define IP_FLOWTYPE 91 /* get flow type (M_HASHTYPE) */
+#define IP_RSSBUCKETID 92 /* get RSS flowid -> bucket mapping */
+#define IP_RECVFLOWID 93 /* bool; receive IP flowid/flowtype w/ datagram */
+#define IP_RECVRSSBUCKETID 94 /* bool; receive IP RSS bucket id w/ datagram */
+
/*
* Defaults and limits for options
*/
@@ -602,86 +615,7 @@ int getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t,
#define IP_PORTRANGE_LOW 2 /* "low" - vouchsafe security */
/*
- * Definitions for inet sysctl operations.
- *
- * Third level is protocol number.
- * Fourth level is desired variable within that protocol.
- */
-#define IPPROTO_MAXID (IPPROTO_AH + 1) /* don't list to IPPROTO_MAX */
-
-#define CTL_IPPROTO_NAMES { \
- { "ip", CTLTYPE_NODE }, \
- { "icmp", CTLTYPE_NODE }, \
- { "igmp", CTLTYPE_NODE }, \
- { "ggp", CTLTYPE_NODE }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { "tcp", CTLTYPE_NODE }, \
- { 0, 0 }, \
- { "egp", CTLTYPE_NODE }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { "pup", CTLTYPE_NODE }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { "udp", CTLTYPE_NODE }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { "idp", CTLTYPE_NODE }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { "ipsec", CTLTYPE_NODE }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
- { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
- { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
- { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
- { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
- { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
- { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
- { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
- { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { 0, 0 }, \
- { "pim", CTLTYPE_NODE }, \
-}
-
-/*
- * Names for IP sysctl objects
+ * Identifiers for IP sysctl nodes
*/
#define IPCTL_FORWARDING 1 /* act as router */
#define IPCTL_SENDREDIRECTS 2 /* may send redirects when forwarding */
@@ -689,9 +623,9 @@ int getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t,
#ifdef notyet
#define IPCTL_DEFMTU 4 /* default MTU */
#endif
-#define IPCTL_RTEXPIRE 5 /* cloned route expiration time */
-#define IPCTL_RTMINEXPIRE 6 /* min value for expiration time */
-#define IPCTL_RTMAXCACHE 7 /* trigger level for dynamic expire */
+/* IPCTL_RTEXPIRE 5 deprecated */
+/* IPCTL_RTMINEXPIRE 6 deprecated */
+/* IPCTL_RTMAXCACHE 7 deprecated */
#define IPCTL_SOURCEROUTE 8 /* may perform source routes */
#define IPCTL_DIRECTEDBROADCAST 9 /* may re-broadcast received packets */
#define IPCTL_INTRQMAXLEN 10 /* max length of netisr queue */
@@ -699,38 +633,22 @@ int getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t,
#define IPCTL_STATS 12 /* ipstat structure */
#define IPCTL_ACCEPTSOURCEROUTE 13 /* may accept source routed packets */
#define IPCTL_FASTFORWARDING 14 /* use fast IP forwarding code */
-#define IPCTL_KEEPFAITH 15 /* FAITH IPv4->IPv6 translater ctl */
+ /* 15, unused, was: IPCTL_KEEPFAITH */
#define IPCTL_GIF_TTL 16 /* default TTL for gif encap packet */
-#define IPCTL_MAXID 17
-
-#define IPCTL_NAMES { \
- { 0, 0 }, \
- { "forwarding", CTLTYPE_INT }, \
- { "redirect", CTLTYPE_INT }, \
- { "ttl", CTLTYPE_INT }, \
- { "mtu", CTLTYPE_INT }, \
- { "rtexpire", CTLTYPE_INT }, \
- { "rtminexpire", CTLTYPE_INT }, \
- { "rtmaxcache", CTLTYPE_INT }, \
- { "sourceroute", CTLTYPE_INT }, \
- { "directed-broadcast", CTLTYPE_INT }, \
- { "intr-queue-maxlen", CTLTYPE_INT }, \
- { "intr-queue-drops", CTLTYPE_INT }, \
- { "stats", CTLTYPE_STRUCT }, \
- { "accept_sourceroute", CTLTYPE_INT }, \
- { "fastforwarding", CTLTYPE_INT }, \
-}
#endif /* __BSD_VISIBLE */
#ifdef _KERNEL
struct ifnet; struct mbuf; /* forward declarations for Standard C */
+struct in_ifaddr;
int in_broadcast(struct in_addr, struct ifnet *);
+int in_ifaddr_broadcast(struct in_addr, struct in_ifaddr *);
int in_canforward(struct in_addr);
int in_localaddr(struct in_addr);
int in_localip(struct in_addr);
+int in_ifhasaddr(struct ifnet *, struct in_addr);
int inet_aton(const char *, struct in_addr *); /* in libkern */
char *inet_ntoa(struct in_addr); /* in libkern */
char *inet_ntoa_r(struct in_addr ina, char *buf); /* in libkern */
@@ -745,33 +663,6 @@ void in_ifdetach(struct ifnet *);
#define satosin(sa) ((struct sockaddr_in *)(sa))
#define sintosa(sin) ((struct sockaddr *)(sin))
#define ifatoia(ifa) ((struct in_ifaddr *)(ifa))
-
-/*
- * Historically, BSD keeps ip_len and ip_off in host format
- * when doing layer 3 processing, and this often requires
- * to translate the format back and forth.
- * To make the process explicit, we define a couple of macros
- * that also take into account the fact that at some point
- * we may want to keep those fields always in net format.
- */
-
-#if (BYTE_ORDER == BIG_ENDIAN) || defined(HAVE_NET_IPLEN)
-#define SET_NET_IPLEN(p) do {} while (0)
-#define SET_HOST_IPLEN(p) do {} while (0)
-#else
-#define SET_NET_IPLEN(p) do { \
- struct ip *h_ip = (p); \
- h_ip->ip_len = htons(h_ip->ip_len); \
- h_ip->ip_off = htons(h_ip->ip_off); \
- } while (0)
-
-#define SET_HOST_IPLEN(p) do { \
- struct ip *h_ip = (p); \
- h_ip->ip_len = ntohs(h_ip->ip_len); \
- h_ip->ip_off = ntohs(h_ip->ip_off); \
- } while (0)
-#endif /* !HAVE_NET_IPLEN */
-
#endif /* _KERNEL */
/* INET6 stuff */
diff --git a/freebsd/sys/netinet/in_fib.c b/freebsd/sys/netinet/in_fib.c
new file mode 100644
index 00000000..f1edf976
--- /dev/null
+++ b/freebsd/sys/netinet/in_fib.c
@@ -0,0 +1,235 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2015
+ * Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/local/opt_inet.h>
+#include <rtems/bsd/local/opt_route.h>
+#include <rtems/bsd/local/opt_mpath.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/vnet.h>
+
+#ifdef RADIX_MPATH
+#include <net/radix_mpath.h>
+#endif
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#ifdef INET
+static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
+ uint32_t flags, struct nhop4_basic *pnh4);
+static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
+ uint32_t flags, struct nhop4_extended *pnh4);
+
+#define RNTORT(p) ((struct rtentry *)(p))
+
+static void
+fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
+ uint32_t flags, struct nhop4_basic *pnh4)
+{
+ struct sockaddr_in *gw;
+
+ if ((flags & NHR_IFAIF) != 0)
+ pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
+ else
+ pnh4->nh_ifp = rte->rt_ifp;
+ pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
+ if (rte->rt_flags & RTF_GATEWAY) {
+ gw = (struct sockaddr_in *)rte->rt_gateway;
+ pnh4->nh_addr = gw->sin_addr;
+ } else
+ pnh4->nh_addr = dst;
+ /* Set flags */
+ pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
+ gw = (struct sockaddr_in *)rt_key(rte);
+ if (gw->sin_addr.s_addr == 0)
+ pnh4->nh_flags |= NHF_DEFAULT;
+ /* TODO: Handle RTF_BROADCAST here */
+}
+
+static void
+fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
+ uint32_t flags, struct nhop4_extended *pnh4)
+{
+ struct sockaddr_in *gw;
+ struct in_ifaddr *ia;
+
+ if ((flags & NHR_IFAIF) != 0)
+ pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
+ else
+ pnh4->nh_ifp = rte->rt_ifp;
+ pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
+ if (rte->rt_flags & RTF_GATEWAY) {
+ gw = (struct sockaddr_in *)rte->rt_gateway;
+ pnh4->nh_addr = gw->sin_addr;
+ } else
+ pnh4->nh_addr = dst;
+ /* Set flags */
+ pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
+ gw = (struct sockaddr_in *)rt_key(rte);
+ if (gw->sin_addr.s_addr == 0)
+ pnh4->nh_flags |= NHF_DEFAULT;
+ /* XXX: Set RTF_BROADCAST if GW address is broadcast */
+
+ ia = ifatoia(rte->rt_ifa);
+ pnh4->nh_src = IA_SIN(ia)->sin_addr;
+}
+
+/*
+ * Performs IPv4 route table lookup on @dst. Returns 0 on success.
+ * Stores nexthop info provided @pnh4 structure.
+ * Note that
+ * - nh_ifp cannot be safely dereferenced
+ * - nh_ifp represents logical transmit interface (rt_ifp) (e.g. if
+ * looking up address on interface "ix0" pointer to "lo0" interface
+ * will be returned instead of "ix0")
+ * - nh_ifp represents "address" interface if NHR_IFAIF flag is passed
+ * - howewer mtu from "transmit" interface will be returned.
+ */
+int
+fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flags,
+ uint32_t flowid, struct nhop4_basic *pnh4)
+{
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct sockaddr_in sin;
+ struct rtentry *rte;
+
+ KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET);
+ if (rh == NULL)
+ return (ENOENT);
+
+ /* Prepare lookup key */
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_len = sizeof(struct sockaddr_in);
+ sin.sin_addr = dst;
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ rte = RNTORT(rn);
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(rte->rt_ifp)) {
+ fib4_rte_to_nh_basic(rte, dst, flags, pnh4);
+ RIB_RUNLOCK(rh);
+
+ return (0);
+ }
+ }
+ RIB_RUNLOCK(rh);
+
+ return (ENOENT);
+}
+
+/*
+ * Performs IPv4 route table lookup on @dst. Returns 0 on success.
+ * Stores extende nexthop info provided @pnh4 structure.
+ * Note that
+ * - nh_ifp cannot be safely dereferenced unless NHR_REF is specified.
+ * - in that case you need to call fib4_free_nh_ext()
+ * - nh_ifp represents logical transmit interface (rt_ifp) (e.g. if
+ * looking up address of interface "ix0" pointer to "lo0" interface
+ * will be returned instead of "ix0")
+ * - nh_ifp represents "address" interface if NHR_IFAIF flag is passed
+ * - howewer mtu from "transmit" interface will be returned.
+ */
+int
+fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags,
+ uint32_t flowid, struct nhop4_extended *pnh4)
+{
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct sockaddr_in sin;
+ struct rtentry *rte;
+
+ KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET);
+ if (rh == NULL)
+ return (ENOENT);
+
+ /* Prepare lookup key */
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_len = sizeof(struct sockaddr_in);
+ sin.sin_addr = dst;
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ rte = RNTORT(rn);
+#ifdef RADIX_MPATH
+ rte = rt_mpath_select(rte, flowid);
+ if (rte == NULL) {
+ RIB_RUNLOCK(rh);
+ return (ENOENT);
+ }
+#endif
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(rte->rt_ifp)) {
+ fib4_rte_to_nh_extended(rte, dst, flags, pnh4);
+ if ((flags & NHR_REF) != 0) {
+ /* TODO: lwref on egress ifp's ? */
+ }
+ RIB_RUNLOCK(rh);
+
+ return (0);
+ }
+ }
+ RIB_RUNLOCK(rh);
+
+ return (ENOENT);
+}
+
+void
+fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4)
+{
+
+}
+
+#endif
diff --git a/freebsd/sys/netinet/in_fib.h b/freebsd/sys/netinet/in_fib.h
new file mode 100644
index 00000000..754a2e3c
--- /dev/null
+++ b/freebsd/sys/netinet/in_fib.h
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2015
+ * Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IN_FIB_H_
+#define _NETINET_IN_FIB_H_
+
+/* Basic nexthop info used for uRPF/mtu checks */
+struct nhop4_basic {
+ struct ifnet *nh_ifp; /* Logical egress interface */
+ uint16_t nh_mtu; /* nexthop mtu */
+ uint16_t nh_flags; /* nhop flags */
+ struct in_addr nh_addr; /* GW/DST IPv4 address */
+};
+
+/* Extended nexthop info used for control protocols */
+struct nhop4_extended {
+ struct ifnet *nh_ifp; /* Logical egress interface */
+ uint16_t nh_mtu; /* nexthop mtu */
+ uint16_t nh_flags; /* nhop flags */
+ uint8_t spare[4];
+ struct in_addr nh_addr; /* GW/DST IPv4 address */
+ struct in_addr nh_src; /* default source IPv4 address */
+ uint64_t spare2[2];
+};
+
+int fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flags,
+ uint32_t flowid, struct nhop4_basic *pnh4);
+int fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags,
+ uint32_t flowid, struct nhop4_extended *pnh4);
+void fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4);
+
+#endif
+
diff --git a/freebsd/sys/netinet/in_gif.c b/freebsd/sys/netinet/in_gif.c
index 332d7ff4..02e2efd8 100644
--- a/freebsd/sys/netinet/in_gif.c
+++ b/freebsd/sys/netinet/in_gif.c
@@ -1,7 +1,5 @@
#include <machine/rtems-bsd-kernel-space.h>
-/* $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */
-
/*-
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
@@ -29,16 +27,19 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
+ *
+ * $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <rtems/bsd/local/opt_mrouting.h>
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/sys/param.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/rmlock.h>
#include <sys/systm.h>
#include <sys/socket.h>
#include <sys/sockio.h>
@@ -50,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <sys/malloc.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#include <net/vnet.h>
@@ -57,162 +59,56 @@ __FBSDID("$FreeBSD$");
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
-#include <netinet/in_gif.h>
#include <netinet/in_var.h>
#include <netinet/ip_encap.h>
#include <netinet/ip_ecn.h>
+#include <netinet/in_fib.h>
#ifdef INET6
#include <netinet/ip6.h>
#endif
-#ifdef MROUTING
-#include <netinet/ip_mroute.h>
-#endif /* MROUTING */
-
-#include <net/if_gif.h>
+#include <net/if_gif.h>
-static int gif_validate4(const struct ip *, struct gif_softc *,
- struct ifnet *);
+static int in_gif_input(struct mbuf **, int *, int);
extern struct domain inetdomain;
-struct protosw in_gif_protosw = {
+static struct protosw in_gif_protosw = {
.pr_type = SOCK_RAW,
.pr_domain = &inetdomain,
.pr_protocol = 0/* IPPROTO_IPV[46] */,
.pr_flags = PR_ATOMIC|PR_ADDR,
.pr_input = in_gif_input,
- .pr_output = (pr_output_t*)rip_output,
+ .pr_output = rip_output,
.pr_ctloutput = rip_ctloutput,
.pr_usrreqs = &rip_usrreqs
};
-VNET_DEFINE(int, ip_gif_ttl) = GIF_TTL;
+#define GIF_TTL 30
+static VNET_DEFINE(int, ip_gif_ttl) = GIF_TTL;
#define V_ip_gif_ttl VNET(ip_gif_ttl)
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(ip_gif_ttl), 0, "");
int
-in_gif_output(struct ifnet *ifp, int family, struct mbuf *m)
+in_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn)
{
+ GIF_RLOCK_TRACKER;
struct gif_softc *sc = ifp->if_softc;
- struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst;
- struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc;
- struct sockaddr_in *sin_dst = (struct sockaddr_in *)sc->gif_pdst;
- struct ip iphdr; /* capsule IP header, host byte ordered */
- struct etherip_header eiphdr;
- int error, len, proto;
- u_int8_t tos;
-
- GIF_LOCK_ASSERT(sc);
-
- if (sin_src == NULL || sin_dst == NULL ||
- sin_src->sin_family != AF_INET ||
- sin_dst->sin_family != AF_INET) {
- m_freem(m);
- return EAFNOSUPPORT;
- }
-
- switch (family) {
-#ifdef INET
- case AF_INET:
- {
- struct ip *ip;
-
- proto = IPPROTO_IPV4;
- if (m->m_len < sizeof(*ip)) {
- m = m_pullup(m, sizeof(*ip));
- if (!m)
- return ENOBUFS;
- }
- ip = mtod(m, struct ip *);
- tos = ip->ip_tos;
- break;
- }
-#endif /* INET */
-#ifdef INET6
- case AF_INET6:
- {
- struct ip6_hdr *ip6;
- proto = IPPROTO_IPV6;
- if (m->m_len < sizeof(*ip6)) {
- m = m_pullup(m, sizeof(*ip6));
- if (!m)
- return ENOBUFS;
- }
- ip6 = mtod(m, struct ip6_hdr *);
- tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
- break;
- }
-#endif /* INET6 */
- case AF_LINK:
- proto = IPPROTO_ETHERIP;
-
- /*
- * GIF_SEND_REVETHIP (disabled by default) intentionally
- * sends an EtherIP packet with revered version field in
- * the header. This is a knob for backward compatibility
- * with FreeBSD 7.2R or prior.
- */
- if ((sc->gif_options & GIF_SEND_REVETHIP)) {
- eiphdr.eip_ver = 0;
- eiphdr.eip_resvl = ETHERIP_VERSION;
- eiphdr.eip_resvh = 0;
- } else {
- eiphdr.eip_ver = ETHERIP_VERSION;
- eiphdr.eip_resvl = 0;
- eiphdr.eip_resvh = 0;
- }
- /* prepend Ethernet-in-IP header */
- M_PREPEND(m, sizeof(struct etherip_header), M_DONTWAIT);
- if (m && m->m_len < sizeof(struct etherip_header))
- m = m_pullup(m, sizeof(struct etherip_header));
- if (m == NULL)
- return ENOBUFS;
- bcopy(&eiphdr, mtod(m, struct etherip_header *),
- sizeof(struct etherip_header));
- break;
-
- default:
-#ifdef DEBUG
- printf("in_gif_output: warning: unknown family %d passed\n",
- family);
-#endif
- m_freem(m);
- return EAFNOSUPPORT;
- }
-
- bzero(&iphdr, sizeof(iphdr));
- iphdr.ip_src = sin_src->sin_addr;
- /* bidirectional configured tunnel mode */
- if (sin_dst->sin_addr.s_addr != INADDR_ANY)
- iphdr.ip_dst = sin_dst->sin_addr;
- else {
- m_freem(m);
- return ENETUNREACH;
- }
- iphdr.ip_p = proto;
- /* version will be set in ip_output() */
- iphdr.ip_ttl = V_ip_gif_ttl;
- iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip);
- ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE,
- &iphdr.ip_tos, &tos);
+ struct ip *ip;
+ int len;
/* prepend new IP header */
len = sizeof(struct ip);
#ifndef __NO_STRICT_ALIGNMENT
- if (family == AF_LINK)
+ if (proto == IPPROTO_ETHERIP)
len += ETHERIP_ALIGN;
#endif
- M_PREPEND(m, len, M_DONTWAIT);
- if (m != NULL && m->m_len < len)
- m = m_pullup(m, len);
- if (m == NULL) {
- printf("ENOBUFS in in_gif_output %d\n", __LINE__);
- return ENOBUFS;
- }
+ M_PREPEND(m, len, M_NOWAIT);
+ if (m == NULL)
+ return (ENOBUFS);
#ifndef __NO_STRICT_ALIGNMENT
- if (family == AF_LINK) {
+ if (proto == IPPROTO_ETHERIP) {
len = mtod(m, vm_offset_t) & 3;
KASSERT(len == 0 || len == ETHERIP_ALIGN,
("in_gif_output: unexpected misalignment"));
@@ -220,212 +116,51 @@ in_gif_output(struct ifnet *ifp, int family, struct mbuf *m)
m->m_len -= ETHERIP_ALIGN;
}
#endif
- bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip));
-
- M_SETFIB(m, sc->gif_fibnum);
-
- if (dst->sin_family != sin_dst->sin_family ||
- dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) {
- /* cache route doesn't match */
- bzero(dst, sizeof(*dst));
- dst->sin_family = sin_dst->sin_family;
- dst->sin_len = sizeof(struct sockaddr_in);
- dst->sin_addr = sin_dst->sin_addr;
- if (sc->gif_ro.ro_rt) {
- RTFREE(sc->gif_ro.ro_rt);
- sc->gif_ro.ro_rt = NULL;
- }
-#if 0
- GIF2IFP(sc)->if_mtu = GIF_MTU;
-#endif
- }
-
- if (sc->gif_ro.ro_rt == NULL) {
- in_rtalloc_ign(&sc->gif_ro, 0, sc->gif_fibnum);
- if (sc->gif_ro.ro_rt == NULL) {
- m_freem(m);
- return ENETUNREACH;
- }
-
- /* if it constitutes infinite encapsulation, punt. */
- if (sc->gif_ro.ro_rt->rt_ifp == ifp) {
- m_freem(m);
- return ENETUNREACH; /* XXX */
- }
-#if 0
- ifp->if_mtu = sc->gif_ro.ro_rt->rt_ifp->if_mtu
- - sizeof(struct ip);
-#endif
+ ip = mtod(m, struct ip *);
+ GIF_RLOCK(sc);
+ if (sc->gif_family != AF_INET) {
+ m_freem(m);
+ GIF_RUNLOCK(sc);
+ return (ENETDOWN);
}
+ bcopy(sc->gif_iphdr, ip, sizeof(struct ip));
+ GIF_RUNLOCK(sc);
- m_addr_changed(m);
-
- error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL);
-
- if (!(GIF2IFP(sc)->if_flags & IFF_LINK0) &&
- sc->gif_ro.ro_rt != NULL) {
- RTFREE(sc->gif_ro.ro_rt);
- sc->gif_ro.ro_rt = NULL;
- }
+ ip->ip_p = proto;
+ /* version will be set in ip_output() */
+ ip->ip_ttl = V_ip_gif_ttl;
+ ip->ip_len = htons(m->m_pkthdr.len);
+ ip->ip_tos = ecn;
- return (error);
+ return (ip_output(m, NULL, NULL, 0, NULL, NULL));
}
-void
-in_gif_input(struct mbuf *m, int off)
+static int
+in_gif_input(struct mbuf **mp, int *offp, int proto)
{
- struct ifnet *gifp = NULL;
+ struct mbuf *m = *mp;
struct gif_softc *sc;
+ struct ifnet *gifp;
struct ip *ip;
- int af;
- u_int8_t otos;
- int proto;
+ uint8_t ecn;
- ip = mtod(m, struct ip *);
- proto = ip->ip_p;
-
- sc = (struct gif_softc *)encap_getarg(m);
+ sc = encap_getarg(m);
if (sc == NULL) {
m_freem(m);
KMOD_IPSTAT_INC(ips_nogif);
- return;
+ return (IPPROTO_DONE);
}
-
gifp = GIF2IFP(sc);
- if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) {
- m_freem(m);
- KMOD_IPSTAT_INC(ips_nogif);
- return;
- }
-
- otos = ip->ip_tos;
- m_adj(m, off);
-
- switch (proto) {
-#ifdef INET
- case IPPROTO_IPV4:
- {
- struct ip *ip;
- af = AF_INET;
- if (m->m_len < sizeof(*ip)) {
- m = m_pullup(m, sizeof(*ip));
- if (!m)
- return;
- }
+ if ((gifp->if_flags & IFF_UP) != 0) {
ip = mtod(m, struct ip *);
- if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ?
- ECN_ALLOWED : ECN_NOCARE,
- &otos, &ip->ip_tos) == 0) {
- m_freem(m);
- return;
- }
- break;
- }
-#endif
-#ifdef INET6
- case IPPROTO_IPV6:
- {
- struct ip6_hdr *ip6;
- u_int8_t itos, oitos;
-
- af = AF_INET6;
- if (m->m_len < sizeof(*ip6)) {
- m = m_pullup(m, sizeof(*ip6));
- if (!m)
- return;
- }
- ip6 = mtod(m, struct ip6_hdr *);
- itos = oitos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
- if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ?
- ECN_ALLOWED : ECN_NOCARE,
- &otos, &itos) == 0) {
- m_freem(m);
- return;
- }
- if (itos != oitos) {
- ip6->ip6_flow &= ~htonl(0xff << 20);
- ip6->ip6_flow |= htonl((u_int32_t)itos << 20);
- }
- break;
- }
-#endif /* INET6 */
- case IPPROTO_ETHERIP:
- af = AF_LINK;
- break;
-
- default:
- KMOD_IPSTAT_INC(ips_nogif);
+ ecn = ip->ip_tos;
+ m_adj(m, *offp);
+ gif_input(m, gifp, proto, ecn);
+ } else {
m_freem(m);
- return;
- }
- gif_input(m, af, gifp);
- return;
-}
-
-/*
- * validate outer address.
- */
-static int
-gif_validate4(const struct ip *ip, struct gif_softc *sc, struct ifnet *ifp)
-{
- struct sockaddr_in *src, *dst;
- struct in_ifaddr *ia4;
-
- src = (struct sockaddr_in *)sc->gif_psrc;
- dst = (struct sockaddr_in *)sc->gif_pdst;
-
- /* check for address match */
- if (src->sin_addr.s_addr != ip->ip_dst.s_addr ||
- dst->sin_addr.s_addr != ip->ip_src.s_addr)
- return 0;
-
- /* martian filters on outer source - NOT done in ip_input! */
- if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)))
- return 0;
- switch ((ntohl(ip->ip_src.s_addr) & 0xff000000) >> 24) {
- case 0: case 127: case 255:
- return 0;
- }
-
- /* reject packets with broadcast on source */
- /* XXXRW: should use hash lists? */
- IN_IFADDR_RLOCK();
- TAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) {
- if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0)
- continue;
- if (ip->ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) {
- IN_IFADDR_RUNLOCK();
- return 0;
- }
- }
- IN_IFADDR_RUNLOCK();
-
- /* ingress filters on outer source */
- if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0 && ifp) {
- struct sockaddr_in sin;
- struct rtentry *rt;
-
- bzero(&sin, sizeof(sin));
- sin.sin_family = AF_INET;
- sin.sin_len = sizeof(struct sockaddr_in);
- sin.sin_addr = ip->ip_src;
- /* XXX MRT check for the interface we would use on output */
- rt = in_rtalloc1((struct sockaddr *)&sin, 0,
- 0UL, sc->gif_fibnum);
- if (!rt || rt->rt_ifp != ifp) {
-#if 0
- log(LOG_WARNING, "%s: packet from 0x%x dropped "
- "due to ingress filter\n", if_name(GIF2IFP(sc)),
- (u_int32_t)ntohl(sin.sin_addr.s_addr));
-#endif
- if (rt)
- RTFREE_LOCKED(rt);
- return 0;
- }
- RTFREE_LOCKED(rt);
+ KMOD_IPSTAT_INC(ips_nogif);
}
-
- return 32 * 2;
+ return (IPPROTO_DONE);
}
/*
@@ -433,39 +168,51 @@ gif_validate4(const struct ip *ip, struct gif_softc *sc, struct ifnet *ifp)
* matched the physical addr family. see gif_encapcheck().
*/
int
-gif_encapcheck4(const struct mbuf *m, int off, int proto, void *arg)
+in_gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
{
- struct ip ip;
+ const struct ip *ip;
struct gif_softc *sc;
- struct ifnet *ifp;
+ int ret;
/* sanity check done in caller */
sc = (struct gif_softc *)arg;
+ GIF_RLOCK_ASSERT(sc);
- /* LINTED const cast */
- m_copydata(m, 0, sizeof(ip), (caddr_t)&ip);
- ifp = ((m->m_flags & M_PKTHDR) != 0) ? m->m_pkthdr.rcvif : NULL;
+ /* check for address match */
+ ip = mtod(m, const struct ip *);
+ if (sc->gif_iphdr->ip_src.s_addr != ip->ip_dst.s_addr)
+ return (0);
+ ret = 32;
+ if (sc->gif_iphdr->ip_dst.s_addr != ip->ip_src.s_addr) {
+ if ((sc->gif_options & GIF_IGNORE_SOURCE) == 0)
+ return (0);
+ } else
+ ret += 32;
- return gif_validate4(&ip, sc, ifp);
-}
+ /* ingress filters on outer source */
+ if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0) {
+ struct nhop4_basic nh4;
+ struct in_addr dst;
-int
-in_gif_attach(struct gif_softc *sc)
-{
- sc->encap_cookie4 = encap_attach_func(AF_INET, -1, gif_encapcheck,
- &in_gif_protosw, sc);
- if (sc->encap_cookie4 == NULL)
- return EEXIST;
- return 0;
+ dst = ip->ip_src;
+
+ if (fib4_lookup_nh_basic(sc->gif_fibnum, dst, 0, 0, &nh4) != 0)
+ return (0);
+
+ if (nh4.nh_ifp != m->m_pkthdr.rcvif)
+ return (0);
+ }
+ return (ret);
}
int
-in_gif_detach(struct gif_softc *sc)
+in_gif_attach(struct gif_softc *sc)
{
- int error;
- error = encap_detach(sc->encap_cookie4);
- if (error == 0)
- sc->encap_cookie4 = NULL;
- return error;
+ KASSERT(sc->gif_ecookie == NULL, ("gif_ecookie isn't NULL"));
+ sc->gif_ecookie = encap_attach_func(AF_INET, -1, gif_encapcheck,
+ &in_gif_protosw, sc);
+ if (sc->gif_ecookie == NULL)
+ return (EEXIST);
+ return (0);
}
diff --git a/freebsd/sys/netinet/in_kdtrace.h b/freebsd/sys/netinet/in_kdtrace.h
new file mode 100644
index 00000000..a36991ef
--- /dev/null
+++ b/freebsd/sys/netinet/in_kdtrace.h
@@ -0,0 +1,72 @@
+/*-
+ * Copyright (c) 2013 Mark Johnston <markj@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_IN_KDTRACE_H_
+#define _SYS_IN_KDTRACE_H_
+
+#define IP_PROBE(probe, arg0, arg1, arg2, arg3, arg4, arg5) \
+ SDT_PROBE6(ip, , , probe, arg0, arg1, arg2, arg3, arg4, arg5)
+#define UDP_PROBE(probe, arg0, arg1, arg2, arg3, arg4) \
+ SDT_PROBE5(udp, , , probe, arg0, arg1, arg2, arg3, arg4)
+#define TCP_PROBE1(probe, arg0) \
+ SDT_PROBE1(tcp, , , probe, arg0)
+#define TCP_PROBE2(probe, arg0, arg1) \
+ SDT_PROBE2(tcp, , , probe, arg0, arg1)
+#define TCP_PROBE3(probe, arg0, arg1, arg2) \
+ SDT_PROBE3(tcp, , , probe, arg0, arg1, arg2)
+#define TCP_PROBE4(probe, arg0, arg1, arg2, arg3) \
+ SDT_PROBE4(tcp, , , probe, arg0, arg1, arg2, arg3)
+#define TCP_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \
+ SDT_PROBE5(tcp, , , probe, arg0, arg1, arg2, arg3, arg4)
+#define TCP_PROBE6(probe, arg0, arg1, arg2, arg3, arg4, arg5) \
+ SDT_PROBE6(tcp, , , probe, arg0, arg1, arg2, arg3, arg4, arg5)
+
+SDT_PROVIDER_DECLARE(ip);
+SDT_PROVIDER_DECLARE(tcp);
+SDT_PROVIDER_DECLARE(udp);
+
+SDT_PROBE_DECLARE(ip, , , receive);
+SDT_PROBE_DECLARE(ip, , , send);
+
+SDT_PROBE_DECLARE(tcp, , , accept__established);
+SDT_PROBE_DECLARE(tcp, , , accept__refused);
+SDT_PROBE_DECLARE(tcp, , , connect__established);
+SDT_PROBE_DECLARE(tcp, , , connect__refused);
+SDT_PROBE_DECLARE(tcp, , , connect__request);
+SDT_PROBE_DECLARE(tcp, , , receive);
+SDT_PROBE_DECLARE(tcp, , , send);
+SDT_PROBE_DECLARE(tcp, , , siftr);
+SDT_PROBE_DECLARE(tcp, , , state__change);
+SDT_PROBE_DECLARE(tcp, , , debug__input);
+SDT_PROBE_DECLARE(tcp, , , debug__output);
+SDT_PROBE_DECLARE(tcp, , , debug__user);
+SDT_PROBE_DECLARE(tcp, , , debug__drop);
+
+SDT_PROBE_DECLARE(udp, , , receive);
+SDT_PROBE_DECLARE(udp, , , send);
+
+#endif
diff --git a/freebsd/sys/netinet/in_mcast.c b/freebsd/sys/netinet/in_mcast.c
index 4112046c..3d68718e 100644
--- a/freebsd/sys/netinet/in_mcast.c
+++ b/freebsd/sys/netinet/in_mcast.c
@@ -40,23 +40,28 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
+#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <sys/ktr.h>
+#include <sys/taskqueue.h>
#include <sys/tree.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
+#include <netinet/in_fib.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
@@ -148,6 +153,8 @@ static void inm_purge(struct in_multi *);
static void inm_reap(struct in_multi *);
static struct ip_moptions *
inp_findmoptions(struct inpcb *);
+static void inp_freemoptions_internal(struct ip_moptions *);
+static void inp_gcmoptions(void *, int);
static int inp_get_source_filters(struct inpcb *, struct sockopt *);
static int inp_join_group(struct inpcb *, struct sockopt *);
static int inp_leave_group(struct inpcb *, struct sockopt *);
@@ -164,25 +171,26 @@ static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0,
static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER;
SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc,
- CTLFLAG_RW | CTLFLAG_TUN, &in_mcast_maxgrpsrc, 0,
+ CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0,
"Max source filters per group");
-TUNABLE_ULONG("net.inet.ip.mcast.maxgrpsrc", &in_mcast_maxgrpsrc);
static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER;
SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc,
- CTLFLAG_RW | CTLFLAG_TUN, &in_mcast_maxsocksrc, 0,
+ CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0,
"Max source filters per socket");
-TUNABLE_ULONG("net.inet.ip.mcast.maxsocksrc", &in_mcast_maxsocksrc);
int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP;
-SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RW | CTLFLAG_TUN,
+SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
&in_mcast_loop, 0, "Loopback multicast datagrams by default");
-TUNABLE_INT("net.inet.ip.mcast.loop", &in_mcast_loop);
static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters,
"Per-interface stack-wide source filters");
+static STAILQ_HEAD(, ip_moptions) imo_gc_list =
+ STAILQ_HEAD_INITIALIZER(imo_gc_list);
+static struct task imo_gc_task = TASK_INITIALIZER(0, inp_gcmoptions, NULL);
+
#ifdef KTR
/*
* Inline function which wraps assertions for a valid ifp.
@@ -222,6 +230,49 @@ imf_init(struct in_mfilter *imf, const int st0, const int st1)
}
/*
+ * Function for looking up an in_multi record for an IPv4 multicast address
+ * on a given interface. ifp must be valid. If no record found, return NULL.
+ * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held.
+ */
+struct in_multi *
+inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina)
+{
+ struct ifmultiaddr *ifma;
+ struct in_multi *inm;
+
+ IN_MULTI_LOCK_ASSERT();
+ IF_ADDR_LOCK_ASSERT(ifp);
+
+ inm = NULL;
+ TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) {
+ if (ifma->ifma_addr->sa_family == AF_INET) {
+ inm = (struct in_multi *)ifma->ifma_protospec;
+ if (inm->inm_addr.s_addr == ina.s_addr)
+ break;
+ inm = NULL;
+ }
+ }
+ return (inm);
+}
+
+/*
+ * Wrapper for inm_lookup_locked().
+ * The IF_ADDR_LOCK will be taken on ifp and released on return.
+ */
+struct in_multi *
+inm_lookup(struct ifnet *ifp, const struct in_addr ina)
+{
+ struct in_multi *inm;
+
+ IN_MULTI_LOCK_ASSERT();
+ IF_ADDR_RLOCK(ifp);
+ inm = inm_lookup_locked(ifp, ina);
+ IF_ADDR_RUNLOCK(ifp);
+
+ return (inm);
+}
+
+/*
* Resize the ip_moptions vector to the next power-of-two minus 1.
* May be called with locks held; do not sleep.
*/
@@ -467,8 +518,8 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group,
*/
inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO);
if (inm == NULL) {
- if_delmulti_ifma(ifma);
IF_ADDR_WUNLOCK(ifp);
+ if_delmulti_ifma(ifma);
return (ENOMEM);
}
inm->inm_addr = *group;
@@ -477,12 +528,7 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group,
inm->inm_ifma = ifma;
inm->inm_refcount = 1;
inm->inm_state = IGMP_NOT_MEMBER;
-
- /*
- * Pending state-changes per group are subject to a bounds check.
- */
- IFQ_SET_MAXLEN(&inm->inm_scq, IGMP_MAX_STATE_CHANGES);
-
+ mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES);
inm->inm_st[0].iss_fmode = MCAST_UNDEFINED;
inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
RB_INIT(&inm->inm_srcs);
@@ -575,7 +621,7 @@ inm_clear_recorded(struct in_multi *inm)
*
* Return 0 if the source didn't exist or was already marked as recorded.
* Return 1 if the source was marked as recorded by this function.
- * Return <0 if any error occured (negated errno code).
+ * Return <0 if any error occurred (negated errno code).
*/
int
inm_record_source(struct in_multi *inm, const in_addr_t naddr)
@@ -1177,11 +1223,8 @@ out_inm_release:
int
in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
{
- struct ifnet *ifp;
int error;
- ifp = inm->inm_ifp;
-
IN_MULTI_LOCK();
error = in_leavegroup_locked(inm, imf);
IN_MULTI_UNLOCK();
@@ -1238,7 +1281,9 @@ in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
+ CURVNET_SET(inm->inm_ifp->if_vnet);
error = igmp_change_state(inm);
+ CURVNET_RESTORE();
if (error)
CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
@@ -1526,17 +1571,29 @@ inp_findmoptions(struct inpcb *inp)
}
/*
- * Discard the IP multicast options (and source filters).
+ * Discard the IP multicast options (and source filters). To minimize
+ * the amount of work done while holding locks such as the INP's
+ * pcbinfo lock (which is used in the receive path), the free
+ * operation is performed asynchronously in a separate task.
*
* SMPng: NOTE: assumes INP write lock is held.
*/
void
inp_freemoptions(struct ip_moptions *imo)
{
- struct in_mfilter *imf;
- size_t idx, nmships;
KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__));
+ IN_MULTI_LOCK();
+ STAILQ_INSERT_TAIL(&imo_gc_list, imo, imo_link);
+ IN_MULTI_UNLOCK();
+ taskqueue_enqueue(taskqueue_thread, &imo_gc_task);
+}
+
+static void
+inp_freemoptions_internal(struct ip_moptions *imo)
+{
+ struct in_mfilter *imf;
+ size_t idx, nmships;
nmships = imo->imo_num_memberships;
for (idx = 0; idx < nmships; ++idx) {
@@ -1554,6 +1611,22 @@ inp_freemoptions(struct ip_moptions *imo)
free(imo, M_IPMOPTS);
}
+static void
+inp_gcmoptions(void *context, int pending)
+{
+ struct ip_moptions *imo;
+
+ IN_MULTI_LOCK();
+ while (!STAILQ_EMPTY(&imo_gc_list)) {
+ imo = STAILQ_FIRST(&imo_gc_list);
+ STAILQ_REMOVE_HEAD(&imo_gc_list, imo_link);
+ IN_MULTI_UNLOCK();
+ inp_freemoptions_internal(imo);
+ IN_MULTI_LOCK();
+ }
+ IN_MULTI_UNLOCK();
+}
+
/*
* Atomically get source filters on a socket for an IPv4 multicast group.
* Called with INP lock held; returns with lock released.
@@ -1680,6 +1753,7 @@ inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
int
inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
{
+ struct rm_priotracker in_ifa_tracker;
struct ip_mreqn mreqn;
struct ip_moptions *imo;
struct ifnet *ifp;
@@ -1719,7 +1793,7 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
mreqn.imr_address = imo->imo_multicast_addr;
} else if (ifp != NULL) {
mreqn.imr_ifindex = ifp->if_index;
- IFP_TO_IA(ifp, ia);
+ IFP_TO_IA(ifp, ia, &in_ifa_tracker);
if (ia != NULL) {
mreqn.imr_address =
IA_SIN(ia)->sin_addr;
@@ -1738,7 +1812,7 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
break;
case IP_MULTICAST_TTL:
- if (imo == 0)
+ if (imo == NULL)
optval = coptval = IP_DEFAULT_MULTICAST_TTL;
else
optval = coptval = imo->imo_multicast_ttl;
@@ -1750,7 +1824,7 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
break;
case IP_MULTICAST_LOOP:
- if (imo == 0)
+ if (imo == NULL)
optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
else
optval = coptval = imo->imo_multicast_loop;
@@ -1810,7 +1884,10 @@ static struct ifnet *
inp_lookup_mcast_ifp(const struct inpcb *inp,
const struct sockaddr_in *gsin, const struct in_addr ina)
{
+ struct rm_priotracker in_ifa_tracker;
struct ifnet *ifp;
+ struct nhop4_basic nh4;
+ uint32_t fibnum;
KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__));
KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)),
@@ -1820,21 +1897,15 @@ inp_lookup_mcast_ifp(const struct inpcb *inp,
if (!in_nullhost(ina)) {
INADDR_TO_IFP(ina, ifp);
} else {
- struct route ro;
-
- ro.ro_rt = NULL;
- memcpy(&ro.ro_dst, gsin, sizeof(struct sockaddr_in));
- in_rtalloc_ign(&ro, 0, inp ? inp->inp_inc.inc_fibnum : 0);
- if (ro.ro_rt != NULL) {
- ifp = ro.ro_rt->rt_ifp;
- KASSERT(ifp != NULL, ("%s: null ifp", __func__));
- RTFREE(ro.ro_rt);
- } else {
+ fibnum = inp ? inp->inp_inc.inc_fibnum : 0;
+ if (fib4_lookup_nh_basic(fibnum, gsin->sin_addr, 0, 0, &nh4)==0)
+ ifp = nh4.nh_ifp;
+ else {
struct in_ifaddr *ia;
struct ifnet *mifp;
mifp = NULL;
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
mifp = ia->ia_ifp;
if (!(mifp->if_flags & IFF_LOOPBACK) &&
@@ -1843,7 +1914,7 @@ inp_lookup_mcast_ifp(const struct inpcb *inp,
break;
}
}
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
}
}
@@ -2855,7 +2926,7 @@ sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS)
return (retval);
}
-#ifdef KTR
+#if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3)
static const char *inm_modestrs[] = { "un", "in", "ex" };
@@ -2910,7 +2981,7 @@ inm_print(const struct in_multi *inm)
inm->inm_timer,
inm_state_str(inm->inm_state),
inm->inm_refcount,
- inm->inm_scq.ifq_len);
+ inm->inm_scq.mq_len);
printf("igi %p nsrc %lu sctimer %u scrv %u\n",
inm->inm_igi,
inm->inm_nsrc,
@@ -2927,7 +2998,7 @@ inm_print(const struct in_multi *inm)
printf("%s: --- end inm %p ---\n", __func__, inm);
}
-#else /* !KTR */
+#else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */
void
inm_print(const struct in_multi *inm)
@@ -2935,6 +3006,6 @@ inm_print(const struct in_multi *inm)
}
-#endif /* KTR */
+#endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */
RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp);
diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c
index b93abadf..f8790938 100644
--- a/freebsd/sys/netinet/in_pcb.c
+++ b/freebsd/sys/netinet/in_pcb.c
@@ -49,14 +49,18 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_pcbgroup.h>
+#include <rtems/bsd/local/opt_rss.h>
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
+#include <rtems/bsd/sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/callout.h>
+#include <sys/eventhandler.h>
#include <sys/domain.h>
#include <sys/protosw.h>
+#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/priv.h>
@@ -73,8 +77,11 @@ __FBSDID("$FreeBSD$");
#include <vm/uma.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/if_types.h>
+#include <net/if_llatbl.h>
#include <net/route.h>
+#include <net/rss_config.h>
#include <net/vnet.h>
#if defined(INET) || defined(INET6)
@@ -150,11 +157,7 @@ sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
{
int error;
-#ifdef VIMAGE
- error = vnet_sysctl_handle_int(oidp, arg1, arg2, req);
-#else
error = sysctl_handle_int(oidp, arg1, arg2, req);
-#endif
if (error == 0) {
RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
@@ -171,38 +174,42 @@ sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
"IP Ports");
-SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
- CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0,
- &sysctl_net_ipport_check, "I", "");
-SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
- CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0,
- &sysctl_net_ipport_check, "I", "");
-SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, first,
- CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0,
- &sysctl_net_ipport_check, "I", "");
-SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, last,
- CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0,
- &sysctl_net_ipport_check, "I", "");
-SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
- CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0,
- &sysctl_net_ipport_check, "I", "");
-SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
- CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0,
- &sysctl_net_ipport_check, "I", "");
-SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
- CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, "");
-SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+ &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+ &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+ &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+ &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+ &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+ &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
+ CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
+ &VNET_NAME(ipport_reservedhigh), 0, "");
+SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
-SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
+ CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
-SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
+ CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
"allocations before switching to a sequental one");
-SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
+ CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(ipport_randomtime), 0,
"Minimum time to keep sequental port "
"allocation before switching to a random one");
-#endif
+#endif /* INET */
/*
* in_pcb.c: manage the Protocol Control Blocks.
@@ -225,6 +232,7 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
INP_INFO_LOCK_INIT(pcbinfo, name);
INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */
+ INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
#ifdef VIMAGE
pcbinfo->ipi_vnet = curvnet;
#endif
@@ -242,6 +250,8 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR,
inpcbzone_flags);
uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
+ uma_zone_set_warning(pcbinfo->ipi_zone,
+ "kern.ipc.maxsockets limit reached");
}
/*
@@ -261,6 +271,7 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
in_pcbgroup_destroy(pcbinfo);
#endif
uma_zdestroy(pcbinfo->ipi_zone);
+ INP_LIST_LOCK_DESTROY(pcbinfo);
INP_HASH_LOCK_DESTROY(pcbinfo);
INP_INFO_LOCK_DESTROY(pcbinfo);
}
@@ -275,7 +286,14 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
struct inpcb *inp;
int error;
- INP_INFO_WLOCK_ASSERT(pcbinfo);
+#ifdef INVARIANTS
+ if (pcbinfo == &V_tcbinfo) {
+ INP_INFO_RLOCK_ASSERT(pcbinfo);
+ } else {
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ }
+#endif
+
error = 0;
inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
if (inp == NULL)
@@ -307,6 +325,8 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
inp->inp_flags |= IN6P_IPV6_V6ONLY;
}
#endif
+ INP_WLOCK(inp);
+ INP_LIST_WLOCK(pcbinfo);
LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
pcbinfo->ipi_count++;
so->so_pcb = (caddr_t)inp;
@@ -314,9 +334,9 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
if (V_ip6_auto_flowlabel)
inp->inp_flags |= IN6P_AUTOFLOWLABEL;
#endif
- INP_WLOCK(inp);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */
+ INP_LIST_WUNLOCK(pcbinfo);
#if defined(IPSEC) || defined(MAC)
out:
if (error != 0) {
@@ -338,8 +358,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
return (EINVAL);
- anonport = inp->inp_lport == 0 && (nam == NULL ||
- ((struct sockaddr_in *)nam)->sin_port == 0);
+ anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
&inp->inp_lport, cred);
if (error)
@@ -355,6 +374,9 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
}
#endif
+/*
+ * Select a local port (number) to use.
+ */
#if defined(INET) || defined(INET6)
int
in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
@@ -395,13 +417,14 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
lastport = &pcbinfo->ipi_lastport;
}
/*
- * For UDP, use random port allocation as long as the user
+ * For UDP(-Lite), use random port allocation as long as the user
* allows it. For TCP (and as of yet unknown) connections,
* use random port allocation only if the user allows it AND
* ipport_tick() allows it.
*/
if (V_ipport_randomized &&
- (!V_ipport_stoprandom || pcbinfo == &V_udbinfo))
+ (!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
+ pcbinfo == &V_ulitecbinfo))
dorandom = 1;
else
dorandom = 0;
@@ -411,8 +434,8 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
*/
if (first == last)
dorandom = 0;
- /* Make sure to not include UDP packets in the count. */
- if (pcbinfo != &V_udbinfo)
+ /* Make sure to not include UDP(-Lite) packets in the count. */
+ if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
V_ipport_tcpallocs++;
/*
* Instead of having two loops further down counting up or down
@@ -467,7 +490,7 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
#ifdef INET
if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4)
laddrp->s_addr = laddr.s_addr;
-#endif
+#endif
*lportp = lport;
return (0);
@@ -491,6 +514,38 @@ inp_so_options(const struct inpcb *inp)
}
#endif /* INET || INET6 */
+/*
+ * Check if a new BINDMULTI socket is allowed to be created.
+ *
+ * ni points to the new inp.
+ * oi points to the exisitng inp.
+ *
+ * This checks whether the existing inp also has BINDMULTI and
+ * whether the credentials match.
+ */
+int
+in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
+{
+ /* Check permissions match */
+#ifndef __rtems__
+ if ((ni->inp_flags2 & INP_BINDMULTI) &&
+ (ni->inp_cred->cr_uid !=
+ oi->inp_cred->cr_uid))
+ return (0);
+#endif /* __rtems__ */
+
+ /* Check the existing inp has BINDMULTI set */
+ if ((ni->inp_flags2 & INP_BINDMULTI) &&
+ ((oi->inp_flags2 & INP_BINDMULTI) == 0))
+ return (0);
+
+ /*
+ * We're okay - either INP_BINDMULTI isn't set on ni, or
+ * it is and it matches the checks.
+ */
+ return (1);
+}
+
#ifdef INET
/*
* Set up a bind operation on a PCB, performing port allocation
@@ -594,6 +649,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
* This entire block sorely needs a rewrite.
*/
if (t &&
+ ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
((t->inp_flags & INP_TIMEWAIT) == 0) &&
(so->so_type != SOCK_STREAM ||
ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
@@ -607,6 +663,15 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
0)
#endif /* __rtems__ */
return (EADDRINUSE);
+
+ /*
+ * If the socket is a BINDMULTI socket, then
+ * the credentials need to match and the
+ * original socket also has to have been bound
+ * with BINDMULTI.
+ */
+ if (t && (! in_pcbbind_check_bindmulti(inp, t)))
+ return (EADDRINUSE);
}
t = in_pcblookup_local(pcbinfo, sin->sin_addr,
lport, lookupflags, cred);
@@ -621,7 +686,9 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
if (tw == NULL ||
(reuseport & tw->tw_so_options) == 0)
return (EADDRINUSE);
- } else if (t && (reuseport & inp_so_options(t)) == 0) {
+ } else if (t &&
+ ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
+ (reuseport & inp_so_options(t)) == 0) {
#ifdef INET6
if (ntohl(sin->sin_addr.s_addr) !=
INADDR_ANY ||
@@ -631,6 +698,8 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
(t->inp_vflag & INP_IPV6PROTO) == 0)
#endif
return (EADDRINUSE);
+ if (t && (! in_pcbbind_check_bindmulti(inp, t)))
+ return (EADDRINUSE);
}
}
}
@@ -706,7 +775,7 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
* Do proper source address selection on an unbound socket in case
* of connect. Take jails into account as well.
*/
-static int
+int
in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
struct ucred *cred)
{
@@ -754,9 +823,11 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
struct in_ifaddr *ia;
struct ifnet *ifp;
- ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin));
+ ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
+ inp->inp_socket->so_fibnum));
if (ia == NULL)
- ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0));
+ ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
+ inp->inp_socket->so_fibnum));
if (ia == NULL) {
error = ENETUNREACH;
goto done;
@@ -871,9 +942,11 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
sain.sin_len = sizeof(struct sockaddr_in);
sain.sin_addr.s_addr = faddr->s_addr;
- ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain)));
+ ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain),
+ inp->inp_socket->so_fibnum));
if (ia == NULL)
- ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0));
+ ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0,
+ inp->inp_socket->so_fibnum));
if (ia == NULL)
ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
@@ -946,6 +1019,7 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
struct inpcb **oinpp, struct ucred *cred)
{
+ struct rm_priotracker in_ifa_tracker;
struct sockaddr_in *sin = (struct sockaddr_in *)nam;
struct in_ifaddr *ia;
struct inpcb *oinp;
@@ -982,20 +1056,20 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
* choose the broadcast address for that interface.
*/
if (faddr.s_addr == INADDR_ANY) {
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
faddr =
IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
if (cred != NULL &&
(error = prison_get_ip4(cred, &faddr)) != 0)
return (error);
} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
IFF_BROADCAST)
faddr = satosin(&TAILQ_FIRST(
&V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
}
}
if (laddr.s_addr == INADDR_ANY) {
@@ -1013,7 +1087,7 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
imo = inp->inp_moptions;
if (imo->imo_multicast_ifp != NULL) {
ifp = imo->imo_multicast_ifp;
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if ((ia->ia_ifp == ifp) &&
(cred == NULL ||
@@ -1027,7 +1101,7 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
laddr = ia->ia_addr.sin_addr;
error = 0;
}
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
}
}
if (error)
@@ -1064,7 +1138,7 @@ in_pcbdisconnect(struct inpcb *inp)
inp->inp_fport = 0;
in_pcbrehash(inp);
}
-#endif
+#endif /* INET */
/*
* in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
@@ -1160,8 +1234,17 @@ in_pcbrele_wlocked(struct inpcb *inp)
INP_WLOCK_ASSERT(inp);
- if (refcount_release(&inp->inp_refcount) == 0)
+ if (refcount_release(&inp->inp_refcount) == 0) {
+ /*
+ * If the inpcb has been freed, let the caller know, even if
+ * this isn't the last reference.
+ */
+ if (inp->inp_flags2 & INP_FREED) {
+ INP_WUNLOCK(inp);
+ return (1);
+ }
return (0);
+ }
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
@@ -1197,16 +1280,24 @@ in_pcbfree(struct inpcb *inp)
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
- INP_INFO_WLOCK_ASSERT(pcbinfo);
+#ifdef INVARIANTS
+ if (pcbinfo == &V_tcbinfo) {
+ INP_INFO_LOCK_ASSERT(pcbinfo);
+ } else {
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ }
+#endif
INP_WLOCK_ASSERT(inp);
/* XXXRW: Do as much as possible here. */
#ifdef IPSEC
if (inp->inp_sp != NULL)
ipsec_delete_pcbpolicy(inp);
-#endif /* IPSEC */
+#endif
+ INP_LIST_WLOCK(pcbinfo);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
in_pcbremlists(inp);
+ INP_LIST_WUNLOCK(pcbinfo);
#ifdef INET6
if (inp->inp_vflag & INP_IPV6PROTO) {
ip6_freepcbopts(inp->in6p_outputopts);
@@ -1220,6 +1311,13 @@ in_pcbfree(struct inpcb *inp)
if (inp->inp_moptions != NULL)
inp_freemoptions(inp->inp_moptions);
#endif
+ if (inp->inp_route.ro_rt) {
+ RTFREE(inp->inp_route.ro_rt);
+ inp->inp_route.ro_rt = (struct rtentry *)NULL;
+ }
+ if (inp->inp_route.ro_lle)
+ LLE_FREE(inp->inp_route.ro_lle); /* zeros ro_lle */
+
inp->inp_vflag = 0;
inp->inp_flags2 |= INP_FREED;
crfree(inp->inp_cred);
@@ -1363,7 +1461,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
struct ip_moptions *imo;
int i, gap;
- INP_INFO_RLOCK(pcbinfo);
+ INP_INFO_WLOCK(pcbinfo);
LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
INP_WLOCK(inp);
imo = inp->inp_moptions;
@@ -1393,7 +1491,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
}
INP_WUNLOCK(inp);
}
- INP_INFO_RUNLOCK(pcbinfo);
+ INP_INFO_WUNLOCK(pcbinfo);
}
/*
@@ -1565,6 +1663,83 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
goto found;
}
+#ifdef RSS
+ /*
+ * For incoming connections, we may wish to do a wildcard
+ * match for an RSS-local socket.
+ */
+ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
+ struct inpcb *local_wild = NULL, *local_exact = NULL;
+#ifdef INET6
+ struct inpcb *local_wild_mapped = NULL;
+#endif
+ struct inpcb *jail_wild = NULL;
+ struct inpcbhead *head;
+ int injail;
+
+ /*
+ * Order of socket selection - we always prefer jails.
+ * 1. jailed, non-wild.
+ * 2. jailed, wild.
+ * 3. non-jailed, non-wild.
+ * 4. non-jailed, wild.
+ */
+
+ head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
+ lport, 0, pcbgroup->ipg_hashmask)];
+ LIST_FOREACH(inp, head, inp_pcbgrouphash) {
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_faddr.s_addr != INADDR_ANY ||
+ inp->inp_lport != lport)
+ continue;
+
+ injail = prison_flag(inp->inp_cred, PR_IP4);
+ if (injail) {
+ if (prison_check_ip4(inp->inp_cred,
+ &laddr) != 0)
+ continue;
+ } else {
+ if (local_exact != NULL)
+ continue;
+ }
+
+ if (inp->inp_laddr.s_addr == laddr.s_addr) {
+ if (injail)
+ goto found;
+ else
+ local_exact = inp;
+ } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
+#ifdef INET6
+ /* XXX inp locking, NULL check */
+ if (inp->inp_vflag & INP_IPV6PROTO)
+ local_wild_mapped = inp;
+ else
+#endif
+ if (injail)
+ jail_wild = inp;
+ else
+ local_wild = inp;
+ }
+ } /* LIST_FOREACH */
+
+ inp = jail_wild;
+ if (inp == NULL)
+ inp = local_exact;
+ if (inp == NULL)
+ inp = local_wild;
+#ifdef INET6
+ if (inp == NULL)
+ inp = local_wild_mapped;
+#endif
+ if (inp != NULL)
+ goto found;
+ }
+#endif
+
/*
* Then look for a wildcard match, if requested.
*/
@@ -1596,11 +1771,6 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
inp->inp_lport != lport)
continue;
- /* XXX inp locking */
- if (ifp && ifp->if_type == IFT_FAITH &&
- (inp->inp_flags & INP_FAITH) == 0)
- continue;
-
injail = prison_flag(inp->inp_cred, PR_IP4);
if (injail) {
if (prison_check_ip4(inp->inp_cred,
@@ -1622,7 +1792,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
if (inp->inp_vflag & INP_IPV6PROTO)
local_wild_mapped = inp;
else
-#endif /* INET6 */
+#endif
if (injail)
jail_wild = inp;
else
@@ -1637,7 +1807,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
#ifdef INET6
if (inp == NULL)
inp = local_wild_mapped;
-#endif /* defined(INET6) */
+#endif
if (inp != NULL)
goto found;
} /* if (lookupflags & INPLOOKUP_WILDCARD) */
@@ -1741,11 +1911,6 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
inp->inp_lport != lport)
continue;
- /* XXX inp locking */
- if (ifp && ifp->if_type == IFT_FAITH &&
- (inp->inp_flags & INP_FAITH) == 0)
- continue;
-
injail = prison_flag(inp->inp_cred, PR_IP4);
if (injail) {
if (prison_check_ip4(inp->inp_cred,
@@ -1767,7 +1932,7 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
if (inp->inp_vflag & INP_IPV6PROTO)
local_wild_mapped = inp;
else
-#endif /* INET6 */
+#endif
if (injail)
jail_wild = inp;
else
@@ -1783,7 +1948,7 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
#ifdef INET6
if (local_wild_mapped != NULL)
return (local_wild_mapped);
-#endif /* defined(INET6) */
+#endif
} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
return (NULL);
@@ -1832,7 +1997,7 @@ struct inpcb *
in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
{
-#if defined(PCBGROUP)
+#if defined(PCBGROUP) && !defined(RSS)
struct inpcbgroup *pcbgroup;
#endif
@@ -1841,7 +2006,17 @@ in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
("%s: LOCKPCB not set", __func__));
-#if defined(PCBGROUP)
+ /*
+ * When not using RSS, use connection groups in preference to the
+ * reservation table when looking up 4-tuples. When using RSS, just
+ * use the reservation table, due to the cost of the Toeplitz hash
+ * in software.
+ *
+ * XXXRW: This policy belongs in the pcbgroup code, as in principle
+ * we could be doing RSS with a non-Toeplitz hash that is affordable
+ * in software.
+ */
+#if defined(PCBGROUP) && !defined(RSS)
if (in_pcbgroup_enabled(pcbinfo)) {
pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
fport);
@@ -1868,16 +2043,27 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
("%s: LOCKPCB not set", __func__));
#ifdef PCBGROUP
- if (in_pcbgroup_enabled(pcbinfo)) {
+ /*
+ * If we can use a hardware-generated hash to look up the connection
+ * group, use that connection group to find the inpcb. Otherwise
+ * fall back on a software hash -- or the reservation table if we're
+ * using RSS.
+ *
+ * XXXRW: As above, that policy belongs in the pcbgroup code.
+ */
+ if (in_pcbgroup_enabled(pcbinfo) &&
+ !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
m->m_pkthdr.flowid);
if (pcbgroup != NULL)
return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
fport, laddr, lport, lookupflags, ifp));
+#ifndef RSS
pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
fport);
return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
laddr, lport, lookupflags, ifp));
+#endif
}
#endif
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
@@ -1905,9 +2091,9 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
#ifdef INET6
if (inp->inp_vflag & INP_IPV6)
- hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
+ hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
else
-#endif /* INET6 */
+#endif
hashkey_faddr = inp->inp_faddr.s_addr;
pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
@@ -1992,9 +2178,9 @@ in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
#ifdef INET6
if (inp->inp_vflag & INP_IPV6)
- hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
+ hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
else
-#endif /* INET6 */
+#endif
hashkey_faddr = inp->inp_faddr.s_addr;
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
@@ -2026,8 +2212,16 @@ in_pcbremlists(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
- INP_INFO_WLOCK_ASSERT(pcbinfo);
+#ifdef INVARIANTS
+ if (pcbinfo == &V_tcbinfo) {
+ INP_INFO_RLOCK_ASSERT(pcbinfo);
+ } else {
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ }
+#endif
+
INP_WLOCK_ASSERT(inp);
+ INP_LIST_WLOCK_ASSERT(pcbinfo);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
if (inp->inp_flags & INP_INHASHLIST) {
@@ -2051,6 +2245,25 @@ in_pcbremlists(struct inpcb *inp)
}
/*
+ * Check for alternatives when higher level complains
+ * about service problems. For now, invalidate cached
+ * routing information. If the route was created dynamically
+ * (by a redirect), time to try a default gateway again.
+ */
+void
+in_losing(struct inpcb *inp)
+{
+
+ if (inp->inp_route.ro_rt) {
+ RTFREE(inp->inp_route.ro_rt);
+ inp->inp_route.ro_rt = (struct rtentry *)NULL;
+ }
+ if (inp->inp_route.ro_lle)
+ LLE_FREE(inp->inp_route.ro_lle); /* zeros ro_lle */
+ return;
+}
+
+/*
* A set label operation has occurred at the socket layer, propagate the
* label change into the in_pcb for the socket.
*/
@@ -2115,7 +2328,7 @@ ipport_tick_init(const void *unused __unused)
{
/* Start ipport_tick. */
- callout_init(&ipport_tick_callout, CALLOUT_MPSAFE);
+ callout_init(&ipport_tick_callout, 1);
callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
SHUTDOWN_PRI_DEFAULT);
@@ -2172,13 +2385,13 @@ inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
{
struct inpcb *inp;
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inp);
func(inp, arg);
INP_WUNLOCK(inp);
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
}
struct socket *
@@ -2262,14 +2475,13 @@ db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
/* IPv6. */
ip6_sprintf(laddr_str, &inc->inc6_laddr);
ip6_sprintf(faddr_str, &inc->inc6_faddr);
- } else {
+ } else
#endif
+ {
/* IPv4. */
inet_ntoa_r(inc->inc_laddr, laddr_str);
inet_ntoa_r(inc->inc_faddr, faddr_str);
-#ifdef INET6
}
-#endif
db_print_indent(indent);
db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
ntohs(inc->inc_lport));
@@ -2320,10 +2532,6 @@ db_print_inpflags(int inp_flags)
db_printf("%sINP_MTUDISC", comma ? ", " : "");
comma = 1;
}
- if (inp_flags & INP_FAITH) {
- db_printf("%sINP_FAITH", comma ? ", " : "");
- comma = 1;
- }
if (inp_flags & INP_RECVTTL) {
db_printf("%sINP_RECVTTL", comma ? ", " : "");
comma = 1;
@@ -2486,4 +2694,4 @@ DB_SHOW_COMMAND(inpcb, db_show_inpcb)
db_print_inpcb(inp, "inpcb", 0);
}
-#endif
+#endif /* DDB */
diff --git a/freebsd/sys/netinet/in_pcb.h b/freebsd/sys/netinet/in_pcb.h
index a78c6ab6..ea47d6b2 100644
--- a/freebsd/sys/netinet/in_pcb.h
+++ b/freebsd/sys/netinet/in_pcb.h
@@ -42,6 +42,7 @@
#include <sys/_lock.h>
#include <sys/_mutex.h>
#include <sys/_rwlock.h>
+#include <net/route.h>
#ifdef _KERNEL
#include <rtems/bsd/sys/lock.h>
@@ -79,6 +80,8 @@ struct in_addr_4in6 {
/*
* NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has
* some extra padding to accomplish this.
+ * NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport,
+ * lport, faddr to generate hash, so these fields shouldn't be moved.
*/
struct in_endpoints {
u_int16_t ie_fport; /* foreign port */
@@ -94,6 +97,7 @@ struct in_endpoints {
struct in_addr_4in6 ie46_local;
struct in6_addr ie6_local;
} ie_dependladdr;
+ u_int32_t ie6_zoneid; /* scope zone id */
};
#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4
#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4
@@ -117,34 +121,47 @@ struct in_conninfo {
*/
#define INC_ISIPV6 0x01
-#define inc_isipv6 inc_flags /* temp compatability */
+#define inc_isipv6 inc_flags /* temp compatibility */
#define inc_fport inc_ie.ie_fport
#define inc_lport inc_ie.ie_lport
#define inc_faddr inc_ie.ie_faddr
#define inc_laddr inc_ie.ie_laddr
#define inc6_faddr inc_ie.ie6_faddr
#define inc6_laddr inc_ie.ie6_laddr
+#define inc6_zoneid inc_ie.ie6_zoneid
struct icmp6_filter;
/*-
- * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4
- * and IPv6 sockets. In the case of TCP, further per-connection state is
+ * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
+ * IPv6 sockets. In the case of TCP and UDP, further per-connection state is
* hung off of inp_ppcb most of the time. Almost all fields of struct inpcb
* are static after creation or protected by a per-inpcb rwlock, inp_lock. A
- * few fields also require the global pcbinfo lock for the inpcb to be held,
- * when modified, such as the global connection lists and hashes, as well as
- * binding information (which affects which hash a connection is on). This
- * model means that connections can be looked up without holding the
- * per-connection lock, which is important for performance when attempting to
- * find the connection for a packet given its IP and port tuple. Writing to
- * these fields that write locks be held on both the inpcb and global locks.
+ * few fields are protected by multiple locks as indicated in the locking notes
+ * below. For these fields, all of the listed locks must be write-locked for
+ * any modifications. However, these fields can be safely read while any one of
+ * the listed locks are read-locked. This model can permit greater concurrency
+ * for read operations. For example, connections can be looked up while only
+ * holding a read lock on the global pcblist lock. This is important for
+ * performance when attempting to find the connection for a packet given its IP
+ * and port tuple.
+ *
+ * One noteworthy exception is that the global pcbinfo lock follows a different
+ * set of rules in relation to the inp_list field. Rather than being
+ * write-locked for modifications and read-locked for list iterations, it must
+ * be read-locked during modifications and write-locked during list iterations.
+ * This ensures that the relatively rare global list iterations safely walk a
+ * stable snapshot of connections while allowing more common list modifications
+ * to safely grab the pcblist lock just while adding or removing a connection
+ * from the global list.
*
* Key:
* (c) - Constant after initialization
* (g) - Protected by the pcbgroup lock
* (i) - Protected by the inpcb lock
* (p) - Protected by the pcbinfo lock for the inpcb
+ * (l) - Protected by the pcblist lock for the inpcb
+ * (h) - Protected by the pcbhash lock for the inpcb
* (s) - Protected by another subsystem's locks
* (x) - Undefined locking
*
@@ -159,15 +176,21 @@ struct icmp6_filter;
* socket has been freed), or there may be close(2)-related races.
*
* The inp_vflag field is overloaded, and would otherwise ideally be (c).
+ *
+ * TODO: Currently only the TCP stack is leveraging the global pcbinfo lock
+ * read-lock usage during modification, this model can be applied to other
+ * protocols (especially SCTP).
*/
struct inpcb {
- LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */
+ LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */
LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
- LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */
+ LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */
+ /* (p[w]) for list iteration */
+ /* (p[r]/l) for addition/removal */
void *inp_ppcb; /* (i) pointer to per-protocol pcb */
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
- LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/p) group wildcard entry */
+ LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
struct socket *inp_socket; /* (i) back pointer to socket */
struct ucred *inp_cred; /* (c) cache of socket cred */
u_int32_t inp_flow; /* (i) IPv6 flow information */
@@ -179,12 +202,14 @@ struct inpcb {
u_char inp_ip_minttl; /* (i) minimum TTL or drop */
uint32_t inp_flowid; /* (x) flow id / queue id */
u_int inp_refcount; /* (i) refcount */
- void *inp_pspare[5]; /* (x) route caching / general use */
- u_int inp_ispare[6]; /* (x) route caching / user cookie /
+ void *inp_pspare[5]; /* (x) packet pacing / general use */
+ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */
+ uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */
+ u_int inp_ispare[4]; /* (x) packet pacing / user cookie /
* general use */
/* Local and foreign ports, local and foreign addr. */
- struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */
+ struct in_conninfo inp_inc; /* (i) list for PCB's local port */
/* MAC and IPSEC policy information. */
struct label *inp_label; /* (i) MAC label */
@@ -209,13 +234,19 @@ struct inpcb {
int inp6_cksum;
short inp6_hops;
} inp_depend6;
- LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */
- struct inpcbport *inp_phd; /* (i/p) head of this list */
+ LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */
+ struct inpcbport *inp_phd; /* (i/h) head of this list */
#define inp_zero_size offsetof(struct inpcb, inp_gencnt)
inp_gen_t inp_gencnt; /* (c) generation count */
struct llentry *inp_lle; /* cached L2 information */
- struct rtentry *inp_rt; /* cached L3 information */
struct rwlock inp_lock;
+ rt_gen_t inp_rt_cookie; /* generation for route entry */
+ union { /* cached L3 information */
+ struct route inpu_route;
+ struct route_in6 inpu_route6;
+ } inp_rtu;
+#define inp_route inp_rtu.inpu_route
+#define inp_route6 inp_rtu.inpu_route6
};
#define inp_fport inp_inc.inc_fport
#define inp_lport inp_inc.inc_lport
@@ -227,6 +258,7 @@ struct inpcb {
#define in6p_faddr inp_inc.inc6_faddr
#define in6p_laddr inp_inc.inc6_laddr
+#define in6p_zoneid inp_inc.inc6_zoneid
#define in6p_hops inp_depend6.inp6_hops /* default hop limit */
#define in6p_flowinfo inp_flow
#define in6p_options inp_depend6.inp6_options
@@ -274,37 +306,46 @@ struct inpcbport {
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
*
- * Each pcbinfo is protected by two locks: ipi_lock and ipi_hash_lock,
- * the former covering mutable global fields (such as the global pcb list),
- * and the latter covering the hashed lookup tables. The lock order is:
+ * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and
+ * ipi_list_lock:
+ * - ipi_lock covering the global pcb list stability during loop iteration,
+ * - ipi_hash_lock covering the hashed lookup tables,
+ * - ipi_list_lock covering mutable global fields (such as the global
+ * pcb list)
+ *
+ * The lock order is:
*
- * ipi_lock (before) inpcb locks (before) {ipi_hash_lock, pcbgroup locks}
+ * ipi_lock (before)
+ * inpcb locks (before)
+ * ipi_list locks (before)
+ * {ipi_hash_lock, pcbgroup locks}
*
* Locking key:
*
* (c) Constant or nearly constant after initialisation
* (g) Locked by ipi_lock
+ * (l) Locked by ipi_list_lock
* (h) Read using either ipi_hash_lock or inpcb lock; write requires both
* (p) Protected by one or more pcbgroup locks
* (x) Synchronisation properties poorly defined
*/
struct inpcbinfo {
/*
- * Global lock protecting global inpcb list, inpcb count, etc.
+ * Global lock protecting full inpcb list traversal
*/
struct rwlock ipi_lock;
/*
* Global list of inpcbs on the protocol.
*/
- struct inpcbhead *ipi_listhead; /* (g) */
- u_int ipi_count; /* (g) */
+ struct inpcbhead *ipi_listhead; /* (g/l) */
+ u_int ipi_count; /* (l) */
/*
* Generation count -- incremented each time a connection is allocated
* or freed.
*/
- u_quad_t ipi_gencnt; /* (g) */
+ u_quad_t ipi_gencnt; /* (l) */
/*
* Fields associated with port lookup and allocation.
@@ -362,6 +403,11 @@ struct inpcbinfo {
* general use 2
*/
void *ipi_pspare[2];
+
+ /*
+ * Global lock protecting global inpcb list, inpcb count, etc.
+ */
+ struct rwlock ipi_list_lock;
};
#ifdef _KERNEL
@@ -454,6 +500,7 @@ short inp_so_options(const struct inpcb *inp);
#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock)
+#define INP_INFO_WLOCKED(ipi) rw_wowned(&(ipi)->ipi_lock)
#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock)
#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock)
#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED)
@@ -461,6 +508,25 @@ short inp_so_options(const struct inpcb *inp);
#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
+#define INP_LIST_LOCK_INIT(ipi, d) \
+ rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
+#define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock)
+#define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock)
+#define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_LOCK_ASSERT(ipi) \
+ rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED)
+#define INP_LIST_RLOCK_ASSERT(ipi) \
+ rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED)
+#define INP_LIST_WLOCK_ASSERT(ipi) \
+ rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED)
+#define INP_LIST_UNLOCK_ASSERT(ipi) \
+ rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
+
#define INP_HASH_LOCK_INIT(ipi, d) \
rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0)
#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock)
@@ -485,6 +551,7 @@ short inp_so_options(const struct inpcb *inp);
(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
#define INP_PCBPORTHASH(lport, mask) \
(ntohs((lport)) & (mask))
+#define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3])
/*
* Flags for inp_vflags -- historically version flags only
@@ -505,7 +572,7 @@ short inp_so_options(const struct inpcb *inp);
#define INP_ANONPORT 0x00000040 /* port chosen for user */
#define INP_RECVIF 0x00000080 /* receive incoming interface */
#define INP_MTUDISC 0x00000100 /* user can do MTU discovery */
-#define INP_FAITH 0x00000200 /* accept FAITH'ed connections */
+ /* 0x000200 unused: was INP_FAITH */
#define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */
#define INP_DONTFRAG 0x00000800 /* don't fragment packet */
#define INP_BINDANY 0x00001000 /* allow bind to any address */
@@ -524,8 +591,8 @@ short inp_so_options(const struct inpcb *inp);
#define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */
#define INP_DROPPED 0x04000000 /* protocol drop flag */
#define INP_SOCKREF 0x08000000 /* strong socket reference */
-#define INP_SW_FLOWID 0x10000000 /* software generated flow id */
-#define INP_HW_FLOWID 0x20000000 /* hardware generated flow id */
+#define INP_RESERVED_0 0x10000000 /* reserved field */
+#define INP_RESERVED_1 0x20000000 /* reserved field */
#define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */
#define IN6P_MTU 0x80000000 /* receive path MTU */
@@ -545,6 +612,10 @@ short inp_so_options(const struct inpcb *inp);
#define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */
#define INP_FREED 0x00000010 /* inp itself is not valid */
#define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */
+#define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */
+#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
+#define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */
+#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */
/*
* Flags passed to in_pcblookup*() functions.
@@ -603,6 +674,9 @@ void in_pcbinfo_destroy(struct inpcbinfo *);
void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *,
int, int, char *, uma_init, uma_fini, uint32_t, u_int);
+int in_pcbbind_check_bindmulti(const struct inpcb *ni,
+ const struct inpcb *oi);
+
struct inpcbgroup *
in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t);
struct inpcbgroup *
@@ -636,6 +710,8 @@ void in_pcbdrop(struct inpcb *);
void in_pcbfree(struct inpcb *);
int in_pcbinshash(struct inpcb *);
int in_pcbinshash_nopcbgroup(struct inpcb *);
+int in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *,
+ struct ucred *);
struct inpcb *
in_pcblookup_local(struct inpcbinfo *,
struct in_addr, u_short, int, struct ucred *);
@@ -653,6 +729,7 @@ void in_pcbrehash_mbuf(struct inpcb *, struct mbuf *);
int in_pcbrele(struct inpcb *);
int in_pcbrele_rlocked(struct inpcb *);
int in_pcbrele_wlocked(struct inpcb *);
+void in_losing(struct inpcb *);
void in_pcbsetsolabel(struct socket *so);
int in_getpeeraddr(struct socket *so, struct sockaddr **nam);
int in_getsockaddr(struct socket *so, struct sockaddr **nam);
diff --git a/freebsd/sys/netinet/in_proto.c b/freebsd/sys/netinet/in_proto.c
index 1eef2c72..8c3efa4d 100644
--- a/freebsd/sys/netinet/in_proto.c
+++ b/freebsd/sys/netinet/in_proto.c
@@ -34,7 +34,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <rtems/bsd/local/opt_ipx.h>
#include <rtems/bsd/local/opt_mrouting.h>
#include <rtems/bsd/local/opt_ipsec.h>
#include <rtems/bsd/local/opt_inet.h>
@@ -45,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
+#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/domain.h>
#include <sys/proc.h>
@@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$");
*/
#ifdef INET
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
@@ -120,9 +121,6 @@ struct protosw inetsw[] = {
.pr_domain = &inetdomain,
.pr_protocol = IPPROTO_IP,
.pr_init = ip_init,
-#ifdef VIMAGE
- .pr_destroy = ip_destroy,
-#endif
.pr_slowtimo = ip_slowtimo,
.pr_drain = ip_drain,
.pr_usrreqs = &nousrreqs
@@ -136,9 +134,6 @@ struct protosw inetsw[] = {
.pr_ctlinput = udp_ctlinput,
.pr_ctloutput = udp_ctloutput,
.pr_init = udp_init,
-#ifdef VIMAGE
- .pr_destroy = udp_destroy,
-#endif
.pr_usrreqs = &udp_usrreqs
},
{
@@ -150,9 +145,6 @@ struct protosw inetsw[] = {
.pr_ctlinput = tcp_ctlinput,
.pr_ctloutput = tcp_ctloutput,
.pr_init = tcp_init,
-#ifdef VIMAGE
- .pr_destroy = tcp_destroy,
-#endif
.pr_slowtimo = tcp_slowtimo,
.pr_drain = tcp_drain,
.pr_usrreqs = &tcp_usrreqs
@@ -167,9 +159,6 @@ struct protosw inetsw[] = {
.pr_ctlinput = sctp_ctlinput,
.pr_ctloutput = sctp_ctloutput,
.pr_init = sctp_init,
-#ifdef VIMAGE
- .pr_destroy = sctp_finish,
-#endif
.pr_drain = sctp_drain,
.pr_usrreqs = &sctp_usrreqs
},
@@ -177,7 +166,7 @@ struct protosw inetsw[] = {
.pr_type = SOCK_STREAM,
.pr_domain = &inetdomain,
.pr_protocol = IPPROTO_SCTP,
- .pr_flags = PR_WANTRCVD,
+ .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD,
.pr_input = sctp_input,
.pr_ctlinput = sctp_ctlinput,
.pr_ctloutput = sctp_ctloutput,
@@ -186,6 +175,17 @@ struct protosw inetsw[] = {
},
#endif /* SCTP */
{
+ .pr_type = SOCK_DGRAM,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_UDPLITE,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = udp_input,
+ .pr_ctlinput = udplite_ctlinput,
+ .pr_ctloutput = udp_ctloutput,
+ .pr_init = udplite_init,
+ .pr_usrreqs = &udp_usrreqs
+},
+{
.pr_type = SOCK_RAW,
.pr_domain = &inetdomain,
.pr_protocol = IPPROTO_RAW,
@@ -330,9 +330,6 @@ IPPROTOSPACER,
.pr_input = rip_input,
.pr_ctloutput = rip_ctloutput,
.pr_init = rip_init,
-#ifdef VIMAGE
- .pr_destroy = rip_destroy,
-#endif
.pr_usrreqs = &rip_usrreqs
},
};
@@ -344,7 +341,7 @@ struct domain inetdomain = {
.dom_family = AF_INET,
.dom_name = "internet",
.dom_protosw = inetsw,
- .dom_protoswNPROTOSW = &inetsw[sizeof(inetsw)/sizeof(inetsw[0])],
+ .dom_protoswNPROTOSW = &inetsw[nitems(inetsw)],
#ifdef RADIX_MPATH
.dom_rtattach = rn4_mpath_inithead,
#else
@@ -353,8 +350,6 @@ struct domain inetdomain = {
#ifdef VIMAGE
.dom_rtdetach = in_detachhead,
#endif
- .dom_rtoffset = 32,
- .dom_maxrtkey = sizeof(struct sockaddr_in),
.dom_ifattach = in_domifattach,
.dom_ifdetach = in_domifdetach
};
@@ -382,3 +377,5 @@ SYSCTL_NODE(_net_inet, IPPROTO_IPCOMP, ipcomp, CTLFLAG_RW, 0, "IPCOMP");
SYSCTL_NODE(_net_inet, IPPROTO_IPIP, ipip, CTLFLAG_RW, 0, "IPIP");
#endif /* IPSEC */
SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW");
+SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW, 0,
+ "Accept filters");
diff --git a/freebsd/sys/netinet/in_rmx.c b/freebsd/sys/netinet/in_rmx.c
index 939193f6..2062d1d1 100644
--- a/freebsd/sys/netinet/in_rmx.c
+++ b/freebsd/sys/netinet/in_rmx.c
@@ -38,11 +38,11 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <sys/socket.h>
#include <sys/mbuf.h>
-#include <sys/syslog.h>
-#include <sys/callout.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
+#include <net/route_var.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -56,19 +56,16 @@ extern int in_inithead(void **head, int off);
extern int in_detachhead(void **head, int off);
#endif
-#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */
-
/*
* Do what we need to do when inserting a route.
*/
static struct radix_node *
-in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
+in_addroute(void *v_arg, void *n_arg, struct radix_head *head,
struct radix_node *treenodes)
{
struct rtentry *rt = (struct rtentry *)treenodes;
struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt);
- RADIX_NODE_HEAD_WLOCK_ASSERT(head);
/*
* A little bit of help for both IP output and input:
* For host routes, we make sure that RTF_BROADCAST
@@ -95,247 +92,20 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
rt->rt_flags |= RTF_MULTICAST;
- if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp)
- rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
-
- return (rn_addroute(v_arg, n_arg, head, treenodes));
-}
-
-/*
- * This code is the inverse of in_clsroute: on first reference, if we
- * were managing the route, stop doing so and set the expiration timer
- * back off again.
- */
-static struct radix_node *
-in_matroute(void *v_arg, struct radix_node_head *head)
-{
- struct radix_node *rn = rn_match(v_arg, head);
- struct rtentry *rt = (struct rtentry *)rn;
-
- if (rt) {
- RT_LOCK(rt);
- if (rt->rt_flags & RTPRF_OURS) {
- rt->rt_flags &= ~RTPRF_OURS;
- rt->rt_rmx.rmx_expire = 0;
- }
- RT_UNLOCK(rt);
- }
- return rn;
-}
-
-static VNET_DEFINE(int, rtq_reallyold) = 60*60; /* one hour is "really old" */
-#define V_rtq_reallyold VNET(rtq_reallyold)
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW,
- &VNET_NAME(rtq_reallyold), 0,
- "Default expiration time on dynamically learned routes");
-
-/* never automatically crank down to less */
-static VNET_DEFINE(int, rtq_minreallyold) = 10;
-#define V_rtq_minreallyold VNET(rtq_minreallyold)
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW,
- &VNET_NAME(rtq_minreallyold), 0,
- "Minimum time to attempt to hold onto dynamically learned routes");
-
-/* 128 cached routes is "too many" */
-static VNET_DEFINE(int, rtq_toomany) = 128;
-#define V_rtq_toomany VNET(rtq_toomany)
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW,
- &VNET_NAME(rtq_toomany), 0,
- "Upper limit on dynamically learned routes");
-
-/*
- * On last reference drop, mark the route as belong to us so that it can be
- * timed out.
- */
-static void
-in_clsroute(struct radix_node *rn, struct radix_node_head *head)
-{
- struct rtentry *rt = (struct rtentry *)rn;
-
- RT_LOCK_ASSERT(rt);
-
- if (!(rt->rt_flags & RTF_UP))
- return; /* prophylactic measures */
-
- if (rt->rt_flags & RTPRF_OURS)
- return;
-
- if (!(rt->rt_flags & RTF_DYNAMIC))
- return;
-
- /*
- * If rtq_reallyold is 0, just delete the route without
- * waiting for a timeout cycle to kill it.
- */
- if (V_rtq_reallyold != 0) {
- rt->rt_flags |= RTPRF_OURS;
- rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold;
- } else {
- rtexpunge(rt);
- }
-}
-
-struct rtqk_arg {
- struct radix_node_head *rnh;
- int draining;
- int killed;
- int found;
- int updating;
- time_t nextstop;
-};
-
-/*
- * Get rid of old routes. When draining, this deletes everything, even when
- * the timeout is not expired yet. When updating, this makes sure that
- * nothing has a timeout longer than the current value of rtq_reallyold.
- */
-static int
-in_rtqkill(struct radix_node *rn, void *rock)
-{
- struct rtqk_arg *ap = rock;
- struct rtentry *rt = (struct rtentry *)rn;
- int err;
-
- RADIX_NODE_HEAD_WLOCK_ASSERT(ap->rnh);
-
- if (rt->rt_flags & RTPRF_OURS) {
- ap->found++;
-
- if (ap->draining || rt->rt_rmx.rmx_expire <= time_uptime) {
- if (rt->rt_refcnt > 0)
- panic("rtqkill route really not free");
-
- err = in_rtrequest(RTM_DELETE,
- (struct sockaddr *)rt_key(rt),
- rt->rt_gateway, rt_mask(rt),
- rt->rt_flags | RTF_RNH_LOCKED, 0,
- rt->rt_fibnum);
- if (err) {
- log(LOG_WARNING, "in_rtqkill: error %d\n", err);
- } else {
- ap->killed++;
- }
- } else {
- if (ap->updating &&
- (rt->rt_rmx.rmx_expire - time_uptime >
- V_rtq_reallyold)) {
- rt->rt_rmx.rmx_expire =
- time_uptime + V_rtq_reallyold;
- }
- ap->nextstop = lmin(ap->nextstop,
- rt->rt_rmx.rmx_expire);
- }
- }
-
- return 0;
-}
-
-#define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */
-static VNET_DEFINE(int, rtq_timeout) = RTQ_TIMEOUT;
-static VNET_DEFINE(struct callout, rtq_timer);
-
-#define V_rtq_timeout VNET(rtq_timeout)
-#define V_rtq_timer VNET(rtq_timer)
-
-static void in_rtqtimo_one(void *rock);
-
-static void
-in_rtqtimo(void *rock)
-{
- CURVNET_SET((struct vnet *) rock);
- int fibnum;
- void *newrock;
- struct timeval atv;
-
- for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
- newrock = rt_tables_get_rnh(fibnum, AF_INET);
- if (newrock != NULL)
- in_rtqtimo_one(newrock);
- }
- atv.tv_usec = 0;
- atv.tv_sec = V_rtq_timeout;
- callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock);
- CURVNET_RESTORE();
-}
+ if (rt->rt_ifp != NULL) {
-static void
-in_rtqtimo_one(void *rock)
-{
- struct radix_node_head *rnh = rock;
- struct rtqk_arg arg;
- static time_t last_adjusted_timeout = 0;
-
- arg.found = arg.killed = 0;
- arg.rnh = rnh;
- arg.nextstop = time_uptime + V_rtq_timeout;
- arg.draining = arg.updating = 0;
- RADIX_NODE_HEAD_LOCK(rnh);
- rnh->rnh_walktree(rnh, in_rtqkill, &arg);
- RADIX_NODE_HEAD_UNLOCK(rnh);
-
- /*
- * Attempt to be somewhat dynamic about this:
- * If there are ``too many'' routes sitting around taking up space,
- * then crank down the timeout, and see if we can't make some more
- * go away. However, we make sure that we will never adjust more
- * than once in rtq_timeout seconds, to keep from cranking down too
- * hard.
- */
- if ((arg.found - arg.killed > V_rtq_toomany) &&
- (time_uptime - last_adjusted_timeout >= V_rtq_timeout) &&
- V_rtq_reallyold > V_rtq_minreallyold) {
- V_rtq_reallyold = 2 * V_rtq_reallyold / 3;
- if (V_rtq_reallyold < V_rtq_minreallyold) {
- V_rtq_reallyold = V_rtq_minreallyold;
- }
-
- last_adjusted_timeout = time_uptime;
-#ifdef DIAGNOSTIC
- log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n",
- V_rtq_reallyold);
-#endif
- arg.found = arg.killed = 0;
- arg.updating = 1;
- RADIX_NODE_HEAD_LOCK(rnh);
- rnh->rnh_walktree(rnh, in_rtqkill, &arg);
- RADIX_NODE_HEAD_UNLOCK(rnh);
- }
-
-}
-
-void
-in_rtqdrain(void)
-{
- VNET_ITERATOR_DECL(vnet_iter);
- struct radix_node_head *rnh;
- struct rtqk_arg arg;
- int fibnum;
-
- VNET_LIST_RLOCK_NOSLEEP();
- VNET_FOREACH(vnet_iter) {
- CURVNET_SET(vnet_iter);
-
- for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
- rnh = rt_tables_get_rnh(fibnum, AF_INET);
- arg.found = arg.killed = 0;
- arg.rnh = rnh;
- arg.nextstop = 0;
- arg.draining = 1;
- arg.updating = 0;
- RADIX_NODE_HEAD_LOCK(rnh);
- rnh->rnh_walktree(rnh, in_rtqkill, &arg);
- RADIX_NODE_HEAD_UNLOCK(rnh);
- }
- CURVNET_RESTORE();
+ /*
+ * Check route MTU:
+ * inherit interface MTU if not set or
+ * check if MTU is too large.
+ */
+ if (rt->rt_mtu == 0) {
+ rt->rt_mtu = rt->rt_ifp->if_mtu;
+ } else if (rt->rt_mtu > rt->rt_ifp->if_mtu)
+ rt->rt_mtu = rt->rt_ifp->if_mtu;
}
- VNET_LIST_RUNLOCK_NOSLEEP();
-}
-void
-in_setmatchfunc(struct radix_node_head *rnh, int val)
-{
-
- rnh->rnh_matchaddr = (val != 0) ? rn_match : in_matroute;
+ return (rn_addroute(v_arg, n_arg, head, treenodes));
}
static int _in_rt_was_here;
@@ -345,29 +115,16 @@ static int _in_rt_was_here;
int
in_inithead(void **head, int off)
{
- struct radix_node_head *rnh;
+ struct rib_head *rh;
- /* XXX MRT
- * This can be called from vfs_export.c too in which case 'off'
- * will be 0. We know the correct value so just use that and
- * return directly if it was 0.
- * This is a hack that replaces an even worse hack on a bad hack
- * on a bad design. After RELENG_7 this should be fixed but that
- * will change the ABI, so for now do it this way.
- */
- if (!rn_inithead(head, 32))
- return 0;
+ rh = rt_table_init(32);
+ if (rh == NULL)
+ return (0);
- if (off == 0) /* XXX MRT see above */
- return 1; /* only do the rest for a real routing table */
+ rh->rnh_addaddr = in_addroute;
+ *head = (void *)rh;
- rnh = *head;
- rnh->rnh_addaddr = in_addroute;
- in_setmatchfunc(rnh, V_drop_redirect);
- rnh->rnh_close = in_clsroute;
if (_in_rt_was_here == 0 ) {
- callout_init(&V_rtq_timer, CALLOUT_MPSAFE);
- callout_reset(&V_rtq_timer, 1, in_rtqtimo, curvnet);
_in_rt_was_here = 1;
}
return 1;
@@ -378,7 +135,7 @@ int
in_detachhead(void **head, int off)
{
- callout_drain(&V_rtq_timer);
+ rt_table_destroy((struct rib_head *)(*head));
return (1);
}
#endif
@@ -398,62 +155,32 @@ struct in_ifadown_arg {
};
static int
-in_ifadownkill(struct radix_node *rn, void *xap)
+in_ifadownkill(const struct rtentry *rt, void *xap)
{
struct in_ifadown_arg *ap = xap;
- struct rtentry *rt = (struct rtentry *)rn;
- RT_LOCK(rt);
- if (rt->rt_ifa == ap->ifa &&
- (ap->del || !(rt->rt_flags & RTF_STATIC))) {
- /*
- * Aquire a reference so that it can later be freed
- * as the refcount would be 0 here in case of at least
- * ap->del.
- */
- RT_ADDREF(rt);
- /*
- * Disconnect it from the tree and permit protocols
- * to cleanup.
- */
- rtexpunge(rt);
- /*
- * At this point it is an rttrash node, and in case
- * the above is the only reference we must free it.
- * If we do not noone will have a pointer and the
- * rtentry will be leaked forever.
- * In case someone else holds a reference, we are
- * fine as we only decrement the refcount. In that
- * case if the other entity calls RT_REMREF, we
- * will still be leaking but at least we tried.
- */
- RTFREE_LOCKED(rt);
+ if (rt->rt_ifa != ap->ifa)
return (0);
- }
- RT_UNLOCK(rt);
- return 0;
+
+ if ((rt->rt_flags & RTF_STATIC) != 0 && ap->del == 0)
+ return (0);
+
+ return (1);
}
-int
+void
in_ifadown(struct ifaddr *ifa, int delete)
{
struct in_ifadown_arg arg;
- struct radix_node_head *rnh;
- int fibnum;
- if (ifa->ifa_addr->sa_family != AF_INET)
- return 1;
+ KASSERT(ifa->ifa_addr->sa_family == AF_INET,
+ ("%s: wrong family", __func__));
- for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
- rnh = rt_tables_get_rnh(fibnum, AF_INET);
- arg.ifa = ifa;
- arg.del = delete;
- RADIX_NODE_HEAD_LOCK(rnh);
- rnh->rnh_walktree(rnh, in_ifadownkill, &arg);
- RADIX_NODE_HEAD_UNLOCK(rnh);
- ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */
- }
- return 0;
+ arg.ifa = ifa;
+ arg.del = delete;
+
+ rt_foreach_fib_walk_del(AF_INET, in_ifadownkill, &arg);
+ ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */
}
/*
@@ -467,25 +194,6 @@ in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum)
rtalloc_ign_fib(ro, ignflags, fibnum);
}
-int
-in_rtrequest( int req,
- struct sockaddr *dst,
- struct sockaddr *gateway,
- struct sockaddr *netmask,
- int flags,
- struct rtentry **ret_nrt,
- u_int fibnum)
-{
- return (rtrequest_fib(req, dst, gateway, netmask,
- flags, ret_nrt, fibnum));
-}
-
-struct rtentry *
-in_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum)
-{
- return (rtalloc1_fib(dst, report, ignflags, fibnum));
-}
-
void
in_rtredirect(struct sockaddr *dst,
struct sockaddr *gateway,
@@ -497,16 +205,3 @@ in_rtredirect(struct sockaddr *dst,
rtredirect_fib(dst, gateway, netmask, flags, src, fibnum);
}
-void
-in_rtalloc(struct route *ro, u_int fibnum)
-{
- rtalloc_ign_fib(ro, 0UL, fibnum);
-}
-
-#if 0
-int in_rt_getifa(struct rt_addrinfo *, u_int fibnum);
-int in_rtioctl(u_long, caddr_t, u_int);
-int in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int);
-#endif
-
-
diff --git a/freebsd/sys/netinet/in_rss.h b/freebsd/sys/netinet/in_rss.h
new file mode 100644
index 00000000..fd300ac5
--- /dev/null
+++ b/freebsd/sys/netinet/in_rss.h
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by Robert N. M. Watson under contract
+ * to Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IN_RSS_H_
+#define _NETINET_IN_RSS_H_
+
+#include <netinet/in.h> /* in_addr_t */
+
+/*
+ * Network stack interface to generate a hash for a protocol tuple.
+ */
+uint32_t rss_hash_ip4_4tuple(struct in_addr src, u_short srcport,
+ struct in_addr dst, u_short dstport);
+uint32_t rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst);
+
+/*
+ * Functions to calculate a software RSS hash for a given mbuf or
+ * packet detail.
+ */
+int rss_mbuf_software_hash_v4(const struct mbuf *m, int dir,
+ uint32_t *hashval, uint32_t *hashtype);
+int rss_proto_software_hash_v4(struct in_addr src,
+ struct in_addr dst, u_short src_port, u_short dst_port,
+ int proto, uint32_t *hashval,
+ uint32_t *hashtype);
+struct mbuf * rss_soft_m2cpuid_v4(struct mbuf *m, uintptr_t source,
+ u_int *cpuid);
+
+#endif /* !_NETINET_IN_RSS_H_ */
diff --git a/freebsd/sys/netinet/in_systm.h b/freebsd/sys/netinet/in_systm.h
index 4b34aa00..a4a56833 100644
--- a/freebsd/sys/netinet/in_systm.h
+++ b/freebsd/sys/netinet/in_systm.h
@@ -44,14 +44,26 @@
* Internally the system keeps counters in the headers with the bytes
* swapped so that VAX instructions will work on them. It reverses
* the bytes before transmission at each protocol level. The n_ types
- * represent the types with the bytes in ``high-ender'' order.
+ * represent the types with the bytes in ``high-ender'' order. Network
+ * byte order is usually referered to as big-endian these days rather
+ * than high-ender, which sadly invokes an Orson Scott Card novel, or
+ * worse, the movie.
*/
typedef u_int16_t n_short; /* short as received from the net */
typedef u_int32_t n_long; /* long as received from the net */
-typedef u_int32_t n_time; /* ms since 00:00 GMT, byte rev */
+typedef u_int32_t n_time; /* ms since 00:00 UTC, byte rev */
#ifdef _KERNEL
+struct inpcb;
+struct ucred;
+
+#ifndef __rtems__
+int cr_canseeinpcb(struct ucred *cred, struct inpcb *inp);
+#else /* __rtems__ */
+#define cr_canseeinpcb(cred, inp) 0
+#endif /* __rtems__ */
+
uint32_t iptime(void);
#endif
diff --git a/freebsd/sys/netinet/in_var.h b/freebsd/sys/netinet/in_var.h
index b8477309..af83e9a1 100644
--- a/freebsd/sys/netinet/in_var.h
+++ b/freebsd/sys/netinet/in_var.h
@@ -33,11 +33,24 @@
#ifndef _NETINET_IN_VAR_H_
#define _NETINET_IN_VAR_H_
+/*
+ * Argument structure for SIOCAIFADDR.
+ */
+struct in_aliasreq {
+ char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */
+ struct sockaddr_in ifra_addr;
+ struct sockaddr_in ifra_broadaddr;
+#define ifra_dstaddr ifra_broadaddr
+ struct sockaddr_in ifra_mask;
+ int ifra_vhid;
+};
+
+#ifdef _KERNEL
#include <sys/queue.h>
#include <sys/fnv_hash.h>
#include <sys/tree.h>
-struct igmp_ifinfo;
+struct igmp_ifsoftc;
struct in_multi;
struct lltable;
@@ -46,7 +59,7 @@ struct lltable;
*/
struct in_ifinfo {
struct lltable *ii_llt; /* ARP state */
- struct igmp_ifinfo *ii_igmp; /* IGMP state */
+ struct igmp_ifsoftc *ii_igmp; /* IGMP state */
struct in_multi *ii_allhosts; /* 224.0.0.1 membership */
};
@@ -71,25 +84,17 @@ struct in_ifaddr {
struct sockaddr_in ia_sockmask; /* reserve space for general netmask */
};
-struct in_aliasreq {
- char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */
- struct sockaddr_in ifra_addr;
- struct sockaddr_in ifra_broadaddr;
-#define ifra_dstaddr ifra_broadaddr
- struct sockaddr_in ifra_mask;
-};
/*
* Given a pointer to an in_ifaddr (ifaddr),
* return a pointer to the addr as a sockaddr_in.
*/
#define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr))
#define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr))
+#define IA_MASKSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_sockmask))
#define IN_LNAOF(in, ifa) \
((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask))
-
-#ifdef _KERNEL
extern u_char inetctlerrmap[];
#define LLTABLE(ifp) \
@@ -114,15 +119,15 @@ VNET_DECLARE(u_long, in_ifaddrhmask); /* mask for hash table */
#define INADDR_HASH(x) \
(&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask])
-extern struct rwlock in_ifaddr_lock;
+extern struct rmlock in_ifaddr_lock;
-#define IN_IFADDR_LOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_LOCKED)
-#define IN_IFADDR_RLOCK() rw_rlock(&in_ifaddr_lock)
-#define IN_IFADDR_RLOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_RLOCKED)
-#define IN_IFADDR_RUNLOCK() rw_runlock(&in_ifaddr_lock)
-#define IN_IFADDR_WLOCK() rw_wlock(&in_ifaddr_lock)
-#define IN_IFADDR_WLOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_WLOCKED)
-#define IN_IFADDR_WUNLOCK() rw_wunlock(&in_ifaddr_lock)
+#define IN_IFADDR_LOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_LOCKED)
+#define IN_IFADDR_RLOCK(t) rm_rlock(&in_ifaddr_lock, (t))
+#define IN_IFADDR_RLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_RLOCKED)
+#define IN_IFADDR_RUNLOCK(t) rm_runlock(&in_ifaddr_lock, (t))
+#define IN_IFADDR_WLOCK() rm_wlock(&in_ifaddr_lock)
+#define IN_IFADDR_WLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_WLOCKED)
+#define IN_IFADDR_WUNLOCK() rm_wunlock(&in_ifaddr_lock)
/*
* Macro for finding the internet address structure (in_ifaddr)
@@ -156,29 +161,20 @@ do { \
* Macro for finding the internet address structure (in_ifaddr) corresponding
* to a given interface (ifnet structure).
*/
-#define IFP_TO_IA(ifp, ia) \
+#define IFP_TO_IA(ifp, ia, t) \
/* struct ifnet *ifp; */ \
/* struct in_ifaddr *ia; */ \
+ /* struct rm_priotracker *t; */ \
do { \
- IN_IFADDR_RLOCK(); \
+ IN_IFADDR_RLOCK((t)); \
for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \
(ia) != NULL && (ia)->ia_ifp != (ifp); \
(ia) = TAILQ_NEXT((ia), ia_link)) \
continue; \
if ((ia) != NULL) \
ifa_ref(&(ia)->ia_ifa); \
- IN_IFADDR_RUNLOCK(); \
+ IN_IFADDR_RUNLOCK((t)); \
} while (0)
-#endif
-
-/*
- * IP datagram reassembly.
- */
-#define IPREASS_NHASH_LOG2 6
-#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
-#define IPREASS_HMASK (IPREASS_NHASH - 1)
-#define IPREASS_HASH(x,y) \
- (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
/*
* Legacy IPv4 IGMP per-link structure.
@@ -191,28 +187,6 @@ struct router_info {
};
/*
- * Per-interface IGMP router version information.
- */
-struct igmp_ifinfo {
- LIST_ENTRY(igmp_ifinfo) igi_link;
- struct ifnet *igi_ifp; /* interface this instance belongs to */
- uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */
- uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */
- uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */
- uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/
- uint32_t igi_flags; /* IGMP per-interface flags */
- uint32_t igi_rv; /* IGMPv3 Robustness Variable */
- uint32_t igi_qi; /* IGMPv3 Query Interval (s) */
- uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */
- uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */
- SLIST_HEAD(,in_multi) igi_relinmhead; /* released groups */
- struct ifqueue igi_gq; /* queue of general query responses */
-};
-
-#define IGIF_SILENT 0x00000001 /* Do not use IGMP on this ifp */
-#define IGIF_LOOPBACK 0x00000002 /* Send IGMP reports to loopback */
-
-/*
* IPv4 multicast IGMP-layer source entry.
*/
struct ip_msource {
@@ -290,12 +264,12 @@ struct in_multi {
u_int inm_refcount; /* reference count */
/* New fields for IGMPv3 follow. */
- struct igmp_ifinfo *inm_igi; /* IGMP info */
+ struct igmp_ifsoftc *inm_igi; /* IGMP info */
SLIST_ENTRY(in_multi) inm_nrele; /* to-be-released by IGMP */
struct ip_msource_tree inm_srcs; /* tree of sources */
u_long inm_nsrc; /* # of tree entries */
- struct ifqueue inm_scq; /* queue of pending
+ struct mbufq inm_scq; /* queue of pending
* state-change packets */
struct timeval inm_lastgsrtv; /* Time of last G-S-R query */
uint16_t inm_sctimer; /* state-change timer */
@@ -339,8 +313,6 @@ ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims,
return (MCAST_UNDEFINED);
}
-#ifdef _KERNEL
-
#ifdef SYSCTL_DECL
SYSCTL_DECL(_net_inet);
SYSCTL_DECL(_net_inet_ip);
@@ -359,49 +331,6 @@ extern struct mtx in_multi_mtx;
#define IN_MULTI_LOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_OWNED)
#define IN_MULTI_UNLOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_NOTOWNED)
-/*
- * Function for looking up an in_multi record for an IPv4 multicast address
- * on a given interface. ifp must be valid. If no record found, return NULL.
- * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held.
- */
-static __inline struct in_multi *
-inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina)
-{
- struct ifmultiaddr *ifma;
- struct in_multi *inm;
-
- IN_MULTI_LOCK_ASSERT();
- IF_ADDR_LOCK_ASSERT(ifp);
-
- inm = NULL;
- TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) {
- if (ifma->ifma_addr->sa_family == AF_INET) {
- inm = (struct in_multi *)ifma->ifma_protospec;
- if (inm->inm_addr.s_addr == ina.s_addr)
- break;
- inm = NULL;
- }
- }
- return (inm);
-}
-
-/*
- * Wrapper for inm_lookup_locked().
- * The IF_ADDR_LOCK will be taken on ifp and released on return.
- */
-static __inline struct in_multi *
-inm_lookup(struct ifnet *ifp, const struct in_addr ina)
-{
- struct in_multi *inm;
-
- IN_MULTI_LOCK_ASSERT();
- IF_ADDR_RLOCK(ifp);
- inm = inm_lookup_locked(ifp, ina);
- IF_ADDR_RUNLOCK(ifp);
-
- return (inm);
-}
-
/* Acquire an in_multi record. */
static __inline void
inm_acquire_locked(struct in_multi *inm)
@@ -422,8 +351,9 @@ inm_acquire_locked(struct in_multi *inm)
struct rtentry;
struct route;
struct ip_moptions;
-struct radix_node_head;
+struct in_multi *inm_lookup_locked(struct ifnet *, const struct in_addr);
+struct in_multi *inm_lookup(struct ifnet *, const struct in_addr);
int imo_multi_filter(const struct ip_moptions *, const struct ifnet *,
const struct sockaddr *, const struct sockaddr *);
void inm_commit(struct in_multi *);
@@ -444,30 +374,21 @@ int in_leavegroup_locked(struct in_multi *,
/*const*/ struct in_mfilter *);
int in_control(struct socket *, u_long, caddr_t, struct ifnet *,
struct thread *);
-void in_rtqdrain(void);
+int in_addprefix(struct in_ifaddr *, int);
+int in_scrubprefix(struct in_ifaddr *, u_int);
+void in_ifscrub_all(void);
void ip_input(struct mbuf *);
-int in_ifadown(struct ifaddr *ifa, int);
-void in_ifscrub(struct ifnet *, struct in_ifaddr *, u_int);
-struct mbuf *ip_fastforward(struct mbuf *);
+void ip_direct_input(struct mbuf *);
+void in_ifadown(struct ifaddr *ifa, int);
+struct mbuf *ip_tryforward(struct mbuf *);
void *in_domifattach(struct ifnet *);
void in_domifdetach(struct ifnet *, void *);
/* XXX */
void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum);
-void in_rtalloc(struct route *ro, u_int fibnum);
-struct rtentry *in_rtalloc1(struct sockaddr *, int, u_long, u_int);
void in_rtredirect(struct sockaddr *, struct sockaddr *,
struct sockaddr *, int, struct sockaddr *, u_int);
-int in_rtrequest(int, struct sockaddr *,
- struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int);
-void in_setmatchfunc(struct radix_node_head *, int);
-
-#if 0
-int in_rt_getifa(struct rt_addrinfo *, u_int fibnum);
-int in_rtioctl(u_long, caddr_t, u_int);
-int in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int);
-#endif
#endif /* _KERNEL */
/* INET6 stuff */
diff --git a/freebsd/sys/netinet/ip.h b/freebsd/sys/netinet/ip.h
index 79afeb8f..98bd1e99 100644
--- a/freebsd/sys/netinet/ip.h
+++ b/freebsd/sys/netinet/ip.h
@@ -67,7 +67,7 @@ struct ip {
u_char ip_p; /* protocol */
u_short ip_sum; /* checksum */
struct in_addr ip_src,ip_dst; /* source and dest address */
-} __packed __aligned(4);
+} __packed __aligned(2);
#define IP_MAXPACKET 65535 /* maximum packet size */
@@ -80,19 +80,19 @@ struct ip {
#define IPTOS_MINCOST 0x02
/*
- * Definitions for IP precedence (also in ip_tos) (hopefully unused).
+ * Definitions for IP precedence (also in ip_tos) (deprecated).
*/
-#define IPTOS_PREC_NETCONTROL 0xe0
-#define IPTOS_PREC_INTERNETCONTROL 0xc0
-#define IPTOS_PREC_CRITIC_ECP 0xa0
-#define IPTOS_PREC_FLASHOVERRIDE 0x80
-#define IPTOS_PREC_FLASH 0x60
-#define IPTOS_PREC_IMMEDIATE 0x40
-#define IPTOS_PREC_PRIORITY 0x20
-#define IPTOS_PREC_ROUTINE 0x00
+#define IPTOS_PREC_NETCONTROL IPTOS_DSCP_CS7
+#define IPTOS_PREC_INTERNETCONTROL IPTOS_DSCP_CS6
+#define IPTOS_PREC_CRITIC_ECP IPTOS_DSCP_CS5
+#define IPTOS_PREC_FLASHOVERRIDE IPTOS_DSCP_CS4
+#define IPTOS_PREC_FLASH IPTOS_DSCP_CS3
+#define IPTOS_PREC_IMMEDIATE IPTOS_DSCP_CS2
+#define IPTOS_PREC_PRIORITY IPTOS_DSCP_CS1
+#define IPTOS_PREC_ROUTINE IPTOS_DSCP_CS0
/*
- * Definitions for DiffServ Codepoints as per RFC2474
+ * Definitions for DiffServ Codepoints as per RFC2474 and RFC5865.
*/
#define IPTOS_DSCP_CS0 0x00
#define IPTOS_DSCP_CS1 0x20
@@ -112,6 +112,7 @@ struct ip {
#define IPTOS_DSCP_AF42 0x90
#define IPTOS_DSCP_AF43 0x98
#define IPTOS_DSCP_CS5 0xa0
+#define IPTOS_DSCP_VA 0xb0
#define IPTOS_DSCP_EF 0xb8
#define IPTOS_DSCP_CS6 0xc0
#define IPTOS_DSCP_CS7 0xe0
@@ -146,7 +147,7 @@ struct ip {
#define IPOPT_SECURITY 130 /* provide s,c,h,tcc */
#define IPOPT_LSRR 131 /* loose source route */
#define IPOPT_ESO 133 /* extended security */
-#define IPOPT_CIPSO 134 /* commerical security */
+#define IPOPT_CIPSO 134 /* commercial security */
#define IPOPT_SATID 136 /* satnet id */
#define IPOPT_SSRR 137 /* strict source route */
#define IPOPT_RA 148 /* router alert */
diff --git a/freebsd/sys/netinet/ip6.h b/freebsd/sys/netinet/ip6.h
index 8f498410..ff870579 100644
--- a/freebsd/sys/netinet/ip6.h
+++ b/freebsd/sys/netinet/ip6.h
@@ -277,12 +277,6 @@ do { \
(((m) = m_pullup((m), (off) + (hlen))) == NULL)) { \
IP6STAT_INC(ip6s_exthdrtoolong); \
return ret; \
- } else if ((m)->m_flags & M_EXT) { \
- if ((m)->m_len < (off) + (hlen)) { \
- IP6STAT_INC(ip6s_exthdrtoolong); \
- m_freem(m); \
- return ret; \
- } \
} else { \
if ((m)->m_len < (off) + (hlen)) { \
IP6STAT_INC(ip6s_exthdrtoolong); \
diff --git a/freebsd/sys/netinet/ip_carp.c b/freebsd/sys/netinet/ip_carp.c
index 330023b1..6b683f45 100644
--- a/freebsd/sys/netinet/ip_carp.c
+++ b/freebsd/sys/netinet/ip_carp.c
@@ -1,8 +1,10 @@
#include <machine/rtems-bsd-kernel-space.h>
-/*
- * Copyright (c) 2002 Michael Shalayeff. All rights reserved.
- * Copyright (c) 2003 Ryan McBride. All rights reserved.
+/*-
+ * Copyright (c) 2002 Michael Shalayeff.
+ * Copyright (c) 2003 Ryan McBride.
+ * Copyright (c) 2011 Gleb Smirnoff <glebius@FreeBSD.org>
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -33,38 +35,33 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
-#include <sys/types.h>
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
-#include <sys/conf.h>
+#include <sys/bus.h>
+#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/module.h>
-#include <sys/time.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
-#include <sys/signalvar.h>
-#include <sys/filio.h>
-#include <sys/sockio.h>
-
-#include <sys/socket.h>
-#include <sys/vnode.h>
-
-#include <machine/stdarg.h>
+#include <sys/taskqueue.h>
+#include <sys/counter.h>
-#include <net/bpf.h>
#include <net/ethernet.h>
#include <net/fddi.h>
-#include <net/iso88025.h>
#include <net/if.h>
-#include <net/if_clone.h>
+#include <net/if_var.h>
#include <net/if_dl.h>
+#include <net/if_llatbl.h>
#include <net/if_types.h>
+#include <net/iso88025.h>
#include <net/route.h>
#include <net/vnet.h>
@@ -73,12 +70,9 @@ __FBSDID("$FreeBSD$");
#include <netinet/in_var.h>
#include <netinet/ip_carp.h>
#include <netinet/ip.h>
-
#include <machine/in_cksum.h>
#endif
-
#ifdef INET
-#include <netinet/in_systm.h>
#include <netinet/ip_var.h>
#include <netinet/if_ether.h>
#endif
@@ -86,182 +80,254 @@ __FBSDID("$FreeBSD$");
#ifdef INET6
#include <netinet/icmp6.h>
#include <netinet/ip6.h>
-#include <netinet6/ip6protosw.h>
+#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
-#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#endif
#include <crypto/sha1.h>
-#define CARP_IFNAME "carp"
-static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces");
-SYSCTL_DECL(_net_inet_carp);
+static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses");
struct carp_softc {
- struct ifnet *sc_ifp; /* Interface clue */
- struct ifnet *sc_carpdev; /* Pointer to parent interface */
- struct in_ifaddr *sc_ia; /* primary iface address */
+ struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */
+ struct ifaddr **sc_ifas; /* Our ifaddrs. */
+ struct sockaddr_dl sc_addr; /* Our link level address. */
+ struct callout sc_ad_tmo; /* Advertising timeout. */
#ifdef INET
- struct ip_moptions sc_imo;
+ struct callout sc_md_tmo; /* Master down timeout. */
#endif
#ifdef INET6
- struct in6_ifaddr *sc_ia6; /* primary iface address v6 */
- struct ip6_moptions sc_im6o;
-#endif /* INET6 */
- TAILQ_ENTRY(carp_softc) sc_list;
-
- enum { INIT = 0, BACKUP, MASTER } sc_state;
+ struct callout sc_md6_tmo; /* XXX: Master down timeout. */
+#endif
+ struct mtx sc_mtx;
- int sc_flags_backup;
- int sc_suppress;
+ int sc_vhid;
+ int sc_advskew;
+ int sc_advbase;
- int sc_sendad_errors;
+ int sc_naddrs;
+ int sc_naddrs6;
+ int sc_ifasiz;
+ enum { INIT = 0, BACKUP, MASTER } sc_state;
+ int sc_suppress;
+ int sc_sendad_errors;
#define CARP_SENDAD_MAX_ERRORS 3
- int sc_sendad_success;
+ int sc_sendad_success;
#define CARP_SENDAD_MIN_SUCCESS 3
- int sc_vhid;
- int sc_advskew;
- int sc_naddrs;
- int sc_naddrs6;
- int sc_advbase; /* seconds */
- int sc_init_counter;
- u_int64_t sc_counter;
+ int sc_init_counter;
+ uint64_t sc_counter;
/* authentication */
-#define CARP_HMAC_PAD 64
+#define CARP_HMAC_PAD 64
unsigned char sc_key[CARP_KEY_LEN];
unsigned char sc_pad[CARP_HMAC_PAD];
SHA1_CTX sc_sha1;
- struct callout sc_ad_tmo; /* advertisement timeout */
- struct callout sc_md_tmo; /* master down timeout */
- struct callout sc_md6_tmo; /* master down timeout */
-
- LIST_ENTRY(carp_softc) sc_next; /* Interface clue */
+ TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */
+ LIST_ENTRY(carp_softc) sc_next; /* On the global list. */
};
-#define SC2IFP(sc) ((sc)->sc_ifp)
-
-int carp_suppress_preempt = 0;
-int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 }; /* XXX for now */
-SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP");
-SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW,
- &carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets");
-SYSCTL_INT(_net_inet_carp, CARPCTL_PREEMPT, preempt, CTLFLAG_RW,
- &carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode");
-SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW,
- &carp_opts[CARPCTL_LOG], 0, "log bad carp packets");
-SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW,
- &carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses");
-SYSCTL_INT(_net_inet_carp, OID_AUTO, suppress_preempt, CTLFLAG_RD,
- &carp_suppress_preempt, 0, "Preemption is suppressed");
-
-struct carpstats carpstats;
-SYSCTL_STRUCT(_net_inet_carp, CARPCTL_STATS, stats, CTLFLAG_RW,
- &carpstats, carpstats,
- "CARP statistics (struct carpstats, netinet/ip_carp.h)");
struct carp_if {
- TAILQ_HEAD(, carp_softc) vhif_vrs;
- int vhif_nvrs;
-
- struct ifnet *vhif_ifp;
- struct mtx vhif_mtx;
+#ifdef INET
+ int cif_naddrs;
+#endif
+#ifdef INET6
+ int cif_naddrs6;
+#endif
+ TAILQ_HEAD(, carp_softc) cif_vrs;
+#ifdef INET
+ struct ip_moptions cif_imo;
+#endif
+#ifdef INET6
+ struct ip6_moptions cif_im6o;
+#endif
+ struct ifnet *cif_ifp;
+ struct mtx cif_mtx;
+ uint32_t cif_flags;
+#define CIF_PROMISC 0x00000001
};
#define CARP_INET 0
#define CARP_INET6 1
static int proto_reg[] = {-1, -1};
-/* Get carp_if from softc. Valid after carp_set_addr{,6}. */
-#define SC2CIF(sc) ((struct carp_if *)(sc)->sc_carpdev->if_carp)
+/*
+ * Brief design of carp(4).
+ *
+ * Any carp-capable ifnet may have a list of carp softcs hanging off
+ * its ifp->if_carp pointer. Each softc represents one unique virtual
+ * host id, or vhid. The softc has a back pointer to the ifnet. All
+ * softcs are joined in a global list, which has quite limited use.
+ *
+ * Any interface address that takes part in CARP negotiation has a
+ * pointer to the softc of its vhid, ifa->ifa_carp. That could be either
+ * AF_INET or AF_INET6 address.
+ *
+ * Although, one can get the softc's backpointer to ifnet and traverse
+ * through its ifp->if_addrhead queue to find all interface addresses
+ * involved in CARP, we keep a growable array of ifaddr pointers. This
+ * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that
+ * do calls into the network stack, thus avoiding LORs.
+ *
+ * Locking:
+ *
+ * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(),
+ * callout-driven events and ioctl()s.
+ *
+ * To traverse the list of softcs on an ifnet we use CIF_LOCK(), to
+ * traverse the global list we use the mutex carp_mtx.
+ *
+ * Known issues with locking:
+ *
+ * - Sending ad, we put the pointer to the softc in an mtag, and no reference
+ * counting is done on the softc.
+ * - On module unload we may race (?) with packet processing thread
+ * dereferencing our function pointers.
+ */
+
+/* Accept incoming CARP packets. */
+static VNET_DEFINE(int, carp_allow) = 1;
+#define V_carp_allow VNET(carp_allow)
-/* lock per carp_if queue */
-#define CARP_LOCK_INIT(cif) mtx_init(&(cif)->vhif_mtx, "carp_if", \
- NULL, MTX_DEF)
-#define CARP_LOCK_DESTROY(cif) mtx_destroy(&(cif)->vhif_mtx)
-#define CARP_LOCK_ASSERT(cif) mtx_assert(&(cif)->vhif_mtx, MA_OWNED)
-#define CARP_LOCK(cif) mtx_lock(&(cif)->vhif_mtx)
-#define CARP_UNLOCK(cif) mtx_unlock(&(cif)->vhif_mtx)
+/* Preempt slower nodes. */
+static VNET_DEFINE(int, carp_preempt) = 0;
+#define V_carp_preempt VNET(carp_preempt)
-#define CARP_SCLOCK(sc) mtx_lock(&SC2CIF(sc)->vhif_mtx)
-#define CARP_SCUNLOCK(sc) mtx_unlock(&SC2CIF(sc)->vhif_mtx)
-#define CARP_SCLOCK_ASSERT(sc) mtx_assert(&SC2CIF(sc)->vhif_mtx, MA_OWNED)
+/* Log level. */
+static VNET_DEFINE(int, carp_log) = 1;
+#define V_carp_log VNET(carp_log)
+
+/* Global advskew demotion. */
+static VNET_DEFINE(int, carp_demotion) = 0;
+#define V_carp_demotion VNET(carp_demotion)
+
+/* Send error demotion factor. */
+static VNET_DEFINE(int, carp_senderr_adj) = CARP_MAXSKEW;
+#define V_carp_senderr_adj VNET(carp_senderr_adj)
+
+/* Iface down demotion factor. */
+static VNET_DEFINE(int, carp_ifdown_adj) = CARP_MAXSKEW;
+#define V_carp_ifdown_adj VNET(carp_ifdown_adj)
+
+static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS);
+
+SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP");
+SYSCTL_INT(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(carp_allow), 0, "Accept incoming CARP packets");
+SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode");
+SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(carp_log), 0, "CARP log level");
+SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+ 0, 0, carp_demote_adj_sysctl, "I",
+ "Adjust demotion factor (skew of advskew)");
+SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor,
+ CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment");
+SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor,
+ CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(carp_ifdown_adj), 0,
+ "Interface down demotion factor adjustment");
+
+VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats);
+VNET_PCPUSTAT_SYSINIT(carpstats);
+VNET_PCPUSTAT_SYSUNINIT(carpstats);
+
+#define CARPSTATS_ADD(name, val) \
+ counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \
+ sizeof(uint64_t)], (val))
+#define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1)
+
+SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats,
+ carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)");
+
+#define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \
+ NULL, MTX_DEF)
+#define CARP_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx)
+#define CARP_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED)
+#define CARP_LOCK(sc) mtx_lock(&(sc)->sc_mtx)
+#define CARP_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx)
+#define CIF_LOCK_INIT(cif) mtx_init(&(cif)->cif_mtx, "carp_if", \
+ NULL, MTX_DEF)
+#define CIF_LOCK_DESTROY(cif) mtx_destroy(&(cif)->cif_mtx)
+#define CIF_LOCK_ASSERT(cif) mtx_assert(&(cif)->cif_mtx, MA_OWNED)
+#define CIF_LOCK(cif) mtx_lock(&(cif)->cif_mtx)
+#define CIF_UNLOCK(cif) mtx_unlock(&(cif)->cif_mtx)
+#define CIF_FREE(cif) do { \
+ CIF_LOCK(cif); \
+ if (TAILQ_EMPTY(&(cif)->cif_vrs)) \
+ carp_free_if(cif); \
+ else \
+ CIF_UNLOCK(cif); \
+} while (0)
#define CARP_LOG(...) do { \
- if (carp_opts[CARPCTL_LOG] > 0) \
- log(LOG_INFO, __VA_ARGS__); \
+ if (V_carp_log > 0) \
+ log(LOG_INFO, "carp: " __VA_ARGS__); \
} while (0)
#define CARP_DEBUG(...) do { \
- if (carp_opts[CARPCTL_LOG] > 1) \
+ if (V_carp_log > 1) \
log(LOG_DEBUG, __VA_ARGS__); \
} while (0)
-static void carp_hmac_prepare(struct carp_softc *);
-static void carp_hmac_generate(struct carp_softc *, u_int32_t *,
- unsigned char *);
-static int carp_hmac_verify(struct carp_softc *, u_int32_t *,
- unsigned char *);
-static void carp_setroute(struct carp_softc *, int);
-static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
-static int carp_clone_create(struct if_clone *, int, caddr_t);
-static void carp_clone_destroy(struct ifnet *);
-static void carpdetach(struct carp_softc *, int);
-static int carp_prepare_ad(struct mbuf *, struct carp_softc *,
- struct carp_header *);
-static void carp_send_ad_all(void);
-static void carp_send_ad(void *);
-static void carp_send_ad_locked(struct carp_softc *);
-#ifdef INET
-static void carp_send_arp(struct carp_softc *);
-#endif
-static void carp_master_down(void *);
-static void carp_master_down_locked(struct carp_softc *);
-static int carp_ioctl(struct ifnet *, u_long, caddr_t);
-static int carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *,
- struct route *);
-static void carp_start(struct ifnet *);
-static void carp_setrun(struct carp_softc *, sa_family_t);
-static void carp_set_state(struct carp_softc *, int);
-#ifdef INET
-static int carp_addrcount(struct carp_if *, struct in_ifaddr *, int);
-#endif
-enum { CARP_COUNT_MASTER, CARP_COUNT_RUNNING };
+#define IFNET_FOREACH_IFA(ifp, ifa) \
+ IF_ADDR_LOCK_ASSERT(ifp); \
+ TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \
+ if ((ifa)->ifa_carp != NULL)
-#ifdef INET
-static void carp_multicast_cleanup(struct carp_softc *, int dofree);
-static int carp_set_addr(struct carp_softc *, struct sockaddr_in *);
-static int carp_del_addr(struct carp_softc *, struct sockaddr_in *);
-#endif
-static void carp_carpdev_state_locked(struct carp_if *);
-static void carp_sc_state_locked(struct carp_softc *);
-#ifdef INET6
-static void carp_send_na(struct carp_softc *);
-static int carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *);
-static int carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *);
-static void carp_multicast6_cleanup(struct carp_softc *, int dofree);
-#endif
+#define CARP_FOREACH_IFA(sc, ifa) \
+ CARP_LOCK_ASSERT(sc); \
+ for (int _i = 0; \
+ _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 && \
+ ((ifa) = sc->sc_ifas[_i]) != NULL; \
+ ++_i)
-static LIST_HEAD(, carp_softc) carpif_list;
-static struct mtx carp_mtx;
-IFC_SIMPLE_DECLARE(carp, 0);
+#define IFNET_FOREACH_CARP(ifp, sc) \
+ CIF_LOCK_ASSERT(ifp->if_carp); \
+ TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list)
-static eventhandler_tag if_detach_event_tag;
+#define DEMOTE_ADVSKEW(sc) \
+ (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ? \
+ CARP_MAXSKEW : ((sc)->sc_advskew + V_carp_demotion))
-static __inline u_int16_t
-carp_cksum(struct mbuf *m, int len)
-{
- return (in_cksum(m, len));
-}
+static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
+static struct carp_softc
+ *carp_alloc(struct ifnet *);
+static void carp_destroy(struct carp_softc *);
+static struct carp_if
+ *carp_alloc_if(struct ifnet *);
+static void carp_free_if(struct carp_if *);
+static void carp_set_state(struct carp_softc *, int, const char* reason);
+static void carp_sc_state(struct carp_softc *);
+static void carp_setrun(struct carp_softc *, sa_family_t);
+static void carp_master_down(void *);
+static void carp_master_down_locked(struct carp_softc *,
+ const char* reason);
+static void carp_send_ad(void *);
+static void carp_send_ad_locked(struct carp_softc *);
+static void carp_addroute(struct carp_softc *);
+static void carp_ifa_addroute(struct ifaddr *);
+static void carp_delroute(struct carp_softc *);
+static void carp_ifa_delroute(struct ifaddr *);
+static void carp_send_ad_all(void *, int);
+static void carp_demote_adj(int, char *);
+
+static LIST_HEAD(, carp_softc) carp_list;
+static struct mtx carp_mtx;
+static struct sx carp_sx;
+static struct task carp_sendall_task =
+ TASK_INITIALIZER(0, carp_send_ad_all, NULL);
static void
carp_hmac_prepare(struct carp_softc *sc)
{
- u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
- u_int8_t vhid = sc->sc_vhid & 0xff;
+ uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
+ uint8_t vhid = sc->sc_vhid & 0xff;
struct ifaddr *ifa;
int i, found;
#ifdef INET
@@ -271,18 +337,15 @@ carp_hmac_prepare(struct carp_softc *sc)
struct in6_addr last6, cur6, in6;
#endif
- if (sc->sc_carpdev)
- CARP_SCLOCK(sc);
+ CARP_LOCK_ASSERT(sc);
- /* XXX: possible race here */
-
- /* compute ipad from key */
+ /* Compute ipad from key. */
bzero(sc->sc_pad, sizeof(sc->sc_pad));
bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
for (i = 0; i < sizeof(sc->sc_pad); i++)
sc->sc_pad[i] ^= 0x36;
- /* precompute first part of inner hash */
+ /* Precompute first part of inner hash. */
SHA1Init(&sc->sc_sha1);
SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
@@ -294,8 +357,7 @@ carp_hmac_prepare(struct carp_softc *sc)
found = 0;
last = cur;
cur.s_addr = 0xffffffff;
- IF_ADDR_RLOCK(SC2IFP(sc));
- TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
+ CARP_FOREACH_IFA(sc, ifa) {
in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
if (ifa->ifa_addr->sa_family == AF_INET &&
ntohl(in.s_addr) > ntohl(last.s_addr) &&
@@ -304,7 +366,6 @@ carp_hmac_prepare(struct carp_softc *sc)
found++;
}
}
- IF_ADDR_RUNLOCK(SC2IFP(sc));
if (found)
SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur));
} while (found);
@@ -315,8 +376,7 @@ carp_hmac_prepare(struct carp_softc *sc)
found = 0;
last6 = cur6;
memset(&cur6, 0xff, sizeof(cur6));
- IF_ADDR_RLOCK(SC2IFP(sc));
- TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
+ CARP_FOREACH_IFA(sc, ifa) {
in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
if (IN6_IS_SCOPE_EMBED(&in6))
in6.s6_addr16[1] = 0;
@@ -327,7 +387,6 @@ carp_hmac_prepare(struct carp_softc *sc)
found++;
}
}
- IF_ADDR_RUNLOCK(SC2IFP(sc));
if (found)
SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6));
} while (found);
@@ -336,17 +395,16 @@ carp_hmac_prepare(struct carp_softc *sc)
/* convert ipad to opad */
for (i = 0; i < sizeof(sc->sc_pad); i++)
sc->sc_pad[i] ^= 0x36 ^ 0x5c;
-
- if (sc->sc_carpdev)
- CARP_SCUNLOCK(sc);
}
static void
-carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2],
+carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2],
unsigned char md[20])
{
SHA1_CTX sha1ctx;
+ CARP_LOCK_ASSERT(sc);
+
/* fetch first half of inner hash */
bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
@@ -361,260 +419,68 @@ carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2],
}
static int
-carp_hmac_verify(struct carp_softc *sc, u_int32_t counter[2],
+carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2],
unsigned char md[20])
{
unsigned char md2[20];
- CARP_SCLOCK_ASSERT(sc);
+ CARP_LOCK_ASSERT(sc);
carp_hmac_generate(sc, counter, md2);
return (bcmp(md, md2, sizeof(md2)));
}
-static void
-carp_setroute(struct carp_softc *sc, int cmd)
-{
- struct ifaddr *ifa;
- int s;
-
- if (sc->sc_carpdev)
- CARP_SCLOCK_ASSERT(sc);
-
- s = splnet();
- TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
-#ifdef INET
- if (ifa->ifa_addr->sa_family == AF_INET &&
- sc->sc_carpdev != NULL) {
- int count = carp_addrcount(
- (struct carp_if *)sc->sc_carpdev->if_carp,
- ifatoia(ifa), CARP_COUNT_MASTER);
-
- if ((cmd == RTM_ADD && count == 1) ||
- (cmd == RTM_DELETE && count == 0))
- rtinit(ifa, cmd, RTF_UP | RTF_HOST);
- }
-#endif
- }
- splx(s);
-}
-
-static int
-carp_clone_create(struct if_clone *ifc, int unit, caddr_t params)
-{
-
- struct carp_softc *sc;
- struct ifnet *ifp;
-
- sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO);
- ifp = SC2IFP(sc) = if_alloc(IFT_ETHER);
- if (ifp == NULL) {
- free(sc, M_CARP);
- return (ENOSPC);
- }
-
- sc->sc_flags_backup = 0;
- sc->sc_suppress = 0;
- sc->sc_advbase = CARP_DFLTINTV;
- sc->sc_vhid = -1; /* required setting */
- sc->sc_advskew = 0;
- sc->sc_init_counter = 1;
- sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */
-#ifdef INET
- sc->sc_imo.imo_membership = (struct in_multi **)malloc(
- (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP,
- M_WAITOK);
- sc->sc_imo.imo_mfilters = NULL;
- sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
- sc->sc_imo.imo_multicast_vif = -1;
-#endif
-#ifdef INET6
- sc->sc_im6o.im6o_membership = (struct in6_multi **)malloc(
- (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP,
- M_WAITOK);
- sc->sc_im6o.im6o_mfilters = NULL;
- sc->sc_im6o.im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
- sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL;
-#endif
-
- callout_init(&sc->sc_ad_tmo, CALLOUT_MPSAFE);
- callout_init(&sc->sc_md_tmo, CALLOUT_MPSAFE);
- callout_init(&sc->sc_md6_tmo, CALLOUT_MPSAFE);
-
- ifp->if_softc = sc;
- if_initname(ifp, CARP_IFNAME, unit);
- ifp->if_mtu = ETHERMTU;
- ifp->if_flags = IFF_LOOPBACK;
- ifp->if_ioctl = carp_ioctl;
- ifp->if_output = carp_looutput;
- ifp->if_start = carp_start;
- ifp->if_type = IFT_CARP;
- ifp->if_snd.ifq_maxlen = ifqmaxlen;
- ifp->if_hdrlen = 0;
- if_attach(ifp);
- bpfattach(SC2IFP(sc), DLT_NULL, sizeof(u_int32_t));
- mtx_lock(&carp_mtx);
- LIST_INSERT_HEAD(&carpif_list, sc, sc_next);
- mtx_unlock(&carp_mtx);
- return (0);
-}
-
-static void
-carp_clone_destroy(struct ifnet *ifp)
-{
- struct carp_softc *sc = ifp->if_softc;
-
- if (sc->sc_carpdev)
- CARP_SCLOCK(sc);
- carpdetach(sc, 1); /* Returns unlocked. */
-
- mtx_lock(&carp_mtx);
- LIST_REMOVE(sc, sc_next);
- mtx_unlock(&carp_mtx);
- bpfdetach(ifp);
- if_detach(ifp);
- if_free_type(ifp, IFT_ETHER);
-#ifdef INET
- free(sc->sc_imo.imo_membership, M_CARP);
-#endif
-#ifdef INET6
- free(sc->sc_im6o.im6o_membership, M_CARP);
-#endif
- free(sc, M_CARP);
-}
-
-/*
- * This function can be called on CARP interface destroy path,
- * and in case of the removal of the underlying interface as
- * well. We differentiate these two cases: in case of destruction
- * of the underlying interface, we do not cleanup our multicast
- * memberships, since they are already freed. But we purge pointers
- * to multicast structures, since they are no longer valid, to
- * avoid panic in future calls to carpdetach(). Also, we do not
- * release the lock on return, because the function will be
- * called once more, for another CARP instance on the same
- * interface.
- */
-static void
-carpdetach(struct carp_softc *sc, int unlock)
-{
- struct carp_if *cif;
-
- callout_stop(&sc->sc_ad_tmo);
- callout_stop(&sc->sc_md_tmo);
- callout_stop(&sc->sc_md6_tmo);
-
- if (sc->sc_suppress)
- carp_suppress_preempt--;
- sc->sc_suppress = 0;
-
- if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS)
- carp_suppress_preempt--;
- sc->sc_sendad_errors = 0;
-
- carp_set_state(sc, INIT);
- SC2IFP(sc)->if_flags &= ~IFF_UP;
- carp_setrun(sc, 0);
-#ifdef INET
- carp_multicast_cleanup(sc, unlock);
-#endif
-#ifdef INET6
- carp_multicast6_cleanup(sc, unlock);
-#endif
-
- if (sc->sc_carpdev != NULL) {
- cif = (struct carp_if *)sc->sc_carpdev->if_carp;
- CARP_LOCK_ASSERT(cif);
- TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
- if (!--cif->vhif_nvrs) {
- ifpromisc(sc->sc_carpdev, 0);
- sc->sc_carpdev->if_carp = NULL;
- CARP_LOCK_DESTROY(cif);
- free(cif, M_CARP);
- } else if (unlock)
- CARP_UNLOCK(cif);
- sc->sc_carpdev = NULL;
- }
-}
-
-/* Detach an interface from the carp. */
-static void
-carp_ifdetach(void *arg __unused, struct ifnet *ifp)
-{
- struct carp_if *cif = (struct carp_if *)ifp->if_carp;
- struct carp_softc *sc, *nextsc;
-
- if (cif == NULL)
- return;
-
- /*
- * XXX: At the end of for() cycle the lock will be destroyed.
- */
- CARP_LOCK(cif);
- for (sc = TAILQ_FIRST(&cif->vhif_vrs); sc; sc = nextsc) {
- nextsc = TAILQ_NEXT(sc, sc_list);
- carpdetach(sc, 0);
- }
-}
-
/*
* process input packet.
* we have rearranged checks order compared to the rfc,
* but it seems more efficient this way or not possible otherwise.
*/
#ifdef INET
-void
-carp_input(struct mbuf *m, int hlen)
+int
+carp_input(struct mbuf **mp, int *offp, int proto)
{
+ struct mbuf *m = *mp;
struct ip *ip = mtod(m, struct ip *);
struct carp_header *ch;
int iplen, len;
- CARPSTATS_INC(carps_ipackets);
+ iplen = *offp;
+ *mp = NULL;
- if (!carp_opts[CARPCTL_ALLOW]) {
- m_freem(m);
- return;
- }
+ CARPSTATS_INC(carps_ipackets);
- /* check if received on a valid carp interface */
- if (m->m_pkthdr.rcvif->if_carp == NULL) {
- CARPSTATS_INC(carps_badif);
- CARP_DEBUG("carp_input: packet received on non-carp "
- "interface: %s\n",
- m->m_pkthdr.rcvif->if_xname);
+ if (!V_carp_allow) {
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
/* verify that the IP TTL is 255. */
if (ip->ip_ttl != CARP_DFLTTL) {
CARPSTATS_INC(carps_badttl);
- CARP_DEBUG("carp_input: received ttl %d != 255 on %s\n",
+ CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
ip->ip_ttl,
m->m_pkthdr.rcvif->if_xname);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
iplen = ip->ip_hl << 2;
if (m->m_pkthdr.len < iplen + sizeof(*ch)) {
CARPSTATS_INC(carps_badlen);
- CARP_DEBUG("carp_input: received len %zd < "
- "sizeof(struct carp_header) on %s\n",
- m->m_len - sizeof(struct ip),
+ CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) "
+ "on %s\n", __func__, m->m_len - sizeof(struct ip),
m->m_pkthdr.rcvif->if_xname);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
if (iplen + sizeof(*ch) < m->m_len) {
if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) {
CARPSTATS_INC(carps_hdrops);
- CARP_DEBUG("carp_input: pullup failed\n");
- return;
+ CARP_DEBUG("%s: pullup failed\n", __func__);
+ return (IPPROTO_DONE);
}
ip = mtod(m, struct ip *);
}
@@ -627,32 +493,33 @@ carp_input(struct mbuf *m, int hlen)
len = iplen + sizeof(*ch);
if (len > m->m_pkthdr.len) {
CARPSTATS_INC(carps_badlen);
- CARP_DEBUG("carp_input: packet too short %d on %s\n",
+ CARP_DEBUG("%s: packet too short %d on %s\n", __func__,
m->m_pkthdr.len,
m->m_pkthdr.rcvif->if_xname);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
if ((m = m_pullup(m, len)) == NULL) {
CARPSTATS_INC(carps_hdrops);
- return;
+ return (IPPROTO_DONE);
}
ip = mtod(m, struct ip *);
ch = (struct carp_header *)((char *)ip + iplen);
/* verify the CARP checksum */
m->m_data += iplen;
- if (carp_cksum(m, len - iplen)) {
+ if (in_cksum(m, len - iplen)) {
CARPSTATS_INC(carps_badsum);
- CARP_DEBUG("carp_input: checksum failed on %s\n",
+ CARP_DEBUG("%s: checksum failed on %s\n", __func__,
m->m_pkthdr.rcvif->if_xname);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
m->m_data -= iplen;
carp_input_c(m, ch, AF_INET);
+ return (IPPROTO_DONE);
}
#endif
@@ -667,7 +534,7 @@ carp6_input(struct mbuf **mp, int *offp, int proto)
CARPSTATS_INC(carps_ipackets6);
- if (!carp_opts[CARPCTL_ALLOW]) {
+ if (!V_carp_allow) {
m_freem(m);
return (IPPROTO_DONE);
}
@@ -675,9 +542,8 @@ carp6_input(struct mbuf **mp, int *offp, int proto)
/* check if received on a valid carp interface */
if (m->m_pkthdr.rcvif->if_carp == NULL) {
CARPSTATS_INC(carps_badif);
- CARP_DEBUG("carp6_input: packet received on non-carp "
- "interface: %s\n",
- m->m_pkthdr.rcvif->if_xname);
+ CARP_DEBUG("%s: packet received on non-carp interface: %s\n",
+ __func__, m->m_pkthdr.rcvif->if_xname);
m_freem(m);
return (IPPROTO_DONE);
}
@@ -685,9 +551,8 @@ carp6_input(struct mbuf **mp, int *offp, int proto)
/* verify that the IP TTL is 255 */
if (ip6->ip6_hlim != CARP_DFLTTL) {
CARPSTATS_INC(carps_badttl);
- CARP_DEBUG("carp6_input: received ttl %d != 255 on %s\n",
- ip6->ip6_hlim,
- m->m_pkthdr.rcvif->if_xname);
+ CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
+ ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname);
m_freem(m);
return (IPPROTO_DONE);
}
@@ -697,16 +562,16 @@ carp6_input(struct mbuf **mp, int *offp, int proto)
IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
if (ch == NULL) {
CARPSTATS_INC(carps_badlen);
- CARP_DEBUG("carp6_input: packet size %u too small\n", len);
+ CARP_DEBUG("%s: packet size %u too small\n", __func__, len);
return (IPPROTO_DONE);
}
/* verify the CARP checksum */
m->m_data += *offp;
- if (carp_cksum(m, sizeof(*ch))) {
+ if (in_cksum(m, sizeof(*ch))) {
CARPSTATS_INC(carps_badsum);
- CARP_DEBUG("carp6_input: checksum failed, on %s\n",
+ CARP_DEBUG("%s: checksum failed, on %s\n", __func__,
m->m_pkthdr.rcvif->if_xname);
m_freem(m);
return (IPPROTO_DONE);
@@ -722,62 +587,46 @@ static void
carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
{
struct ifnet *ifp = m->m_pkthdr.rcvif;
+ struct ifaddr *ifa;
struct carp_softc *sc;
- u_int64_t tmp_counter;
+ uint64_t tmp_counter;
struct timeval sc_tv, ch_tv;
/* verify that the VHID is valid on the receiving interface */
- CARP_LOCK(ifp->if_carp);
- TAILQ_FOREACH(sc, &((struct carp_if *)ifp->if_carp)->vhif_vrs, sc_list)
- if (sc->sc_vhid == ch->carp_vhid)
+ IF_ADDR_RLOCK(ifp);
+ IFNET_FOREACH_IFA(ifp, ifa)
+ if (ifa->ifa_addr->sa_family == af &&
+ ifa->ifa_carp->sc_vhid == ch->carp_vhid) {
+ ifa_ref(ifa);
break;
+ }
+ IF_ADDR_RUNLOCK(ifp);
- if (!sc || !((SC2IFP(sc)->if_flags & IFF_UP) &&
- (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) {
+ if (ifa == NULL) {
CARPSTATS_INC(carps_badvhid);
- CARP_UNLOCK(ifp->if_carp);
m_freem(m);
return;
}
- getmicrotime(&SC2IFP(sc)->if_lastchange);
- SC2IFP(sc)->if_ipackets++;
- SC2IFP(sc)->if_ibytes += m->m_pkthdr.len;
-
- if (bpf_peers_present(SC2IFP(sc)->if_bpf)) {
- uint32_t af1 = af;
-#ifdef INET
- struct ip *ip = mtod(m, struct ip *);
-
- /* BPF wants net byte order */
- if (af == AF_INET) {
- ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2));
- ip->ip_off = htons(ip->ip_off);
- }
-#endif
- bpf_mtap2(SC2IFP(sc)->if_bpf, &af1, sizeof(af1), m);
- }
-
/* verify the CARP version. */
if (ch->carp_version != CARP_VERSION) {
CARPSTATS_INC(carps_badver);
- SC2IFP(sc)->if_ierrors++;
- CARP_UNLOCK(ifp->if_carp);
- CARP_DEBUG("%s; invalid version %d\n",
- SC2IFP(sc)->if_xname,
+ CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname,
ch->carp_version);
+ ifa_free(ifa);
m_freem(m);
return;
}
- /* verify the hash */
+ sc = ifa->ifa_carp;
+ CARP_LOCK(sc);
+ ifa_free(ifa);
+
if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
CARPSTATS_INC(carps_badauth);
- SC2IFP(sc)->if_ierrors++;
- CARP_UNLOCK(ifp->if_carp);
- CARP_DEBUG("%s: incorrect hash\n", SC2IFP(sc)->if_xname);
- m_freem(m);
- return;
+ CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__,
+ sc->sc_vhid, ifp->if_xname);
+ goto out;
}
tmp_counter = ntohl(ch->carp_counter[0]);
@@ -790,10 +639,7 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
sc->sc_counter = tmp_counter;
sc_tv.tv_sec = sc->sc_advbase;
- if (carp_suppress_preempt && sc->sc_advskew < 240)
- sc_tv.tv_usec = 240 * 1000000 / 256;
- else
- sc_tv.tv_usec = sc->sc_advskew * 1000000 / 256;
+ sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256;
ch_tv.tv_sec = ch->carp_advbase;
ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
@@ -808,12 +654,10 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
if (timevalcmp(&sc_tv, &ch_tv, >) ||
timevalcmp(&sc_tv, &ch_tv, ==)) {
callout_stop(&sc->sc_ad_tmo);
- CARP_LOG("%s: MASTER -> BACKUP "
- "(more frequent advertisement received)\n",
- SC2IFP(sc)->if_xname);
- carp_set_state(sc, BACKUP);
+ carp_set_state(sc, BACKUP,
+ "more frequent advertisement received");
carp_setrun(sc, 0);
- carp_setroute(sc, RTM_DELETE);
+ carp_delroute(sc);
}
break;
case BACKUP:
@@ -821,12 +665,9 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
* If we're pre-empting masters who advertise slower than us,
* and this one claims to be slower, treat him as down.
*/
- if (carp_opts[CARPCTL_PREEMPT] &&
- timevalcmp(&sc_tv, &ch_tv, <)) {
- CARP_LOG("%s: BACKUP -> MASTER "
- "(preempting a slower master)\n",
- SC2IFP(sc)->if_xname);
- carp_master_down_locked(sc);
+ if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) {
+ carp_master_down_locked(sc,
+ "preempting a slower master");
break;
}
@@ -837,10 +678,7 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
*/
sc_tv.tv_sec = sc->sc_advbase * 3;
if (timevalcmp(&sc_tv, &ch_tv, <)) {
- CARP_LOG("%s: BACKUP -> MASTER "
- "(master timed out)\n",
- SC2IFP(sc)->if_xname);
- carp_master_down_locked(sc);
+ carp_master_down_locked(sc, "master will time out");
break;
}
@@ -852,17 +690,15 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
break;
}
- CARP_UNLOCK(ifp->if_carp);
-
+out:
+ CARP_UNLOCK(sc);
m_freem(m);
- return;
}
static int
carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
{
struct m_tag *mtag;
- struct ifnet *ifp = SC2IFP(sc);
if (sc->sc_init_counter) {
/* this could also be seconds since unix epoch */
@@ -878,45 +714,79 @@ carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
/* Tag packet for carp_output */
- mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), M_NOWAIT);
- if (mtag == NULL) {
+ if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *),
+ M_NOWAIT)) == NULL) {
m_freem(m);
- SC2IFP(sc)->if_oerrors++;
+ CARPSTATS_INC(carps_onomem);
return (ENOMEM);
}
- bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *));
+ bcopy(&sc, mtag + 1, sizeof(sc));
m_tag_prepend(m, mtag);
return (0);
}
+/*
+ * To avoid LORs and possible recursions this function shouldn't
+ * be called directly, but scheduled via taskqueue.
+ */
static void
-carp_send_ad_all(void)
+carp_send_ad_all(void *ctx __unused, int pending __unused)
{
struct carp_softc *sc;
mtx_lock(&carp_mtx);
- LIST_FOREACH(sc, &carpif_list, sc_next) {
- if (sc->sc_carpdev == NULL)
- continue;
- CARP_SCLOCK(sc);
- if ((SC2IFP(sc)->if_flags & IFF_UP) &&
- (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING) &&
- sc->sc_state == MASTER)
+ LIST_FOREACH(sc, &carp_list, sc_next)
+ if (sc->sc_state == MASTER) {
+ CARP_LOCK(sc);
+ CURVNET_SET(sc->sc_carpdev->if_vnet);
carp_send_ad_locked(sc);
- CARP_SCUNLOCK(sc);
- }
+ CURVNET_RESTORE();
+ CARP_UNLOCK(sc);
+ }
mtx_unlock(&carp_mtx);
}
+/* Send a periodic advertisement, executed in callout context. */
static void
carp_send_ad(void *v)
{
struct carp_softc *sc = v;
- CARP_SCLOCK(sc);
+ CARP_LOCK_ASSERT(sc);
+ CURVNET_SET(sc->sc_carpdev->if_vnet);
carp_send_ad_locked(sc);
- CARP_SCUNLOCK(sc);
+ CURVNET_RESTORE();
+ CARP_UNLOCK(sc);
+}
+
+static void
+carp_send_ad_error(struct carp_softc *sc, int error)
+{
+
+ if (error) {
+ if (sc->sc_sendad_errors < INT_MAX)
+ sc->sc_sendad_errors++;
+ if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
+ static const char fmt[] = "send error %d on %s";
+ char msg[sizeof(fmt) + IFNAMSIZ];
+
+ sprintf(msg, fmt, error, sc->sc_carpdev->if_xname);
+ carp_demote_adj(V_carp_senderr_adj, msg);
+ }
+ sc->sc_sendad_success = 0;
+ } else {
+ if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS &&
+ ++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) {
+ static const char fmt[] = "send ok on %s";
+ char msg[sizeof(fmt) + IFNAMSIZ];
+
+ sprintf(msg, fmt, sc->sc_carpdev->if_xname);
+ carp_demote_adj(-V_carp_senderr_adj, msg);
+ sc->sc_sendad_errors = 0;
+ } else
+ sc->sc_sendad_errors = 0;
+ }
}
static void
@@ -924,190 +794,211 @@ carp_send_ad_locked(struct carp_softc *sc)
{
struct carp_header ch;
struct timeval tv;
+ struct sockaddr sa;
+ struct ifaddr *ifa;
struct carp_header *ch_ptr;
struct mbuf *m;
- int len, advbase, advskew;
+ int len, advskew;
- CARP_SCLOCK_ASSERT(sc);
+ CARP_LOCK_ASSERT(sc);
- /* bow out if we've lost our UPness or RUNNINGuiness */
- if (!((SC2IFP(sc)->if_flags & IFF_UP) &&
- (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) {
- advbase = 255;
- advskew = 255;
- } else {
- advbase = sc->sc_advbase;
- if (!carp_suppress_preempt || sc->sc_advskew > 240)
- advskew = sc->sc_advskew;
- else
- advskew = 240;
- tv.tv_sec = advbase;
- tv.tv_usec = advskew * 1000000 / 256;
- }
+ advskew = DEMOTE_ADVSKEW(sc);
+ tv.tv_sec = sc->sc_advbase;
+ tv.tv_usec = advskew * 1000000 / 256;
ch.carp_version = CARP_VERSION;
ch.carp_type = CARP_ADVERTISEMENT;
ch.carp_vhid = sc->sc_vhid;
- ch.carp_advbase = advbase;
+ ch.carp_advbase = sc->sc_advbase;
ch.carp_advskew = advskew;
ch.carp_authlen = 7; /* XXX DEFINE */
ch.carp_pad1 = 0; /* must be zero */
ch.carp_cksum = 0;
+ /* XXXGL: OpenBSD picks first ifaddr with needed family. */
+
#ifdef INET
- if (sc->sc_ia) {
+ if (sc->sc_naddrs) {
struct ip *ip;
- MGETHDR(m, M_DONTWAIT, MT_HEADER);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL) {
- SC2IFP(sc)->if_oerrors++;
CARPSTATS_INC(carps_onomem);
- /* XXX maybe less ? */
- if (advbase != 255 || advskew != 255)
- callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
- carp_send_ad, sc);
- return;
+ goto resched;
}
len = sizeof(*ip) + sizeof(ch);
m->m_pkthdr.len = len;
m->m_pkthdr.rcvif = NULL;
m->m_len = len;
- MH_ALIGN(m, m->m_len);
+ M_ALIGN(m, m->m_len);
m->m_flags |= M_MCAST;
ip = mtod(m, struct ip *);
ip->ip_v = IPVERSION;
ip->ip_hl = sizeof(*ip) >> 2;
ip->ip_tos = IPTOS_LOWDELAY;
- ip->ip_len = len;
- ip->ip_id = ip_newid();
- ip->ip_off = IP_DF;
+ ip->ip_len = htons(len);
+ ip->ip_off = htons(IP_DF);
ip->ip_ttl = CARP_DFLTTL;
ip->ip_p = IPPROTO_CARP;
ip->ip_sum = 0;
- ip->ip_src.s_addr = sc->sc_ia->ia_addr.sin_addr.s_addr;
+ ip_fillid(ip);
+
+ bzero(&sa, sizeof(sa));
+ sa.sa_family = AF_INET;
+ ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev);
+ if (ifa != NULL) {
+ ip->ip_src.s_addr =
+ ifatoia(ifa)->ia_addr.sin_addr.s_addr;
+ ifa_free(ifa);
+ } else
+ ip->ip_src.s_addr = 0;
ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
ch_ptr = (struct carp_header *)(&ip[1]);
bcopy(&ch, ch_ptr, sizeof(ch));
if (carp_prepare_ad(m, sc, ch_ptr))
- return;
+ goto resched;
m->m_data += sizeof(*ip);
- ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip));
+ ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip));
m->m_data -= sizeof(*ip);
- getmicrotime(&SC2IFP(sc)->if_lastchange);
- SC2IFP(sc)->if_opackets++;
- SC2IFP(sc)->if_obytes += len;
CARPSTATS_INC(carps_opackets);
- if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) {
- SC2IFP(sc)->if_oerrors++;
- if (sc->sc_sendad_errors < INT_MAX)
- sc->sc_sendad_errors++;
- if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
- carp_suppress_preempt++;
- if (carp_suppress_preempt == 1) {
- CARP_SCUNLOCK(sc);
- carp_send_ad_all();
- CARP_SCLOCK(sc);
- }
- }
- sc->sc_sendad_success = 0;
- } else {
- if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
- if (++sc->sc_sendad_success >=
- CARP_SENDAD_MIN_SUCCESS) {
- carp_suppress_preempt--;
- sc->sc_sendad_errors = 0;
- }
- } else
- sc->sc_sendad_errors = 0;
- }
+ carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT,
+ &sc->sc_carpdev->if_carp->cif_imo, NULL));
}
#endif /* INET */
#ifdef INET6
- if (sc->sc_ia6) {
+ if (sc->sc_naddrs6) {
struct ip6_hdr *ip6;
- MGETHDR(m, M_DONTWAIT, MT_HEADER);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL) {
- SC2IFP(sc)->if_oerrors++;
CARPSTATS_INC(carps_onomem);
- /* XXX maybe less ? */
- if (advbase != 255 || advskew != 255)
- callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
- carp_send_ad, sc);
- return;
+ goto resched;
}
len = sizeof(*ip6) + sizeof(ch);
m->m_pkthdr.len = len;
m->m_pkthdr.rcvif = NULL;
m->m_len = len;
- MH_ALIGN(m, m->m_len);
+ M_ALIGN(m, m->m_len);
m->m_flags |= M_MCAST;
ip6 = mtod(m, struct ip6_hdr *);
bzero(ip6, sizeof(*ip6));
ip6->ip6_vfc |= IPV6_VERSION;
ip6->ip6_hlim = CARP_DFLTTL;
ip6->ip6_nxt = IPPROTO_CARP;
- bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src,
- sizeof(struct in6_addr));
- /* set the multicast destination */
+ bzero(&sa, sizeof(sa));
+
+ /* set the source address */
+ sa.sa_family = AF_INET6;
+ ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev);
+ if (ifa != NULL) {
+ bcopy(IFA_IN6(ifa), &ip6->ip6_src,
+ sizeof(struct in6_addr));
+ ifa_free(ifa);
+ } else
+ /* This should never happen with IPv6. */
+ bzero(&ip6->ip6_src, sizeof(struct in6_addr));
+ /* Set the multicast destination. */
ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
ip6->ip6_dst.s6_addr8[15] = 0x12;
if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
- SC2IFP(sc)->if_oerrors++;
m_freem(m);
CARP_DEBUG("%s: in6_setscope failed\n", __func__);
- return;
+ goto resched;
}
ch_ptr = (struct carp_header *)(&ip6[1]);
bcopy(&ch, ch_ptr, sizeof(ch));
if (carp_prepare_ad(m, sc, ch_ptr))
- return;
+ goto resched;
m->m_data += sizeof(*ip6);
- ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6));
+ ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6));
m->m_data -= sizeof(*ip6);
- getmicrotime(&SC2IFP(sc)->if_lastchange);
- SC2IFP(sc)->if_opackets++;
- SC2IFP(sc)->if_obytes += len;
CARPSTATS_INC(carps_opackets6);
- if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) {
- SC2IFP(sc)->if_oerrors++;
- if (sc->sc_sendad_errors < INT_MAX)
- sc->sc_sendad_errors++;
- if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
- carp_suppress_preempt++;
- if (carp_suppress_preempt == 1) {
- CARP_SCUNLOCK(sc);
- carp_send_ad_all();
- CARP_SCLOCK(sc);
- }
- }
- sc->sc_sendad_success = 0;
- } else {
- if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
- if (++sc->sc_sendad_success >=
- CARP_SENDAD_MIN_SUCCESS) {
- carp_suppress_preempt--;
- sc->sc_sendad_errors = 0;
- }
- } else
- sc->sc_sendad_errors = 0;
- }
+ carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0,
+ &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL));
}
#endif /* INET6 */
- if (advbase != 255 || advskew != 255)
- callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
- carp_send_ad, sc);
+resched:
+ callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc);
+}
+
+static void
+carp_addroute(struct carp_softc *sc)
+{
+ struct ifaddr *ifa;
+
+ CARP_FOREACH_IFA(sc, ifa)
+ carp_ifa_addroute(ifa);
+}
+
+static void
+carp_ifa_addroute(struct ifaddr *ifa)
+{
+
+ switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+ case AF_INET:
+ in_addprefix(ifatoia(ifa), RTF_UP);
+ ifa_add_loopback_route(ifa,
+ (struct sockaddr *)&ifatoia(ifa)->ia_addr);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ ifa_add_loopback_route(ifa,
+ (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
+ nd6_add_ifa_lle(ifatoia6(ifa));
+ break;
+#endif
+ }
+}
+static void
+carp_delroute(struct carp_softc *sc)
+{
+ struct ifaddr *ifa;
+
+ CARP_FOREACH_IFA(sc, ifa)
+ carp_ifa_delroute(ifa);
+}
+
+static void
+carp_ifa_delroute(struct ifaddr *ifa)
+{
+
+ switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+ case AF_INET:
+ ifa_del_loopback_route(ifa,
+ (struct sockaddr *)&ifatoia(ifa)->ia_addr);
+ in_scrubprefix(ifatoia(ifa), LLE_STATIC);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ ifa_del_loopback_route(ifa,
+ (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
+ nd6_rem_ifa_lle(ifatoia6(ifa), 1);
+ break;
+#endif
+ }
+}
+
+int
+carp_master(struct ifaddr *ifa)
+{
+ struct carp_softc *sc = ifa->ifa_carp;
+
+ return (sc->sc_state == MASTER);
}
#ifdef INET
@@ -1120,17 +1011,27 @@ static void
carp_send_arp(struct carp_softc *sc)
{
struct ifaddr *ifa;
+ struct in_addr addr;
- TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
-
+ CARP_FOREACH_IFA(sc, ifa) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
+ addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr;
+ arp_announce_ifaddr(sc->sc_carpdev, addr, LLADDR(&sc->sc_addr));
+ }
+}
-/* arprequest(sc->sc_carpdev, &in, &in, IF_LLADDR(sc->sc_ifp)); */
- arp_ifinit2(sc->sc_carpdev, ifa, IF_LLADDR(sc->sc_ifp));
+int
+carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr)
+{
+ struct carp_softc *sc = ifa->ifa_carp;
- DELAY(1000); /* XXX */
+ if (sc->sc_state == MASTER) {
+ *enaddr = LLADDR(&sc->sc_addr);
+ return (1);
}
+
+ return (0);
}
#endif
@@ -1138,262 +1039,148 @@ carp_send_arp(struct carp_softc *sc)
static void
carp_send_na(struct carp_softc *sc)
{
+ static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
struct ifaddr *ifa;
struct in6_addr *in6;
- static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
-
- TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
+ CARP_FOREACH_IFA(sc, ifa) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
- in6 = &ifatoia6(ifa)->ia_addr.sin6_addr;
+ in6 = IFA_IN6(ifa);
nd6_na_output(sc->sc_carpdev, &mcast, in6,
ND_NA_FLAG_OVERRIDE, 1, NULL);
DELAY(1000); /* XXX */
}
}
-#endif /* INET6 */
-
-#ifdef INET
-static int
-carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type)
-{
- struct carp_softc *vh;
- struct ifaddr *ifa;
- int count = 0;
-
- CARP_LOCK_ASSERT(cif);
-
- TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
- if ((type == CARP_COUNT_RUNNING &&
- (SC2IFP(vh)->if_flags & IFF_UP) &&
- (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) ||
- (type == CARP_COUNT_MASTER && vh->sc_state == MASTER)) {
- IF_ADDR_RLOCK(SC2IFP(vh));
- TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist,
- ifa_list) {
- if (ifa->ifa_addr->sa_family == AF_INET &&
- ia->ia_addr.sin_addr.s_addr ==
- ifatoia(ifa)->ia_addr.sin_addr.s_addr)
- count++;
- }
- IF_ADDR_RUNLOCK(SC2IFP(vh));
- }
- }
- return (count);
-}
-
-int
-carp_iamatch(struct ifnet *ifp, struct in_ifaddr *ia,
- struct in_addr *isaddr, u_int8_t **enaddr)
-{
- struct carp_if *cif;
- struct carp_softc *vh;
- int index, count = 0;
- struct ifaddr *ifa;
-
- cif = ifp->if_carp;
- CARP_LOCK(cif);
-
- if (carp_opts[CARPCTL_ARPBALANCE]) {
- /*
- * XXX proof of concept implementation.
- * We use the source ip to decide which virtual host should
- * handle the request. If we're master of that virtual host,
- * then we respond, otherwise, just drop the arp packet on
- * the floor.
- */
- count = carp_addrcount(cif, ia, CARP_COUNT_RUNNING);
- if (count == 0) {
- /* should never reach this */
- CARP_UNLOCK(cif);
- return (0);
- }
-
- /* this should be a hash, like pf_hash() */
- index = ntohl(isaddr->s_addr) % count;
- count = 0;
-
- TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
- if ((SC2IFP(vh)->if_flags & IFF_UP) &&
- (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) {
- IF_ADDR_RLOCK(SC2IFP(vh));
- TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist,
- ifa_list) {
- if (ifa->ifa_addr->sa_family ==
- AF_INET &&
- ia->ia_addr.sin_addr.s_addr ==
- ifatoia(ifa)->ia_addr.sin_addr.s_addr) {
- if (count == index) {
- if (vh->sc_state ==
- MASTER) {
- *enaddr = IF_LLADDR(vh->sc_ifp);
- IF_ADDR_RUNLOCK(SC2IFP(vh));
- CARP_UNLOCK(cif);
- return (1);
- } else {
- IF_ADDR_RUNLOCK(SC2IFP(vh));
- CARP_UNLOCK(cif);
- return (0);
- }
- }
- count++;
- }
- }
- IF_ADDR_RUNLOCK(SC2IFP(vh));
- }
- }
- } else {
- TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
- if ((SC2IFP(vh)->if_flags & IFF_UP) &&
- (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
- ia->ia_ifp == SC2IFP(vh) &&
- vh->sc_state == MASTER) {
- *enaddr = IF_LLADDR(vh->sc_ifp);
- CARP_UNLOCK(cif);
- return (1);
- }
- }
- }
- CARP_UNLOCK(cif);
- return (0);
-}
-#endif
-#ifdef INET6
+/*
+ * Returns ifa in case it's a carp address and it is MASTER, or if the address
+ * matches and is not a carp address. Returns NULL otherwise.
+ */
struct ifaddr *
carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr)
{
- struct carp_if *cif;
- struct carp_softc *vh;
struct ifaddr *ifa;
- cif = ifp->if_carp;
- CARP_LOCK(cif);
- TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
- IF_ADDR_RLOCK(SC2IFP(vh));
- TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, ifa_list) {
- if (IN6_ARE_ADDR_EQUAL(taddr,
- &ifatoia6(ifa)->ia_addr.sin6_addr) &&
- (SC2IFP(vh)->if_flags & IFF_UP) &&
- (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
- vh->sc_state == MASTER) {
- ifa_ref(ifa);
- IF_ADDR_RUNLOCK(SC2IFP(vh));
- CARP_UNLOCK(cif);
- return (ifa);
- }
- }
- IF_ADDR_RUNLOCK(SC2IFP(vh));
+ ifa = NULL;
+ IF_ADDR_RLOCK(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_INET6)
+ continue;
+ if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa)))
+ continue;
+ if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER)
+ ifa = NULL;
+ else
+ ifa_ref(ifa);
+ break;
}
- CARP_UNLOCK(cif);
-
- return (NULL);
+ IF_ADDR_RUNLOCK(ifp);
+
+ return (ifa);
}
caddr_t
carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr)
{
- struct m_tag *mtag;
- struct carp_if *cif;
- struct carp_softc *sc;
struct ifaddr *ifa;
- cif = ifp->if_carp;
- CARP_LOCK(cif);
- TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) {
- IF_ADDR_RLOCK(SC2IFP(sc));
- TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
- if (IN6_ARE_ADDR_EQUAL(taddr,
- &ifatoia6(ifa)->ia_addr.sin6_addr) &&
- (SC2IFP(sc)->if_flags & IFF_UP) &&
- (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING)) {
- struct ifnet *ifp = SC2IFP(sc);
- mtag = m_tag_get(PACKET_TAG_CARP,
- sizeof(struct ifnet *), M_NOWAIT);
- if (mtag == NULL) {
- /* better a bit than nothing */
- IF_ADDR_RUNLOCK(SC2IFP(sc));
- CARP_UNLOCK(cif);
- return (IF_LLADDR(sc->sc_ifp));
- }
- bcopy(&ifp, (caddr_t)(mtag + 1),
- sizeof(struct ifnet *));
- m_tag_prepend(m, mtag);
+ IF_ADDR_RLOCK(ifp);
+ IFNET_FOREACH_IFA(ifp, ifa)
+ if (ifa->ifa_addr->sa_family == AF_INET6 &&
+ IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) {
+ struct carp_softc *sc = ifa->ifa_carp;
+ struct m_tag *mtag;
- IF_ADDR_RUNLOCK(SC2IFP(sc));
- CARP_UNLOCK(cif);
- return (IF_LLADDR(sc->sc_ifp));
- }
+ IF_ADDR_RUNLOCK(ifp);
+
+ mtag = m_tag_get(PACKET_TAG_CARP,
+ sizeof(struct carp_softc *), M_NOWAIT);
+ if (mtag == NULL)
+ /* Better a bit than nothing. */
+ return (LLADDR(&sc->sc_addr));
+
+ bcopy(&sc, mtag + 1, sizeof(sc));
+ m_tag_prepend(m, mtag);
+
+ return (LLADDR(&sc->sc_addr));
}
- IF_ADDR_RUNLOCK(SC2IFP(sc));
- }
- CARP_UNLOCK(cif);
+ IF_ADDR_RUNLOCK(ifp);
return (NULL);
}
-#endif
+#endif /* INET6 */
-struct ifnet *
+int
carp_forus(struct ifnet *ifp, u_char *dhost)
{
- struct carp_if *cif;
- struct carp_softc *vh;
- u_int8_t *ena = dhost;
+ struct carp_softc *sc;
+ uint8_t *ena = dhost;
if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
- return (NULL);
-
- cif = ifp->if_carp;
- CARP_LOCK(cif);
- TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list)
- if ((SC2IFP(vh)->if_flags & IFF_UP) &&
- (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
- vh->sc_state == MASTER &&
- !bcmp(dhost, IF_LLADDR(vh->sc_ifp), ETHER_ADDR_LEN)) {
- CARP_UNLOCK(cif);
- return (SC2IFP(vh));
+ return (0);
+
+ CIF_LOCK(ifp->if_carp);
+ IFNET_FOREACH_CARP(ifp, sc) {
+ CARP_LOCK(sc);
+ if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr),
+ ETHER_ADDR_LEN)) {
+ CARP_UNLOCK(sc);
+ CIF_UNLOCK(ifp->if_carp);
+ return (1);
}
+ CARP_UNLOCK(sc);
+ }
+ CIF_UNLOCK(ifp->if_carp);
- CARP_UNLOCK(cif);
- return (NULL);
+ return (0);
}
+/* Master down timeout event, executed in callout context. */
static void
carp_master_down(void *v)
{
struct carp_softc *sc = v;
- CARP_SCLOCK(sc);
- carp_master_down_locked(sc);
- CARP_SCUNLOCK(sc);
+ CARP_LOCK_ASSERT(sc);
+
+ CURVNET_SET(sc->sc_carpdev->if_vnet);
+ if (sc->sc_state == BACKUP) {
+ carp_master_down_locked(sc, "master timed out");
+ }
+ CURVNET_RESTORE();
+
+ CARP_UNLOCK(sc);
}
static void
-carp_master_down_locked(struct carp_softc *sc)
+carp_master_down_locked(struct carp_softc *sc, const char *reason)
{
- if (sc->sc_carpdev)
- CARP_SCLOCK_ASSERT(sc);
+
+ CARP_LOCK_ASSERT(sc);
switch (sc->sc_state) {
- case INIT:
- printf("%s: master_down event in INIT state\n",
- SC2IFP(sc)->if_xname);
- break;
- case MASTER:
- break;
case BACKUP:
- carp_set_state(sc, MASTER);
+ carp_set_state(sc, MASTER, reason);
carp_send_ad_locked(sc);
#ifdef INET
carp_send_arp(sc);
#endif
#ifdef INET6
carp_send_na(sc);
-#endif /* INET6 */
+#endif
carp_setrun(sc, 0);
- carp_setroute(sc, RTM_ADD);
+ carp_addroute(sc);
+ break;
+ case INIT:
+ case MASTER:
+#ifdef INVARIANTS
+ panic("carp: VHID %u@%s: master_down event in %s state\n",
+ sc->sc_vhid,
+ sc->sc_carpdev->if_xname,
+ sc->sc_state ? "MASTER" : "INIT");
+#endif
break;
}
}
@@ -1407,28 +1194,16 @@ carp_setrun(struct carp_softc *sc, sa_family_t af)
{
struct timeval tv;
- if (sc->sc_carpdev == NULL) {
- SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
- carp_set_state(sc, INIT);
- return;
- } else
- CARP_SCLOCK_ASSERT(sc);
-
- if (SC2IFP(sc)->if_flags & IFF_UP &&
- sc->sc_vhid > 0 && (sc->sc_naddrs || sc->sc_naddrs6) &&
- sc->sc_carpdev->if_link_state == LINK_STATE_UP)
- SC2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING;
- else {
- SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
- carp_setroute(sc, RTM_DELETE);
+ CARP_LOCK_ASSERT(sc);
+
+ if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 ||
+ sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
+ (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0))
return;
- }
switch (sc->sc_state) {
case INIT:
- CARP_LOG("%s: INIT -> BACKUP\n", SC2IFP(sc)->if_xname);
- carp_set_state(sc, BACKUP);
- carp_setroute(sc, RTM_DELETE);
+ carp_set_state(sc, BACKUP, "initialization complete");
carp_setrun(sc, 0);
break;
case BACKUP:
@@ -1441,20 +1216,24 @@ carp_setrun(struct carp_softc *sc, sa_family_t af)
callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
carp_master_down, sc);
break;
-#endif /* INET */
+#endif
#ifdef INET6
case AF_INET6:
callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
carp_master_down, sc);
break;
-#endif /* INET6 */
+#endif
default:
+#ifdef INET
if (sc->sc_naddrs)
callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
carp_master_down, sc);
+#endif
+#ifdef INET6
if (sc->sc_naddrs6)
callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
carp_master_down, sc);
+#endif
break;
}
break;
@@ -1467,842 +1246,779 @@ carp_setrun(struct carp_softc *sc, sa_family_t af)
}
}
-#ifdef INET
-static void
-carp_multicast_cleanup(struct carp_softc *sc, int dofree)
-{
- struct ip_moptions *imo = &sc->sc_imo;
- u_int16_t n = imo->imo_num_memberships;
-
- /* Clean up our own multicast memberships */
- while (n-- > 0) {
- if (imo->imo_membership[n] != NULL) {
- if (dofree)
- in_delmulti(imo->imo_membership[n]);
- imo->imo_membership[n] = NULL;
- }
- }
- KASSERT(imo->imo_mfilters == NULL,
- ("%s: imo_mfilters != NULL", __func__));
- imo->imo_num_memberships = 0;
- imo->imo_multicast_ifp = NULL;
-}
-#endif
-
-#ifdef INET6
-static void
-carp_multicast6_cleanup(struct carp_softc *sc, int dofree)
-{
- struct ip6_moptions *im6o = &sc->sc_im6o;
- u_int16_t n = im6o->im6o_num_memberships;
-
- while (n-- > 0) {
- if (im6o->im6o_membership[n] != NULL) {
- if (dofree)
- in6_mc_leave(im6o->im6o_membership[n], NULL);
- im6o->im6o_membership[n] = NULL;
- }
- }
- KASSERT(im6o->im6o_mfilters == NULL,
- ("%s: im6o_mfilters != NULL", __func__));
- im6o->im6o_num_memberships = 0;
- im6o->im6o_multicast_ifp = NULL;
-}
-#endif
-
-#ifdef INET
+/*
+ * Setup multicast structures.
+ */
static int
-carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin)
+carp_multicast_setup(struct carp_if *cif, sa_family_t sa)
{
- struct ifnet *ifp;
- struct carp_if *cif;
- struct in_ifaddr *ia, *ia_if;
- struct ip_moptions *imo = &sc->sc_imo;
- struct in_addr addr;
- u_long iaddr = htonl(sin->sin_addr.s_addr);
- int own, error;
-
- if (sin->sin_addr.s_addr == 0) {
- if (!(SC2IFP(sc)->if_flags & IFF_UP))
- carp_set_state(sc, INIT);
- if (sc->sc_naddrs)
- SC2IFP(sc)->if_flags |= IFF_UP;
- if (sc->sc_carpdev)
- CARP_SCLOCK(sc);
- carp_setrun(sc, 0);
- if (sc->sc_carpdev)
- CARP_SCUNLOCK(sc);
- return (0);
- }
-
- /* we have to do it by hands to check we won't match on us */
- ia_if = NULL; own = 0;
- IN_IFADDR_RLOCK();
- TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
- /* and, yeah, we need a multicast-capable iface too */
- if (ia->ia_ifp != SC2IFP(sc) &&
- (ia->ia_ifp->if_flags & IFF_MULTICAST) &&
- (iaddr & ia->ia_subnetmask) == ia->ia_subnet) {
- if (!ia_if)
- ia_if = ia;
- if (sin->sin_addr.s_addr ==
- ia->ia_addr.sin_addr.s_addr)
- own++;
- }
- }
-
- if (!ia_if) {
- IN_IFADDR_RUNLOCK();
- return (EADDRNOTAVAIL);
- }
+ struct ifnet *ifp = cif->cif_ifp;
+ int error = 0;
- ia = ia_if;
- ifa_ref(&ia->ia_ifa);
- IN_IFADDR_RUNLOCK();
+ switch (sa) {
+#ifdef INET
+ case AF_INET:
+ {
+ struct ip_moptions *imo = &cif->cif_imo;
+ struct in_addr addr;
- ifp = ia->ia_ifp;
+ if (imo->imo_membership)
+ return (0);
- if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 ||
- (imo->imo_multicast_ifp && imo->imo_multicast_ifp != ifp)) {
- ifa_free(&ia->ia_ifa);
- return (EADDRNOTAVAIL);
- }
+ imo->imo_membership = (struct in_multi **)malloc(
+ (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP,
+ M_WAITOK);
+ imo->imo_mfilters = NULL;
+ imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
+ imo->imo_multicast_vif = -1;
- if (imo->imo_num_memberships == 0) {
addr.s_addr = htonl(INADDR_CARP_GROUP);
- if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) ==
- NULL) {
- ifa_free(&ia->ia_ifa);
- return (ENOBUFS);
+ if ((error = in_joingroup(ifp, &addr, NULL,
+ &imo->imo_membership[0])) != 0) {
+ free(imo->imo_membership, M_CARP);
+ break;
}
imo->imo_num_memberships++;
imo->imo_multicast_ifp = ifp;
imo->imo_multicast_ttl = CARP_DFLTTL;
imo->imo_multicast_loop = 0;
- }
+ break;
+ }
+#endif
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct ip6_moptions *im6o = &cif->cif_im6o;
+ struct in6_addr in6;
+ struct in6_multi *in6m;
+
+ if (im6o->im6o_membership)
+ return (0);
- if (!ifp->if_carp) {
+ im6o->im6o_membership = (struct in6_multi **)malloc(
+ (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP,
+ M_ZERO | M_WAITOK);
+ im6o->im6o_mfilters = NULL;
+ im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
+ im6o->im6o_multicast_hlim = CARP_DFLTTL;
+ im6o->im6o_multicast_ifp = ifp;
- cif = malloc(sizeof(*cif), M_CARP,
- M_WAITOK|M_ZERO);
- if (!cif) {
- error = ENOBUFS;
- goto cleanup;
+ /* Join IPv6 CARP multicast group. */
+ bzero(&in6, sizeof(in6));
+ in6.s6_addr16[0] = htons(0xff02);
+ in6.s6_addr8[15] = 0x12;
+ if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
+ free(im6o->im6o_membership, M_CARP);
+ break;
}
- if ((error = ifpromisc(ifp, 1))) {
- free(cif, M_CARP);
- goto cleanup;
+ in6m = NULL;
+ if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
+ free(im6o->im6o_membership, M_CARP);
+ break;
}
-
- CARP_LOCK_INIT(cif);
- CARP_LOCK(cif);
- cif->vhif_ifp = ifp;
- TAILQ_INIT(&cif->vhif_vrs);
- ifp->if_carp = cif;
+ im6o->im6o_membership[0] = in6m;
+ im6o->im6o_num_memberships++;
- } else {
- struct carp_softc *vr;
-
- cif = (struct carp_if *)ifp->if_carp;
- CARP_LOCK(cif);
- TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
- if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
- CARP_UNLOCK(cif);
- error = EEXIST;
- goto cleanup;
- }
+ /* Join solicited multicast address. */
+ bzero(&in6, sizeof(in6));
+ in6.s6_addr16[0] = htons(0xff02);
+ in6.s6_addr32[1] = 0;
+ in6.s6_addr32[2] = htonl(1);
+ in6.s6_addr32[3] = 0;
+ in6.s6_addr8[12] = 0xff;
+ if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
+ in6_mc_leave(im6o->im6o_membership[0], NULL);
+ free(im6o->im6o_membership, M_CARP);
+ break;
+ }
+ in6m = NULL;
+ if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
+ in6_mc_leave(im6o->im6o_membership[0], NULL);
+ free(im6o->im6o_membership, M_CARP);
+ break;
+ }
+ im6o->im6o_membership[1] = in6m;
+ im6o->im6o_num_memberships++;
+ break;
+ }
+#endif
}
- sc->sc_ia = ia;
- sc->sc_carpdev = ifp;
- { /* XXX prevent endless loop if already in queue */
- struct carp_softc *vr, *after = NULL;
- int myself = 0;
- cif = (struct carp_if *)ifp->if_carp;
+ return (error);
+}
- /* XXX: cif should not change, right? So we still hold the lock */
- CARP_LOCK_ASSERT(cif);
+/*
+ * Free multicast structures.
+ */
+static void
+carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa)
+{
- TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
- if (vr == sc)
- myself = 1;
- if (vr->sc_vhid < sc->sc_vhid)
- after = vr;
- }
+ sx_assert(&carp_sx, SA_XLOCKED);
+
+ switch (sa) {
+#ifdef INET
+ case AF_INET:
+ if (cif->cif_naddrs == 0) {
+ struct ip_moptions *imo = &cif->cif_imo;
+
+ in_leavegroup(imo->imo_membership[0], NULL);
+ KASSERT(imo->imo_mfilters == NULL,
+ ("%s: imo_mfilters != NULL", __func__));
+ free(imo->imo_membership, M_CARP);
+ imo->imo_membership = NULL;
- if (!myself) {
- /* We're trying to keep things in order */
- if (after == NULL) {
- TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
- } else {
- TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
}
- cif->vhif_nvrs++;
- }
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ if (cif->cif_naddrs6 == 0) {
+ struct ip6_moptions *im6o = &cif->cif_im6o;
+
+ in6_mc_leave(im6o->im6o_membership[0], NULL);
+ in6_mc_leave(im6o->im6o_membership[1], NULL);
+ KASSERT(im6o->im6o_mfilters == NULL,
+ ("%s: im6o_mfilters != NULL", __func__));
+ free(im6o->im6o_membership, M_CARP);
+ im6o->im6o_membership = NULL;
+ }
+ break;
+#endif
}
+}
+
+int
+carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa)
+{
+ struct m_tag *mtag;
+ struct carp_softc *sc;
- sc->sc_naddrs++;
- SC2IFP(sc)->if_flags |= IFF_UP;
- if (own)
- sc->sc_advskew = 0;
- carp_sc_state_locked(sc);
- carp_setrun(sc, 0);
+ if (!sa)
+ return (0);
- CARP_UNLOCK(cif);
- ifa_free(&ia->ia_ifa); /* XXXRW: should hold reference for softc. */
+ switch (sa->sa_family) {
+#ifdef INET
+ case AF_INET:
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ break;
+#endif
+ default:
+ return (0);
+ }
- return (0);
+ mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
+ if (mtag == NULL)
+ return (0);
-cleanup:
- in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
- ifa_free(&ia->ia_ifa);
- return (error);
-}
+ bcopy(mtag + 1, &sc, sizeof(sc));
-static int
-carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin)
-{
- int error = 0;
+ /* Set the source MAC address to the Virtual Router MAC Address. */
+ switch (ifp->if_type) {
+ case IFT_ETHER:
+ case IFT_BRIDGE:
+ case IFT_L2VLAN: {
+ struct ether_header *eh;
- if (!--sc->sc_naddrs) {
- struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp;
- struct ip_moptions *imo = &sc->sc_imo;
+ eh = mtod(m, struct ether_header *);
+ eh->ether_shost[0] = 0;
+ eh->ether_shost[1] = 0;
+ eh->ether_shost[2] = 0x5e;
+ eh->ether_shost[3] = 0;
+ eh->ether_shost[4] = 1;
+ eh->ether_shost[5] = sc->sc_vhid;
+ }
+ break;
+ case IFT_FDDI: {
+ struct fddi_header *fh;
- CARP_LOCK(cif);
- callout_stop(&sc->sc_ad_tmo);
- SC2IFP(sc)->if_flags &= ~IFF_UP;
- SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
- sc->sc_vhid = -1;
- in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
- imo->imo_multicast_ifp = NULL;
- TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
- if (!--cif->vhif_nvrs) {
- sc->sc_carpdev->if_carp = NULL;
- CARP_LOCK_DESTROY(cif);
- free(cif, M_CARP);
- } else {
- CARP_UNLOCK(cif);
+ fh = mtod(m, struct fddi_header *);
+ fh->fddi_shost[0] = 0;
+ fh->fddi_shost[1] = 0;
+ fh->fddi_shost[2] = 0x5e;
+ fh->fddi_shost[3] = 0;
+ fh->fddi_shost[4] = 1;
+ fh->fddi_shost[5] = sc->sc_vhid;
+ }
+ break;
+ case IFT_ISO88025: {
+ struct iso88025_header *th;
+ th = mtod(m, struct iso88025_header *);
+ th->iso88025_shost[0] = 3;
+ th->iso88025_shost[1] = 0;
+ th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1);
+ th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1);
+ th->iso88025_shost[4] = 0;
+ th->iso88025_shost[5] = 0;
}
+ break;
+ default:
+ printf("%s: carp is not supported for the %d interface type\n",
+ ifp->if_xname, ifp->if_type);
+ return (EOPNOTSUPP);
}
- return (error);
+ return (0);
}
-#endif
-#ifdef INET6
-static int
-carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
+static struct carp_softc*
+carp_alloc(struct ifnet *ifp)
{
- struct ifnet *ifp;
+ struct carp_softc *sc;
struct carp_if *cif;
- struct in6_ifaddr *ia, *ia_if;
- struct ip6_moptions *im6o = &sc->sc_im6o;
- struct in6_addr in6;
- int own, error;
-
- error = 0;
-
- if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
- if (!(SC2IFP(sc)->if_flags & IFF_UP))
- carp_set_state(sc, INIT);
- if (sc->sc_naddrs6)
- SC2IFP(sc)->if_flags |= IFF_UP;
- if (sc->sc_carpdev)
- CARP_SCLOCK(sc);
- carp_setrun(sc, 0);
- if (sc->sc_carpdev)
- CARP_SCUNLOCK(sc);
- return (0);
- }
- /* we have to do it by hands to check we won't match on us */
- ia_if = NULL; own = 0;
- IN6_IFADDR_RLOCK();
- TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
- int i;
-
- for (i = 0; i < 4; i++) {
- if ((sin6->sin6_addr.s6_addr32[i] &
- ia->ia_prefixmask.sin6_addr.s6_addr32[i]) !=
- (ia->ia_addr.sin6_addr.s6_addr32[i] &
- ia->ia_prefixmask.sin6_addr.s6_addr32[i]))
- break;
- }
- /* and, yeah, we need a multicast-capable iface too */
- if (ia->ia_ifp != SC2IFP(sc) &&
- (ia->ia_ifp->if_flags & IFF_MULTICAST) &&
- (i == 4)) {
- if (!ia_if)
- ia_if = ia;
- if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
- &ia->ia_addr.sin6_addr))
- own++;
- }
- }
+ if ((cif = ifp->if_carp) == NULL)
+ cif = carp_alloc_if(ifp);
- if (!ia_if) {
- IN6_IFADDR_RUNLOCK();
- return (EADDRNOTAVAIL);
- }
- ia = ia_if;
- ifa_ref(&ia->ia_ifa);
- IN6_IFADDR_RUNLOCK();
- ifp = ia->ia_ifp;
-
- if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 ||
- (im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp)) {
- ifa_free(&ia->ia_ifa);
- return (EADDRNOTAVAIL);
- }
+ sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO);
- if (!sc->sc_naddrs6) {
- struct in6_multi *in6m;
+ sc->sc_advbase = CARP_DFLTINTV;
+ sc->sc_vhid = -1; /* required setting */
+ sc->sc_init_counter = 1;
+ sc->sc_state = INIT;
- im6o->im6o_multicast_ifp = ifp;
+ sc->sc_ifasiz = sizeof(struct ifaddr *);
+ sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO);
+ sc->sc_carpdev = ifp;
- /* join CARP multicast address */
- bzero(&in6, sizeof(in6));
- in6.s6_addr16[0] = htons(0xff02);
- in6.s6_addr8[15] = 0x12;
- if (in6_setscope(&in6, ifp, NULL) != 0)
- goto cleanup;
- in6m = NULL;
- error = in6_mc_join(ifp, &in6, NULL, &in6m, 0);
- if (error)
- goto cleanup;
- im6o->im6o_membership[0] = in6m;
- im6o->im6o_num_memberships++;
+ CARP_LOCK_INIT(sc);
+#ifdef INET
+ callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
+#endif
+#ifdef INET6
+ callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
+#endif
+ callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
- /* join solicited multicast address */
- bzero(&in6, sizeof(in6));
- in6.s6_addr16[0] = htons(0xff02);
- in6.s6_addr32[1] = 0;
- in6.s6_addr32[2] = htonl(1);
- in6.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3];
- in6.s6_addr8[12] = 0xff;
- if (in6_setscope(&in6, ifp, NULL) != 0)
- goto cleanup;
- in6m = NULL;
- error = in6_mc_join(ifp, &in6, NULL, &in6m, 0);
- if (error)
- goto cleanup;
- im6o->im6o_membership[1] = in6m;
- im6o->im6o_num_memberships++;
- }
+ CIF_LOCK(cif);
+ TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list);
+ CIF_UNLOCK(cif);
- if (!ifp->if_carp) {
- cif = malloc(sizeof(*cif), M_CARP,
- M_WAITOK|M_ZERO);
- if (!cif) {
- error = ENOBUFS;
- goto cleanup;
- }
- if ((error = ifpromisc(ifp, 1))) {
- free(cif, M_CARP);
- goto cleanup;
- }
+ mtx_lock(&carp_mtx);
+ LIST_INSERT_HEAD(&carp_list, sc, sc_next);
+ mtx_unlock(&carp_mtx);
- CARP_LOCK_INIT(cif);
- CARP_LOCK(cif);
- cif->vhif_ifp = ifp;
- TAILQ_INIT(&cif->vhif_vrs);
- ifp->if_carp = cif;
+ return (sc);
+}
- } else {
- struct carp_softc *vr;
+static void
+carp_grow_ifas(struct carp_softc *sc)
+{
+ struct ifaddr **new;
+
+ new = malloc(sc->sc_ifasiz * 2, M_CARP, M_WAITOK | M_ZERO);
+ CARP_LOCK(sc);
+ bcopy(sc->sc_ifas, new, sc->sc_ifasiz);
+ free(sc->sc_ifas, M_CARP);
+ sc->sc_ifas = new;
+ sc->sc_ifasiz *= 2;
+ CARP_UNLOCK(sc);
+}
- cif = (struct carp_if *)ifp->if_carp;
- CARP_LOCK(cif);
- TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
- if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
- CARP_UNLOCK(cif);
- error = EINVAL;
- goto cleanup;
- }
- }
- sc->sc_ia6 = ia;
- sc->sc_carpdev = ifp;
+static void
+carp_destroy(struct carp_softc *sc)
+{
+ struct ifnet *ifp = sc->sc_carpdev;
+ struct carp_if *cif = ifp->if_carp;
- { /* XXX prevent endless loop if already in queue */
- struct carp_softc *vr, *after = NULL;
- int myself = 0;
- cif = (struct carp_if *)ifp->if_carp;
- CARP_LOCK_ASSERT(cif);
-
- TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
- if (vr == sc)
- myself = 1;
- if (vr->sc_vhid < sc->sc_vhid)
- after = vr;
- }
+ sx_assert(&carp_sx, SA_XLOCKED);
- if (!myself) {
- /* We're trying to keep things in order */
- if (after == NULL) {
- TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
- } else {
- TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
- }
- cif->vhif_nvrs++;
- }
- }
+ if (sc->sc_suppress)
+ carp_demote_adj(-V_carp_ifdown_adj, "vhid removed");
+ CARP_UNLOCK(sc);
- sc->sc_naddrs6++;
- SC2IFP(sc)->if_flags |= IFF_UP;
- if (own)
- sc->sc_advskew = 0;
- carp_sc_state_locked(sc);
- carp_setrun(sc, 0);
+ CIF_LOCK(cif);
+ TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list);
+ CIF_UNLOCK(cif);
- CARP_UNLOCK(cif);
- ifa_free(&ia->ia_ifa); /* XXXRW: should hold reference for softc. */
+ mtx_lock(&carp_mtx);
+ LIST_REMOVE(sc, sc_next);
+ mtx_unlock(&carp_mtx);
- return (0);
+ callout_drain(&sc->sc_ad_tmo);
+#ifdef INET
+ callout_drain(&sc->sc_md_tmo);
+#endif
+#ifdef INET6
+ callout_drain(&sc->sc_md6_tmo);
+#endif
+ CARP_LOCK_DESTROY(sc);
-cleanup:
- if (!sc->sc_naddrs6)
- carp_multicast6_cleanup(sc, 1);
- ifa_free(&ia->ia_ifa);
- return (error);
+ free(sc->sc_ifas, M_CARP);
+ free(sc, M_CARP);
}
-static int
-carp_del_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
+static struct carp_if*
+carp_alloc_if(struct ifnet *ifp)
{
- int error = 0;
+ struct carp_if *cif;
+ int error;
- if (!--sc->sc_naddrs6) {
- struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp;
+ cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO);
- CARP_LOCK(cif);
- callout_stop(&sc->sc_ad_tmo);
- SC2IFP(sc)->if_flags &= ~IFF_UP;
- SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
- sc->sc_vhid = -1;
- carp_multicast6_cleanup(sc, 1);
- TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
- if (!--cif->vhif_nvrs) {
- CARP_LOCK_DESTROY(cif);
- sc->sc_carpdev->if_carp = NULL;
- free(cif, M_CARP);
- } else
- CARP_UNLOCK(cif);
- }
+ if ((error = ifpromisc(ifp, 1)) != 0)
+ printf("%s: ifpromisc(%s) failed: %d\n",
+ __func__, ifp->if_xname, error);
+ else
+ cif->cif_flags |= CIF_PROMISC;
- return (error);
+ CIF_LOCK_INIT(cif);
+ cif->cif_ifp = ifp;
+ TAILQ_INIT(&cif->cif_vrs);
+
+ IF_ADDR_WLOCK(ifp);
+ ifp->if_carp = cif;
+ if_ref(ifp);
+ IF_ADDR_WUNLOCK(ifp);
+
+ return (cif);
}
-#endif /* INET6 */
-static int
-carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
+static void
+carp_free_if(struct carp_if *cif)
+{
+ struct ifnet *ifp = cif->cif_ifp;
+
+ CIF_LOCK_ASSERT(cif);
+ KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty",
+ __func__));
+
+ IF_ADDR_WLOCK(ifp);
+ ifp->if_carp = NULL;
+ IF_ADDR_WUNLOCK(ifp);
+
+ CIF_LOCK_DESTROY(cif);
+
+ if (cif->cif_flags & CIF_PROMISC)
+ ifpromisc(ifp, 0);
+ if_rele(ifp);
+
+ free(cif, M_CARP);
+}
+
+static void
+carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv)
+{
+
+ CARP_LOCK(sc);
+ carpr->carpr_state = sc->sc_state;
+ carpr->carpr_vhid = sc->sc_vhid;
+ carpr->carpr_advbase = sc->sc_advbase;
+ carpr->carpr_advskew = sc->sc_advskew;
+ if (priv)
+ bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key));
+ else
+ bzero(carpr->carpr_key, sizeof(carpr->carpr_key));
+ CARP_UNLOCK(sc);
+}
+
+int
+carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td)
{
- struct carp_softc *sc = ifp->if_softc, *vr;
struct carpreq carpr;
- struct ifaddr *ifa;
- struct ifreq *ifr;
- struct ifaliasreq *ifra;
- int locked = 0, error = 0;
+ struct ifnet *ifp;
+ struct carp_softc *sc = NULL;
+ int error = 0, locked = 0;
- ifa = (struct ifaddr *)addr;
- ifra = (struct ifaliasreq *)addr;
- ifr = (struct ifreq *)addr;
+ if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
+ return (error);
- switch (cmd) {
- case SIOCSIFADDR:
- switch (ifa->ifa_addr->sa_family) {
-#ifdef INET
- case AF_INET:
- SC2IFP(sc)->if_flags |= IFF_UP;
- bcopy(ifa->ifa_addr, ifa->ifa_dstaddr,
- sizeof(struct sockaddr));
- error = carp_set_addr(sc, satosin(ifa->ifa_addr));
- break;
-#endif /* INET */
-#ifdef INET6
- case AF_INET6:
- SC2IFP(sc)->if_flags |= IFF_UP;
- error = carp_set_addr6(sc, satosin6(ifa->ifa_addr));
- break;
-#endif /* INET6 */
- default:
- error = EAFNOSUPPORT;
- break;
- }
- break;
+ ifp = ifunit_ref(ifr->ifr_name);
+ if (ifp == NULL)
+ return (ENXIO);
- case SIOCAIFADDR:
- switch (ifa->ifa_addr->sa_family) {
-#ifdef INET
- case AF_INET:
- SC2IFP(sc)->if_flags |= IFF_UP;
- bcopy(ifa->ifa_addr, ifa->ifa_dstaddr,
- sizeof(struct sockaddr));
- error = carp_set_addr(sc, satosin(&ifra->ifra_addr));
- break;
-#endif /* INET */
-#ifdef INET6
- case AF_INET6:
- SC2IFP(sc)->if_flags |= IFF_UP;
- error = carp_set_addr6(sc, satosin6(&ifra->ifra_addr));
- break;
-#endif /* INET6 */
- default:
- error = EAFNOSUPPORT;
- break;
- }
+ switch (ifp->if_type) {
+ case IFT_ETHER:
+ case IFT_L2VLAN:
+ case IFT_BRIDGE:
+ case IFT_FDDI:
+ case IFT_ISO88025:
break;
+ default:
+ error = EOPNOTSUPP;
+ goto out;
+ }
- case SIOCDIFADDR:
- switch (ifa->ifa_addr->sa_family) {
-#ifdef INET
- case AF_INET:
- error = carp_del_addr(sc, satosin(&ifra->ifra_addr));
- break;
-#endif /* INET */
-#ifdef INET6
- case AF_INET6:
- error = carp_del_addr6(sc, satosin6(&ifra->ifra_addr));
+ if ((ifp->if_flags & IFF_MULTICAST) == 0) {
+ error = EADDRNOTAVAIL;
+ goto out;
+ }
+
+ sx_xlock(&carp_sx);
+ switch (cmd) {
+ case SIOCSVH:
+ if ((error = priv_check(td, PRIV_NETINET_CARP)))
break;
-#endif /* INET6 */
- default:
- error = EAFNOSUPPORT;
+ if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID ||
+ carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) {
+ error = EINVAL;
break;
}
- break;
- case SIOCSIFFLAGS:
- if (sc->sc_carpdev) {
- locked = 1;
- CARP_SCLOCK(sc);
+ if (ifp->if_carp) {
+ CIF_LOCK(ifp->if_carp);
+ IFNET_FOREACH_CARP(ifp, sc)
+ if (sc->sc_vhid == carpr.carpr_vhid)
+ break;
+ CIF_UNLOCK(ifp->if_carp);
}
- if (sc->sc_state != INIT && !(ifr->ifr_flags & IFF_UP)) {
- callout_stop(&sc->sc_ad_tmo);
- callout_stop(&sc->sc_md_tmo);
- callout_stop(&sc->sc_md6_tmo);
- if (sc->sc_state == MASTER)
- carp_send_ad_locked(sc);
- carp_set_state(sc, INIT);
- carp_setrun(sc, 0);
- } else if (sc->sc_state == INIT && (ifr->ifr_flags & IFF_UP)) {
- SC2IFP(sc)->if_flags |= IFF_UP;
- carp_setrun(sc, 0);
+ if (sc == NULL) {
+ sc = carp_alloc(ifp);
+ CARP_LOCK(sc);
+ sc->sc_vhid = carpr.carpr_vhid;
+ LLADDR(&sc->sc_addr)[0] = 0;
+ LLADDR(&sc->sc_addr)[1] = 0;
+ LLADDR(&sc->sc_addr)[2] = 0x5e;
+ LLADDR(&sc->sc_addr)[3] = 0;
+ LLADDR(&sc->sc_addr)[4] = 1;
+ LLADDR(&sc->sc_addr)[5] = sc->sc_vhid;
+ } else
+ CARP_LOCK(sc);
+ locked = 1;
+ if (carpr.carpr_advbase > 0) {
+ if (carpr.carpr_advbase > 255 ||
+ carpr.carpr_advbase < CARP_DFLTINTV) {
+ error = EINVAL;
+ break;
+ }
+ sc->sc_advbase = carpr.carpr_advbase;
}
- break;
-
- case SIOCSVH:
- error = priv_check(curthread, PRIV_NETINET_CARP);
- if (error)
- break;
- if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
+ if (carpr.carpr_advskew >= 255) {
+ error = EINVAL;
break;
- error = 1;
- if (sc->sc_carpdev) {
- locked = 1;
- CARP_SCLOCK(sc);
}
- if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) {
+ sc->sc_advskew = carpr.carpr_advskew;
+ if (carpr.carpr_key[0] != '\0') {
+ bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
+ carp_hmac_prepare(sc);
+ }
+ if (sc->sc_state != INIT &&
+ carpr.carpr_state != sc->sc_state) {
switch (carpr.carpr_state) {
case BACKUP:
callout_stop(&sc->sc_ad_tmo);
- carp_set_state(sc, BACKUP);
+ carp_set_state(sc, BACKUP,
+ "user requested via ifconfig");
carp_setrun(sc, 0);
- carp_setroute(sc, RTM_DELETE);
+ carp_delroute(sc);
break;
case MASTER:
- carp_master_down_locked(sc);
+ carp_master_down_locked(sc,
+ "user requested via ifconfig");
break;
default:
break;
}
}
- if (carpr.carpr_vhid > 0) {
- if (carpr.carpr_vhid > 255) {
- error = EINVAL;
- break;
- }
- if (sc->sc_carpdev) {
- struct carp_if *cif;
- cif = (struct carp_if *)sc->sc_carpdev->if_carp;
- TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
- if (vr != sc &&
- vr->sc_vhid == carpr.carpr_vhid) {
- error = EEXIST;
- break;
- }
- if (error == EEXIST)
- break;
- }
- sc->sc_vhid = carpr.carpr_vhid;
- IF_LLADDR(sc->sc_ifp)[0] = 0;
- IF_LLADDR(sc->sc_ifp)[1] = 0;
- IF_LLADDR(sc->sc_ifp)[2] = 0x5e;
- IF_LLADDR(sc->sc_ifp)[3] = 0;
- IF_LLADDR(sc->sc_ifp)[4] = 1;
- IF_LLADDR(sc->sc_ifp)[5] = sc->sc_vhid;
- error--;
+ break;
+
+ case SIOCGVH:
+ {
+ int priveleged;
+
+ if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) {
+ error = EINVAL;
+ break;
}
- if (carpr.carpr_advbase > 0 || carpr.carpr_advskew > 0) {
- if (carpr.carpr_advskew >= 255) {
- error = EINVAL;
+ if (carpr.carpr_count < 1) {
+ error = EMSGSIZE;
+ break;
+ }
+ if (ifp->if_carp == NULL) {
+ error = ENOENT;
+ break;
+ }
+
+ priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0);
+ if (carpr.carpr_vhid != 0) {
+ CIF_LOCK(ifp->if_carp);
+ IFNET_FOREACH_CARP(ifp, sc)
+ if (sc->sc_vhid == carpr.carpr_vhid)
+ break;
+ CIF_UNLOCK(ifp->if_carp);
+ if (sc == NULL) {
+ error = ENOENT;
break;
}
- if (carpr.carpr_advbase > 255) {
- error = EINVAL;
+ carp_carprcp(&carpr, sc, priveleged);
+ error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
+ } else {
+ int i, count;
+
+ count = 0;
+ CIF_LOCK(ifp->if_carp);
+ IFNET_FOREACH_CARP(ifp, sc)
+ count++;
+
+ if (count > carpr.carpr_count) {
+ CIF_UNLOCK(ifp->if_carp);
+ error = EMSGSIZE;
break;
}
- sc->sc_advbase = carpr.carpr_advbase;
- sc->sc_advskew = carpr.carpr_advskew;
- error--;
- }
- bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
- if (error > 0)
- error = EINVAL;
- else {
- error = 0;
- carp_setrun(sc, 0);
- }
- break;
- case SIOCGVH:
- /* XXX: lockless read */
- bzero(&carpr, sizeof(carpr));
- carpr.carpr_state = sc->sc_state;
- carpr.carpr_vhid = sc->sc_vhid;
- carpr.carpr_advbase = sc->sc_advbase;
- carpr.carpr_advskew = sc->sc_advskew;
- error = priv_check(curthread, PRIV_NETINET_CARP);
- if (error == 0)
- bcopy(sc->sc_key, carpr.carpr_key,
- sizeof(carpr.carpr_key));
- error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
+ i = 0;
+ IFNET_FOREACH_CARP(ifp, sc) {
+ carp_carprcp(&carpr, sc, priveleged);
+ carpr.carpr_count = count;
+ error = copyout(&carpr, ifr->ifr_data +
+ (i * sizeof(carpr)), sizeof(carpr));
+ if (error) {
+ CIF_UNLOCK(ifp->if_carp);
+ break;
+ }
+ i++;
+ }
+ CIF_UNLOCK(ifp->if_carp);
+ }
break;
-
+ }
default:
error = EINVAL;
}
+ sx_xunlock(&carp_sx);
+out:
if (locked)
- CARP_SCUNLOCK(sc);
-
- carp_hmac_prepare(sc);
+ CARP_UNLOCK(sc);
+ if_rele(ifp);
return (error);
}
-/*
- * XXX: this is looutput. We should eventually use it from there.
- */
static int
-carp_looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
- struct route *ro)
+carp_get_vhid(struct ifaddr *ifa)
{
- u_int32_t af;
- struct rtentry *rt = NULL;
-
- M_ASSERTPKTHDR(m); /* check if we have the packet header */
-
- if (ro != NULL)
- rt = ro->ro_rt;
- if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
- m_freem(m);
- return (rt->rt_flags & RTF_BLACKHOLE ? 0 :
- rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
- }
- ifp->if_opackets++;
- ifp->if_obytes += m->m_pkthdr.len;
-
- /* BPF writes need to be handled specially. */
- if (dst->sa_family == AF_UNSPEC) {
- bcopy(dst->sa_data, &af, sizeof(af));
- dst->sa_family = af;
- }
-
-#if 1 /* XXX */
- switch (dst->sa_family) {
- case AF_INET:
- case AF_INET6:
- case AF_IPX:
- case AF_APPLETALK:
- break;
- default:
- printf("carp_looutput: af=%d unexpected\n", dst->sa_family);
- m_freem(m);
- return (EAFNOSUPPORT);
- }
-#endif
- return(if_simloop(ifp, m, dst->sa_family, 0));
-}
+ if (ifa == NULL || ifa->ifa_carp == NULL)
+ return (0);
-/*
- * Start output on carp interface. This function should never be called.
- */
-static void
-carp_start(struct ifnet *ifp)
-{
-#ifdef DEBUG
- printf("%s: start called\n", ifp->if_xname);
-#endif
+ return (ifa->ifa_carp->sc_vhid);
}
int
-carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
- struct rtentry *rt)
+carp_attach(struct ifaddr *ifa, int vhid)
{
- struct m_tag *mtag;
+ struct ifnet *ifp = ifa->ifa_ifp;
+ struct carp_if *cif = ifp->if_carp;
struct carp_softc *sc;
- struct ifnet *carp_ifp;
+ int index, error;
- if (!sa)
- return (0);
+ KASSERT(ifa->ifa_carp == NULL, ("%s: ifa %p attached", __func__, ifa));
- switch (sa->sa_family) {
+ switch (ifa->ifa_addr->sa_family) {
#ifdef INET
case AF_INET:
- break;
-#endif /* INET */
+#endif
#ifdef INET6
case AF_INET6:
+#endif
break;
-#endif /* INET6 */
default:
- return (0);
+ return (EPROTOTYPE);
}
- mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
- if (mtag == NULL)
- return (0);
+ sx_xlock(&carp_sx);
+ if (ifp->if_carp == NULL) {
+ sx_xunlock(&carp_sx);
+ return (ENOPROTOOPT);
+ }
- bcopy(mtag + 1, &carp_ifp, sizeof(struct ifnet *));
- sc = carp_ifp->if_softc;
+ CIF_LOCK(cif);
+ IFNET_FOREACH_CARP(ifp, sc)
+ if (sc->sc_vhid == vhid)
+ break;
+ CIF_UNLOCK(cif);
+ if (sc == NULL) {
+ sx_xunlock(&carp_sx);
+ return (ENOENT);
+ }
- /* Set the source MAC address to Virtual Router MAC Address */
- switch (ifp->if_type) {
- case IFT_ETHER:
- case IFT_L2VLAN: {
- struct ether_header *eh;
+ error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family);
+ if (error) {
+ CIF_FREE(cif);
+ sx_xunlock(&carp_sx);
+ return (error);
+ }
- eh = mtod(m, struct ether_header *);
- eh->ether_shost[0] = 0;
- eh->ether_shost[1] = 0;
- eh->ether_shost[2] = 0x5e;
- eh->ether_shost[3] = 0;
- eh->ether_shost[4] = 1;
- eh->ether_shost[5] = sc->sc_vhid;
- }
- break;
- case IFT_FDDI: {
- struct fddi_header *fh;
+ index = sc->sc_naddrs + sc->sc_naddrs6 + 1;
+ if (index > sc->sc_ifasiz / sizeof(struct ifaddr *))
+ carp_grow_ifas(sc);
- fh = mtod(m, struct fddi_header *);
- fh->fddi_shost[0] = 0;
- fh->fddi_shost[1] = 0;
- fh->fddi_shost[2] = 0x5e;
- fh->fddi_shost[3] = 0;
- fh->fddi_shost[4] = 1;
- fh->fddi_shost[5] = sc->sc_vhid;
- }
+ switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+ case AF_INET:
+ cif->cif_naddrs++;
+ sc->sc_naddrs++;
break;
- case IFT_ISO88025: {
- struct iso88025_header *th;
- th = mtod(m, struct iso88025_header *);
- th->iso88025_shost[0] = 3;
- th->iso88025_shost[1] = 0;
- th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1);
- th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1);
- th->iso88025_shost[4] = 0;
- th->iso88025_shost[5] = 0;
- }
+#endif
+#ifdef INET6
+ case AF_INET6:
+ cif->cif_naddrs6++;
+ sc->sc_naddrs6++;
break;
- default:
- printf("%s: carp is not supported for this interface type\n",
- ifp->if_xname);
- return (EOPNOTSUPP);
+#endif
}
+ ifa_ref(ifa);
+
+ CARP_LOCK(sc);
+ sc->sc_ifas[index - 1] = ifa;
+ ifa->ifa_carp = sc;
+ carp_hmac_prepare(sc);
+ carp_sc_state(sc);
+ CARP_UNLOCK(sc);
+
+ sx_xunlock(&carp_sx);
+
return (0);
}
-static void
-carp_set_state(struct carp_softc *sc, int state)
+void
+carp_detach(struct ifaddr *ifa)
{
- int link_state;
+ struct ifnet *ifp = ifa->ifa_ifp;
+ struct carp_if *cif = ifp->if_carp;
+ struct carp_softc *sc = ifa->ifa_carp;
+ int i, index;
- if (sc->sc_carpdev)
- CARP_SCLOCK_ASSERT(sc);
+ KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa));
- if (sc->sc_state == state)
- return;
+ sx_xlock(&carp_sx);
- sc->sc_state = state;
- switch (state) {
- case BACKUP:
- link_state = LINK_STATE_DOWN;
- break;
- case MASTER:
- link_state = LINK_STATE_UP;
+ CARP_LOCK(sc);
+ /* Shift array. */
+ index = sc->sc_naddrs + sc->sc_naddrs6;
+ for (i = 0; i < index; i++)
+ if (sc->sc_ifas[i] == ifa)
+ break;
+ KASSERT(i < index, ("%s: %p no backref", __func__, ifa));
+ for (; i < index - 1; i++)
+ sc->sc_ifas[i] = sc->sc_ifas[i+1];
+ sc->sc_ifas[index - 1] = NULL;
+
+ switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+ case AF_INET:
+ cif->cif_naddrs--;
+ sc->sc_naddrs--;
break;
- default:
- link_state = LINK_STATE_UNKNOWN;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ cif->cif_naddrs6--;
+ sc->sc_naddrs6--;
break;
+#endif
}
- if_link_state_change(SC2IFP(sc), link_state);
+
+ carp_ifa_delroute(ifa);
+ carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family);
+
+ ifa->ifa_carp = NULL;
+ ifa_free(ifa);
+
+ carp_hmac_prepare(sc);
+ carp_sc_state(sc);
+
+ if (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0)
+ carp_destroy(sc);
+ else
+ CARP_UNLOCK(sc);
+
+ CIF_FREE(cif);
+
+ sx_xunlock(&carp_sx);
}
-void
-carp_carpdev_state(struct ifnet *ifp)
+static void
+carp_set_state(struct carp_softc *sc, int state, const char *reason)
{
- struct carp_if *cif;
- cif = ifp->if_carp;
- CARP_LOCK(cif);
- carp_carpdev_state_locked(cif);
- CARP_UNLOCK(cif);
+ CARP_LOCK_ASSERT(sc);
+
+ if (sc->sc_state != state) {
+ const char *carp_states[] = { CARP_STATES };
+ char subsys[IFNAMSIZ+5];
+
+ snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid,
+ sc->sc_carpdev->if_xname);
+
+ CARP_LOG("%s: %s -> %s (%s)\n", subsys,
+ carp_states[sc->sc_state], carp_states[state], reason);
+
+ sc->sc_state = state;
+
+ devctl_notify("CARP", subsys, carp_states[state], NULL);
+ }
}
static void
-carp_carpdev_state_locked(struct carp_if *cif)
+carp_linkstate(struct ifnet *ifp)
{
struct carp_softc *sc;
- TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list)
- carp_sc_state_locked(sc);
+ CIF_LOCK(ifp->if_carp);
+ IFNET_FOREACH_CARP(ifp, sc) {
+ CARP_LOCK(sc);
+ carp_sc_state(sc);
+ CARP_UNLOCK(sc);
+ }
+ CIF_UNLOCK(ifp->if_carp);
}
static void
-carp_sc_state_locked(struct carp_softc *sc)
+carp_sc_state(struct carp_softc *sc)
{
- CARP_SCLOCK_ASSERT(sc);
+
+ CARP_LOCK_ASSERT(sc);
if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
!(sc->sc_carpdev->if_flags & IFF_UP)) {
- sc->sc_flags_backup = SC2IFP(sc)->if_flags;
- SC2IFP(sc)->if_flags &= ~IFF_UP;
- SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
callout_stop(&sc->sc_ad_tmo);
+#ifdef INET
callout_stop(&sc->sc_md_tmo);
+#endif
+#ifdef INET6
callout_stop(&sc->sc_md6_tmo);
- carp_set_state(sc, INIT);
+#endif
+ carp_set_state(sc, INIT, "hardware interface down");
carp_setrun(sc, 0);
- if (!sc->sc_suppress) {
- carp_suppress_preempt++;
- if (carp_suppress_preempt == 1) {
- CARP_SCUNLOCK(sc);
- carp_send_ad_all();
- CARP_SCLOCK(sc);
- }
- }
+ if (!sc->sc_suppress)
+ carp_demote_adj(V_carp_ifdown_adj, "interface down");
sc->sc_suppress = 1;
} else {
- SC2IFP(sc)->if_flags |= sc->sc_flags_backup;
- carp_set_state(sc, INIT);
+ carp_set_state(sc, INIT, "hardware interface up");
carp_setrun(sc, 0);
if (sc->sc_suppress)
- carp_suppress_preempt--;
+ carp_demote_adj(-V_carp_ifdown_adj, "interface up");
sc->sc_suppress = 0;
}
+}
+
+static void
+carp_demote_adj(int adj, char *reason)
+{
+ atomic_add_int(&V_carp_demotion, adj);
+ CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason);
+ taskqueue_enqueue(taskqueue_swi, &carp_sendall_task);
+}
+
+static int
+carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ int new, error;
- return;
+ new = V_carp_demotion;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ carp_demote_adj(new, "sysctl");
+
+ return (0);
}
#ifdef INET
@@ -2313,7 +2029,7 @@ static struct protosw in_carp_protosw = {
.pr_protocol = IPPROTO_CARP,
.pr_flags = PR_ATOMIC|PR_ADDR,
.pr_input = carp_input,
- .pr_output = (pr_output_t *)rip_output,
+ .pr_output = rip_output,
.pr_ctloutput = rip_ctloutput,
.pr_usrreqs = &rip_usrreqs
};
@@ -2321,7 +2037,7 @@ static struct protosw in_carp_protosw = {
#ifdef INET6
extern struct domain inet6domain;
-static struct ip6protosw in6_carp_protosw = {
+static struct protosw in6_carp_protosw = {
.pr_type = SOCK_RAW,
.pr_domain = &inet6domain,
.pr_protocol = IPPROTO_CARP,
@@ -2337,10 +2053,6 @@ static void
carp_mod_cleanup(void)
{
- if (if_detach_event_tag == NULL)
- return;
- EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
- if_clone_detach(&carp_cloner);
#ifdef INET
if (proto_reg[CARP_INET] == 0) {
(void)ipproto_unregister(IPPROTO_CARP);
@@ -2358,10 +2070,19 @@ carp_mod_cleanup(void)
carp_iamatch6_p = NULL;
carp_macmatch6_p = NULL;
#endif
+ carp_ioctl_p = NULL;
+ carp_attach_p = NULL;
+ carp_detach_p = NULL;
+ carp_get_vhid_p = NULL;
carp_linkstate_p = NULL;
carp_forus_p = NULL;
carp_output_p = NULL;
+ carp_demote_adj_p = NULL;
+ carp_master_p = NULL;
+ mtx_unlock(&carp_mtx);
+ taskqueue_drain(taskqueue_swi, &carp_sendall_task);
mtx_destroy(&carp_mtx);
+ sx_destroy(&carp_sx);
}
static int
@@ -2369,22 +2090,24 @@ carp_mod_load(void)
{
int err;
- if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
- carp_ifdetach, NULL, EVENTHANDLER_PRI_ANY);
- if (if_detach_event_tag == NULL)
- return (ENOMEM);
mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
- LIST_INIT(&carpif_list);
- if_clone_attach(&carp_cloner);
- carp_linkstate_p = carp_carpdev_state;
+ sx_init(&carp_sx, "carp_sx");
+ LIST_INIT(&carp_list);
+ carp_get_vhid_p = carp_get_vhid;
carp_forus_p = carp_forus;
carp_output_p = carp_output;
+ carp_linkstate_p = carp_linkstate;
+ carp_ioctl_p = carp_ioctl;
+ carp_attach_p = carp_attach;
+ carp_detach_p = carp_detach;
+ carp_demote_adj_p = carp_demote_adj;
+ carp_master_p = carp_master;
#ifdef INET6
carp_iamatch6_p = carp_iamatch6;
carp_macmatch6_p = carp_macmatch6;
proto_reg[CARP_INET6] = pf_proto_register(PF_INET6,
(struct protosw *)&in6_carp_protosw);
- if (proto_reg[CARP_INET6] != 0) {
+ if (proto_reg[CARP_INET6]) {
printf("carp: error %d attaching to PF_INET6\n",
proto_reg[CARP_INET6]);
carp_mod_cleanup();
@@ -2400,7 +2123,7 @@ carp_mod_load(void)
#ifdef INET
carp_iamatch_p = carp_iamatch;
proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw);
- if (proto_reg[CARP_INET] != 0) {
+ if (proto_reg[CARP_INET]) {
printf("carp: error %d attaching to PF_INET\n",
proto_reg[CARP_INET]);
carp_mod_cleanup();
@@ -2413,7 +2136,7 @@ carp_mod_load(void)
return (err);
}
#endif
- return 0;
+ return (0);
}
static int
@@ -2424,17 +2147,13 @@ carp_modevent(module_t mod, int type, void *data)
return carp_mod_load();
/* NOTREACHED */
case MOD_UNLOAD:
- /*
- * XXX: For now, disallow module unloading by default due to
- * a race condition where a thread may dereference one of the
- * function pointer hooks after the module has been
- * unloaded, during processing of a packet, causing a panic.
- */
-#ifdef CARPMOD_CAN_UNLOAD
- carp_mod_cleanup();
-#else
- return (EBUSY);
-#endif
+ mtx_lock(&carp_mtx);
+ if (LIST_EMPTY(&carp_list))
+ carp_mod_cleanup();
+ else {
+ mtx_unlock(&carp_mtx);
+ return (EBUSY);
+ }
break;
default:
diff --git a/freebsd/sys/netinet/ip_carp.h b/freebsd/sys/netinet/ip_carp.h
index 2f2b4f28..5b7e5064 100644
--- a/freebsd/sys/netinet/ip_carp.h
+++ b/freebsd/sys/netinet/ip_carp.h
@@ -117,69 +117,57 @@ struct carpstats {
uint64_t carps_preempt; /* if enabled, preemptions */
};
-#ifdef _KERNEL
-#define CARPSTATS_ADD(name, val) carpstats.name += (val)
-#define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1)
-#endif
-
/*
* Configuration structure for SIOCSVH SIOCGVH
*/
struct carpreq {
+ int carpr_count;
+ int carpr_vhid;
+#define CARP_MAXVHID 255
int carpr_state;
#define CARP_STATES "INIT", "BACKUP", "MASTER"
#define CARP_MAXSTATE 2
- int carpr_vhid;
int carpr_advskew;
+#define CARP_MAXSKEW 240
int carpr_advbase;
unsigned char carpr_key[CARP_KEY_LEN];
};
#define SIOCSVH _IOWR('i', 245, struct ifreq)
#define SIOCGVH _IOWR('i', 246, struct ifreq)
-/*
- * Names for CARP sysctl objects
- */
-#define CARPCTL_ALLOW 1 /* accept incoming CARP packets */
-#define CARPCTL_PREEMPT 2 /* high-pri backup preemption mode */
-#define CARPCTL_LOG 3 /* log bad packets */
-#define CARPCTL_STATS 4 /* statistics (read-only) */
-#define CARPCTL_ARPBALANCE 5 /* balance arp responses */
-#define CARPCTL_MAXID 6
-
-#define CARPCTL_NAMES { \
- { 0, 0 }, \
- { "allow", CTLTYPE_INT }, \
- { "preempt", CTLTYPE_INT }, \
- { "log", CTLTYPE_INT }, \
- { "stats", CTLTYPE_STRUCT }, \
- { "arpbalance", CTLTYPE_INT }, \
-}
-
#ifdef _KERNEL
-void carp_carpdev_state(struct ifnet *);
-void carp_input (struct mbuf *, int);
-int carp6_input (struct mbuf **, int *, int);
-int carp_output (struct ifnet *, struct mbuf *, struct sockaddr *,
- struct rtentry *);
-int carp_iamatch (struct ifnet *, struct in_ifaddr *, struct in_addr *,
- u_int8_t **);
+int carp_ioctl(struct ifreq *, u_long, struct thread *);
+int carp_attach(struct ifaddr *, int);
+void carp_detach(struct ifaddr *);
+void carp_carpdev_state(struct ifnet *);
+int carp_input(struct mbuf **, int *, int);
+int carp6_input (struct mbuf **, int *, int);
+int carp_output (struct ifnet *, struct mbuf *,
+ const struct sockaddr *);
+int carp_master(struct ifaddr *);
+int carp_iamatch(struct ifaddr *, uint8_t **);
struct ifaddr *carp_iamatch6(struct ifnet *, struct in6_addr *);
caddr_t carp_macmatch6(struct ifnet *, struct mbuf *, const struct in6_addr *);
-struct ifnet *carp_forus (struct ifnet *, u_char *);
+int carp_forus(struct ifnet *, u_char *);
/* These are external networking stack hooks for CARP */
/* net/if.c */
+extern int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *);
+extern int (*carp_attach_p)(struct ifaddr *, int);
+extern void (*carp_detach_p)(struct ifaddr *);
extern void (*carp_linkstate_p)(struct ifnet *);
+extern void (*carp_demote_adj_p)(int, char *);
+extern int (*carp_master_p)(struct ifaddr *);
/* net/if_bridge.c net/if_ethersubr.c */
-extern struct ifnet *(*carp_forus_p)(struct ifnet *, u_char *);
+extern int (*carp_forus_p)(struct ifnet *, u_char *);
/* net/if_ethersubr.c */
extern int (*carp_output_p)(struct ifnet *, struct mbuf *,
- struct sockaddr *, struct rtentry *);
+ const struct sockaddr *);
+/* net/rtsock.c */
+extern int (*carp_get_vhid_p)(struct ifaddr *);
#ifdef INET
/* netinet/if_ether.c */
-extern int (*carp_iamatch_p)(struct ifnet *, struct in_ifaddr *,
- struct in_addr *, u_int8_t **);
+extern int (*carp_iamatch_p)(struct ifaddr *, uint8_t **);
#endif
#ifdef INET6
/* netinet6/nd6_nbr.c */
diff --git a/freebsd/sys/netinet/ip_divert.c b/freebsd/sys/netinet/ip_divert.c
index 879f411f..b43ebb7c 100644
--- a/freebsd/sys/netinet/ip_divert.c
+++ b/freebsd/sys/netinet/ip_divert.c
@@ -32,16 +32,15 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#if !defined(KLD_MODULE)
#include <rtems/bsd/local/opt_inet.h>
+#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_sctp.h>
#ifndef INET
-#error "IPDIVERT requires INET."
-#endif
+#error "IPDIVERT requires INET"
#endif
-#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/sys/param.h>
+#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <rtems/bsd/sys/lock.h>
#include <sys/malloc.h>
@@ -57,6 +56,7 @@ __FBSDID("$FreeBSD$");
#include <net/vnet.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/netisr.h>
#include <netinet/in.h>
@@ -160,27 +160,30 @@ div_init(void)
* place for hashbase == NULL.
*/
in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb",
- div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE,
- IPI_HASHFIELDS_NONE);
+ div_inpcb_init, div_inpcb_fini, 0, IPI_HASHFIELDS_NONE);
}
static void
-div_destroy(void)
+div_destroy(void *unused __unused)
{
in_pcbinfo_destroy(&V_divcbinfo);
}
+VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
+ div_destroy, NULL);
/*
* IPPROTO_DIVERT is not in the real IP protocol number space; this
* function should never be called. Just in case, drop any packets.
*/
-static void
-div_input(struct mbuf *m, int off)
+static int
+div_input(struct mbuf **mp, int *offp, int proto)
{
+ struct mbuf *m = *mp;
KMOD_IPSTAT_INC(ips_noproto);
m_freem(m);
+ return (IPPROTO_DONE);
}
/*
@@ -206,23 +209,19 @@ divert_packet(struct mbuf *m, int incoming)
}
/* Assure header */
if (m->m_len < sizeof(struct ip) &&
- (m = m_pullup(m, sizeof(struct ip))) == 0)
+ (m = m_pullup(m, sizeof(struct ip))) == NULL)
return;
ip = mtod(m, struct ip *);
/* Delayed checksums are currently not compatible with divert. */
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
- ip->ip_len = ntohs(ip->ip_len);
in_delayed_cksum(m);
m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
- ip->ip_len = htons(ip->ip_len);
}
#ifdef SCTP
if (m->m_pkthdr.csum_flags & CSUM_SCTP) {
- ip->ip_len = ntohs(ip->ip_len);
sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
- ip->ip_len = htons(ip->ip_len);
}
#endif
bzero(&divsrc, sizeof(divsrc));
@@ -394,10 +393,6 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
INP_RUNLOCK(inp);
goto cantsend;
}
-
- /* Convert fields to host order for ip_output() */
- ip->ip_len = ntohs(ip->ip_len);
- ip->ip_off = ntohs(ip->ip_off);
break;
#ifdef INET6
case IPV6_VERSION >> 4:
@@ -410,8 +405,6 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
INP_RUNLOCK(inp);
goto cantsend;
}
-
- ip6->ip6_plen = ntohs(ip6->ip6_plen);
break;
}
#endif
@@ -611,7 +604,7 @@ div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
/* Packet must have a header (but that's about it) */
if (m->m_len < sizeof (struct ip) &&
- (m = m_pullup(m, sizeof (struct ip))) == 0) {
+ (m = m_pullup(m, sizeof (struct ip))) == NULL) {
KMOD_IPSTAT_INC(ips_toosmall);
m_freem(m);
return EINVAL;
@@ -677,7 +670,7 @@ div_pcblist(SYSCTL_HANDLER_ARGS)
return error;
inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
- if (inp_list == 0)
+ if (inp_list == NULL)
return ENOMEM;
INP_INFO_RLOCK(&V_divcbinfo);
@@ -766,9 +759,6 @@ struct protosw div_protosw = {
.pr_ctlinput = div_ctlinput,
.pr_ctloutput = ip_ctloutput,
.pr_init = div_init,
-#ifdef VIMAGE
- .pr_destroy = div_destroy,
-#endif
.pr_usrreqs = &div_usrreqs
};
@@ -776,9 +766,6 @@ static int
div_modevent(module_t mod, int type, void *unused)
{
int err = 0;
-#ifndef VIMAGE
- int n;
-#endif
switch (type) {
case MOD_LOAD:
@@ -803,10 +790,6 @@ div_modevent(module_t mod, int type, void *unused)
err = EPERM;
break;
case MOD_UNLOAD:
-#ifdef VIMAGE
- err = EPERM;
- break;
-#else
/*
* Forced unload.
*
@@ -819,8 +802,7 @@ div_modevent(module_t mod, int type, void *unused)
* we destroy the lock.
*/
INP_INFO_WLOCK(&V_divcbinfo);
- n = V_divcbinfo.ipi_count;
- if (n != 0) {
+ if (V_divcbinfo.ipi_count != 0) {
err = EBUSY;
INP_INFO_WUNLOCK(&V_divcbinfo);
break;
@@ -828,10 +810,11 @@ div_modevent(module_t mod, int type, void *unused)
ip_divert_ptr = NULL;
err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW);
INP_INFO_WUNLOCK(&V_divcbinfo);
- div_destroy();
+#ifndef VIMAGE
+ div_destroy(NULL);
+#endif
EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag);
break;
-#endif /* !VIMAGE */
default:
err = EOPNOTSUPP;
break;
@@ -845,6 +828,6 @@ static moduledata_t ipdivertmod = {
0
};
-DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
-MODULE_DEPEND(ipdivert, ipfw, 2, 2, 2);
+DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
+MODULE_DEPEND(ipdivert, ipfw, 3, 3, 3);
MODULE_VERSION(ipdivert, 1);
diff --git a/freebsd/sys/netinet/ip_dummynet.h b/freebsd/sys/netinet/ip_dummynet.h
index dc2c3412..377b5b09 100644
--- a/freebsd/sys/netinet/ip_dummynet.h
+++ b/freebsd/sys/netinet/ip_dummynet.h
@@ -29,7 +29,7 @@
#ifndef _IP_DUMMYNET_H
#define _IP_DUMMYNET_H
-
+#define NEW_AQM
/*
* Definition of the kernel-userland API for dummynet.
*
@@ -85,7 +85,13 @@ enum {
/* special commands for emulation of sysctl variables */
DN_SYSCTL_GET,
DN_SYSCTL_SET,
-
+#ifdef NEW_AQM
+ /* subtypes used for setting/getting extra parameters.
+ * these subtypes used with IP_DUMMYNET3 command (get)
+ * and DN_TEXT (set). */
+ DN_AQM_PARAMS, /* AQM extra params */
+ DN_SCH_PARAMS, /* scheduler extra params */
+#endif
DN_LAST,
};
@@ -104,6 +110,10 @@ enum { /* user flags */
DN_HAS_PROFILE = 0x0010, /* a link has a profile */
DN_IS_RED = 0x0020,
DN_IS_GENTLE_RED= 0x0040,
+ DN_IS_ECN = 0x0080,
+ #ifdef NEW_AQM
+ DN_IS_AQM = 0x0100, /* AQMs: e.g Codel & PIE */
+ #endif
DN_PIPE_CMD = 0x1000, /* pipe config... */
};
@@ -171,8 +181,8 @@ struct dn_flow {
struct ipfw_flow_id fid;
uint64_t tot_pkts; /* statistics counters */
uint64_t tot_bytes;
- uint32_t length; /* Queue lenght, in packets */
- uint32_t len_bytes; /* Queue lenght, in bytes */
+ uint32_t length; /* Queue length, in packets */
+ uint32_t len_bytes; /* Queue length, in bytes */
uint32_t drops;
};
@@ -209,7 +219,19 @@ struct dn_profile {
int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */
};
-
+#ifdef NEW_AQM
+/* Extra parameters for AQM and scheduler.
+ * This struct is used to pass and retrieve parameters (configurations)
+ * to/from AQM and Scheduler.
+ */
+struct dn_extra_parms {
+ struct dn_id oid;
+ char name[16];
+ uint32_t nr;
+#define DN_MAX_EXTRA_PARM 10
+ int64_t par[DN_MAX_EXTRA_PARM];
+};
+#endif
/*
* Overall structure of dummynet
diff --git a/freebsd/sys/netinet/ip_ecn.h b/freebsd/sys/netinet/ip_ecn.h
index 6a814160..c5c1c4eb 100644
--- a/freebsd/sys/netinet/ip_ecn.h
+++ b/freebsd/sys/netinet/ip_ecn.h
@@ -38,10 +38,6 @@
#ifndef _NETINET_IP_ECN_H_
#define _NETINET_IP_ECN_H_
-#if defined(_KERNEL) && !defined(_LKM)
-#include <rtems/bsd/local/opt_inet.h>
-#endif
-
#define ECN_ALLOWED 1 /* ECN allowed */
#define ECN_FORBIDDEN 0 /* ECN forbidden */
#define ECN_NOCARE (-1) /* no consideration to ECN */
diff --git a/freebsd/sys/netinet/ip_encap.c b/freebsd/sys/netinet/ip_encap.c
index 14f8cd51..19ff1a09 100644
--- a/freebsd/sys/netinet/ip_encap.c
+++ b/freebsd/sys/netinet/ip_encap.c
@@ -67,6 +67,8 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/mutex.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/mbuf.h>
@@ -86,7 +88,6 @@ __FBSDID("$FreeBSD$");
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
-#include <netinet6/ip6protosw.h>
#endif
#include <machine/stdarg.h>
@@ -98,14 +99,14 @@ static MALLOC_DEFINE(M_NETADDR, "encap_export_host", "Export host address struct
static void encap_add(struct encaptab *);
static int mask_match(const struct encaptab *, const struct sockaddr *,
const struct sockaddr *);
-static void encap_fillarg(struct mbuf *, const struct encaptab *);
+static void encap_fillarg(struct mbuf *, void *);
/*
* All global variables in ip_encap.c are locked using encapmtx.
*/
static struct mtx encapmtx;
MTX_SYSINIT(encapmtx, &encapmtx, "encapmtx", MTX_DEF);
-LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(encaptab);
+static LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(encaptab);
/*
* We currently keey encap_init() for source code compatibility reasons --
@@ -117,18 +118,20 @@ encap_init(void)
}
#ifdef INET
-void
-encap4_input(struct mbuf *m, int off)
+int
+encap4_input(struct mbuf **mp, int *offp, int proto)
{
struct ip *ip;
- int proto;
+ struct mbuf *m;
struct sockaddr_in s, d;
const struct protosw *psw;
struct encaptab *ep, *match;
- int prio, matchprio;
+ void *arg;
+ int matchprio, off, prio;
+ m = *mp;
+ off = *offp;
ip = mtod(m, struct ip *);
- proto = ip->ip_p;
bzero(&s, sizeof(s));
s.sin_family = AF_INET;
@@ -139,6 +142,8 @@ encap4_input(struct mbuf *m, int off)
d.sin_len = sizeof(struct sockaddr_in);
d.sin_addr = ip->ip_dst;
+ arg = NULL;
+ psw = NULL;
match = NULL;
matchprio = 0;
mtx_lock(&encapmtx);
@@ -183,21 +188,24 @@ encap4_input(struct mbuf *m, int off)
match = ep;
}
}
+ if (match != NULL) {
+ psw = match->psw;
+ arg = match->arg;
+ }
mtx_unlock(&encapmtx);
- if (match) {
+ if (match != NULL) {
/* found a match, "match" has the best one */
- psw = match->psw;
- if (psw && psw->pr_input) {
- encap_fillarg(m, match);
- (*psw->pr_input)(m, off);
+ if (psw != NULL && psw->pr_input != NULL) {
+ encap_fillarg(m, arg);
+ (*psw->pr_input)(mp, offp, proto);
} else
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
/* last resort: inject to raw socket */
- rip_input(m, off);
+ return (rip_input(mp, offp, proto));
}
#endif
@@ -208,8 +216,9 @@ encap6_input(struct mbuf **mp, int *offp, int proto)
struct mbuf *m = *mp;
struct ip6_hdr *ip6;
struct sockaddr_in6 s, d;
- const struct ip6protosw *psw;
+ const struct protosw *psw;
struct encaptab *ep, *match;
+ void *arg;
int prio, matchprio;
ip6 = mtod(m, struct ip6_hdr *);
@@ -223,6 +232,8 @@ encap6_input(struct mbuf **mp, int *offp, int proto)
d.sin6_len = sizeof(struct sockaddr_in6);
d.sin6_addr = ip6->ip6_dst;
+ arg = NULL;
+ psw = NULL;
match = NULL;
matchprio = 0;
mtx_lock(&encapmtx);
@@ -250,17 +261,20 @@ encap6_input(struct mbuf **mp, int *offp, int proto)
match = ep;
}
}
+ if (match != NULL) {
+ psw = match->psw;
+ arg = match->arg;
+ }
mtx_unlock(&encapmtx);
- if (match) {
+ if (match != NULL) {
/* found a match */
- psw = (const struct ip6protosw *)match->psw;
- if (psw && psw->pr_input) {
- encap_fillarg(m, match);
+ if (psw != NULL && psw->pr_input != NULL) {
+ encap_fillarg(m, arg);
return (*psw->pr_input)(mp, offp, proto);
} else {
m_freem(m);
- return IPPROTO_DONE;
+ return (IPPROTO_DONE);
}
}
@@ -439,14 +453,16 @@ mask_match(const struct encaptab *ep, const struct sockaddr *sp,
}
static void
-encap_fillarg(struct mbuf *m, const struct encaptab *ep)
+encap_fillarg(struct mbuf *m, void *arg)
{
struct m_tag *tag;
- tag = m_tag_get(PACKET_TAG_ENCAP, sizeof (void*), M_NOWAIT);
- if (tag) {
- *(void**)(tag+1) = ep->arg;
- m_tag_prepend(m, tag);
+ if (arg != NULL) {
+ tag = m_tag_get(PACKET_TAG_ENCAP, sizeof(void *), M_NOWAIT);
+ if (tag != NULL) {
+ *(void**)(tag+1) = arg;
+ m_tag_prepend(m, tag);
+ }
}
}
diff --git a/freebsd/sys/netinet/ip_encap.h b/freebsd/sys/netinet/ip_encap.h
index 3b1a5aee..0b8dbd6f 100644
--- a/freebsd/sys/netinet/ip_encap.h
+++ b/freebsd/sys/netinet/ip_encap.h
@@ -49,7 +49,7 @@ struct encaptab {
};
void encap_init(void);
-void encap4_input(struct mbuf *, int);
+int encap4_input(struct mbuf **, int *, int);
int encap6_input(struct mbuf **, int *, int);
const struct encaptab *encap_attach(int, int, const struct sockaddr *,
const struct sockaddr *, const struct sockaddr *,
diff --git a/freebsd/sys/netinet/ip_fastfwd.c b/freebsd/sys/netinet/ip_fastfwd.c
index 863b9a16..19dfb1ab 100644
--- a/freebsd/sys/netinet/ip_fastfwd.c
+++ b/freebsd/sys/netinet/ip_fastfwd.c
@@ -78,7 +78,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <rtems/bsd/local/opt_ipfw.h>
#include <rtems/bsd/local/opt_ipstealth.h>
#include <rtems/bsd/sys/param.h>
@@ -87,6 +86,7 @@ __FBSDID("$FreeBSD$");
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
+#include <sys/sdt.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
@@ -99,6 +99,7 @@ __FBSDID("$FreeBSD$");
#include <net/vnet.h>
#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
@@ -108,12 +109,6 @@ __FBSDID("$FreeBSD$");
#include <machine/in_cksum.h>
-static VNET_DEFINE(int, ipfastforward_active);
-#define V_ipfastforward_active VNET(ipfastforward_active)
-
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW,
- &VNET_NAME(ipfastforward_active), 0, "Enable fast IP forwarding");
-
static struct sockaddr_in *
ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m)
{
@@ -158,7 +153,7 @@ ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m)
* to ip_input for full processing.
*/
struct mbuf *
-ip_fastforward(struct mbuf *m)
+ip_tryforward(struct mbuf *m)
{
struct ip *ip;
struct mbuf *m0 = NULL;
@@ -166,119 +161,20 @@ ip_fastforward(struct mbuf *m)
struct sockaddr_in *dst = NULL;
struct ifnet *ifp;
struct in_addr odest, dest;
- u_short sum, ip_len;
+ uint16_t ip_len, ip_off;
int error = 0;
- int hlen, mtu;
+ int mtu;
struct m_tag *fwd_tag = NULL;
/*
* Are we active and forwarding packets?
*/
- if (!V_ipfastforward_active || !V_ipforwarding)
- return m;
M_ASSERTVALID(m);
M_ASSERTPKTHDR(m);
bzero(&ro, sizeof(ro));
- /*
- * Step 1: check for packet drop conditions (and sanity checks)
- */
-
- /*
- * Is entire packet big enough?
- */
- if (m->m_pkthdr.len < sizeof(struct ip)) {
- IPSTAT_INC(ips_tooshort);
- goto drop;
- }
-
- /*
- * Is first mbuf large enough for ip header and is header present?
- */
- if (m->m_len < sizeof (struct ip) &&
- (m = m_pullup(m, sizeof (struct ip))) == NULL) {
- IPSTAT_INC(ips_toosmall);
- return NULL; /* mbuf already free'd */
- }
-
- ip = mtod(m, struct ip *);
-
- /*
- * Is it IPv4?
- */
- if (ip->ip_v != IPVERSION) {
- IPSTAT_INC(ips_badvers);
- goto drop;
- }
-
- /*
- * Is IP header length correct and is it in first mbuf?
- */
- hlen = ip->ip_hl << 2;
- if (hlen < sizeof(struct ip)) { /* minimum header length */
- IPSTAT_INC(ips_badhlen);
- goto drop;
- }
- if (hlen > m->m_len) {
- if ((m = m_pullup(m, hlen)) == NULL) {
- IPSTAT_INC(ips_badhlen);
- return NULL; /* mbuf already free'd */
- }
- ip = mtod(m, struct ip *);
- }
-
- /*
- * Checksum correct?
- */
- if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED)
- sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
- else {
- if (hlen == sizeof(struct ip))
- sum = in_cksum_hdr(ip);
- else
- sum = in_cksum(m, hlen);
- }
- if (sum) {
- IPSTAT_INC(ips_badsum);
- goto drop;
- }
-
- /*
- * Remember that we have checked the IP header and found it valid.
- */
- m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID);
-
- ip_len = ntohs(ip->ip_len);
-
- /*
- * Is IP length longer than packet we have got?
- */
- if (m->m_pkthdr.len < ip_len) {
- IPSTAT_INC(ips_tooshort);
- goto drop;
- }
-
- /*
- * Is packet longer than IP header tells us? If yes, truncate packet.
- */
- if (m->m_pkthdr.len > ip_len) {
- if (m->m_len == m->m_pkthdr.len) {
- m->m_len = ip_len;
- m->m_pkthdr.len = ip_len;
- } else
- m_adj(m, ip_len - m->m_pkthdr.len);
- }
-
- /*
- * Is packet from or to 127/8?
- */
- if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
- (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
- IPSTAT_INC(ips_badaddr);
- goto drop;
- }
#ifdef ALTQ
/*
@@ -289,16 +185,14 @@ ip_fastforward(struct mbuf *m)
#endif
/*
- * Step 2: fallback conditions to normal ip_input path processing
- */
-
- /*
* Only IP packets without options
*/
+ ip = mtod(m, struct ip *);
+
if (ip->ip_hl != (sizeof(struct ip) >> 2)) {
- if (ip_doopts == 1)
+ if (V_ip_doopts == 1)
return m;
- else if (ip_doopts == 2) {
+ else if (V_ip_doopts == 2) {
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB,
0, 0);
return NULL; /* mbuf already free'd */
@@ -312,7 +206,7 @@ ip_fastforward(struct mbuf *m)
*
* XXX: Probably some of these checks could be direct drop
* conditions. However it is not clear whether there are some
- * hacks or obscure behaviours which make it neccessary to
+ * hacks or obscure behaviours which make it necessary to
* let ip_input handle it. We play safe here and let ip_input
* deal with it until it is proven that we can directly drop it.
*/
@@ -340,12 +234,6 @@ ip_fastforward(struct mbuf *m)
* Step 3: incoming packet firewall processing
*/
- /*
- * Convert to host representation
- */
- ip->ip_len = ntohs(ip->ip_len);
- ip->ip_off = ntohs(ip->ip_off);
-
odest.s_addr = dest.s_addr = ip->ip_dst.s_addr;
/*
@@ -464,8 +352,6 @@ passin:
forwardlocal:
/*
* Return packet for processing by ip_input().
- * Keep host byte order as expected at ip_input's
- * "ours"-label.
*/
m->m_flags |= M_FASTFWD_OURS;
if (ro.ro_rt)
@@ -491,29 +377,18 @@ passout:
/*
* Step 6: send off the packet
*/
+ ip_len = ntohs(ip->ip_len);
+ ip_off = ntohs(ip->ip_off);
/*
* Check if route is dampned (when ARP is unable to resolve)
*/
if ((ro.ro_rt->rt_flags & RTF_REJECT) &&
- (ro.ro_rt->rt_rmx.rmx_expire == 0 ||
- time_uptime < ro.ro_rt->rt_rmx.rmx_expire)) {
+ (ro.ro_rt->rt_expire == 0 || time_uptime < ro.ro_rt->rt_expire)) {
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
goto consumed;
}
-#ifndef ALTQ
- /*
- * Check if there is enough space in the interface queue
- */
- if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
- ifp->if_snd.ifq_maxlen) {
- IPSTAT_INC(ips_odropped);
- /* would send source quench here but that is depreciated */
- goto drop;
- }
-#endif
-
/*
* Check if media link state of interface is not down
*/
@@ -525,28 +400,27 @@ passout:
/*
* Check if packet fits MTU or if hardware will fragment for us
*/
- if (ro.ro_rt->rt_rmx.rmx_mtu)
- mtu = min(ro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
+ if (ro.ro_rt->rt_mtu)
+ mtu = min(ro.ro_rt->rt_mtu, ifp->if_mtu);
else
mtu = ifp->if_mtu;
- if (ip->ip_len <= mtu ||
- (ifp->if_hwassist & CSUM_FRAGMENT && (ip->ip_off & IP_DF) == 0)) {
+ if (ip_len <= mtu) {
/*
- * Restore packet header fields to original values
+ * Avoid confusing lower layers.
*/
- ip->ip_len = htons(ip->ip_len);
- ip->ip_off = htons(ip->ip_off);
+ m_clrprotoflags(m);
/*
* Send off the packet via outgoing interface
*/
+ IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
error = (*ifp->if_output)(ifp, m,
(struct sockaddr *)dst, &ro);
} else {
/*
* Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery
*/
- if (ip->ip_off & IP_DF) {
+ if (ip_off & IP_DF) {
IPSTAT_INC(ips_cantfrag);
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
0, mtu);
@@ -556,14 +430,8 @@ passout:
* We have to fragment the packet
*/
m->m_pkthdr.csum_flags |= CSUM_IP;
- /*
- * ip_fragment expects ip_len and ip_off in host byte
- * order but returns all packets in network byte order
- */
- if (ip_fragment(ip, &m, mtu, ifp->if_hwassist,
- (~ifp->if_hwassist & CSUM_DELAY_IP))) {
+ if (ip_fragment(ip, &m, mtu, ifp->if_hwassist))
goto drop;
- }
KASSERT(m != NULL, ("null mbuf and no error"));
/*
* Send off the fragments via outgoing interface
@@ -572,7 +440,12 @@ passout:
do {
m0 = m->m_nextpkt;
m->m_nextpkt = NULL;
+ /*
+ * Avoid confusing lower layers.
+ */
+ m_clrprotoflags(m);
+ IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
error = (*ifp->if_output)(ifp, m,
(struct sockaddr *)dst, &ro);
if (error)
@@ -592,7 +465,7 @@ passout:
if (error != 0)
IPSTAT_INC(ips_odropped);
else {
- ro.ro_rt->rt_rmx.rmx_pksent++;
+ counter_u64_add(ro.ro_rt->rt_pksent, 1);
IPSTAT_INC(ips_forward);
IPSTAT_INC(ips_fastforward);
}
diff --git a/freebsd/sys/netinet/ip_fw.h b/freebsd/sys/netinet/ip_fw.h
index 14b08f5e..d274ab27 100644
--- a/freebsd/sys/netinet/ip_fw.h
+++ b/freebsd/sys/netinet/ip_fw.h
@@ -36,25 +36,31 @@
*/
#define IPFW_DEFAULT_RULE 65535
+#define RESVD_SET 31 /*set for default and persistent rules*/
+#define IPFW_MAX_SETS 32 /* Number of sets supported by ipfw*/
+
/*
- * Default number of ipfw tables.
+ * Compat values for old clients
*/
+#ifndef _KERNEL
#define IPFW_TABLES_MAX 65535
#define IPFW_TABLES_DEFAULT 128
+#endif
/*
* Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit
- * argument between 1 and 65534. The value 0 is unused, the value
- * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the
- * can be 1..65534, or 65535 to indicate the use of a 'tablearg'
+ * argument between 1 and 65534. The value 0 (IP_FW_TARG) is used
+ * to represent 'tablearg' value, e.g. indicate the use of a 'tablearg'
* result of the most recent table() lookup.
* Note that 16bit is only a historical limit, resulting from
* the use of a 16-bit fields for that value. In reality, we can have
- * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg.
+ * 2^32 pipes, queues, tag values and so on.
*/
#define IPFW_ARG_MIN 1
#define IPFW_ARG_MAX 65534
-#define IP_FW_TABLEARG 65535 /* XXX should use 0 */
+#define IP_FW_TABLEARG 65535 /* Compat value for old clients */
+#define IP_FW_TARG 0 /* Current tablearg value */
+#define IP_FW_NAT44_GLOBAL 65535 /* arg1 value for "nat global" */
/*
* Number of entries in the call stack of the call/return commands.
@@ -65,15 +71,66 @@
/* IP_FW3 header/opcodes */
typedef struct _ip_fw3_opheader {
uint16_t opcode; /* Operation opcode */
- uint16_t reserved[3]; /* Align to 64-bit boundary */
+ uint16_t version; /* Opcode version */
+ uint16_t reserved[2]; /* Align to 64-bit boundary */
} ip_fw3_opheader;
-
-/* IPFW extented tables support */
+/* IP_FW3 opcodes */
#define IP_FW_TABLE_XADD 86 /* add entry */
#define IP_FW_TABLE_XDEL 87 /* delete entry */
-#define IP_FW_TABLE_XGETSIZE 88 /* get table size */
+#define IP_FW_TABLE_XGETSIZE 88 /* get table size (deprecated) */
#define IP_FW_TABLE_XLIST 89 /* list table contents */
+#define IP_FW_TABLE_XDESTROY 90 /* destroy table */
+#define IP_FW_TABLES_XLIST 92 /* list all tables */
+#define IP_FW_TABLE_XINFO 93 /* request info for one table */
+#define IP_FW_TABLE_XFLUSH 94 /* flush table data */
+#define IP_FW_TABLE_XCREATE 95 /* create new table */
+#define IP_FW_TABLE_XMODIFY 96 /* modify existing table */
+#define IP_FW_XGET 97 /* Retrieve configuration */
+#define IP_FW_XADD 98 /* add rule */
+#define IP_FW_XDEL 99 /* del rule */
+#define IP_FW_XMOVE 100 /* move rules to different set */
+#define IP_FW_XZERO 101 /* clear accounting */
+#define IP_FW_XRESETLOG 102 /* zero rules logs */
+#define IP_FW_SET_SWAP 103 /* Swap between 2 sets */
+#define IP_FW_SET_MOVE 104 /* Move one set to another one */
+#define IP_FW_SET_ENABLE 105 /* Enable/disable sets */
+#define IP_FW_TABLE_XFIND 106 /* finds an entry */
+#define IP_FW_XIFLIST 107 /* list tracked interfaces */
+#define IP_FW_TABLES_ALIST 108 /* list table algorithms */
+#define IP_FW_TABLE_XSWAP 109 /* swap two tables */
+#define IP_FW_TABLE_VLIST 110 /* dump table value hash */
+
+#define IP_FW_NAT44_XCONFIG 111 /* Create/modify NAT44 instance */
+#define IP_FW_NAT44_DESTROY 112 /* Destroys NAT44 instance */
+#define IP_FW_NAT44_XGETCONFIG 113 /* Get NAT44 instance config */
+#define IP_FW_NAT44_LIST_NAT 114 /* List all NAT44 instances */
+#define IP_FW_NAT44_XGETLOG 115 /* Get log from NAT44 instance */
+
+#define IP_FW_DUMP_SOPTCODES 116 /* Dump available sopts/versions */
+#define IP_FW_DUMP_SRVOBJECTS 117 /* Dump existing named objects */
+
+#define IP_FW_NAT64STL_CREATE 130 /* Create stateless NAT64 instance */
+#define IP_FW_NAT64STL_DESTROY 131 /* Destroy stateless NAT64 instance */
+#define IP_FW_NAT64STL_CONFIG 132 /* Modify stateless NAT64 instance */
+#define IP_FW_NAT64STL_LIST 133 /* List stateless NAT64 instances */
+#define IP_FW_NAT64STL_STATS 134 /* Get NAT64STL instance statistics */
+#define IP_FW_NAT64STL_RESET_STATS 135 /* Reset NAT64STL instance statistics */
+
+#define IP_FW_NAT64LSN_CREATE 140 /* Create stateful NAT64 instance */
+#define IP_FW_NAT64LSN_DESTROY 141 /* Destroy stateful NAT64 instance */
+#define IP_FW_NAT64LSN_CONFIG 142 /* Modify stateful NAT64 instance */
+#define IP_FW_NAT64LSN_LIST 143 /* List stateful NAT64 instances */
+#define IP_FW_NAT64LSN_STATS 144 /* Get NAT64LSN instance statistics */
+#define IP_FW_NAT64LSN_LIST_STATES 145 /* Get stateful NAT64 states */
+#define IP_FW_NAT64LSN_RESET_STATS 146 /* Reset NAT64LSN instance statistics */
+
+#define IP_FW_NPTV6_CREATE 150 /* Create NPTv6 instance */
+#define IP_FW_NPTV6_DESTROY 151 /* Destroy NPTv6 instance */
+#define IP_FW_NPTV6_CONFIG 152 /* Modify NPTv6 instance */
+#define IP_FW_NPTV6_LIST 153 /* List NPTv6 instances */
+#define IP_FW_NPTV6_STATS 154 /* Get NPTv6 instance statistics */
+#define IP_FW_NPTV6_RESET_STATS 155 /* Reset NPTv6 instance statistics */
/*
* The kernel representation of ipfw rules is made of a list of
@@ -220,11 +277,14 @@ enum ipfw_opcodes { /* arguments (4 byte each) */
O_DSCP, /* 2 u32 = DSCP mask */
O_SETDSCP, /* arg1=DSCP value */
+ O_IP_FLOW_LOOKUP, /* arg1=table number, u32=value */
+
+ O_EXTERNAL_ACTION, /* arg1=id of external action handler */
+ O_EXTERNAL_INSTANCE, /* arg1=id of eaction handler instance */
O_LAST_OPCODE /* not an opcode! */
};
-
/*
* The extension header are filtered only for presence using a bit
* vector with a flag for each header.
@@ -341,6 +401,7 @@ typedef struct _ipfw_insn_if {
union {
struct in_addr ip;
int glob;
+ uint16_t kidx;
} p;
char name[IFNAMSIZ];
} ipfw_insn_if;
@@ -377,6 +438,8 @@ typedef struct _ipfw_insn_log {
u_int32_t log_left; /* how many left to log */
} ipfw_insn_log;
+/* Legacy NAT structures, compat only */
+#ifndef _KERNEL
/*
* Data structures required by both ipfw(8) and ipfw(4) but not part of the
* management API are protected by IPFW_INTERNAL.
@@ -438,6 +501,44 @@ struct cfg_nat {
#define SOF_REDIR sizeof(struct cfg_redir)
#define SOF_SPOOL sizeof(struct cfg_spool)
+#endif /* ifndef _KERNEL */
+
+
+struct nat44_cfg_spool {
+ struct in_addr addr;
+ uint16_t port;
+ uint16_t spare;
+};
+#define NAT44_REDIR_ADDR 0x01
+#define NAT44_REDIR_PORT 0x02
+#define NAT44_REDIR_PROTO 0x04
+
+/* Nat redirect configuration. */
+struct nat44_cfg_redir {
+ struct in_addr laddr; /* local ip address */
+ struct in_addr paddr; /* public ip address */
+ struct in_addr raddr; /* remote ip address */
+ uint16_t lport; /* local port */
+ uint16_t pport; /* public port */
+ uint16_t rport; /* remote port */
+ uint16_t pport_cnt; /* number of public ports */
+ uint16_t rport_cnt; /* number of remote ports */
+ uint16_t mode; /* type of redirect mode */
+ uint16_t spool_cnt; /* num of entry in spool chain */
+ uint16_t spare;
+ uint32_t proto; /* protocol: tcp/udp */
+};
+
+/* Nat configuration data struct. */
+struct nat44_cfg_nat {
+ char name[64]; /* nat name */
+ char if_name[64]; /* interface name */
+ uint32_t size; /* structure size incl. redirs */
+ struct in_addr ip; /* nat IPv4 address */
+ uint32_t mode; /* aliasing mode */
+ uint32_t redir_cnt; /* number of entry in spool chain */
+};
+
/* Nat command. */
typedef struct _ipfw_insn_nat {
ipfw_insn o;
@@ -471,15 +572,17 @@ typedef struct _ipfw_insn_icmp6 {
/*
* Here we have the structure representing an ipfw rule.
*
- * It starts with a general area (with link fields and counters)
- * followed by an array of one or more instructions, which the code
- * accesses as an array of 32-bit values.
- *
- * Given a rule pointer r:
+ * Layout:
+ * struct ip_fw_rule
+ * [ counter block, size = rule->cntr_len ]
+ * [ one or more instructions, size = rule->cmd_len * 4 ]
*
- * r->cmd is the start of the first instruction.
- * ACTION_PTR(r) is the start of the first action (things to do
- * once a rule matched).
+ * It starts with a general area (with link fields).
+ * Counter block may be next (if rule->cntr_len > 0),
+ * followed by an array of one or more instructions, which the code
+ * accesses as an array of 32-bit values. rule->cmd_len represents
+ * the total instructions legth in u32 worrd, while act_ofs represents
+ * rule action offset in u32 words.
*
* When assembling instruction, remember the following:
*
@@ -490,11 +593,41 @@ typedef struct _ipfw_insn_icmp6 {
* + if a rule has an "altq" option, it comes after "log"
* + if a rule has an O_TAG option, it comes after "log" and "altq"
*
- * NOTE: we use a simple linked list of rules because we never need
- * to delete a rule without scanning the list. We do not use
- * queue(3) macros for portability and readability.
+ *
+ * All structures (excluding instructions) are u64-aligned.
+ * Please keep this.
*/
+struct ip_fw_rule {
+ uint16_t act_ofs; /* offset of action in 32-bit units */
+ uint16_t cmd_len; /* # of 32-bit words in cmd */
+ uint16_t spare;
+ uint8_t set; /* rule set (0..31) */
+ uint8_t flags; /* rule flags */
+ uint32_t rulenum; /* rule number */
+ uint32_t id; /* rule id */
+
+ ipfw_insn cmd[1]; /* storage for commands */
+};
+#define IPFW_RULE_NOOPT 0x01 /* Has no options in body */
+
+/* Unaligned version */
+
+/* Base ipfw rule counter block. */
+struct ip_fw_bcounter {
+ uint16_t size; /* Size of counter block, bytes */
+ uint8_t flags; /* flags for given block */
+ uint8_t spare;
+ uint32_t timestamp; /* tv_sec of last match */
+ uint64_t pcnt; /* Packet counter */
+ uint64_t bcnt; /* Byte counter */
+};
+
+
+#ifndef _KERNEL
+/*
+ * Legacy rule format
+ */
struct ip_fw {
struct ip_fw *x_next; /* linked list of rules */
struct ip_fw *next_rule; /* ptr to next [skipto] rule */
@@ -503,8 +636,7 @@ struct ip_fw {
uint16_t act_ofs; /* offset of action in 32-bit units */
uint16_t cmd_len; /* # of 32-bit words in cmd */
uint16_t rulenum; /* rule number */
- uint8_t set; /* rule set (0..31) */
-#define RESVD_SET 31 /* set for default and persistent rules */
+ uint8_t set; /* rule set (0..31) */
uint8_t _pad; /* padding */
uint32_t id; /* rule id */
@@ -515,12 +647,13 @@ struct ip_fw {
ipfw_insn cmd[1]; /* storage for commands */
};
+#endif
#define ACTION_PTR(rule) \
(ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) )
-#define RULESIZE(rule) (sizeof(struct ip_fw) + \
- ((struct ip_fw *)(rule))->cmd_len * 4 - 4)
+#define RULESIZE(rule) (sizeof(*(rule)) + (rule)->cmd_len * 4 - 4)
+
#if 1 // should be moved to in.h
/*
@@ -572,7 +705,8 @@ struct _ipfw_dyn_rule {
/* to generate keepalives) */
u_int16_t dyn_type; /* rule type */
u_int16_t count; /* refcount */
-};
+ u_int16_t kidx; /* index of named object */
+} __packed __aligned(8);
/*
* Definitions for IP option names.
@@ -598,9 +732,27 @@ struct _ipfw_dyn_rule {
* These are used for lookup tables.
*/
-#define IPFW_TABLE_CIDR 1 /* Table for holding IPv4/IPv6 prefixes */
+#define IPFW_TABLE_ADDR 1 /* Table for holding IPv4/IPv6 prefixes */
#define IPFW_TABLE_INTERFACE 2 /* Table for holding interface names */
-#define IPFW_TABLE_MAXTYPE 2 /* Maximum valid number */
+#define IPFW_TABLE_NUMBER 3 /* Table for holding ports/uid/gid/etc */
+#define IPFW_TABLE_FLOW 4 /* Table for holding flow data */
+#define IPFW_TABLE_MAXTYPE 4 /* Maximum valid number */
+
+#define IPFW_TABLE_CIDR IPFW_TABLE_ADDR /* compat */
+
+/* Value types */
+#define IPFW_VTYPE_LEGACY 0xFFFFFFFF /* All data is filled in */
+#define IPFW_VTYPE_SKIPTO 0x00000001 /* skipto/call/callreturn */
+#define IPFW_VTYPE_PIPE 0x00000002 /* pipe/queue */
+#define IPFW_VTYPE_FIB 0x00000004 /* setfib */
+#define IPFW_VTYPE_NAT 0x00000008 /* nat */
+#define IPFW_VTYPE_DSCP 0x00000010 /* dscp */
+#define IPFW_VTYPE_TAG 0x00000020 /* tag/untag */
+#define IPFW_VTYPE_DIVERT 0x00000040 /* divert/tee */
+#define IPFW_VTYPE_NETGRAPH 0x00000080 /* netgraph/ngtee */
+#define IPFW_VTYPE_LIMIT 0x00000100 /* limit */
+#define IPFW_VTYPE_NH4 0x00000200 /* IPv4 nexthop */
+#define IPFW_VTYPE_NH6 0x00000400 /* IPv6 nexthop */
typedef struct _ipfw_table_entry {
in_addr_t addr; /* network address */
@@ -614,6 +766,7 @@ typedef struct _ipfw_table_xentry {
uint8_t type; /* entry type */
uint8_t masklen; /* mask length */
uint16_t tbl; /* table number */
+ uint16_t flags; /* record flags */
uint32_t value; /* value */
union {
/* Longest field needs to be aligned by 4-byte boundary */
@@ -621,6 +774,7 @@ typedef struct _ipfw_table_xentry {
char iface[IF_NAMESIZE]; /* interface name */
} k;
} ipfw_table_xentry;
+#define IPFW_TCF_INET 0x01 /* CIDR flags: IPv4 record */
typedef struct _ipfw_table {
u_int32_t size; /* size of entries in bytes */
@@ -630,7 +784,7 @@ typedef struct _ipfw_table {
} ipfw_table;
typedef struct _ipfw_xtable {
- ip_fw3_opheader opheader; /* eXtended tables are controlled via IP_FW3 */
+ ip_fw3_opheader opheader; /* IP_FW3 opcode */
uint32_t size; /* size of entries in bytes */
uint32_t cnt; /* # of entries */
uint16_t tbl; /* table number */
@@ -638,4 +792,259 @@ typedef struct _ipfw_xtable {
ipfw_table_xentry xent[0]; /* entries */
} ipfw_xtable;
+typedef struct _ipfw_obj_tlv {
+ uint16_t type; /* TLV type */
+ uint16_t flags; /* TLV-specific flags */
+ uint32_t length; /* Total length, aligned to u64 */
+} ipfw_obj_tlv;
+#define IPFW_TLV_TBL_NAME 1
+#define IPFW_TLV_TBLNAME_LIST 2
+#define IPFW_TLV_RULE_LIST 3
+#define IPFW_TLV_DYNSTATE_LIST 4
+#define IPFW_TLV_TBL_ENT 5
+#define IPFW_TLV_DYN_ENT 6
+#define IPFW_TLV_RULE_ENT 7
+#define IPFW_TLV_TBLENT_LIST 8
+#define IPFW_TLV_RANGE 9
+#define IPFW_TLV_EACTION 10
+#define IPFW_TLV_COUNTERS 11
+#define IPFW_TLV_OBJDATA 12
+#define IPFW_TLV_STATE_NAME 14
+
+#define IPFW_TLV_EACTION_BASE 1000
+#define IPFW_TLV_EACTION_NAME(arg) (IPFW_TLV_EACTION_BASE + (arg))
+
+typedef struct _ipfw_obj_data {
+ ipfw_obj_tlv head;
+ void *data[0];
+} ipfw_obj_data;
+
+/* Object name TLV */
+typedef struct _ipfw_obj_ntlv {
+ ipfw_obj_tlv head; /* TLV header */
+ uint16_t idx; /* Name index */
+ uint8_t set; /* set, if applicable */
+ uint8_t type; /* object type, if applicable */
+ uint32_t spare; /* unused */
+ char name[64]; /* Null-terminated name */
+} ipfw_obj_ntlv;
+
+/* IPv4/IPv6 L4 flow description */
+struct tflow_entry {
+ uint8_t af;
+ uint8_t proto;
+ uint16_t spare;
+ uint16_t sport;
+ uint16_t dport;
+ union {
+ struct {
+ struct in_addr sip;
+ struct in_addr dip;
+ } a4;
+ struct {
+ struct in6_addr sip6;
+ struct in6_addr dip6;
+ } a6;
+ } a;
+};
+
+typedef struct _ipfw_table_value {
+ uint32_t tag; /* O_TAG/O_TAGGED */
+ uint32_t pipe; /* O_PIPE/O_QUEUE */
+ uint16_t divert; /* O_DIVERT/O_TEE */
+ uint16_t skipto; /* skipto, CALLRET */
+ uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */
+ uint32_t fib; /* O_SETFIB */
+ uint32_t nat; /* O_NAT */
+ uint32_t nh4;
+ uint8_t dscp;
+ uint8_t spare0;
+ uint16_t spare1;
+ struct in6_addr nh6;
+ uint32_t limit; /* O_LIMIT */
+ uint32_t zoneid; /* scope zone id for nh6 */
+ uint64_t reserved;
+} ipfw_table_value;
+
+/* Table entry TLV */
+typedef struct _ipfw_obj_tentry {
+ ipfw_obj_tlv head; /* TLV header */
+ uint8_t subtype; /* subtype (IPv4,IPv6) */
+ uint8_t masklen; /* mask length */
+ uint8_t result; /* request result */
+ uint8_t spare0;
+ uint16_t idx; /* Table name index */
+ uint16_t spare1;
+ union {
+ /* Longest field needs to be aligned by 8-byte boundary */
+ struct in_addr addr; /* IPv4 address */
+ uint32_t key; /* uid/gid/port */
+ struct in6_addr addr6; /* IPv6 address */
+ char iface[IF_NAMESIZE]; /* interface name */
+ struct tflow_entry flow;
+ } k;
+ union {
+ ipfw_table_value value; /* value data */
+ uint32_t kidx; /* value kernel index */
+ } v;
+} ipfw_obj_tentry;
+#define IPFW_TF_UPDATE 0x01 /* Update record if exists */
+/* Container TLV */
+#define IPFW_CTF_ATOMIC 0x01 /* Perform atomic operation */
+/* Operation results */
+#define IPFW_TR_IGNORED 0 /* Entry was ignored (rollback) */
+#define IPFW_TR_ADDED 1 /* Entry was successfully added */
+#define IPFW_TR_UPDATED 2 /* Entry was successfully updated*/
+#define IPFW_TR_DELETED 3 /* Entry was successfully deleted*/
+#define IPFW_TR_LIMIT 4 /* Entry was ignored (limit) */
+#define IPFW_TR_NOTFOUND 5 /* Entry was not found */
+#define IPFW_TR_EXISTS 6 /* Entry already exists */
+#define IPFW_TR_ERROR 7 /* Request has failed (unknown) */
+
+typedef struct _ipfw_obj_dyntlv {
+ ipfw_obj_tlv head;
+ ipfw_dyn_rule state;
+} ipfw_obj_dyntlv;
+#define IPFW_DF_LAST 0x01 /* Last state in chain */
+
+/* Containter TLVs */
+typedef struct _ipfw_obj_ctlv {
+ ipfw_obj_tlv head; /* TLV header */
+ uint32_t count; /* Number of sub-TLVs */
+ uint16_t objsize; /* Single object size */
+ uint8_t version; /* TLV version */
+ uint8_t flags; /* TLV-specific flags */
+} ipfw_obj_ctlv;
+
+/* Range TLV */
+typedef struct _ipfw_range_tlv {
+ ipfw_obj_tlv head; /* TLV header */
+ uint32_t flags; /* Range flags */
+ uint16_t start_rule; /* Range start */
+ uint16_t end_rule; /* Range end */
+ uint32_t set; /* Range set to match */
+ uint32_t new_set; /* New set to move/swap to */
+} ipfw_range_tlv;
+#define IPFW_RCFLAG_RANGE 0x01 /* rule range is set */
+#define IPFW_RCFLAG_ALL 0x02 /* match ALL rules */
+#define IPFW_RCFLAG_SET 0x04 /* match rules in given set */
+/* User-settable flags */
+#define IPFW_RCFLAG_USER (IPFW_RCFLAG_RANGE | IPFW_RCFLAG_ALL | \
+ IPFW_RCFLAG_SET)
+/* Internally used flags */
+#define IPFW_RCFLAG_DEFAULT 0x0100 /* Do not skip defaul rule */
+
+typedef struct _ipfw_ta_tinfo {
+ uint32_t flags; /* Format flags */
+ uint32_t spare;
+ uint8_t taclass4; /* algorithm class */
+ uint8_t spare4;
+ uint16_t itemsize4; /* item size in runtime */
+ uint32_t size4; /* runtime structure size */
+ uint32_t count4; /* number of items in runtime */
+ uint8_t taclass6; /* algorithm class */
+ uint8_t spare6;
+ uint16_t itemsize6; /* item size in runtime */
+ uint32_t size6; /* runtime structure size */
+ uint32_t count6; /* number of items in runtime */
+} ipfw_ta_tinfo;
+#define IPFW_TACLASS_HASH 1 /* algo is based on hash */
+#define IPFW_TACLASS_ARRAY 2 /* algo is based on array */
+#define IPFW_TACLASS_RADIX 3 /* algo is based on radix tree */
+
+#define IPFW_TATFLAGS_DATA 0x0001 /* Has data filled in */
+#define IPFW_TATFLAGS_AFDATA 0x0002 /* Separate data per AF */
+#define IPFW_TATFLAGS_AFITEM 0x0004 /* diff. items per AF */
+
+typedef struct _ipfw_xtable_info {
+ uint8_t type; /* table type (addr,iface,..) */
+ uint8_t tflags; /* type flags */
+ uint16_t mflags; /* modification flags */
+ uint16_t flags; /* generic table flags */
+ uint16_t spare[3];
+ uint32_t vmask; /* bitmask with value types */
+ uint32_t set; /* set table is in */
+ uint32_t kidx; /* kernel index */
+ uint32_t refcnt; /* number of references */
+ uint32_t count; /* Number of records */
+ uint32_t size; /* Total size of records(export)*/
+ uint32_t limit; /* Max number of records */
+ char tablename[64]; /* table name */
+ char algoname[64]; /* algorithm name */
+ ipfw_ta_tinfo ta_info; /* additional algo stats */
+} ipfw_xtable_info;
+/* Generic table flags */
+#define IPFW_TGFLAGS_LOCKED 0x01 /* Tables is locked from changes*/
+/* Table type-specific flags */
+#define IPFW_TFFLAG_SRCIP 0x01
+#define IPFW_TFFLAG_DSTIP 0x02
+#define IPFW_TFFLAG_SRCPORT 0x04
+#define IPFW_TFFLAG_DSTPORT 0x08
+#define IPFW_TFFLAG_PROTO 0x10
+/* Table modification flags */
+#define IPFW_TMFLAGS_LIMIT 0x0002 /* Change limit value */
+#define IPFW_TMFLAGS_LOCK 0x0004 /* Change table lock state */
+
+typedef struct _ipfw_iface_info {
+ char ifname[64]; /* interface name */
+ uint32_t ifindex; /* interface index */
+ uint32_t flags; /* flags */
+ uint32_t refcnt; /* number of references */
+ uint32_t gencnt; /* number of changes */
+ uint64_t spare;
+} ipfw_iface_info;
+#define IPFW_IFFLAG_RESOLVED 0x01 /* Interface exists */
+
+typedef struct _ipfw_ta_info {
+ char algoname[64]; /* algorithm name */
+ uint32_t type; /* lookup type */
+ uint32_t flags;
+ uint32_t refcnt;
+ uint32_t spare0;
+ uint64_t spare1;
+} ipfw_ta_info;
+
+typedef struct _ipfw_obj_header {
+ ip_fw3_opheader opheader; /* IP_FW3 opcode */
+ uint32_t spare;
+ uint16_t idx; /* object name index */
+ uint8_t objtype; /* object type */
+ uint8_t objsubtype; /* object subtype */
+ ipfw_obj_ntlv ntlv; /* object name tlv */
+} ipfw_obj_header;
+
+typedef struct _ipfw_obj_lheader {
+ ip_fw3_opheader opheader; /* IP_FW3 opcode */
+ uint32_t set_mask; /* disabled set mask */
+ uint32_t count; /* Total objects count */
+ uint32_t size; /* Total size (incl. header) */
+ uint32_t objsize; /* Size of one object */
+} ipfw_obj_lheader;
+
+#define IPFW_CFG_GET_STATIC 0x01
+#define IPFW_CFG_GET_STATES 0x02
+#define IPFW_CFG_GET_COUNTERS 0x04
+typedef struct _ipfw_cfg_lheader {
+ ip_fw3_opheader opheader; /* IP_FW3 opcode */
+ uint32_t set_mask; /* enabled set mask */
+ uint32_t spare;
+ uint32_t flags; /* Request flags */
+ uint32_t size; /* neded buffer size */
+ uint32_t start_rule;
+ uint32_t end_rule;
+} ipfw_cfg_lheader;
+
+typedef struct _ipfw_range_header {
+ ip_fw3_opheader opheader; /* IP_FW3 opcode */
+ ipfw_range_tlv range;
+} ipfw_range_header;
+
+typedef struct _ipfw_sopt_info {
+ uint16_t opcode;
+ uint8_t version;
+ uint8_t dir;
+ uint8_t spare;
+ uint64_t refcnt;
+} ipfw_sopt_info;
+
#endif /* _IPFW2_H */
diff --git a/freebsd/sys/netinet/ip_gre.c b/freebsd/sys/netinet/ip_gre.c
index 9289be96..36d3ed69 100644
--- a/freebsd/sys/netinet/ip_gre.c
+++ b/freebsd/sys/netinet/ip_gre.c
@@ -1,9 +1,8 @@
#include <machine/rtems-bsd-kernel-space.h>
-/* $NetBSD: ip_gre.c,v 1.29 2003/09/05 23:02:43 itojun Exp $ */
-
/*-
* Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * Copyright (c) 2014 Andrey V. Elsukov <ae@FreeBSD.org>
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
@@ -31,19 +30,14 @@
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * deencapsulate tunneled packets and send them on
- * output half is in net/if_gre.[ch]
- * This currently handles IPPROTO_GRE, IPPROTO_MOBILE
+ *
+ * $NetBSD: ip_gre.c,v 1.29 2003/09/05 23:02:43 itojun Exp $
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_inet.h>
-#include <rtems/bsd/local/opt_atalk.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/sys/param.h>
@@ -55,285 +49,121 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/errno.h>
#include <sys/time.h>
#include <sys/kernel.h>
-#include <sys/syslog.h>
-#include <net/bpf.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/sysctl.h>
#include <net/ethernet.h>
#include <net/if.h>
-#include <net/netisr.h>
-#include <net/route.h>
-#include <net/raw_cb.h>
+#include <net/if_var.h>
+#include <net/vnet.h>
-#ifdef INET
#include <netinet/in.h>
#include <netinet/in_var.h>
-#include <netinet/in_systm.h>
#include <netinet/ip.h>
+#include <netinet/ip_encap.h>
#include <netinet/ip_var.h>
-#include <netinet/ip_gre.h>
-#include <machine/in_cksum.h>
-#else
-#error ip_gre input without IP?
-#endif
-#ifdef NETATALK
-#include <netatalk/at.h>
-#include <netatalk/at_var.h>
-#include <netatalk/at_extern.h>
+#ifdef INET6
+#include <netinet/ip6.h>
#endif
-/* Needs IP headers. */
#include <net/if_gre.h>
-#include <machine/stdarg.h>
-
-#if 1
-void gre_inet_ntoa(struct in_addr in); /* XXX */
-#endif
-
-static struct gre_softc *gre_lookup(struct mbuf *, u_int8_t);
-
-static struct mbuf *gre_input2(struct mbuf *, int, u_char);
-
-/*
- * De-encapsulate a packet and feed it back through ip input (this
- * routine is called whenever IP gets a packet with proto type
- * IPPROTO_GRE and a local destination address).
- * This really is simple
- */
-void
-gre_input(struct mbuf *m, int off)
+extern struct domain inetdomain;
+static const struct protosw in_gre_protosw = {
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_GRE,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = gre_input,
+ .pr_output = rip_output,
+ .pr_ctlinput = rip_ctlinput,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_usrreqs = &rip_usrreqs
+};
+
+#define GRE_TTL 30
+VNET_DEFINE(int, ip_gre_ttl) = GRE_TTL;
+#define V_ip_gre_ttl VNET(ip_gre_ttl)
+SYSCTL_INT(_net_inet_ip, OID_AUTO, grettl, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(ip_gre_ttl), 0, "");
+
+static int
+in_gre_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
{
- int proto;
-
- proto = (mtod(m, struct ip *))->ip_p;
+ GRE_RLOCK_TRACKER;
+ struct gre_softc *sc;
+ struct ip *ip;
- m = gre_input2(m, off, proto);
+ sc = (struct gre_softc *)arg;
+ if ((GRE2IFP(sc)->if_flags & IFF_UP) == 0)
+ return (0);
+ M_ASSERTPKTHDR(m);
/*
- * If no matching tunnel that is up is found. We inject
- * the mbuf to raw ip socket to see if anyone picks it up.
+ * We expect that payload contains at least IPv4
+ * or IPv6 packet.
*/
- if (m != NULL)
- rip_input(m, off);
-}
-
-/*
- * Decapsulate. Does the real work and is called from gre_input()
- * (above). Returns an mbuf back if packet is not yet processed,
- * and NULL if it needs no further processing. proto is the protocol
- * number of the "calling" foo_input() routine.
- */
-static struct mbuf *
-gre_input2(struct mbuf *m ,int hlen, u_char proto)
-{
- struct greip *gip;
- int isr;
- struct gre_softc *sc;
- u_int16_t flags;
- u_int32_t af;
-
- if ((sc = gre_lookup(m, proto)) == NULL) {
- /* No matching tunnel or tunnel is down. */
- return (m);
- }
-
- if (m->m_len < sizeof(*gip)) {
- m = m_pullup(m, sizeof(*gip));
- if (m == NULL)
- return (NULL);
- }
- gip = mtod(m, struct greip *);
-
- GRE2IFP(sc)->if_ipackets++;
- GRE2IFP(sc)->if_ibytes += m->m_pkthdr.len;
-
- switch (proto) {
- case IPPROTO_GRE:
- hlen += sizeof(struct gre_h);
-
- /* process GRE flags as packet can be of variable len */
- flags = ntohs(gip->gi_flags);
-
- /* Checksum & Offset are present */
- if ((flags & GRE_CP) | (flags & GRE_RP))
- hlen += 4;
- /* We don't support routing fields (variable length) */
- if (flags & GRE_RP)
- return (m);
- if (flags & GRE_KP)
- hlen += 4;
- if (flags & GRE_SP)
- hlen += 4;
-
- switch (ntohs(gip->gi_ptype)) { /* ethertypes */
- case WCCP_PROTOCOL_TYPE:
- if (sc->wccp_ver == WCCP_V2)
- hlen += 4;
- /* FALLTHROUGH */
- case ETHERTYPE_IP: /* shouldn't need a schednetisr(), */
- isr = NETISR_IP;/* as we are in ip_input */
- af = AF_INET;
- break;
-#ifdef INET6
- case ETHERTYPE_IPV6:
- isr = NETISR_IPV6;
- af = AF_INET6;
- break;
-#endif
-#ifdef NETATALK
- case ETHERTYPE_ATALK:
- isr = NETISR_ATALK1;
- af = AF_APPLETALK;
- break;
-#endif
- default:
- /* Others not yet supported. */
- return (m);
- }
- break;
- default:
- /* Others not yet supported. */
- return (m);
- }
-
- if (hlen > m->m_pkthdr.len) {
- m_freem(m);
- return (NULL);
- }
- /* Unlike NetBSD, in FreeBSD m_adj() adjusts m->m_pkthdr.len as well */
- m_adj(m, hlen);
-
- if (bpf_peers_present(GRE2IFP(sc)->if_bpf)) {
- bpf_mtap2(GRE2IFP(sc)->if_bpf, &af, sizeof(af), m);
- }
+ if (m->m_pkthdr.len < sizeof(struct greip) + sizeof(struct ip))
+ return (0);
- if ((GRE2IFP(sc)->if_flags & IFF_MONITOR) != 0) {
- m_freem(m);
- return(NULL);
- }
-
- m->m_pkthdr.rcvif = GRE2IFP(sc);
+ GRE_RLOCK(sc);
+ if (sc->gre_family == 0)
+ goto bad;
- netisr_queue(isr, m);
+ KASSERT(sc->gre_family == AF_INET,
+ ("wrong gre_family: %d", sc->gre_family));
- /* Packet is done, no further processing needed. */
- return (NULL);
+ ip = mtod(m, struct ip *);
+ if (sc->gre_oip.ip_src.s_addr != ip->ip_dst.s_addr ||
+ sc->gre_oip.ip_dst.s_addr != ip->ip_src.s_addr)
+ goto bad;
+
+ GRE_RUNLOCK(sc);
+ return (32 * 2);
+bad:
+ GRE_RUNLOCK(sc);
+ return (0);
}
-/*
- * input routine for IPPRPOTO_MOBILE
- * This is a little bit diffrent from the other modes, as the
- * encapsulating header was not prepended, but instead inserted
- * between IP header and payload
- */
-
-void
-gre_mobile_input(struct mbuf *m, int hlen)
+int
+in_gre_output(struct mbuf *m, int af, int hlen)
{
- struct ip *ip;
- struct mobip_h *mip;
- struct gre_softc *sc;
- int msiz;
-
- if ((sc = gre_lookup(m, IPPROTO_MOBILE)) == NULL) {
- /* No matching tunnel or tunnel is down. */
- m_freem(m);
- return;
- }
-
- if (m->m_len < sizeof(*mip)) {
- m = m_pullup(m, sizeof(*mip));
- if (m == NULL)
- return;
- }
- ip = mtod(m, struct ip *);
- mip = mtod(m, struct mobip_h *);
-
- GRE2IFP(sc)->if_ipackets++;
- GRE2IFP(sc)->if_ibytes += m->m_pkthdr.len;
-
- if (ntohs(mip->mh.proto) & MOB_H_SBIT) {
- msiz = MOB_H_SIZ_L;
- mip->mi.ip_src.s_addr = mip->mh.osrc;
- } else
- msiz = MOB_H_SIZ_S;
-
- if (m->m_len < (ip->ip_hl << 2) + msiz) {
- m = m_pullup(m, (ip->ip_hl << 2) + msiz);
- if (m == NULL)
- return;
- ip = mtod(m, struct ip *);
- mip = mtod(m, struct mobip_h *);
- }
-
- mip->mi.ip_dst.s_addr = mip->mh.odst;
- mip->mi.ip_p = (ntohs(mip->mh.proto) >> 8);
-
- if (gre_in_cksum((u_int16_t *)&mip->mh, msiz) != 0) {
- m_freem(m);
- return;
- }
-
- bcopy((caddr_t)(ip) + (ip->ip_hl << 2) + msiz, (caddr_t)(ip) +
- (ip->ip_hl << 2), m->m_len - msiz - (ip->ip_hl << 2));
- m->m_len -= msiz;
- m->m_pkthdr.len -= msiz;
-
- /*
- * On FreeBSD, rip_input() supplies us with ip->ip_len
- * already converted into host byteorder and also decreases
- * it by the lengh of IP header, however, ip_input() expects
- * that this field is in the original format (network byteorder
- * and full size of IP packet), so that adjust accordingly.
- */
- ip->ip_len = htons(ip->ip_len + sizeof(struct ip) - msiz);
-
- ip->ip_sum = 0;
- ip->ip_sum = in_cksum(m, (ip->ip_hl << 2));
-
- if (bpf_peers_present(GRE2IFP(sc)->if_bpf)) {
- u_int32_t af = AF_INET;
- bpf_mtap2(GRE2IFP(sc)->if_bpf, &af, sizeof(af), m);
- }
-
- if ((GRE2IFP(sc)->if_flags & IFF_MONITOR) != 0) {
- m_freem(m);
- return;
+ struct greip *gi;
+
+ gi = mtod(m, struct greip *);
+ switch (af) {
+ case AF_INET:
+ /*
+ * gre_transmit() has used M_PREPEND() that doesn't guarantee
+ * m_data is contiguous more than hlen bytes. Use m_copydata()
+ * here to avoid m_pullup().
+ */
+ m_copydata(m, hlen + offsetof(struct ip, ip_tos),
+ sizeof(u_char), &gi->gi_ip.ip_tos);
+ m_copydata(m, hlen + offsetof(struct ip, ip_id),
+ sizeof(u_short), (caddr_t)&gi->gi_ip.ip_id);
+ break;
+#ifdef INET6
+ case AF_INET6:
+ gi->gi_ip.ip_tos = 0; /* XXX */
+ ip_fillid(&gi->gi_ip);
+ break;
+#endif
}
-
- m->m_pkthdr.rcvif = GRE2IFP(sc);
-
- netisr_queue(NETISR_IP, m);
+ gi->gi_ip.ip_ttl = V_ip_gre_ttl;
+ gi->gi_ip.ip_len = htons(m->m_pkthdr.len);
+ return (ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL));
}
-/*
- * Find the gre interface associated with our src/dst/proto set.
- *
- * XXXRW: Need some sort of drain/refcount mechanism so that the softc
- * reference remains valid after it's returned from gre_lookup(). Right
- * now, I'm thinking it should be reference-counted with a gre_dropref()
- * when the caller is done with the softc. This is complicated by how
- * to handle destroying the gre softc; probably using a gre_drain() in
- * in_gre.c during destroy.
- */
-static struct gre_softc *
-gre_lookup(struct mbuf *m, u_int8_t proto)
+int
+in_gre_attach(struct gre_softc *sc)
{
- struct ip *ip = mtod(m, struct ip *);
- struct gre_softc *sc;
-
- mtx_lock(&gre_mtx);
- for (sc = LIST_FIRST(&gre_softc_list); sc != NULL;
- sc = LIST_NEXT(sc, sc_list)) {
- if ((sc->g_dst.s_addr == ip->ip_src.s_addr) &&
- (sc->g_src.s_addr == ip->ip_dst.s_addr) &&
- (sc->g_proto == proto) &&
- ((GRE2IFP(sc)->if_flags & IFF_UP) != 0)) {
- mtx_unlock(&gre_mtx);
- return (sc);
- }
- }
- mtx_unlock(&gre_mtx);
- return (NULL);
+ KASSERT(sc->gre_ecookie == NULL, ("gre_ecookie isn't NULL"));
+ sc->gre_ecookie = encap_attach_func(AF_INET, IPPROTO_GRE,
+ in_gre_encapcheck, &in_gre_protosw, sc);
+ if (sc->gre_ecookie == NULL)
+ return (EEXIST);
+ return (0);
}
diff --git a/freebsd/sys/netinet/ip_icmp.c b/freebsd/sys/netinet/ip_icmp.c
index cd581948..f34cc4bd 100644
--- a/freebsd/sys/netinet/ip_icmp.c
+++ b/freebsd/sys/netinet/ip_icmp.c
@@ -35,7 +35,6 @@
__FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_inet.h>
-#include <rtems/bsd/local/opt_ipsec.h>
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
@@ -44,15 +43,19 @@ __FBSDID("$FreeBSD$");
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/rmlock.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
+#include <netinet/in_fib.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
@@ -60,16 +63,13 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip_icmp.h>
#include <netinet/ip_var.h>
#include <netinet/ip_options.h>
+#include <netinet/sctp.h>
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
#include <netinet/icmp_var.h>
#ifdef INET
-#ifdef IPSEC
-#include <netipsec/ipsec.h>
-#include <netipsec/key.h>
-#endif
#include <machine/in_cksum.h>
@@ -83,68 +83,79 @@ __FBSDID("$FreeBSD$");
*/
static VNET_DEFINE(int, icmplim) = 200;
#define V_icmplim VNET(icmplim)
-SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(icmplim), 0,
"Maximum number of ICMP responses per second");
static VNET_DEFINE(int, icmplim_output) = 1;
#define V_icmplim_output VNET(icmplim_output)
-SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(icmplim_output), 0,
- "Enable rate limiting of ICMP responses");
+ "Enable logging of ICMP response rate limiting");
#ifdef INET
-VNET_DEFINE(struct icmpstat, icmpstat);
-SYSCTL_VNET_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW,
- &VNET_NAME(icmpstat), icmpstat, "");
+VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat);
+VNET_PCPUSTAT_SYSINIT(icmpstat);
+SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat,
+ icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)");
+
+#ifdef VIMAGE
+VNET_PCPUSTAT_SYSUNINIT(icmpstat);
+#endif /* VIMAGE */
static VNET_DEFINE(int, icmpmaskrepl) = 0;
#define V_icmpmaskrepl VNET(icmpmaskrepl)
-SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(icmpmaskrepl), 0,
- "Reply to ICMP Address Mask Request packets.");
+ "Reply to ICMP Address Mask Request packets");
static VNET_DEFINE(u_int, icmpmaskfake) = 0;
#define V_icmpmaskfake VNET(icmpmaskfake)
-SYSCTL_VNET_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_RW,
+SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(icmpmaskfake), 0,
- "Fake reply to ICMP Address Mask Request packets.");
+ "Fake reply to ICMP Address Mask Request packets");
VNET_DEFINE(int, drop_redirect) = 0;
+#define V_drop_redirect VNET(drop_redirect)
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(drop_redirect), 0,
+ "Ignore ICMP redirects");
static VNET_DEFINE(int, log_redirect) = 0;
#define V_log_redirect VNET(log_redirect)
-SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(log_redirect), 0,
"Log ICMP redirects to the console");
static VNET_DEFINE(char, reply_src[IFNAMSIZ]);
#define V_reply_src VNET(reply_src)
-SYSCTL_VNET_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_RW,
+SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(reply_src), IFNAMSIZ,
- "icmp reply source for non-local packets.");
+ "ICMP reply source for non-local packets");
static VNET_DEFINE(int, icmp_rfi) = 0;
#define V_icmp_rfi VNET(icmp_rfi)
-SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(icmp_rfi), 0,
"ICMP reply from incoming interface for non-local packets");
static VNET_DEFINE(int, icmp_quotelen) = 8;
#define V_icmp_quotelen VNET(icmp_quotelen)
-SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(icmp_quotelen), 0,
"Number of bytes from original packet to quote in ICMP reply");
-/*
- * ICMP broadcast echo sysctl
- */
static VNET_DEFINE(int, icmpbmcastecho) = 0;
#define V_icmpbmcastecho VNET(icmpbmcastecho)
-SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(icmpbmcastecho), 0,
- "");
+ "Reply to multicast ICMP Echo Request and Timestamp packets");
+static VNET_DEFINE(int, icmptstamprepl) = 1;
+#define V_icmptstamprepl VNET(icmptstamprepl)
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_RW,
+ &VNET_NAME(icmptstamprepl), 0,
+ "Respond to ICMP Timestamp packets");
#ifdef ICMPPRINTFS
int icmpprintfs = 0;
@@ -155,39 +166,6 @@ static void icmp_send(struct mbuf *, struct mbuf *);
extern struct protosw inetsw[];
-static int
-sysctl_net_icmp_drop_redir(SYSCTL_HANDLER_ARGS)
-{
- int error, new;
- int i;
- struct radix_node_head *rnh;
-
- new = V_drop_redirect;
- error = sysctl_handle_int(oidp, &new, 0, req);
- if (error == 0 && req->newptr) {
- new = (new != 0) ? 1 : 0;
-
- if (new == V_drop_redirect)
- return (0);
-
- for (i = 0; i < rt_numfibs; i++) {
- if ((rnh = rt_tables_get_rnh(i, AF_INET)) == NULL)
- continue;
- RADIX_NODE_HEAD_LOCK(rnh);
- in_setmatchfunc(rnh, new);
- RADIX_NODE_HEAD_UNLOCK(rnh);
- }
-
- V_drop_redirect = new;
- }
-
- return (error);
-}
-
-SYSCTL_VNET_PROC(_net_inet_icmp, OID_AUTO, drop_redirect,
- CTLTYPE_INT|CTLFLAG_RW, 0, 0,
- sysctl_net_icmp_drop_redir, "I", "Ignore ICMP redirects");
-
/*
* Kernel module interface for updating icmpstat. The argument is an index
* into icmpstat treated as an array of u_long. While this encodes the
@@ -199,7 +177,7 @@ void
kmod_icmpstat_inc(int statnum)
{
- (*((u_long *)&V_icmpstat + statnum))++;
+ counter_u64_add(VNET(icmpstat)[statnum], 1);
}
/*
@@ -231,7 +209,7 @@ icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
*/
if (n->m_flags & M_DECRYPTED)
goto freeit;
- if (oip->ip_off & ~(IP_MF|IP_DF))
+ if (oip->ip_off & htons(~(IP_MF|IP_DF)))
goto freeit;
if (n->m_flags & (M_BCAST|M_MCAST))
goto freeit;
@@ -247,7 +225,7 @@ icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
/*
* Calculate length to quote from original packet and
* prevent the ICMP mbuf from overflowing.
- * Unfortunatly this is non-trivial since ip_forward()
+ * Unfortunately this is non-trivial since ip_forward()
* sends us truncated packets.
*/
nlen = m_length(n, NULL);
@@ -265,25 +243,54 @@ icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
tcphlen = th->th_off << 2;
if (tcphlen < sizeof(struct tcphdr))
goto freeit;
- if (oip->ip_len < oiphlen + tcphlen)
+ if (ntohs(oip->ip_len) < oiphlen + tcphlen)
goto freeit;
if (oiphlen + tcphlen > n->m_len && n->m_next == NULL)
goto stdreply;
if (n->m_len < oiphlen + tcphlen &&
((n = m_pullup(n, oiphlen + tcphlen)) == NULL))
goto freeit;
- icmpelen = max(tcphlen, min(V_icmp_quotelen, oip->ip_len - oiphlen));
+ icmpelen = max(tcphlen, min(V_icmp_quotelen,
+ ntohs(oip->ip_len) - oiphlen));
+ } else if (oip->ip_p == IPPROTO_SCTP) {
+ struct sctphdr *sh;
+ struct sctp_chunkhdr *ch;
+
+ if (ntohs(oip->ip_len) < oiphlen + sizeof(struct sctphdr))
+ goto stdreply;
+ if (oiphlen + sizeof(struct sctphdr) > n->m_len &&
+ n->m_next == NULL)
+ goto stdreply;
+ if (n->m_len < oiphlen + sizeof(struct sctphdr) &&
+ (n = m_pullup(n, oiphlen + sizeof(struct sctphdr))) == NULL)
+ goto freeit;
+ icmpelen = max(sizeof(struct sctphdr),
+ min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen));
+ sh = (struct sctphdr *)((caddr_t)oip + oiphlen);
+ if (ntohl(sh->v_tag) == 0 &&
+ ntohs(oip->ip_len) >= oiphlen + sizeof(struct sctphdr) + 8 &&
+ (n->m_len >= oiphlen + sizeof(struct sctphdr) + 8 ||
+ n->m_next != NULL)) {
+ if (n->m_len < oiphlen + sizeof(struct sctphdr) + 8 &&
+ (n = m_pullup(n, oiphlen + sizeof(struct sctphdr) + 8)) == NULL)
+ goto freeit;
+ ch = (struct sctp_chunkhdr *)(sh + 1);
+ if (ch->chunk_type == SCTP_INITIATION) {
+ icmpelen = max(sizeof(struct sctphdr) + 8,
+ min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen));
+ }
+ }
} else
-stdreply: icmpelen = max(8, min(V_icmp_quotelen, oip->ip_len - oiphlen));
+stdreply: icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen));
icmplen = min(oiphlen + icmpelen, nlen);
if (icmplen < sizeof(struct ip))
goto freeit;
if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen)
- m = m_gethdr(M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
else
- m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+ m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
if (m == NULL)
goto freeit;
#ifdef MAC
@@ -324,8 +331,6 @@ stdreply: icmpelen = max(8, min(V_icmp_quotelen, oip->ip_len - oiphlen));
*/
m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip);
nip = &icp->icmp_ip;
- nip->ip_len = htons(nip->ip_len);
- nip->ip_off = htons(nip->ip_off);
/*
* Set up ICMP message mbuf and copy old IP header (without options
@@ -340,7 +345,7 @@ stdreply: icmpelen = max(8, min(V_icmp_quotelen, oip->ip_len - oiphlen));
m->m_pkthdr.rcvif = n->m_pkthdr.rcvif;
nip = mtod(m, struct ip *);
bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip));
- nip->ip_len = m->m_len;
+ nip->ip_len = htons(m->m_len);
nip->ip_v = IPVERSION;
nip->ip_hl = 5;
nip->ip_p = IPPROTO_ICMP;
@@ -355,19 +360,22 @@ freeit:
/*
* Process a received ICMP message.
*/
-void
-icmp_input(struct mbuf *m, int off)
+int
+icmp_input(struct mbuf **mp, int *offp, int proto)
{
struct icmp *icp;
struct in_ifaddr *ia;
+ struct mbuf *m = *mp;
struct ip *ip = mtod(m, struct ip *);
struct sockaddr_in icmpsrc, icmpdst, icmpgw;
- int hlen = off;
- int icmplen = ip->ip_len;
+ int hlen = *offp;
+ int icmplen = ntohs(ip->ip_len) - *offp;
int i, code;
void (*ctlfunc)(int, struct sockaddr *, void *);
int fibnum;
+ *mp = NULL;
+
/*
* Locate icmp structure in mbuf, and check
* that not corrupted and of at least minimum length.
@@ -387,7 +395,7 @@ icmp_input(struct mbuf *m, int off)
i = hlen + min(icmplen, ICMP_ADVLENMIN);
if (m->m_len < i && (m = m_pullup(m, i)) == NULL) {
ICMPSTAT_INC(icps_tooshort);
- return;
+ return (IPPROTO_DONE);
}
ip = mtod(m, struct ip *);
m->m_len -= hlen;
@@ -400,19 +408,6 @@ icmp_input(struct mbuf *m, int off)
m->m_len += hlen;
m->m_data -= hlen;
- if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
- /*
- * Deliver very specific ICMP type only.
- */
- switch (icp->icmp_type) {
- case ICMP_UNREACH:
- case ICMP_TIMXCEED:
- break;
- default:
- goto freeit;
- }
- }
-
#ifdef ICMPPRINTFS
if (icmpprintfs)
printf("icmp_input, type %d code %d\n", icp->icmp_type,
@@ -489,12 +484,6 @@ icmp_input(struct mbuf *m, int off)
if (code > 1)
goto badcode;
code = PRC_PARAMPROB;
- goto deliver;
-
- case ICMP_SOURCEQUENCH:
- if (code)
- goto badcode;
- code = PRC_QUENCH;
deliver:
/*
* Problem with datagram; advise higher level routines.
@@ -504,7 +493,6 @@ icmp_input(struct mbuf *m, int off)
ICMPSTAT_INC(icps_badlen);
goto freeit;
}
- icp->icmp_ip.ip_len = ntohs(icp->icmp_ip.ip_len);
/* Discard ICMP's in response to multicast packets */
if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr)))
goto badcode;
@@ -517,6 +505,23 @@ icmp_input(struct mbuf *m, int off)
* XXX if the packet contains [IPv4 AH TCP], we can't make a
* notification to TCP layer.
*/
+ i = sizeof(struct ip) + min(icmplen, ICMP_ADVLENPREF(icp));
+ ip_stripoptions(m);
+ if (m->m_len < i && (m = m_pullup(m, i)) == NULL) {
+ /* This should actually not happen */
+ ICMPSTAT_INC(icps_tooshort);
+ return (IPPROTO_DONE);
+ }
+ ip = mtod(m, struct ip *);
+ icp = (struct icmp *)(ip + 1);
+ /*
+ * The upper layer handler can rely on:
+ * - The outer IP header has no options.
+ * - The outer IP header, the ICMP header, the inner IP header,
+ * and the first n bytes of the inner payload are contiguous.
+ * n is at least 8, but might be larger based on
+ * ICMP_ADVLENPREF. See its definition in ip_icmp.h.
+ */
ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput;
if (ctlfunc)
(*ctlfunc)(code, (struct sockaddr *)&icmpsrc,
@@ -540,6 +545,8 @@ icmp_input(struct mbuf *m, int off)
goto reflect;
case ICMP_TSTAMP:
+ if (V_icmptstamprepl == 0)
+ break;
if (!V_icmpbmcastecho
&& (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
ICMPSTAT_INC(icps_bmcasttstamp);
@@ -597,11 +604,10 @@ icmp_input(struct mbuf *m, int off)
}
ifa_free(&ia->ia_ifa);
reflect:
- ip->ip_len += hlen; /* since ip_input deducts this */
ICMPSTAT_INC(icps_reflect);
ICMPSTAT_INC(icps_outhist[icp->icmp_type]);
icmp_reflect(m);
- return;
+ return (IPPROTO_DONE);
case ICMP_REDIRECT:
if (V_log_redirect) {
@@ -658,9 +664,6 @@ reflect:
(struct sockaddr *)&icmpgw, fibnum);
}
pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc);
-#ifdef IPSEC
- key_sa_routechange((struct sockaddr *)&icmpsrc);
-#endif
break;
/*
@@ -673,16 +676,19 @@ reflect:
case ICMP_TSTAMPREPLY:
case ICMP_IREQREPLY:
case ICMP_MASKREPLY:
+ case ICMP_SOURCEQUENCH:
default:
break;
}
raw:
- rip_input(m, off);
- return;
+ *mp = m;
+ rip_input(mp, offp, proto);
+ return (IPPROTO_DONE);
freeit:
m_freem(m);
+ return (IPPROTO_DONE);
}
/*
@@ -691,12 +697,14 @@ freeit:
static void
icmp_reflect(struct mbuf *m)
{
+ struct rm_priotracker in_ifa_tracker;
struct ip *ip = mtod(m, struct ip *);
struct ifaddr *ifa;
struct ifnet *ifp;
struct in_ifaddr *ia;
struct in_addr t;
- struct mbuf *opts = 0;
+ struct nhop4_extended nh_ext;
+ struct mbuf *opts = NULL;
int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
@@ -707,8 +715,6 @@ icmp_reflect(struct mbuf *m)
goto done; /* Ip_output() will check for broadcast */
}
- m_addr_changed(m);
-
t = ip->ip_dst;
ip->ip_dst = ip->ip_src;
@@ -718,15 +724,15 @@ icmp_reflect(struct mbuf *m)
* If the incoming packet was addressed directly to one of our
* own addresses, use dst as the src for the reply.
*/
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) {
if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) {
t = IA_SIN(ia)->sin_addr;
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
goto match;
}
}
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
/*
* If the incoming packet was addressed to one of our broadcast
@@ -791,14 +797,12 @@ icmp_reflect(struct mbuf *m)
* When we don't have a route back to the packet source, stop here
* and drop the packet.
*/
- ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
- if (ia == NULL) {
+ if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh_ext) != 0) {
m_freem(m);
ICMPSTAT_INC(icps_noroute);
goto done;
}
- t = IA_SIN(ia)->sin_addr;
- ifa_free(&ia->ia_ifa);
+ t = nh_ext.nh_src;
match:
#ifdef MAC
mac_netinet_icmp_replyinplace(m);
@@ -816,8 +820,8 @@ match:
* add on any record-route or timestamp options.
*/
cp = (u_char *) (ip + 1);
- if ((opts = ip_srcroute(m)) == 0 &&
- (opts = m_gethdr(M_DONTWAIT, MT_DATA))) {
+ if ((opts = ip_srcroute(m)) == NULL &&
+ (opts = m_gethdr(M_NOWAIT, MT_DATA))) {
opts->m_len = sizeof(struct in_addr);
mtod(opts, struct in_addr *)->s_addr = 0;
}
@@ -865,19 +869,7 @@ match:
printf("%d\n", opts->m_len);
#endif
}
- /*
- * Now strip out original options by copying rest of first
- * mbuf's data back, and adjust the IP length.
- */
- ip->ip_len -= optlen;
- ip->ip_v = IPVERSION;
- ip->ip_hl = 5;
- m->m_len -= optlen;
- if (m->m_flags & M_PKTHDR)
- m->m_pkthdr.len -= optlen;
- optlen += sizeof(struct ip);
- bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1),
- (unsigned)(m->m_len - sizeof(struct ip)));
+ ip_stripoptions(m);
}
m_tag_delete_nonpersistent(m);
m->m_flags &= ~(M_BCAST|M_MCAST);
@@ -903,7 +895,7 @@ icmp_send(struct mbuf *m, struct mbuf *opts)
m->m_len -= hlen;
icp = mtod(m, struct icmp *);
icp->icmp_cksum = 0;
- icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen);
+ icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - hlen);
m->m_data -= hlen;
m->m_len += hlen;
m->m_pkthdr.rcvif = (struct ifnet *)0;
@@ -919,7 +911,7 @@ icmp_send(struct mbuf *m, struct mbuf *opts)
}
/*
- * Return milliseconds since 00:00 GMT in network format.
+ * Return milliseconds since 00:00 UTC in network format.
*/
uint32_t
iptime(void)
diff --git a/freebsd/sys/netinet/ip_icmp.h b/freebsd/sys/netinet/ip_icmp.h
index 9cabdb58..64db0064 100644
--- a/freebsd/sys/netinet/ip_icmp.h
+++ b/freebsd/sys/netinet/ip_icmp.h
@@ -99,7 +99,7 @@ struct icmp {
struct id_ts { /* ICMP Timestamp */
/*
* The next 3 fields are in network format,
- * milliseconds since 00:00 GMT
+ * milliseconds since 00:00 UTC
*/
uint32_t its_otime; /* Originate */
uint32_t its_rtime; /* Receive */
@@ -136,6 +136,14 @@ struct icmp {
#define ICMP_ADVLENMIN (8 + sizeof (struct ip) + 8) /* min */
#define ICMP_ADVLEN(p) (8 + ((p)->icmp_ip.ip_hl << 2) + 8)
/* N.B.: must separately check that ip_hl >= 5 */
+ /* This is the minimum length required by RFC 792. */
+/*
+ * ICMP_ADVLENPREF is the preferred number of bytes which should be contiguous.
+ * SCTP needs additional 12 bytes to be able to access the initiate tag
+ * in packets containing an INIT chunk. For also supporting SCTP/UDP,
+ * additional 8 bytes are needed.
+ */
+#define ICMP_ADVLENPREF(p) (8 + ((p)->icmp_ip.ip_hl << 2) + 8 + 8 + 12)
/*
* Definition of type and code field values.
@@ -207,7 +215,7 @@ struct icmp {
#ifdef _KERNEL
void icmp_error(struct mbuf *, int, int, uint32_t, int);
-void icmp_input(struct mbuf *, int);
+int icmp_input(struct mbuf **, int *, int);
int ip_next_mtu(int, int);
#endif
diff --git a/freebsd/sys/netinet/ip_id.c b/freebsd/sys/netinet/ip_id.c
index a76c7b78..17352cfb 100644
--- a/freebsd/sys/netinet/ip_id.c
+++ b/freebsd/sys/netinet/ip_id.c
@@ -76,119 +76,149 @@ __FBSDID("$FreeBSD$");
* enabled.
*/
-#include <sys/types.h>
-#include <sys/malloc.h>
#include <rtems/bsd/sys/param.h>
-#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
#include <sys/kernel.h>
-#include <sys/libkern.h>
+#include <sys/malloc.h>
#include <rtems/bsd/sys/lock.h>
#include <sys/mutex.h>
#include <sys/random.h>
-#include <sys/systm.h>
+#include <sys/smp.h>
#include <sys/sysctl.h>
+#include <sys/bitstring.h>
+
+#include <net/vnet.h>
+
#include <netinet/in.h>
+#include <netinet/ip.h>
#include <netinet/ip_var.h>
-#include <sys/bitstring.h>
+/*
+ * By default we generate IP ID only for non-atomic datagrams, as
+ * suggested by RFC6864. We use per-CPU counter for that, or if
+ * user wants to, we can turn on random ID generation.
+ */
+static VNET_DEFINE(int, ip_rfc6864) = 1;
+static VNET_DEFINE(int, ip_do_randomid) = 0;
+#define V_ip_rfc6864 VNET(ip_rfc6864)
+#define V_ip_do_randomid VNET(ip_do_randomid)
+
+/*
+ * Random ID state engine.
+ */
static MALLOC_DEFINE(M_IPID, "ipid", "randomized ip id state");
+static VNET_DEFINE(uint16_t *, id_array);
+static VNET_DEFINE(bitstr_t *, id_bits);
+static VNET_DEFINE(int, array_ptr);
+static VNET_DEFINE(int, array_size);
+static VNET_DEFINE(int, random_id_collisions);
+static VNET_DEFINE(int, random_id_total);
+static VNET_DEFINE(struct mtx, ip_id_mtx);
+#define V_id_array VNET(id_array)
+#define V_id_bits VNET(id_bits)
+#define V_array_ptr VNET(array_ptr)
+#define V_array_size VNET(array_size)
+#define V_random_id_collisions VNET(random_id_collisions)
+#define V_random_id_total VNET(random_id_total)
+#define V_ip_id_mtx VNET(ip_id_mtx)
-static u_int16_t *id_array = NULL;
-static bitstr_t *id_bits = NULL;
-static int array_ptr = 0;
-static int array_size = 8192;
-static int random_id_collisions = 0;
-static int random_id_total = 0;
-static struct mtx ip_id_mtx;
+/*
+ * Non-random ID state engine is simply a per-cpu counter.
+ */
+static VNET_DEFINE(counter_u64_t, ip_id);
+#define V_ip_id VNET(ip_id)
-static void ip_initid(void);
+static int sysctl_ip_randomid(SYSCTL_HANDLER_ARGS);
static int sysctl_ip_id_change(SYSCTL_HANDLER_ARGS);
-
-MTX_SYSINIT(ip_id_mtx, &ip_id_mtx, "ip_id_mtx", MTX_DEF);
+static void ip_initid(int);
+static uint16_t ip_randomid(void);
+static void ipid_sysinit(void);
+static void ipid_sysuninit(void);
SYSCTL_DECL(_net_inet_ip);
-SYSCTL_PROC(_net_inet_ip, OID_AUTO, random_id_period, CTLTYPE_INT|CTLFLAG_RW,
- &array_size, 0, sysctl_ip_id_change, "IU", "IP ID Array size");
-SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_collisions, CTLFLAG_RD,
- &random_id_collisions, 0, "Count of IP ID collisions");
-SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_total, CTLFLAG_RD,
- &random_id_total, 0, "Count of IP IDs created");
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, random_id,
+ CTLTYPE_INT | CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(ip_do_randomid), 0, sysctl_ip_randomid, "IU",
+ "Assign random ip_id values");
+SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(ip_rfc6864), 0,
+ "Use constant IP ID for atomic datagrams");
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, random_id_period,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET,
+ &VNET_NAME(array_size), 0, sysctl_ip_id_change, "IU", "IP ID Array size");
+SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_collisions,
+ CTLFLAG_RD | CTLFLAG_VNET,
+ &VNET_NAME(random_id_collisions), 0, "Count of IP ID collisions");
+SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_total, CTLFLAG_RD | CTLFLAG_VNET,
+ &VNET_NAME(random_id_total), 0, "Count of IP IDs created");
+
+static int
+sysctl_ip_randomid(SYSCTL_HANDLER_ARGS)
+{
+ int error, new;
+
+ new = V_ip_do_randomid;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error || req->newptr == NULL)
+ return (error);
+ if (new != 0 && new != 1)
+ return (EINVAL);
+ if (new == V_ip_do_randomid)
+ return (0);
+ if (new == 1 && V_ip_do_randomid == 0)
+ ip_initid(8192);
+ /* We don't free memory when turning random ID off, due to race. */
+ V_ip_do_randomid = new;
+ return (0);
+}
static int
sysctl_ip_id_change(SYSCTL_HANDLER_ARGS)
{
int error, new;
- new = array_size;
+ new = V_array_size;
error = sysctl_handle_int(oidp, &new, 0, req);
if (error == 0 && req->newptr) {
- if (new >= 512 && new <= 32768) {
- mtx_lock(&ip_id_mtx);
- array_size = new;
- ip_initid();
- mtx_unlock(&ip_id_mtx);
- } else
+ if (new >= 512 && new <= 32768)
+ ip_initid(new);
+ else
error = EINVAL;
}
return (error);
}
-/*
- * ip_initid() runs with a mutex held and may execute in a network context.
- * As a result, it uses M_NOWAIT. Ideally, we would always do this
- * allocation from the sysctl contact and have it be an invariant that if
- * this random ID allocation mode is selected, the buffers are present. This
- * would also avoid potential network context failures of IP ID generation.
- */
static void
-ip_initid(void)
+ip_initid(int new_size)
{
+ uint16_t *new_array;
+ bitstr_t *new_bits;
- mtx_assert(&ip_id_mtx, MA_OWNED);
+ new_array = malloc(new_size * sizeof(uint16_t), M_IPID,
+ M_WAITOK | M_ZERO);
+ new_bits = malloc(bitstr_size(65536), M_IPID, M_WAITOK | M_ZERO);
- if (id_array != NULL) {
- free(id_array, M_IPID);
- free(id_bits, M_IPID);
- }
- random_id_collisions = 0;
- random_id_total = 0;
- array_ptr = 0;
- id_array = (u_int16_t *) malloc(array_size * sizeof(u_int16_t),
- M_IPID, M_NOWAIT | M_ZERO);
- id_bits = (bitstr_t *) malloc(bitstr_size(65536), M_IPID,
- M_NOWAIT | M_ZERO);
- if (id_array == NULL || id_bits == NULL) {
- /* Neither or both. */
- if (id_array != NULL) {
- free(id_array, M_IPID);
- id_array = NULL;
- }
- if (id_bits != NULL) {
- free(id_bits, M_IPID);
- id_bits = NULL;
- }
+ mtx_lock(&V_ip_id_mtx);
+ if (V_id_array != NULL) {
+ free(V_id_array, M_IPID);
+ free(V_id_bits, M_IPID);
}
+ V_id_array = new_array;
+ V_id_bits = new_bits;
+ V_array_size = new_size;
+ V_array_ptr = 0;
+ V_random_id_collisions = 0;
+ V_random_id_total = 0;
+ mtx_unlock(&V_ip_id_mtx);
}
-u_int16_t
+static uint16_t
ip_randomid(void)
{
- u_int16_t new_id;
-
- mtx_lock(&ip_id_mtx);
- if (id_array == NULL)
- ip_initid();
-
- /*
- * Fail gracefully; return a fixed id if memory allocation failed;
- * ideally we wouldn't do allocation in this context in order to
- * avoid the possibility of this failure mode.
- */
- if (id_array == NULL) {
- mtx_unlock(&ip_id_mtx);
- return (1);
- }
+ uint16_t new_id;
+ mtx_lock(&V_ip_id_mtx);
/*
* To avoid a conflict with the zeros that the array is initially
* filled with, we never hand out an id of zero.
@@ -196,16 +226,76 @@ ip_randomid(void)
new_id = 0;
do {
if (new_id != 0)
- random_id_collisions++;
+ V_random_id_collisions++;
arc4rand(&new_id, sizeof(new_id), 0);
- } while (bit_test(id_bits, new_id) || new_id == 0);
- bit_clear(id_bits, id_array[array_ptr]);
- bit_set(id_bits, new_id);
- id_array[array_ptr] = new_id;
- array_ptr++;
- if (array_ptr == array_size)
- array_ptr = 0;
- random_id_total++;
- mtx_unlock(&ip_id_mtx);
+ } while (bit_test(V_id_bits, new_id) || new_id == 0);
+ bit_clear(V_id_bits, V_id_array[V_array_ptr]);
+ bit_set(V_id_bits, new_id);
+ V_id_array[V_array_ptr] = new_id;
+ V_array_ptr++;
+ if (V_array_ptr == V_array_size)
+ V_array_ptr = 0;
+ V_random_id_total++;
+ mtx_unlock(&V_ip_id_mtx);
return (new_id);
}
+
+void
+ip_fillid(struct ip *ip)
+{
+
+ /*
+ * Per RFC6864 Section 4
+ *
+ * o Atomic datagrams: (DF==1) && (MF==0) && (frag_offset==0)
+ * o Non-atomic datagrams: (DF==0) || (MF==1) || (frag_offset>0)
+ */
+ if (V_ip_rfc6864 && (ip->ip_off & htons(IP_DF)) == htons(IP_DF))
+ ip->ip_id = 0;
+ else if (V_ip_do_randomid)
+ ip->ip_id = ip_randomid();
+ else {
+ counter_u64_add(V_ip_id, 1);
+ /*
+ * There are two issues about this trick, to be kept in mind.
+ * 1) We can migrate between counter_u64_add() and next
+ * line, and grab counter from other CPU, resulting in too
+ * quick ID reuse. This is tolerable in our particular case,
+ * since probability of such event is much lower then reuse
+ * of ID due to legitimate overflow, that at modern Internet
+ * speeds happens all the time.
+ * 2) We are relying on the fact that counter(9) is based on
+ * UMA_ZONE_PCPU uma(9) zone. We also take only last
+ * sixteen bits of a counter, so we don't care about the
+ * fact that machines with 32-bit word update their counters
+ * not atomically.
+ */
+ ip->ip_id = htons((*(uint64_t *)zpcpu_get(V_ip_id)) & 0xffff);
+ }
+}
+
+static void
+ipid_sysinit(void)
+{
+ int i;
+
+ mtx_init(&V_ip_id_mtx, "ip_id_mtx", NULL, MTX_DEF);
+ V_ip_id = counter_u64_alloc(M_WAITOK);
+
+ CPU_FOREACH(i)
+ arc4rand(zpcpu_get_cpu(V_ip_id, i), sizeof(uint64_t), 0);
+}
+VNET_SYSINIT(ip_id, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipid_sysinit, NULL);
+
+static void
+ipid_sysuninit(void)
+{
+
+ if (V_id_array != NULL) {
+ free(V_id_array, M_IPID);
+ free(V_id_bits, M_IPID);
+ }
+ counter_u64_free(V_ip_id);
+ mtx_destroy(&V_ip_id_mtx);
+}
+VNET_SYSUNINIT(ip_id, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ipid_sysuninit, NULL);
diff --git a/freebsd/sys/netinet/ip_input.c b/freebsd/sys/netinet/ip_input.c
index 24002aac..425dbc1f 100644
--- a/freebsd/sys/netinet/ip_input.c
+++ b/freebsd/sys/netinet/ip_input.c
@@ -35,13 +35,14 @@
__FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_bootp.h>
-#include <rtems/bsd/local/opt_ipfw.h>
#include <rtems/bsd/local/opt_ipstealth.h>
#include <rtems/bsd/local/opt_ipsec.h>
#include <rtems/bsd/local/opt_route.h>
+#include <rtems/bsd/local/opt_rss.h>
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
+#include <sys/hhook.h>
#include <sys/mbuf.h>
#include <sys/malloc.h>
#include <sys/domain.h>
@@ -50,7 +51,9 @@ __FBSDID("$FreeBSD$");
#include <sys/time.h>
#include <sys/kernel.h>
#include <rtems/bsd/sys/lock.h>
+#include <sys/rmlock.h>
#include <sys/rwlock.h>
+#include <sys/sdt.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
@@ -61,10 +64,11 @@ __FBSDID("$FreeBSD$");
#include <net/if_dl.h>
#include <net/route.h>
#include <net/netisr.h>
+#include <net/rss_config.h>
#include <net/vnet.h>
-#include <net/flowtable.h>
#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
@@ -77,7 +81,10 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip_carp.h>
#ifdef IPSEC
#include <netinet/ip_ipsec.h>
+#include <netipsec/ipsec.h>
+#include <netipsec/key.h>
#endif /* IPSEC */
+#include <netinet/in_rss.h>
#include <sys/socketvar.h>
@@ -87,39 +94,30 @@ __FBSDID("$FreeBSD$");
CTASSERT(sizeof(struct ip) == 20);
#endif
-struct rwlock in_ifaddr_lock;
-RW_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock");
+/* IP reassembly functions are defined in ip_reass.c. */
+extern void ipreass_init(void);
+extern void ipreass_drain(void);
+extern void ipreass_slowtimo(void);
+#ifdef VIMAGE
+extern void ipreass_destroy(void);
+#endif
+
+struct rmlock in_ifaddr_lock;
+RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock");
VNET_DEFINE(int, rsvp_on);
VNET_DEFINE(int, ipforwarding);
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(ipforwarding), 0,
"Enable IP forwarding between interfaces");
static VNET_DEFINE(int, ipsendredirects) = 1; /* XXX */
#define V_ipsendredirects VNET(ipsendredirects)
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(ipsendredirects), 0,
"Enable sending IP redirects");
-static VNET_DEFINE(int, ip_keepfaith);
-#define V_ip_keepfaith VNET(ip_keepfaith)
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW,
- &VNET_NAME(ip_keepfaith), 0,
- "Enable packet capture for FAITH IPv4->IPv6 translater daemon");
-
-static VNET_DEFINE(int, ip_sendsourcequench);
-#define V_ip_sendsourcequench VNET(ip_sendsourcequench)
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW,
- &VNET_NAME(ip_sendsourcequench), 0,
- "Enable the transmission of source quench packets");
-
-VNET_DEFINE(int, ip_do_randomid);
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW,
- &VNET_NAME(ip_do_randomid), 0,
- "Assign random ip_id values");
-
/*
* XXX - Setting ip_checkinterface mostly implements the receive side of
* the Strong ES model described in RFC 1122, but since the routing table
@@ -135,7 +133,7 @@ SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW,
*/
static VNET_DEFINE(int, ip_checkinterface);
#define V_ip_checkinterface VNET(ip_checkinterface)
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(ip_checkinterface), 0,
"Verify packet arrives on correct interface");
@@ -145,8 +143,32 @@ static struct netisr_handler ip_nh = {
.nh_name = "ip",
.nh_handler = ip_input,
.nh_proto = NETISR_IP,
+#ifdef RSS
+ .nh_m2cpuid = rss_soft_m2cpuid_v4,
+ .nh_policy = NETISR_POLICY_CPU,
+ .nh_dispatch = NETISR_DISPATCH_HYBRID,
+#else
.nh_policy = NETISR_POLICY_FLOW,
+#endif
+};
+
+#ifdef RSS
+/*
+ * Directly dispatched frames are currently assumed
+ * to have a flowid already calculated.
+ *
+ * It should likely have something that assert it
+ * actually has valid flow details.
+ */
+static struct netisr_handler ip_direct_nh = {
+ .nh_name = "ip_direct",
+ .nh_handler = ip_direct_input,
+ .nh_proto = NETISR_IP_DIRECT,
+ .nh_m2cpuid = rss_soft_m2cpuid_v4,
+ .nh_policy = NETISR_POLICY_CPU,
+ .nh_dispatch = NETISR_DISPATCH_HYBRID,
};
+#endif
extern struct domain inetdomain;
extern struct protosw inetsw[];
@@ -155,41 +177,6 @@ VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */
VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */
VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */
-VNET_DEFINE(struct ipstat, ipstat);
-SYSCTL_VNET_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW,
- &VNET_NAME(ipstat), ipstat,
- "IP statistics (struct ipstat, netinet/ip_var.h)");
-
-static VNET_DEFINE(uma_zone_t, ipq_zone);
-static VNET_DEFINE(TAILQ_HEAD(ipqhead, ipq), ipq[IPREASS_NHASH]);
-static struct mtx ipqlock;
-
-#define V_ipq_zone VNET(ipq_zone)
-#define V_ipq VNET(ipq)
-
-#define IPQ_LOCK() mtx_lock(&ipqlock)
-#define IPQ_UNLOCK() mtx_unlock(&ipqlock)
-#define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF)
-#define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED)
-
-static void maxnipq_update(void);
-static void ipq_zone_change(void *);
-static void ip_drain_locked(void);
-
-static VNET_DEFINE(int, maxnipq); /* Administrative limit on # reass queues. */
-static VNET_DEFINE(int, nipq); /* Total # of reass queues */
-#define V_maxnipq VNET(maxnipq)
-#define V_nipq VNET(nipq)
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD,
- &VNET_NAME(nipq), 0,
- "Current number of IPv4 fragment reassembly queue entries");
-
-static VNET_DEFINE(int, maxfragsperpacket);
-#define V_maxfragsperpacket VNET(maxfragsperpacket)
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW,
- &VNET_NAME(maxfragsperpacket), 0,
- "Maximum number of IPv4 fragments allowed per packet");
-
#ifdef IPCTL_DEFMTU
SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
&ip_mtu, 0, "Default MTU");
@@ -197,42 +184,39 @@ SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
#ifdef IPSTEALTH
VNET_DEFINE(int, ipstealth);
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(ipstealth), 0,
"IP stealth mode, no TTL decrementation on forwarding");
#endif
-#ifdef FLOWTABLE
-static VNET_DEFINE(int, ip_output_flowtable_size) = 2048;
-VNET_DEFINE(struct flowtable *, ip_ft);
-#define V_ip_output_flowtable_size VNET(ip_output_flowtable_size)
-
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN,
- &VNET_NAME(ip_output_flowtable_size), 2048,
- "number of entries in the per-cpu output flow caches");
-#endif
+/*
+ * IP statistics are stored in the "array" of counter(9)s.
+ */
+VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat);
+VNET_PCPUSTAT_SYSINIT(ipstat);
+SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat,
+ "IP statistics (struct ipstat, netinet/ip_var.h)");
-static void ip_freef(struct ipqhead *, struct ipq *);
+#ifdef VIMAGE
+VNET_PCPUSTAT_SYSUNINIT(ipstat);
+#endif /* VIMAGE */
/*
* Kernel module interface for updating ipstat. The argument is an index
- * into ipstat treated as an array of u_long. While this encodes the general
- * layout of ipstat into the caller, it doesn't encode its location, so that
- * future changes to add, for example, per-CPU stats support won't cause
- * binary compatibility problems for kernel modules.
+ * into ipstat treated as an array.
*/
void
kmod_ipstat_inc(int statnum)
{
- (*((u_long *)&V_ipstat + statnum))++;
+ counter_u64_add(VNET(ipstat)[statnum], 1);
}
void
kmod_ipstat_dec(int statnum)
{
- (*((u_long *)&V_ipstat + statnum))--;
+ counter_u64_add(VNET(ipstat)[statnum], -1);
}
static int
@@ -273,6 +257,46 @@ SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops,
CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I",
"Number of packets dropped from the IP input queue");
+#ifdef RSS
+static int
+sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS)
+{
+ int error, qlimit;
+
+ netisr_getqlimit(&ip_direct_nh, &qlimit);
+ error = sysctl_handle_int(oidp, &qlimit, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (qlimit < 1)
+ return (EINVAL);
+ return (netisr_setqlimit(&ip_direct_nh, qlimit));
+}
+SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen,
+ CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I",
+ "Maximum size of the IP direct input queue");
+
+static int
+sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS)
+{
+ u_int64_t qdrops_long;
+ int error, qdrops;
+
+ netisr_getqdrops(&ip_direct_nh, &qdrops_long);
+ qdrops = qdrops_long;
+ error = sysctl_handle_int(oidp, &qdrops, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (qdrops != 0)
+ return (EINVAL);
+ netisr_clearqdrops(&ip_direct_nh);
+ return (0);
+}
+
+SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops,
+ CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I",
+ "Number of packets dropped from the IP direct input queue");
+#endif /* RSS */
+
/*
* IP initialization: fill in IP protocol switch table.
* All protocols not implemented in kernel go to raw IP protocol handler.
@@ -283,19 +307,11 @@ ip_init(void)
struct protosw *pr;
int i;
- V_ip_id = time_second & 0xffff;
-
TAILQ_INIT(&V_in_ifaddrhead);
V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
/* Initialize IP reassembly queue. */
- for (i = 0; i < IPREASS_NHASH; i++)
- TAILQ_INIT(&V_ipq[i]);
- V_maxnipq = nmbclusters / 32;
- V_maxfragsperpacket = 16;
- V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
- NULL, UMA_ALIGN_PTR, 0);
- maxnipq_update();
+ ipreass_init();
/* Initialize packet filter hooks. */
V_inet_pfil_hook.ph_type = PFIL_TYPE_AF;
@@ -304,27 +320,27 @@ ip_init(void)
printf("%s: WARNING: unable to register pfil hook, "
"error %d\n", __func__, i);
-#ifdef FLOWTABLE
- if (TUNABLE_INT_FETCH("net.inet.ip.output_flowtable_size",
- &V_ip_output_flowtable_size)) {
- if (V_ip_output_flowtable_size < 256)
- V_ip_output_flowtable_size = 256;
- if (!powerof2(V_ip_output_flowtable_size)) {
- printf("flowtable must be power of 2 size\n");
- V_ip_output_flowtable_size = 2048;
- }
- } else {
- /*
- * round up to the next power of 2
- */
- V_ip_output_flowtable_size = 1 << fls((1024 + maxusers * 64)-1);
- }
- V_ip_ft = flowtable_alloc("ipv4", V_ip_output_flowtable_size, FL_PCPU);
-#endif
+ if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET,
+ &V_ipsec_hhh_in[HHOOK_IPSEC_INET],
+ HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
+ printf("%s: WARNING: unable to register input helper hook\n",
+ __func__);
+ if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET,
+ &V_ipsec_hhh_out[HHOOK_IPSEC_INET],
+ HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
+ printf("%s: WARNING: unable to register output helper hook\n",
+ __func__);
/* Skip initialization of globals for non-default instances. */
- if (!IS_DEFAULT_VNET(curvnet))
+#ifdef VIMAGE
+ if (!IS_DEFAULT_VNET(curvnet)) {
+ netisr_register_vnet(&ip_nh);
+#ifdef RSS
+ netisr_register_vnet(&ip_direct_nh);
+#endif
return;
+ }
+#endif
pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
if (pr == NULL)
@@ -346,27 +362,79 @@ ip_init(void)
ip_protox[pr->pr_protocol] = pr - inetsw;
}
- EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change,
- NULL, EVENTHANDLER_PRI_ANY);
-
- /* Initialize various other remaining things. */
- IPQ_LOCK_INIT();
netisr_register(&ip_nh);
+#ifdef RSS
+ netisr_register(&ip_direct_nh);
+#endif
}
#ifdef VIMAGE
-void
-ip_destroy(void)
+static void
+ip_destroy(void *unused __unused)
{
+ struct ifnet *ifp;
+ int error;
+
+#ifdef RSS
+ netisr_unregister_vnet(&ip_direct_nh);
+#endif
+ netisr_unregister_vnet(&ip_nh);
+
+ if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0)
+ printf("%s: WARNING: unable to unregister pfil hook, "
+ "error %d\n", __func__, error);
+
+ error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]);
+ if (error != 0) {
+ printf("%s: WARNING: unable to deregister input helper hook "
+ "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: "
+ "error %d returned\n", __func__, error);
+ }
+ error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]);
+ if (error != 0) {
+ printf("%s: WARNING: unable to deregister output helper hook "
+ "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: "
+ "error %d returned\n", __func__, error);
+ }
+
+ /* Remove the IPv4 addresses from all interfaces. */
+ in_ifscrub_all();
+
+ /* Make sure the IPv4 routes are gone as well. */
+ IFNET_RLOCK();
+ TAILQ_FOREACH(ifp, &V_ifnet, if_link)
+ rt_flushifroutes_af(ifp, AF_INET);
+ IFNET_RUNLOCK();
+
+ /* Destroy IP reassembly queue. */
+ ipreass_destroy();
/* Cleanup in_ifaddr hash table; should be empty. */
hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask);
+}
- IPQ_LOCK();
- ip_drain_locked();
- IPQ_UNLOCK();
+VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL);
+#endif
- uma_zdestroy(V_ipq_zone);
+#ifdef RSS
+/*
+ * IP direct input routine.
+ *
+ * This is called when reinjecting completed fragments where
+ * all of the previous checking and book-keeping has been done.
+ */
+void
+ip_direct_input(struct mbuf *m)
+{
+ struct ip *ip;
+ int hlen;
+
+ ip = mtod(m, struct ip *);
+ hlen = ip->ip_hl << 2;
+
+ IPSTAT_INC(ips_delivered);
+ (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
+ return;
}
#endif
@@ -382,21 +450,18 @@ ip_input(struct mbuf *m)
struct ifaddr *ifa;
struct ifnet *ifp;
int checkif, hlen = 0;
- u_short sum;
+ uint16_t sum, ip_len;
int dchg = 0; /* dest changed after fw */
struct in_addr odst; /* original dst address */
M_ASSERTPKTHDR(m);
if (m->m_flags & M_FASTFWD_OURS) {
- /*
- * Firewall or NAT changed destination to local.
- * We expect ip_len and ip_off to be in host byte order.
- */
m->m_flags &= ~M_FASTFWD_OURS;
/* Set up some basics that will be used later. */
ip = mtod(m, struct ip *);
hlen = ip->ip_hl << 2;
+ ip_len = ntohs(ip->ip_len);
goto ours;
}
@@ -430,6 +495,8 @@ ip_input(struct mbuf *m)
ip = mtod(m, struct ip *);
}
+ IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL);
+
/* 127/8 must not appear on wire - RFC1122 */
ifp = m->m_pkthdr.rcvif;
if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
@@ -460,15 +527,11 @@ ip_input(struct mbuf *m)
return;
#endif
- /*
- * Convert fields to host representation.
- */
- ip->ip_len = ntohs(ip->ip_len);
- if (ip->ip_len < hlen) {
+ ip_len = ntohs(ip->ip_len);
+ if (ip_len < hlen) {
IPSTAT_INC(ips_badlen);
goto bad;
}
- ip->ip_off = ntohs(ip->ip_off);
/*
* Check that the amount of data in the buffers
@@ -476,24 +539,35 @@ ip_input(struct mbuf *m)
* Trim mbufs if longer than we expect.
* Drop packet if shorter than we expect.
*/
- if (m->m_pkthdr.len < ip->ip_len) {
+ if (m->m_pkthdr.len < ip_len) {
tooshort:
IPSTAT_INC(ips_tooshort);
goto bad;
}
- if (m->m_pkthdr.len > ip->ip_len) {
+ if (m->m_pkthdr.len > ip_len) {
if (m->m_len == m->m_pkthdr.len) {
- m->m_len = ip->ip_len;
- m->m_pkthdr.len = ip->ip_len;
+ m->m_len = ip_len;
+ m->m_pkthdr.len = ip_len;
} else
- m_adj(m, ip->ip_len - m->m_pkthdr.len);
+ m_adj(m, ip_len - m->m_pkthdr.len);
}
+
+ /* Try to forward the packet, but if we fail continue */
#ifdef IPSEC
+ /* For now we do not handle IPSEC in tryforward. */
+ if (!key_havesp(IPSEC_DIR_INBOUND) && !key_havesp(IPSEC_DIR_OUTBOUND) &&
+ (V_ipforwarding == 1))
+ if (ip_tryforward(m) == NULL)
+ return;
/*
* Bypass packet filtering for packets previously handled by IPsec.
*/
if (ip_ipsec_filtertunnel(m))
goto passin;
+#else
+ if (V_ipforwarding == 1)
+ if (ip_tryforward(m) == NULL)
+ return;
#endif /* IPSEC */
/*
@@ -523,8 +597,7 @@ tooshort:
goto ours;
}
if (m->m_flags & M_IP_NEXTHOP) {
- dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL);
- if (dchg != 0) {
+ if (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) {
/*
* Directly ship the packet on. This allows
* forwarding packets originally destined to us
@@ -535,6 +608,7 @@ tooshort:
}
}
passin:
+
/*
* Process options and, if not destined for us,
* ship it on. ip_dooptions returns 1 when an
@@ -597,7 +671,9 @@ passin:
*/
if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr &&
(!checkif || ia->ia_ifp == ifp)) {
- ifa_ref(&ia->ia_ifa);
+ counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
+ counter_u64_add(ia->ia_ifa.ifa_ibytes,
+ m->m_pkthdr.len);
/* IN_IFADDR_RUNLOCK(); */
goto ours;
}
@@ -620,13 +696,17 @@ passin:
ia = ifatoia(ifa);
if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
ip->ip_dst.s_addr) {
- ifa_ref(ifa);
+ counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
+ counter_u64_add(ia->ia_ifa.ifa_ibytes,
+ m->m_pkthdr.len);
IF_ADDR_RUNLOCK(ifp);
goto ours;
}
#ifdef BOOTP_COMPAT
if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
- ifa_ref(ifa);
+ counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
+ counter_u64_add(ia->ia_ifa.ifa_ibytes,
+ m->m_pkthdr.len);
IF_ADDR_RUNLOCK(ifp);
goto ours;
}
@@ -679,28 +759,12 @@ passin:
goto ours;
/*
- * FAITH(Firewall Aided Internet Translator)
- */
- if (ifp && ifp->if_type == IFT_FAITH) {
- if (V_ip_keepfaith) {
- if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP)
- goto ours;
- }
- m_freem(m);
- return;
- }
-
- /*
* Not for us; forward if possible and desirable.
*/
if (V_ipforwarding == 0) {
IPSTAT_INC(ips_cantforward);
m_freem(m);
} else {
-#ifdef IPSEC
- if (ip_ipsec_fwd(m))
- goto bad;
-#endif /* IPSEC */
ip_forward(m, dchg);
}
return;
@@ -711,25 +775,16 @@ ours:
* IPSTEALTH: Process non-routing options only
* if the packet is destined for us.
*/
- if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) {
- if (ia != NULL)
- ifa_free(&ia->ia_ifa);
+ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1))
return;
- }
#endif /* IPSTEALTH */
- /* Count the packet in the ip address stats */
- if (ia != NULL) {
- ia->ia_ifa.if_ipackets++;
- ia->ia_ifa.if_ibytes += m->m_pkthdr.len;
- ifa_free(&ia->ia_ifa);
- }
-
/*
* Attempt reassembly; if it succeeds, proceed.
* ip_reass() will return a different mbuf.
*/
- if (ip->ip_off & (IP_MF | IP_OFFMASK)) {
+ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
+ /* XXXGL: shouldn't we save & set m_flags? */
m = ip_reass(m);
if (m == NULL)
return;
@@ -738,19 +793,13 @@ ours:
hlen = ip->ip_hl << 2;
}
- /*
- * Further protocols expect the packet length to be w/o the
- * IP header.
- */
- ip->ip_len -= hlen;
-
#ifdef IPSEC
/*
* enforce IPsec policy checking if we are seeing last header.
* note that we do not visit this with protocols with pcb layer
* code - like udp/tcp/raw ip.
*/
- if (ip_ipsec_input(m))
+ if (ip_ipsec_input(m, ip->ip_p) != 0)
goto bad;
#endif /* IPSEC */
@@ -759,419 +808,13 @@ ours:
*/
IPSTAT_INC(ips_delivered);
- (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen);
+ (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
return;
bad:
m_freem(m);
}
/*
- * After maxnipq has been updated, propagate the change to UMA. The UMA zone
- * max has slightly different semantics than the sysctl, for historical
- * reasons.
- */
-static void
-maxnipq_update(void)
-{
-
- /*
- * -1 for unlimited allocation.
- */
- if (V_maxnipq < 0)
- uma_zone_set_max(V_ipq_zone, 0);
- /*
- * Positive number for specific bound.
- */
- if (V_maxnipq > 0)
- uma_zone_set_max(V_ipq_zone, V_maxnipq);
- /*
- * Zero specifies no further fragment queue allocation -- set the
- * bound very low, but rely on implementation elsewhere to actually
- * prevent allocation and reclaim current queues.
- */
- if (V_maxnipq == 0)
- uma_zone_set_max(V_ipq_zone, 1);
-}
-
-static void
-ipq_zone_change(void *tag)
-{
-
- if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) {
- V_maxnipq = nmbclusters / 32;
- maxnipq_update();
- }
-}
-
-static int
-sysctl_maxnipq(SYSCTL_HANDLER_ARGS)
-{
- int error, i;
-
- i = V_maxnipq;
- error = sysctl_handle_int(oidp, &i, 0, req);
- if (error || !req->newptr)
- return (error);
-
- /*
- * XXXRW: Might be a good idea to sanity check the argument and place
- * an extreme upper bound.
- */
- if (i < -1)
- return (EINVAL);
- V_maxnipq = i;
- maxnipq_update();
- return (0);
-}
-
-SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW,
- NULL, 0, sysctl_maxnipq, "I",
- "Maximum number of IPv4 fragment reassembly queue entries");
-
-/*
- * Take incoming datagram fragment and try to reassemble it into
- * whole datagram. If the argument is the first fragment or one
- * in between the function will return NULL and store the mbuf
- * in the fragment chain. If the argument is the last fragment
- * the packet will be reassembled and the pointer to the new
- * mbuf returned for further processing. Only m_tags attached
- * to the first packet/fragment are preserved.
- * The IP header is *NOT* adjusted out of iplen.
- */
-struct mbuf *
-ip_reass(struct mbuf *m)
-{
- struct ip *ip;
- struct mbuf *p, *q, *nq, *t;
- struct ipq *fp = NULL;
- struct ipqhead *head;
- int i, hlen, next;
- u_int8_t ecn, ecn0;
- u_short hash;
-
- /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */
- if (V_maxnipq == 0 || V_maxfragsperpacket == 0) {
- IPSTAT_INC(ips_fragments);
- IPSTAT_INC(ips_fragdropped);
- m_freem(m);
- return (NULL);
- }
-
- ip = mtod(m, struct ip *);
- hlen = ip->ip_hl << 2;
-
- hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
- head = &V_ipq[hash];
- IPQ_LOCK();
-
- /*
- * Look for queue of fragments
- * of this datagram.
- */
- TAILQ_FOREACH(fp, head, ipq_list)
- if (ip->ip_id == fp->ipq_id &&
- ip->ip_src.s_addr == fp->ipq_src.s_addr &&
- ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
-#ifdef MAC
- mac_ipq_match(m, fp) &&
-#endif
- ip->ip_p == fp->ipq_p)
- goto found;
-
- fp = NULL;
-
- /*
- * Attempt to trim the number of allocated fragment queues if it
- * exceeds the administrative limit.
- */
- if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) {
- /*
- * drop something from the tail of the current queue
- * before proceeding further
- */
- struct ipq *q = TAILQ_LAST(head, ipqhead);
- if (q == NULL) { /* gak */
- for (i = 0; i < IPREASS_NHASH; i++) {
- struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead);
- if (r) {
- IPSTAT_ADD(ips_fragtimeout,
- r->ipq_nfrags);
- ip_freef(&V_ipq[i], r);
- break;
- }
- }
- } else {
- IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags);
- ip_freef(head, q);
- }
- }
-
-found:
- /*
- * Adjust ip_len to not reflect header,
- * convert offset of this to bytes.
- */
- ip->ip_len -= hlen;
- if (ip->ip_off & IP_MF) {
- /*
- * Make sure that fragments have a data length
- * that's a non-zero multiple of 8 bytes.
- */
- if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
- IPSTAT_INC(ips_toosmall); /* XXX */
- goto dropfrag;
- }
- m->m_flags |= M_FRAG;
- } else
- m->m_flags &= ~M_FRAG;
- ip->ip_off <<= 3;
-
-
- /*
- * Attempt reassembly; if it succeeds, proceed.
- * ip_reass() will return a different mbuf.
- */
- IPSTAT_INC(ips_fragments);
- m->m_pkthdr.header = ip;
-
- /* Previous ip_reass() started here. */
- /*
- * Presence of header sizes in mbufs
- * would confuse code below.
- */
- m->m_data += hlen;
- m->m_len -= hlen;
-
- /*
- * If first fragment to arrive, create a reassembly queue.
- */
- if (fp == NULL) {
- fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
- if (fp == NULL)
- goto dropfrag;
-#ifdef MAC
- if (mac_ipq_init(fp, M_NOWAIT) != 0) {
- uma_zfree(V_ipq_zone, fp);
- fp = NULL;
- goto dropfrag;
- }
- mac_ipq_create(m, fp);
-#endif
- TAILQ_INSERT_HEAD(head, fp, ipq_list);
- V_nipq++;
- fp->ipq_nfrags = 1;
- fp->ipq_ttl = IPFRAGTTL;
- fp->ipq_p = ip->ip_p;
- fp->ipq_id = ip->ip_id;
- fp->ipq_src = ip->ip_src;
- fp->ipq_dst = ip->ip_dst;
- fp->ipq_frags = m;
- m->m_nextpkt = NULL;
- goto done;
- } else {
- fp->ipq_nfrags++;
-#ifdef MAC
- mac_ipq_update(m, fp);
-#endif
- }
-
-#define GETIP(m) ((struct ip*)((m)->m_pkthdr.header))
-
- /*
- * Handle ECN by comparing this segment with the first one;
- * if CE is set, do not lose CE.
- * drop if CE and not-ECT are mixed for the same packet.
- */
- ecn = ip->ip_tos & IPTOS_ECN_MASK;
- ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
- if (ecn == IPTOS_ECN_CE) {
- if (ecn0 == IPTOS_ECN_NOTECT)
- goto dropfrag;
- if (ecn0 != IPTOS_ECN_CE)
- GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
- }
- if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
- goto dropfrag;
-
- /*
- * Find a segment which begins after this one does.
- */
- for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
- if (GETIP(q)->ip_off > ip->ip_off)
- break;
-
- /*
- * If there is a preceding segment, it may provide some of
- * our data already. If so, drop the data from the incoming
- * segment. If it provides all of our data, drop us, otherwise
- * stick new segment in the proper place.
- *
- * If some of the data is dropped from the preceding
- * segment, then it's checksum is invalidated.
- */
- if (p) {
- i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
- if (i > 0) {
- if (i >= ip->ip_len)
- goto dropfrag;
- m_adj(m, i);
- m->m_pkthdr.csum_flags = 0;
- ip->ip_off += i;
- ip->ip_len -= i;
- }
- m->m_nextpkt = p->m_nextpkt;
- p->m_nextpkt = m;
- } else {
- m->m_nextpkt = fp->ipq_frags;
- fp->ipq_frags = m;
- }
-
- /*
- * While we overlap succeeding segments trim them or,
- * if they are completely covered, dequeue them.
- */
- for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
- q = nq) {
- i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off;
- if (i < GETIP(q)->ip_len) {
- GETIP(q)->ip_len -= i;
- GETIP(q)->ip_off += i;
- m_adj(q, i);
- q->m_pkthdr.csum_flags = 0;
- break;
- }
- nq = q->m_nextpkt;
- m->m_nextpkt = nq;
- IPSTAT_INC(ips_fragdropped);
- fp->ipq_nfrags--;
- m_freem(q);
- }
-
- /*
- * Check for complete reassembly and perform frag per packet
- * limiting.
- *
- * Frag limiting is performed here so that the nth frag has
- * a chance to complete the packet before we drop the packet.
- * As a result, n+1 frags are actually allowed per packet, but
- * only n will ever be stored. (n = maxfragsperpacket.)
- *
- */
- next = 0;
- for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
- if (GETIP(q)->ip_off != next) {
- if (fp->ipq_nfrags > V_maxfragsperpacket) {
- IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
- ip_freef(head, fp);
- }
- goto done;
- }
- next += GETIP(q)->ip_len;
- }
- /* Make sure the last packet didn't have the IP_MF flag */
- if (p->m_flags & M_FRAG) {
- if (fp->ipq_nfrags > V_maxfragsperpacket) {
- IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
- ip_freef(head, fp);
- }
- goto done;
- }
-
- /*
- * Reassembly is complete. Make sure the packet is a sane size.
- */
- q = fp->ipq_frags;
- ip = GETIP(q);
- if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
- IPSTAT_INC(ips_toolong);
- IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
- ip_freef(head, fp);
- goto done;
- }
-
- /*
- * Concatenate fragments.
- */
- m = q;
- t = m->m_next;
- m->m_next = NULL;
- m_cat(m, t);
- nq = q->m_nextpkt;
- q->m_nextpkt = NULL;
- for (q = nq; q != NULL; q = nq) {
- nq = q->m_nextpkt;
- q->m_nextpkt = NULL;
- m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
- m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
- m_cat(m, q);
- }
- /*
- * In order to do checksumming faster we do 'end-around carry' here
- * (and not in for{} loop), though it implies we are not going to
- * reassemble more than 64k fragments.
- */
- while (m->m_pkthdr.csum_data & 0xffff0000)
- m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
- (m->m_pkthdr.csum_data >> 16);
-#ifdef MAC
- mac_ipq_reassemble(fp, m);
- mac_ipq_destroy(fp);
-#endif
-
- /*
- * Create header for new ip packet by modifying header of first
- * packet; dequeue and discard fragment reassembly header.
- * Make header visible.
- */
- ip->ip_len = (ip->ip_hl << 2) + next;
- ip->ip_src = fp->ipq_src;
- ip->ip_dst = fp->ipq_dst;
- TAILQ_REMOVE(head, fp, ipq_list);
- V_nipq--;
- uma_zfree(V_ipq_zone, fp);
- m->m_len += (ip->ip_hl << 2);
- m->m_data -= (ip->ip_hl << 2);
- /* some debugging cruft by sklower, below, will go away soon */
- if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */
- m_fixhdr(m);
- IPSTAT_INC(ips_reassembled);
- IPQ_UNLOCK();
- return (m);
-
-dropfrag:
- IPSTAT_INC(ips_fragdropped);
- if (fp != NULL)
- fp->ipq_nfrags--;
- m_freem(m);
-done:
- IPQ_UNLOCK();
- return (NULL);
-
-#undef GETIP
-}
-
-/*
- * Free a fragment reassembly header and all
- * associated datagrams.
- */
-static void
-ip_freef(struct ipqhead *fhp, struct ipq *fp)
-{
- struct mbuf *q;
-
- IPQ_LOCK_ASSERT();
-
- while (fp->ipq_frags) {
- q = fp->ipq_frags;
- fp->ipq_frags = q->m_nextpkt;
- m_freem(q);
- }
- TAILQ_REMOVE(fhp, fp, ipq_list);
- uma_zfree(V_ipq_zone, fp);
- V_nipq--;
-}
-
-/*
* IP timer processing;
* if a timer expires on a reassembly
* queue, discard it.
@@ -1180,82 +823,28 @@ void
ip_slowtimo(void)
{
VNET_ITERATOR_DECL(vnet_iter);
- struct ipq *fp;
- int i;
VNET_LIST_RLOCK_NOSLEEP();
- IPQ_LOCK();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- for (i = 0; i < IPREASS_NHASH; i++) {
- for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) {
- struct ipq *fpp;
-
- fpp = fp;
- fp = TAILQ_NEXT(fp, ipq_list);
- if(--fpp->ipq_ttl == 0) {
- IPSTAT_ADD(ips_fragtimeout,
- fpp->ipq_nfrags);
- ip_freef(&V_ipq[i], fpp);
- }
- }
- }
- /*
- * If we are over the maximum number of fragments
- * (due to the limit being lowered), drain off
- * enough to get down to the new limit.
- */
- if (V_maxnipq >= 0 && V_nipq > V_maxnipq) {
- for (i = 0; i < IPREASS_NHASH; i++) {
- while (V_nipq > V_maxnipq &&
- !TAILQ_EMPTY(&V_ipq[i])) {
- IPSTAT_ADD(ips_fragdropped,
- TAILQ_FIRST(&V_ipq[i])->ipq_nfrags);
- ip_freef(&V_ipq[i],
- TAILQ_FIRST(&V_ipq[i]));
- }
- }
- }
+ ipreass_slowtimo();
CURVNET_RESTORE();
}
- IPQ_UNLOCK();
VNET_LIST_RUNLOCK_NOSLEEP();
}
-/*
- * Drain off all datagram fragments.
- */
-static void
-ip_drain_locked(void)
-{
- int i;
-
- IPQ_LOCK_ASSERT();
-
- for (i = 0; i < IPREASS_NHASH; i++) {
- while(!TAILQ_EMPTY(&V_ipq[i])) {
- IPSTAT_ADD(ips_fragdropped,
- TAILQ_FIRST(&V_ipq[i])->ipq_nfrags);
- ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i]));
- }
- }
-}
-
void
ip_drain(void)
{
VNET_ITERATOR_DECL(vnet_iter);
VNET_LIST_RLOCK_NOSLEEP();
- IPQ_LOCK();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- ip_drain_locked();
+ ipreass_drain();
CURVNET_RESTORE();
}
- IPQ_UNLOCK();
VNET_LIST_RUNLOCK_NOSLEEP();
- in_rtqdrain();
}
/*
@@ -1314,33 +903,6 @@ ipproto_unregister(short ipproto)
return (0);
}
-/*
- * Given address of next destination (final or next hop), return (referenced)
- * internet address info of interface to be used to get there.
- */
-struct in_ifaddr *
-ip_rtaddr(struct in_addr dst, u_int fibnum)
-{
- struct route sro;
- struct sockaddr_in *sin;
- struct in_ifaddr *ia;
-
- bzero(&sro, sizeof(sro));
- sin = (struct sockaddr_in *)&sro.ro_dst;
- sin->sin_family = AF_INET;
- sin->sin_len = sizeof(*sin);
- sin->sin_addr = dst;
- in_rtalloc_ign(&sro, 0, fibnum);
-
- if (sro.ro_rt == NULL)
- return (NULL);
-
- ia = ifatoia(sro.ro_rt->rt_ifa);
- ifa_ref(&ia->ia_ifa);
- RTFREE(sro.ro_rt);
- return (ia);
-}
-
u_char inetctlerrmap[PRC_NCMDS] = {
0, 0, 0, 0,
0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
@@ -1370,6 +932,7 @@ ip_forward(struct mbuf *m, int srcrt)
struct ip *ip = mtod(m, struct ip *);
struct in_ifaddr *ia;
struct mbuf *mcopy;
+ struct sockaddr_in *sin;
struct in_addr dest;
struct route ro;
int error, type = 0, code = 0, mtu = 0;
@@ -1379,6 +942,13 @@ ip_forward(struct mbuf *m, int srcrt)
m_freem(m);
return;
}
+#ifdef IPSEC
+ if (ip_ipsec_fwd(m) != 0) {
+ IPSTAT_INC(ips_cantforward);
+ m_freem(m);
+ return;
+ }
+#endif /* IPSEC */
#ifdef IPSTEALTH
if (!V_ipstealth) {
#endif
@@ -1391,7 +961,23 @@ ip_forward(struct mbuf *m, int srcrt)
}
#endif
- ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
+ bzero(&ro, sizeof(ro));
+ sin = (struct sockaddr_in *)&ro.ro_dst;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = ip->ip_dst;
+#ifdef RADIX_MPATH
+ rtalloc_mpath_fib(&ro,
+ ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
+ M_GETFIB(m));
+#else
+ in_rtalloc_ign(&ro, 0, M_GETFIB(m));
+#endif
+ if (ro.ro_rt != NULL) {
+ ia = ifatoia(ro.ro_rt->rt_ifa);
+ ifa_ref(&ia->ia_ifa);
+ } else
+ ia = NULL;
#ifndef IPSEC
/*
* 'ia' may be NULL if there is no route for this destination.
@@ -1400,6 +986,7 @@ ip_forward(struct mbuf *m, int srcrt)
*/
if (!srcrt && ia == NULL) {
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
+ RO_RTFREE(&ro);
return;
}
#endif
@@ -1420,8 +1007,8 @@ ip_forward(struct mbuf *m, int srcrt)
* assume exclusive access to the IP header in `m', so any
* data in a cluster may change before we reach icmp_error().
*/
- MGETHDR(mcopy, M_DONTWAIT, m->m_type);
- if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_DONTWAIT)) {
+ mcopy = m_gethdr(M_NOWAIT, m->m_type);
+ if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) {
/*
* It's probably ok if the pkthdr dup fails (because
* the deep copy of the tag chain failed), but for now
@@ -1432,7 +1019,7 @@ ip_forward(struct mbuf *m, int srcrt)
mcopy = NULL;
}
if (mcopy != NULL) {
- mcopy->m_len = min(ip->ip_len, M_TRAILINGSPACE(mcopy));
+ mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy));
mcopy->m_pkthdr.len = mcopy->m_len;
m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
}
@@ -1456,16 +1043,8 @@ ip_forward(struct mbuf *m, int srcrt)
dest.s_addr = 0;
if (!srcrt && V_ipsendredirects &&
ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) {
- struct sockaddr_in *sin;
struct rtentry *rt;
- bzero(&ro, sizeof(ro));
- sin = (struct sockaddr_in *)&ro.ro_dst;
- sin->sin_family = AF_INET;
- sin->sin_len = sizeof(*sin);
- sin->sin_addr = ip->ip_dst;
- in_rtalloc_ign(&ro, 0, M_GETFIB(m));
-
rt = ro.ro_rt;
if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
@@ -1484,20 +1063,12 @@ ip_forward(struct mbuf *m, int srcrt)
code = ICMP_REDIRECT_HOST;
}
}
- if (rt)
- RTFREE(rt);
}
- /*
- * Try to cache the route MTU from ip_output so we can consider it for
- * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191.
- */
- bzero(&ro, sizeof(ro));
-
error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
if (error == EMSGSIZE && ro.ro_rt)
- mtu = ro.ro_rt->rt_rmx.rmx_mtu;
+ mtu = ro.ro_rt->rt_mtu;
RO_RTFREE(&ro);
if (error)
@@ -1560,31 +1131,12 @@ ip_forward(struct mbuf *m, int srcrt)
if (ia != NULL)
mtu = ia->ia_ifp->if_mtu;
else
- mtu = ip_next_mtu(ip->ip_len, 0);
+ mtu = ip_next_mtu(ntohs(ip->ip_len), 0);
}
IPSTAT_INC(ips_cantfrag);
break;
case ENOBUFS:
- /*
- * A router should not generate ICMP_SOURCEQUENCH as
- * required in RFC1812 Requirements for IP Version 4 Routers.
- * Source quench could be a big problem under DoS attacks,
- * or if the underlying interface is rate-limited.
- * Those who need source quench packets may re-enable them
- * via the net.inet.ip.sendsourcequench sysctl.
- */
- if (V_ip_sendsourcequench == 0) {
- m_freem(mcopy);
- if (ia != NULL)
- ifa_free(&ia->ia_ifa);
- return;
- } else {
- type = ICMP_SOURCEQUENCH;
- code = 0;
- }
- break;
-
case EACCES: /* ipfw denied packet */
m_freem(mcopy);
if (ia != NULL)
@@ -1606,8 +1158,8 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
bintime(&bt);
if (inp->inp_socket->so_options & SO_BINTIME) {
- *mp = sbcreatecontrol((caddr_t) &bt, sizeof(bt),
- SCM_BINTIME, SOL_SOCKET);
+ *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt),
+ SCM_BINTIME, SOL_SOCKET);
if (*mp)
mp = &(*mp)->m_next;
}
@@ -1615,20 +1167,20 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
struct timeval tv;
bintime2timeval(&bt, &tv);
- *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
- SCM_TIMESTAMP, SOL_SOCKET);
+ *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv),
+ SCM_TIMESTAMP, SOL_SOCKET);
if (*mp)
mp = &(*mp)->m_next;
}
}
if (inp->inp_flags & INP_RECVDSTADDR) {
- *mp = sbcreatecontrol((caddr_t) &ip->ip_dst,
+ *mp = sbcreatecontrol((caddr_t)&ip->ip_dst,
sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
if (*mp)
mp = &(*mp)->m_next;
}
if (inp->inp_flags & INP_RECVTTL) {
- *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl,
+ *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl,
sizeof(u_char), IP_RECVTTL, IPPROTO_IP);
if (*mp)
mp = &(*mp)->m_next;
@@ -1640,14 +1192,14 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
*/
/* options were tossed already */
if (inp->inp_flags & INP_RECVOPTS) {
- *mp = sbcreatecontrol((caddr_t) opts_deleted_above,
+ *mp = sbcreatecontrol((caddr_t)opts_deleted_above,
sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
if (*mp)
mp = &(*mp)->m_next;
}
/* ip_srcroute doesn't do what we want here, need to fix */
if (inp->inp_flags & INP_RECVRETOPTS) {
- *mp = sbcreatecontrol((caddr_t) ip_srcroute(m),
+ *mp = sbcreatecontrol((caddr_t)ip_srcroute(m),
sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
if (*mp)
mp = &(*mp)->m_next;
@@ -1662,36 +1214,73 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
struct sockaddr_dl *sdp;
struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
- if (((ifp = m->m_pkthdr.rcvif))
- && ( ifp->if_index && (ifp->if_index <= V_if_index))) {
+ if ((ifp = m->m_pkthdr.rcvif) &&
+ ifp->if_index && ifp->if_index <= V_if_index) {
sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
/*
* Change our mind and don't try copy.
*/
- if ((sdp->sdl_family != AF_LINK)
- || (sdp->sdl_len > sizeof(sdlbuf))) {
+ if (sdp->sdl_family != AF_LINK ||
+ sdp->sdl_len > sizeof(sdlbuf)) {
goto makedummy;
}
bcopy(sdp, sdl2, sdp->sdl_len);
} else {
makedummy:
- sdl2->sdl_len
- = offsetof(struct sockaddr_dl, sdl_data[0]);
+ sdl2->sdl_len =
+ offsetof(struct sockaddr_dl, sdl_data[0]);
sdl2->sdl_family = AF_LINK;
sdl2->sdl_index = 0;
sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
}
- *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len,
- IP_RECVIF, IPPROTO_IP);
+ *mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len,
+ IP_RECVIF, IPPROTO_IP);
if (*mp)
mp = &(*mp)->m_next;
}
if (inp->inp_flags & INP_RECVTOS) {
- *mp = sbcreatecontrol((caddr_t) &ip->ip_tos,
+ *mp = sbcreatecontrol((caddr_t)&ip->ip_tos,
sizeof(u_char), IP_RECVTOS, IPPROTO_IP);
if (*mp)
mp = &(*mp)->m_next;
}
+
+ if (inp->inp_flags2 & INP_RECVFLOWID) {
+ uint32_t flowid, flow_type;
+
+ flowid = m->m_pkthdr.flowid;
+ flow_type = M_HASHTYPE_GET(m);
+
+ /*
+ * XXX should handle the failure of one or the
+ * other - don't populate both?
+ */
+ *mp = sbcreatecontrol((caddr_t) &flowid,
+ sizeof(uint32_t), IP_FLOWID, IPPROTO_IP);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ *mp = sbcreatecontrol((caddr_t) &flow_type,
+ sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+
+#ifdef RSS
+ if (inp->inp_flags2 & INP_RECVRSSBUCKETID) {
+ uint32_t flowid, flow_type;
+ uint32_t rss_bucketid;
+
+ flowid = m->m_pkthdr.flowid;
+ flow_type = M_HASHTYPE_GET(m);
+
+ if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
+ *mp = sbcreatecontrol((caddr_t) &rss_bucketid,
+ sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+ }
+#endif
}
/*
@@ -1745,13 +1334,18 @@ ip_rsvp_done(void)
return 0;
}
-void
-rsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */
+int
+rsvp_input(struct mbuf **mp, int *offp, int proto)
{
+ struct mbuf *m;
+
+ m = *mp;
+ *mp = NULL;
if (rsvp_input_p) { /* call the real one if loaded */
- rsvp_input_p(m, off);
- return;
+ *mp = m;
+ rsvp_input_p(mp, offp, proto);
+ return (IPPROTO_DONE);
}
/* Can still get packets with rsvp_on = 0 if there is a local member
@@ -1761,13 +1355,15 @@ rsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */
if (!V_rsvp_on) {
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
if (V_ip_rsvpd != NULL) {
- rip_input(m, off);
- return;
+ *mp = m;
+ rip_input(mp, offp, proto);
+ return (IPPROTO_DONE);
}
/* Drop the packet */
m_freem(m);
+ return (IPPROTO_DONE);
}
diff --git a/freebsd/sys/netinet/ip_ipsec.h b/freebsd/sys/netinet/ip_ipsec.h
index 2870c114..f499b740 100644
--- a/freebsd/sys/netinet/ip_ipsec.h
+++ b/freebsd/sys/netinet/ip_ipsec.h
@@ -34,7 +34,7 @@
int ip_ipsec_filtertunnel(struct mbuf *);
int ip_ipsec_fwd(struct mbuf *);
-int ip_ipsec_input(struct mbuf *);
+int ip_ipsec_input(struct mbuf *, int);
int ip_ipsec_mtu(struct mbuf *, int);
-int ip_ipsec_output(struct mbuf **, struct inpcb *, int *, int *);
+int ip_ipsec_output(struct mbuf **, struct inpcb *, int *);
#endif
diff --git a/freebsd/sys/netinet/ip_mroute.c b/freebsd/sys/netinet/ip_mroute.c
index f4aeed24..f8b14735 100644
--- a/freebsd/sys/netinet/ip_mroute.c
+++ b/freebsd/sys/netinet/ip_mroute.c
@@ -79,6 +79,7 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/kernel.h>
#include <sys/stddef.h>
+#include <sys/eventhandler.h>
#include <rtems/bsd/sys/lock.h>
#include <sys/ktr.h>
#include <sys/malloc.h>
@@ -95,8 +96,10 @@ __FBSDID("$FreeBSD$");
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/time.h>
+#include <sys/counter.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/netisr.h>
#include <net/route.h>
#include <net/vnet.h>
@@ -121,7 +124,6 @@ __FBSDID("$FreeBSD$");
#endif
#define VIFI_INVALID ((vifi_t) -1)
-#define M_HASCL(m) ((m)->m_flags & M_EXT)
static VNET_DEFINE(uint32_t, last_tv_sec); /* last time we processed this */
#define V_last_tv_sec VNET(last_tv_sec)
@@ -147,11 +149,11 @@ static struct mtx mrouter_mtx;
static int ip_mrouter_cnt; /* # of vnets with active mrouters */
static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */
-static VNET_DEFINE(struct mrtstat, mrtstat);
-#define V_mrtstat VNET(mrtstat)
-SYSCTL_VNET_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW,
- &VNET_NAME(mrtstat), mrtstat,
- "IPv4 Multicast Forwarding Statistics (struct mrtstat, "
+static VNET_PCPUSTAT_DEFINE(struct mrtstat, mrtstat);
+VNET_PCPUSTAT_SYSINIT(mrtstat);
+VNET_PCPUSTAT_SYSUNINIT(mrtstat);
+SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat,
+ mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, "
"netinet/ip_mroute.h)");
static VNET_DEFINE(u_long, mfchash);
@@ -179,7 +181,7 @@ static VNET_DEFINE(vifi_t, numvifs);
#define V_numvifs VNET(numvifs)
static VNET_DEFINE(struct vif, viftable[MAXVIFS]);
#define V_viftable VNET(viftable)
-SYSCTL_VNET_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD,
+SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD,
&VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]",
"IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)");
@@ -227,13 +229,13 @@ static VNET_DEFINE(struct callout, bw_upcalls_ch);
#define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */
-static VNET_DEFINE(struct pimstat, pimstat);
-#define V_pimstat VNET(pimstat)
+static VNET_PCPUSTAT_DEFINE(struct pimstat, pimstat);
+VNET_PCPUSTAT_SYSINIT(pimstat);
+VNET_PCPUSTAT_SYSUNINIT(pimstat);
SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");
-SYSCTL_VNET_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD,
- &VNET_NAME(pimstat), pimstat,
- "PIM Statistics (struct pimstat, netinet/pim_var.h)");
+SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat,
+ pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)");
static u_long pim_squelch_wholepkt = 0;
SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW,
@@ -247,7 +249,7 @@ static const struct protosw in_pim_protosw = {
.pr_protocol = IPPROTO_PIM,
.pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
.pr_input = pim_input,
- .pr_output = (pr_output_t*)rip_output,
+ .pr_output = rip_output,
.pr_ctloutput = rip_ctloutput,
.pr_usrreqs = &rip_usrreqs
};
@@ -538,7 +540,7 @@ X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused)
int error = 0;
/*
- * Currently the only function calling this ioctl routine is rtioctl().
+ * Currently the only function calling this ioctl routine is rtioctl_fib().
* Typically, only root can create the raw socket in order to execute
* this ioctl method, however the request might be coming from a prison
*/
@@ -635,8 +637,8 @@ if_detached_event(void *arg __unused, struct ifnet *ifp)
continue;
for (i = 0; i < mfchashsize; i++) {
struct mfc *rt, *nrt;
- for (rt = LIST_FIRST(&V_mfchashtbl[i]); rt; rt = nrt) {
- nrt = LIST_NEXT(rt, mfc_hash);
+
+ LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
if (rt->mfc_parent == vifi) {
expire_mfc(rt);
}
@@ -754,8 +756,8 @@ X_ip_mrouter_done(void)
*/
for (i = 0; i < mfchashsize; i++) {
struct mfc *rt, *nrt;
- for (rt = LIST_FIRST(&V_mfchashtbl[i]); rt; rt = nrt) {
- nrt = LIST_NEXT(rt, mfc_hash);
+
+ LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
expire_mfc(rt);
}
}
@@ -1303,8 +1305,8 @@ X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m,
return ENOBUFS;
}
- mb0 = m_copypacket(m, M_DONTWAIT);
- if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen))
+ mb0 = m_copypacket(m, M_NOWAIT);
+ if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen))
mb0 = m_pullup(mb0, hlen);
if (mb0 == NULL) {
free(rte, M_MRTABLE);
@@ -1446,9 +1448,7 @@ expire_upcalls(void *arg)
if (V_nexpire[i] == 0)
continue;
- for (rt = LIST_FIRST(&V_mfchashtbl[i]); rt; rt = nrt) {
- nrt = LIST_NEXT(rt, mfc_hash);
-
+ LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
if (TAILQ_EMPTY(&rt->mfc_stall))
continue;
@@ -1490,7 +1490,7 @@ ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
{
struct ip *ip = mtod(m, struct ip *);
vifi_t vifi;
- int plen = ip->ip_len;
+ int plen = ntohs(ip->ip_len);
VIF_LOCK_ASSERT();
@@ -1546,7 +1546,7 @@ ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
int hlen = ip->ip_hl << 2;
struct mbuf *mm = m_copy(m, 0, hlen);
- if (mm && (M_HASCL(mm) || mm->m_len < hlen))
+ if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen))
mm = m_pullup(mm, hlen);
if (mm == NULL)
return ENOBUFS;
@@ -1666,8 +1666,8 @@ phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
* the IP header is actually copied, not just referenced,
* so that ip_output() only scribbles on the copy.
*/
- mb_copy = m_copypacket(m, M_DONTWAIT);
- if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen))
+ mb_copy = m_copypacket(m, M_NOWAIT);
+ if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen))
mb_copy = m_pullup(mb_copy, hlen);
if (mb_copy == NULL)
return;
@@ -1720,12 +1720,16 @@ X_ip_rsvp_force_done(struct socket *so __unused)
}
-static void
-X_rsvp_input(struct mbuf *m, int off __unused)
+static int
+X_rsvp_input(struct mbuf **mp, int *offp, int proto)
{
+ struct mbuf *m;
+ m = *mp;
+ *mp = NULL;
if (!V_rsvp_on)
m_freem(m);
+ return (IPPROTO_DONE);
}
/*
@@ -2080,13 +2084,12 @@ bw_upcalls_send(void)
* Allocate a new mbuf, initialize it with the header and
* the payload for the pending calls.
*/
- MGETHDR(m, M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL) {
log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
return;
}
- m->m_len = m->m_pkthdr.len = 0;
m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]);
@@ -2381,7 +2384,7 @@ pim_register_prepare(struct ip *ip, struct mbuf *m)
* Copy the old packet & pullup its IP header into the
* new mbuf so we can modify it.
*/
- mb_copy = m_copypacket(m, M_DONTWAIT);
+ mb_copy = m_copypacket(m, M_NOWAIT);
if (mb_copy == NULL)
return NULL;
mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
@@ -2395,15 +2398,14 @@ pim_register_prepare(struct ip *ip, struct mbuf *m)
/* Compute the MTU after the PIM Register encapsulation */
mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
- if (ip->ip_len <= mtu) {
+ if (ntohs(ip->ip_len) <= mtu) {
/* Turn the IP header into a valid one */
- ip->ip_len = htons(ip->ip_len);
- ip->ip_off = htons(ip->ip_off);
ip->ip_sum = 0;
ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
} else {
/* Fragment the packet */
- if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) {
+ mb_copy->m_pkthdr.csum_flags |= CSUM_IP;
+ if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) {
m_freem(mb_copy);
return NULL;
}
@@ -2428,7 +2430,7 @@ pim_register_send_upcall(struct ip *ip, struct vif *vifp,
/*
* Add a new mbuf with an upcall header
*/
- MGETHDR(mb_first, M_DONTWAIT, MT_DATA);
+ mb_first = m_gethdr(M_NOWAIT, MT_DATA);
if (mb_first == NULL) {
m_freem(mb_copy);
return ENOBUFS;
@@ -2486,7 +2488,7 @@ pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy,
/*
* Add a new mbuf with the encapsulating header
*/
- MGETHDR(mb_first, M_DONTWAIT, MT_DATA);
+ mb_first = m_gethdr(M_NOWAIT, MT_DATA);
if (mb_first == NULL) {
m_freem(mb_copy);
return ENOBUFS;
@@ -2502,8 +2504,8 @@ pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy,
*/
ip_outer = mtod(mb_first, struct ip *);
*ip_outer = pim_encap_iphdr;
- ip_outer->ip_id = ip_newid();
- ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
+ ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
+ sizeof(pim_encap_pimhdr));
ip_outer->ip_src = V_viftable[vifi].v_lcl_addr;
ip_outer->ip_dst = rt->mfc_rp;
/*
@@ -2511,8 +2513,9 @@ pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy,
* IP_DF bit.
*/
ip_outer->ip_tos = ip->ip_tos;
- if (ntohs(ip->ip_off) & IP_DF)
- ip_outer->ip_off |= IP_DF;
+ if (ip->ip_off & htons(IP_DF))
+ ip_outer->ip_off |= htons(IP_DF);
+ ip_fillid(ip_outer);
pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
+ sizeof(pim_encap_iphdr));
*pimhdr = pim_encap_pimhdr;
@@ -2559,15 +2562,18 @@ pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
* (used by PIM-SM): the PIM header is stripped off, and the inner packet
* is passed to if_simloop().
*/
-void
-pim_input(struct mbuf *m, int off)
+int
+pim_input(struct mbuf **mp, int *offp, int proto)
{
+ struct mbuf *m = *mp;
struct ip *ip = mtod(m, struct ip *);
struct pim *pim;
+ int iphlen = *offp;
int minlen;
- int datalen = ip->ip_len;
+ int datalen = ntohs(ip->ip_len) - iphlen;
int ip_tos;
- int iphlen = off;
+
+ *mp = NULL;
/* Keep statistics */
PIMSTAT_INC(pims_rcv_total_msgs);
@@ -2581,7 +2587,7 @@ pim_input(struct mbuf *m, int off)
CTR3(KTR_IPMF, "%s: short packet (%d) from %s",
__func__, datalen, inet_ntoa(ip->ip_src));
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
/*
@@ -2597,10 +2603,9 @@ pim_input(struct mbuf *m, int off)
* Get the IP and PIM headers in contiguous memory, and
* possibly the PIM REGISTER header.
*/
- if ((m->m_flags & M_EXT || m->m_len < minlen) &&
- (m = m_pullup(m, minlen)) == 0) {
+ if (m->m_len < minlen && (m = m_pullup(m, minlen)) == NULL) {
CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__);
- return;
+ return (IPPROTO_DONE);
}
/* m_pullup() may have given us a new mbuf so reset ip. */
@@ -2625,7 +2630,7 @@ pim_input(struct mbuf *m, int off)
PIMSTAT_INC(pims_rcv_badsum);
CTR1(KTR_IPMF, "%s: invalid checksum", __func__);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
/* PIM version check */
@@ -2634,7 +2639,7 @@ pim_input(struct mbuf *m, int off)
CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__,
(int)PIM_VT_V(pim->pim_vt), PIM_VERSION);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
/* restore mbuf back to the outer IP */
@@ -2659,7 +2664,7 @@ pim_input(struct mbuf *m, int off)
CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__,
(int)V_reg_vif_num);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
/* XXX need refcnt? */
vifp = V_viftable[V_reg_vif_num].v_ifp;
@@ -2673,7 +2678,7 @@ pim_input(struct mbuf *m, int off)
PIMSTAT_INC(pims_rcv_badregisters);
CTR1(KTR_IPMF, "%s: register packet size too small", __func__);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
reghdr = (u_int32_t *)(pim + 1);
@@ -2687,7 +2692,7 @@ pim_input(struct mbuf *m, int off)
PIMSTAT_INC(pims_rcv_badregisters);
CTR1(KTR_IPMF, "%s: bad encap ip version", __func__);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
/* verify the inner packet is destined to a mcast group */
@@ -2696,7 +2701,7 @@ pim_input(struct mbuf *m, int off)
CTR2(KTR_IPMF, "%s: bad encap ip dest %s", __func__,
inet_ntoa(encap_ip->ip_dst));
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
/* If a NULL_REGISTER, pass it to the daemon */
@@ -2735,7 +2740,7 @@ pim_input(struct mbuf *m, int off)
if (mcp == NULL) {
CTR1(KTR_IPMF, "%s: m_copy() failed", __func__);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
/* Keep statistics */
@@ -2771,9 +2776,10 @@ pim_input_to_daemon:
* XXX: the outer IP header pkt size of a Register is not adjust to
* reflect the fact that the inner multicast data is truncated.
*/
- rip_input(m, iphlen);
+ *mp = m;
+ rip_input(mp, offp, proto);
- return;
+ return (IPPROTO_DONE);
}
static int
@@ -2813,12 +2819,12 @@ vnet_mroute_init(const void *unused __unused)
MALLOC(V_nexpire, u_char *, mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO);
bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers));
- callout_init(&V_expire_upcalls_ch, CALLOUT_MPSAFE);
- callout_init(&V_bw_upcalls_ch, CALLOUT_MPSAFE);
- callout_init(&V_bw_meter_ch, CALLOUT_MPSAFE);
+ callout_init(&V_expire_upcalls_ch, 1);
+ callout_init(&V_bw_upcalls_ch, 1);
+ callout_init(&V_bw_meter_ch, 1);
}
-VNET_SYSINIT(vnet_mroute_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mroute_init,
+VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init,
NULL);
static void
@@ -2829,7 +2835,7 @@ vnet_mroute_uninit(const void *unused __unused)
V_nexpire = NULL;
}
-VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE,
+VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE,
vnet_mroute_uninit, NULL);
static int
@@ -2944,4 +2950,4 @@ static moduledata_t ip_mroutemod = {
0
};
-DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE);
+DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE);
diff --git a/freebsd/sys/netinet/ip_mroute.h b/freebsd/sys/netinet/ip_mroute.h
index e945b92c..65f7d83c 100644
--- a/freebsd/sys/netinet/ip_mroute.h
+++ b/freebsd/sys/netinet/ip_mroute.h
@@ -206,23 +206,24 @@ struct bw_upcall {
* The kernel's multicast routing statistics.
*/
struct mrtstat {
- u_long mrts_mfc_lookups; /* # forw. cache hash table hits */
- u_long mrts_mfc_misses; /* # forw. cache hash table misses */
- u_long mrts_upcalls; /* # calls to multicast routing daemon */
- u_long mrts_no_route; /* no route for packet's origin */
- u_long mrts_bad_tunnel; /* malformed tunnel options */
- u_long mrts_cant_tunnel; /* no room for tunnel options */
- u_long mrts_wrong_if; /* arrived on wrong interface */
- u_long mrts_upq_ovflw; /* upcall Q overflow */
- u_long mrts_cache_cleanups; /* # entries with no upcalls */
- u_long mrts_drop_sel; /* pkts dropped selectively */
- u_long mrts_q_overflow; /* pkts dropped - Q overflow */
- u_long mrts_pkt2large; /* pkts dropped - size > BKT SIZE */
- u_long mrts_upq_sockfull; /* upcalls dropped - socket full */
+ uint64_t mrts_mfc_lookups; /* # forw. cache hash table hits */
+ uint64_t mrts_mfc_misses; /* # forw. cache hash table misses */
+ uint64_t mrts_upcalls; /* # calls to multicast routing daemon */
+ uint64_t mrts_no_route; /* no route for packet's origin */
+ uint64_t mrts_bad_tunnel; /* malformed tunnel options */
+ uint64_t mrts_cant_tunnel; /* no room for tunnel options */
+ uint64_t mrts_wrong_if; /* arrived on wrong interface */
+ uint64_t mrts_upq_ovflw; /* upcall Q overflow */
+ uint64_t mrts_cache_cleanups; /* # entries with no upcalls */
+ uint64_t mrts_drop_sel; /* pkts dropped selectively */
+ uint64_t mrts_q_overflow; /* pkts dropped - Q overflow */
+ uint64_t mrts_pkt2large; /* pkts dropped - size > BKT SIZE */
+ uint64_t mrts_upq_sockfull; /* upcalls dropped - socket full */
};
#ifdef _KERNEL
-#define MRTSTAT_ADD(name, val) V_mrtstat.name += (val)
+#define MRTSTAT_ADD(name, val) \
+ VNET_PCPUSTAT_ADD(struct mrtstat, mrtstat, name, (val))
#define MRTSTAT_INC(name) MRTSTAT_ADD(name, 1)
#endif
diff --git a/freebsd/sys/netinet/ip_options.c b/freebsd/sys/netinet/ip_options.c
index 6431aaa1..134479c9 100644
--- a/freebsd/sys/netinet/ip_options.c
+++ b/freebsd/sys/netinet/ip_options.c
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
#include <net/vnet.h>
#include <netinet/in.h>
+#include <netinet/in_fib.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
@@ -67,18 +68,21 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
-static int ip_dosourceroute = 0;
-SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW,
- &ip_dosourceroute, 0, "Enable forwarding source routed IP packets");
+static VNET_DEFINE(int, ip_dosourceroute);
+SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_dosourceroute), 0,
+ "Enable forwarding source routed IP packets");
+#define V_ip_dosourceroute VNET(ip_dosourceroute)
-static int ip_acceptsourceroute = 0;
+static VNET_DEFINE(int, ip_acceptsourceroute);
SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
- CTLFLAG_RW, &ip_acceptsourceroute, 0,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_acceptsourceroute), 0,
"Enable accepting source routed IP packets");
+#define V_ip_acceptsourceroute VNET(ip_acceptsourceroute)
-int ip_doopts = 1; /* 0 = ignore, 1 = process, 2 = reject */
-SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_RW,
- &ip_doopts, 0, "Enable IP options processing ([LS]SRR, RR, TS)");
+VNET_DEFINE(int, ip_doopts) = 1; /* 0 = ignore, 1 = process, 2 = reject */
+SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(ip_doopts), 0, "Enable IP options processing ([LS]SRR, RR, TS)");
static void save_rte(struct mbuf *m, u_char *, struct in_addr);
@@ -103,12 +107,13 @@ ip_dooptions(struct mbuf *m, int pass)
int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
struct in_addr *sin, dst;
uint32_t ntime;
+ struct nhop4_extended nh_ext;
struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
/* Ignore or reject packets with IP options. */
- if (ip_doopts == 0)
+ if (V_ip_doopts == 0)
return 0;
- else if (ip_doopts == 2) {
+ else if (V_ip_doopts == 2) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_FILTER_PROHIB;
goto bad;
@@ -169,7 +174,7 @@ ip_dooptions(struct mbuf *m, int pass)
code = ICMP_UNREACH_SRCFAIL;
goto bad;
}
- if (!ip_dosourceroute)
+ if (!V_ip_dosourceroute)
goto nosourcerouting;
/*
* Loose routing, and not at next destination
@@ -182,7 +187,7 @@ ip_dooptions(struct mbuf *m, int pass)
/*
* End of source route. Should be for us.
*/
- if (!ip_acceptsourceroute)
+ if (!V_ip_acceptsourceroute)
goto nosourcerouting;
save_rte(m, cp, ip->ip_src);
break;
@@ -191,7 +196,7 @@ ip_dooptions(struct mbuf *m, int pass)
if (V_ipstealth)
goto dropit;
#endif
- if (!ip_dosourceroute) {
+ if (!V_ip_dosourceroute) {
if (V_ipforwarding) {
char buf[16]; /* aaa.bbb.ccc.ddd\0 */
/*
@@ -226,23 +231,34 @@ dropit:
(void)memcpy(&ipaddr.sin_addr, cp + off,
sizeof(ipaddr.sin_addr));
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_SRCFAIL;
+
if (opt == IPOPT_SSRR) {
#define INA struct in_ifaddr *
#define SA struct sockaddr *
- if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == NULL)
- ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0);
- } else
-/* XXX MRT 0 for routing */
- ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m));
- if (ia == NULL) {
- type = ICMP_UNREACH;
- code = ICMP_UNREACH_SRCFAIL;
- goto bad;
+ ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr,
+ RT_ALL_FIBS);
+ if (ia == NULL)
+ ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0,
+ RT_ALL_FIBS);
+ if (ia == NULL)
+ goto bad;
+
+ memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
+ sizeof(struct in_addr));
+ ifa_free(&ia->ia_ifa);
+ } else {
+ /* XXX MRT 0 for routing */
+ if (fib4_lookup_nh_ext(M_GETFIB(m),
+ ipaddr.sin_addr, 0, 0, &nh_ext) != 0)
+ goto bad;
+
+ memcpy(cp + off, &nh_ext.nh_src,
+ sizeof(struct in_addr));
}
+
ip->ip_dst = ipaddr.sin_addr;
- (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
- sizeof(struct in_addr));
- ifa_free(&ia->ia_ifa);
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
/*
* Let ip_intr's mcast routing check handle mcast pkts
@@ -276,15 +292,19 @@ dropit:
* destination, use the incoming interface (should be
* same).
*/
- if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL &&
- (ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m))) == NULL) {
+ if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) != NULL) {
+ memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
+ sizeof(struct in_addr));
+ ifa_free(&ia->ia_ifa);
+ } else if (fib4_lookup_nh_ext(M_GETFIB(m),
+ ipaddr.sin_addr, 0, 0, &nh_ext) == 0) {
+ memcpy(cp + off, &nh_ext.nh_src,
+ sizeof(struct in_addr));
+ } else {
type = ICMP_UNREACH;
code = ICMP_UNREACH_HOST;
goto bad;
}
- (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
- sizeof(struct in_addr));
- ifa_free(&ia->ia_ifa);
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
break;
@@ -413,7 +433,7 @@ ip_srcroute(struct mbuf *m0)
if (opts->ip_nhops == 0)
return (NULL);
- m = m_get(M_DONTWAIT, MT_DATA);
+ m = m_get(M_NOWAIT, MT_DATA);
if (m == NULL)
return (NULL);
@@ -455,29 +475,23 @@ ip_srcroute(struct mbuf *m0)
}
/*
- * Strip out IP options, at higher level protocol in the kernel. Second
- * argument is buffer to which options will be moved, and return value is
- * their length.
- *
- * XXX should be deleted; last arg currently ignored.
+ * Strip out IP options, at higher level protocol in the kernel.
*/
void
-ip_stripoptions(struct mbuf *m, struct mbuf *mopt)
+ip_stripoptions(struct mbuf *m)
{
- int i;
struct ip *ip = mtod(m, struct ip *);
- caddr_t opts;
int olen;
- olen = (ip->ip_hl << 2) - sizeof (struct ip);
- opts = (caddr_t)(ip + 1);
- i = m->m_len - (sizeof (struct ip) + olen);
- bcopy(opts + olen, opts, (unsigned)i);
+ olen = (ip->ip_hl << 2) - sizeof(struct ip);
m->m_len -= olen;
if (m->m_flags & M_PKTHDR)
m->m_pkthdr.len -= olen;
- ip->ip_v = IPVERSION;
+ ip->ip_len = htons(ntohs(ip->ip_len) - olen);
ip->ip_hl = sizeof(struct ip) >> 2;
+
+ bcopy((char *)ip + sizeof(struct ip) + olen, (ip + 1),
+ (size_t )(m->m_len - sizeof(struct ip)));
}
/*
@@ -496,19 +510,19 @@ ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
unsigned optlen;
optlen = opt->m_len - sizeof(p->ipopt_dst);
- if (optlen + ip->ip_len > IP_MAXPACKET) {
+ if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) {
*phlen = 0;
return (m); /* XXX should fail */
}
if (p->ipopt_dst.s_addr)
ip->ip_dst = p->ipopt_dst;
- if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
- MGETHDR(n, M_DONTWAIT, MT_DATA);
+ if (!M_WRITABLE(m) || M_LEADINGSPACE(m) < optlen) {
+ n = m_gethdr(M_NOWAIT, MT_DATA);
if (n == NULL) {
*phlen = 0;
return (m);
}
- M_MOVE_PKTHDR(n, m);
+ m_move_pkthdr(n, m);
n->m_pkthdr.rcvif = NULL;
n->m_pkthdr.len += optlen;
m->m_len -= sizeof(struct ip);
@@ -529,7 +543,7 @@ ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
*phlen = sizeof(struct ip) + optlen;
ip->ip_v = IPVERSION;
ip->ip_hl = *phlen >> 2;
- ip->ip_len += optlen;
+ ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
return (m);
}
@@ -596,7 +610,7 @@ ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m)
/* turn off any old options */
if (*pcbopt)
(void)m_free(*pcbopt);
- *pcbopt = 0;
+ *pcbopt = NULL;
if (m == NULL || m->m_len == 0) {
/*
* Only turning off any previous options.
@@ -694,7 +708,7 @@ bad:
* may change in future.
* Router alert options SHOULD be passed if running in IPSTEALTH mode and
* we are not the endpoint.
- * Length checks on individual options should already have been peformed
+ * Length checks on individual options should already have been performed
* by ip_dooptions() therefore they are folded under INVARIANTS here.
*
* Return zero if not present or options are invalid, non-zero if present.
diff --git a/freebsd/sys/netinet/ip_options.h b/freebsd/sys/netinet/ip_options.h
index 7ba5ae64..4a6ea420 100644
--- a/freebsd/sys/netinet/ip_options.h
+++ b/freebsd/sys/netinet/ip_options.h
@@ -47,14 +47,15 @@ struct ipopt_tag {
struct ipoptrt ip_srcrt;
};
-extern int ip_doopts; /* process or ignore IP options */
+VNET_DECLARE(int, ip_doopts); /* process or ignore IP options */
+#define V_ip_doopts VNET(ip_doopts)
int ip_checkrouteralert(struct mbuf *);
int ip_dooptions(struct mbuf *, int);
struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
int ip_optcopy(struct ip *, struct ip *);
int ip_pcbopts(struct inpcb *, int, struct mbuf *);
-void ip_stripoptions(struct mbuf *, struct mbuf *);
+void ip_stripoptions(struct mbuf *);
struct mbuf *ip_srcroute(struct mbuf *);
#endif /* !_NETINET_IP_OPTIONS_H_ */
diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c
index a06fed68..81e7b123 100644
--- a/freebsd/sys/netinet/ip_output.c
+++ b/freebsd/sys/netinet/ip_output.c
@@ -34,27 +34,32 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <rtems/bsd/local/opt_ipfw.h>
+#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_ipsec.h>
-#include <rtems/bsd/local/opt_route.h>
#include <rtems/bsd/local/opt_mbuf_stress_test.h>
#include <rtems/bsd/local/opt_mpath.h>
+#include <rtems/bsd/local/opt_route.h>
#include <rtems/bsd/local/opt_sctp.h>
+#include <rtems/bsd/local/opt_rss.h>
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
+#include <sys/rmlock.h>
+#include <sys/sdt.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/ucred.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/if_llatbl.h>
#include <net/netisr.h>
#include <net/pfil.h>
@@ -63,12 +68,15 @@ __FBSDID("$FreeBSD$");
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
+#include <net/rss_config.h>
#include <net/vnet.h>
#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
+#include <netinet/in_rss.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/ip_options.h>
@@ -86,25 +94,112 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
-VNET_DEFINE(u_short, ip_id);
-
#ifdef MBUF_STRESS_TEST
static int mbuf_frag_size = 0;
SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
#endif
-static void ip_mloopback
- (struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
+static void ip_mloopback(struct ifnet *, const struct mbuf *, int);
extern int in_mcast_loop;
extern struct protosw inetsw[];
+static inline int
+ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp,
+ struct sockaddr_in *dst, int *fibnum, int *error)
+{
+ struct m_tag *fwd_tag = NULL;
+ struct mbuf *m;
+ struct in_addr odst;
+ struct ip *ip;
+
+ m = *mp;
+ ip = mtod(m, struct ip *);
+
+ /* Run through list of hooks for output packets. */
+ odst.s_addr = ip->ip_dst.s_addr;
+ *error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, PFIL_OUT, inp);
+ m = *mp;
+ if ((*error) != 0 || m == NULL)
+ return 1; /* Finished */
+
+ ip = mtod(m, struct ip *);
+
+ /* See if destination IP address was changed by packet filter. */
+ if (odst.s_addr != ip->ip_dst.s_addr) {
+ m->m_flags |= M_SKIP_FIREWALL;
+ /* If destination is now ourself drop to ip_input(). */
+ if (in_localip(ip->ip_dst)) {
+ m->m_flags |= M_FASTFWD_OURS;
+ if (m->m_pkthdr.rcvif == NULL)
+ m->m_pkthdr.rcvif = V_loif;
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ m->m_pkthdr.csum_flags |=
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ m->m_pkthdr.csum_data = 0xffff;
+ }
+ m->m_pkthdr.csum_flags |=
+ CSUM_IP_CHECKED | CSUM_IP_VALID;
+#ifdef SCTP
+ if (m->m_pkthdr.csum_flags & CSUM_SCTP)
+ m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+#endif
+ *error = netisr_queue(NETISR_IP, m);
+ return 1; /* Finished */
+ }
+
+ bzero(dst, sizeof(*dst));
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = ip->ip_dst;
+
+ return -1; /* Reloop */
+ }
+ /* See if fib was changed by packet filter. */
+ if ((*fibnum) != M_GETFIB(m)) {
+ m->m_flags |= M_SKIP_FIREWALL;
+ *fibnum = M_GETFIB(m);
+ return -1; /* Reloop for FIB change */
+ }
+
+ /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
+ if (m->m_flags & M_FASTFWD_OURS) {
+ if (m->m_pkthdr.rcvif == NULL)
+ m->m_pkthdr.rcvif = V_loif;
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ m->m_pkthdr.csum_flags |=
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ m->m_pkthdr.csum_data = 0xffff;
+ }
+#ifdef SCTP
+ if (m->m_pkthdr.csum_flags & CSUM_SCTP)
+ m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+#endif
+ m->m_pkthdr.csum_flags |=
+ CSUM_IP_CHECKED | CSUM_IP_VALID;
+
+ *error = netisr_queue(NETISR_IP, m);
+ return 1; /* Finished */
+ }
+ /* Or forward to some other address? */
+ if ((m->m_flags & M_IP_NEXTHOP) &&
+ ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
+ bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
+ m->m_flags |= M_SKIP_FIREWALL;
+ m->m_flags &= ~M_IP_NEXTHOP;
+ m_tag_delete(m, fwd_tag);
+
+ return -1; /* Reloop for CHANGE of dst */
+ }
+
+ return 0;
+}
+
/*
* IP output. The packet in mbuf chain m contains a skeletal IP
* header (with len, off, ttl, proto, tos, src, dst).
- * ip_len and ip_off are in host format.
* The mbuf chain containing the packet will be freed.
* The mbuf opt, if present, will not be freed.
* If route ro is present and has ro_rt initialized, route lookup would be
@@ -118,20 +213,22 @@ int
ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
struct ip_moptions *imo, struct inpcb *inp)
{
+ struct rm_priotracker in_ifa_tracker;
struct ip *ip;
struct ifnet *ifp = NULL; /* keep compiler happy */
struct mbuf *m0;
int hlen = sizeof (struct ip);
int mtu;
- int n; /* scratchpad */
int error = 0;
struct sockaddr_in *dst;
+ const struct sockaddr_in *gw;
struct in_ifaddr *ia;
- int isbroadcast, sw_csum;
+ int isbroadcast;
+ uint16_t ip_len, ip_off;
struct route iproute;
struct rtentry *rte; /* cache for ro->ro_rt */
- struct in_addr odst;
- struct m_tag *fwd_tag = NULL;
+ uint32_t fibnum;
+ int have_ia_ref;
#ifdef IPSEC
int no_route_but_check_spd = 0;
#endif
@@ -140,31 +237,21 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
if (inp != NULL) {
INP_LOCK_ASSERT(inp);
M_SETFIB(m, inp->inp_inc.inc_fibnum);
- if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
+ if ((flags & IP_NODEFAULTFLOWID) == 0) {
m->m_pkthdr.flowid = inp->inp_flowid;
- m->m_flags |= M_FLOWID;
+ M_HASHTYPE_SET(m, inp->inp_flowtype);
}
}
if (ro == NULL) {
ro = &iproute;
bzero(ro, sizeof (*ro));
- }
+ } else
+ ro->ro_flags |= RT_LLE_CACHE;
#ifdef FLOWTABLE
- if (ro->ro_rt == NULL) {
- struct flentry *fle;
-
- /*
- * The flow table returns route entries valid for up to 30
- * seconds; we rely on the remainder of ip_output() taking no
- * longer than that long for the stability of ro_rt. The
- * flow ID assignment must have happened before this point.
- */
- fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET);
- if (fle != NULL)
- flow_to_route(fle, ro);
- }
+ if (ro->ro_rt == NULL)
+ (void )flowtable_lookup(AF_INET, m, ro);
#endif
if (opt) {
@@ -174,37 +261,49 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
hlen = len; /* ip->ip_hl is updated above */
}
ip = mtod(m, struct ip *);
+ ip_len = ntohs(ip->ip_len);
+ ip_off = ntohs(ip->ip_off);
- /*
- * Fill in IP header. If we are not allowing fragmentation,
- * then the ip_id field is meaningless, but we don't set it
- * to zero. Doing so causes various problems when devices along
- * the path (routers, load balancers, firewalls, etc.) illegally
- * disable DF on our packet. Note that a 16-bit counter
- * will wrap around in less than 10 seconds at 100 Mbit/s on a
- * medium with MTU 1500. See Steven M. Bellovin, "A Technique
- * for Counting NATted Hosts", Proc. IMW'02, available at
- * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
- */
if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
ip->ip_v = IPVERSION;
ip->ip_hl = hlen >> 2;
- ip->ip_id = ip_newid();
+ ip_fillid(ip);
IPSTAT_INC(ips_localout);
} else {
/* Header already set, fetch hlen from there */
hlen = ip->ip_hl << 2;
}
+ /*
+ * dst/gw handling:
+ *
+ * dst can be rewritten but always points to &ro->ro_dst.
+ * gw is readonly but can point either to dst OR rt_gateway,
+ * therefore we need restore gw if we're redoing lookup.
+ */
+ gw = dst = (struct sockaddr_in *)&ro->ro_dst;
+ fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
+ rte = ro->ro_rt;
+ if (rte == NULL) {
+ bzero(dst, sizeof(*dst));
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = ip->ip_dst;
+ }
again:
- dst = (struct sockaddr_in *)&ro->ro_dst;
- ia = NULL;
+ /*
+ * Validate route against routing table additions;
+ * a better/more specific route might have been added.
+ */
+ if (inp)
+ RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
/*
* If there is a cached route,
* check that it is to the same destination
* and is still up. If not, free it and try again.
* The address family should also be checked in case of sharing the
* cache with IPv6.
+ * Also check whether routing cache needs invalidation.
*/
rte = ro->ro_rt;
if (rte && ((rte->rt_flags & RTF_UP) == 0 ||
@@ -212,16 +311,14 @@ again:
!RT_LINK_IS_UP(rte->rt_ifp) ||
dst->sin_family != AF_INET ||
dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
- RO_RTFREE(ro);
- ro->ro_lle = NULL;
- rte = NULL;
- }
- if (rte == NULL && fwd_tag == NULL) {
- bzero(dst, sizeof(*dst));
- dst->sin_family = AF_INET;
- dst->sin_len = sizeof(*dst);
- dst->sin_addr = ip->ip_dst;
+ RTFREE(rte);
+ rte = ro->ro_rt = (struct rtentry *)NULL;
+ if (ro->ro_lle)
+ LLE_FREE(ro->ro_lle); /* zeros ro_lle */
+ ro->ro_lle = (struct llentry *)NULL;
}
+ ia = NULL;
+ have_ia_ref = 0;
/*
* If routing to interface only, short circuit routing lookup.
* The use of an all-ones broadcast address implies this; an
@@ -229,27 +326,33 @@ again:
* or the destination address of a ptp interface.
*/
if (flags & IP_SENDONES) {
- if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
- (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
+ if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
+ M_GETFIB(m)))) == NULL &&
+ (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
+ M_GETFIB(m)))) == NULL) {
IPSTAT_INC(ips_noroute);
error = ENETUNREACH;
goto bad;
}
+ have_ia_ref = 1;
ip->ip_dst.s_addr = INADDR_BROADCAST;
dst->sin_addr = ip->ip_dst;
ifp = ia->ia_ifp;
ip->ip_ttl = 1;
isbroadcast = 1;
} else if (flags & IP_ROUTETOIF) {
- if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
- (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0))) == NULL) {
+ if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
+ M_GETFIB(m)))) == NULL &&
+ (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
+ M_GETFIB(m)))) == NULL) {
IPSTAT_INC(ips_noroute);
error = ENETUNREACH;
goto bad;
}
+ have_ia_ref = 1;
ifp = ia->ia_ifp;
ip->ip_ttl = 1;
- isbroadcast = in_broadcast(dst->sin_addr, ifp);
+ isbroadcast = in_ifaddr_broadcast(dst->sin_addr, ia);
} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
imo != NULL && imo->imo_multicast_ifp != NULL) {
/*
@@ -257,7 +360,9 @@ again:
* packets if the interface is specified.
*/
ifp = imo->imo_multicast_ifp;
- IFP_TO_IA(ifp, ia);
+ IFP_TO_IA(ifp, ia, &in_ifa_tracker);
+ if (ia)
+ have_ia_ref = 1;
isbroadcast = 0; /* fool gcc */
} else {
/*
@@ -269,14 +374,14 @@ again:
#ifdef RADIX_MPATH
rtalloc_mpath_fib(ro,
ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
- inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+ fibnum);
#else
- in_rtalloc_ign(ro, 0,
- inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+ in_rtalloc_ign(ro, 0, fibnum);
#endif
rte = ro->ro_rt;
}
if (rte == NULL ||
+ (rte->rt_flags & RTF_UP) == 0 ||
rte->rt_ifp == NULL ||
!RT_LINK_IS_UP(rte->rt_ifp)) {
#ifdef IPSEC
@@ -293,45 +398,37 @@ again:
goto bad;
}
ia = ifatoia(rte->rt_ifa);
- ifa_ref(&ia->ia_ifa);
ifp = rte->rt_ifp;
- rte->rt_rmx.rmx_pksent++;
+ counter_u64_add(rte->rt_pksent, 1);
+ rt_update_ro_flags(ro);
if (rte->rt_flags & RTF_GATEWAY)
- dst = (struct sockaddr_in *)rte->rt_gateway;
+ gw = (struct sockaddr_in *)rte->rt_gateway;
if (rte->rt_flags & RTF_HOST)
isbroadcast = (rte->rt_flags & RTF_BROADCAST);
else
- isbroadcast = in_broadcast(dst->sin_addr, ifp);
+ isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia);
}
+
/*
* Calculate MTU. If we have a route that is up, use that,
* otherwise use the interface's MTU.
*/
- if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) {
- /*
- * This case can happen if the user changed the MTU
- * of an interface after enabling IP on it. Because
- * most netifs don't keep track of routes pointing to
- * them, there is no way for one to update all its
- * routes when the MTU is changed.
- */
- if (rte->rt_rmx.rmx_mtu > ifp->if_mtu)
- rte->rt_rmx.rmx_mtu = ifp->if_mtu;
- mtu = rte->rt_rmx.rmx_mtu;
- } else {
+ if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST)))
+ mtu = rte->rt_mtu;
+ else
mtu = ifp->if_mtu;
- }
/* Catch a possible divide by zero later. */
KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p",
__func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp));
+
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
m->m_flags |= M_MCAST;
/*
- * IP destination address is multicast. Make sure "dst"
+ * IP destination address is multicast. Make sure "gw"
* still points to the address in "ro". (It may have been
* changed to point to a gateway address, above.)
*/
- dst = (struct sockaddr_in *)&ro->ro_dst;
+ gw = dst;
/*
* See if the caller provided any multicast options
*/
@@ -373,7 +470,7 @@ again:
* thus deferring a hash lookup and mutex acquisition
* at the expense of a cheap copy using m_copym().
*/
- ip_mloopback(ifp, m, dst, hlen);
+ ip_mloopback(ifp, m, hlen);
} else {
/*
* If we are acting as a multicast router, perform
@@ -433,23 +530,6 @@ again:
}
/*
- * Verify that we have any chance at all of being able to queue the
- * packet or packet fragments, unless ALTQ is enabled on the given
- * interface in which case packetdrop should be done by queueing.
- */
- n = ip->ip_len / mtu + 1; /* how many fragments ? */
- if (
-#ifdef ALTQ
- (!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
-#endif /* ALTQ */
- (ifp->if_snd.ifq_len + n) >= ifp->if_snd.ifq_maxlen ) {
- error = ENOBUFS;
- IPSTAT_INC(ips_odropped);
- ifp->if_snd.ifq_drops += n;
- goto bad;
- }
-
- /*
* Look for broadcast address and
* verify user is allowed to send
* such a packet.
@@ -464,7 +544,7 @@ again:
goto bad;
}
/* don't allow broadcast messages to be fragmented */
- if (ip->ip_len > mtu) {
+ if (ip_len > mtu) {
error = EMSGSIZE;
goto bad;
}
@@ -475,7 +555,7 @@ again:
sendit:
#ifdef IPSEC
- switch(ip_ipsec_output(&m, inp, &flags, &error)) {
+ switch(ip_ipsec_output(&m, inp, &error)) {
case 1:
goto bad;
case -1:
@@ -498,78 +578,29 @@ sendit:
#endif /* IPSEC */
/* Jump over all PFIL processing if hooks are not active. */
- if (!PFIL_HOOKED(&V_inet_pfil_hook))
- goto passout;
-
- /* Run through list of hooks for output packets. */
- odst.s_addr = ip->ip_dst.s_addr;
- error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
- if (error != 0 || m == NULL)
- goto done;
+ if (PFIL_HOOKED(&V_inet_pfil_hook)) {
+ switch (ip_output_pfil(&m, ifp, inp, dst, &fibnum, &error)) {
+ case 1: /* Finished */
+ goto done;
- ip = mtod(m, struct ip *);
+ case 0: /* Continue normally */
+ ip = mtod(m, struct ip *);
+ break;
- /* See if destination IP address was changed by packet filter. */
- if (odst.s_addr != ip->ip_dst.s_addr) {
- m->m_flags |= M_SKIP_FIREWALL;
- /* If destination is now ourself drop to ip_input(). */
- if (in_localip(ip->ip_dst)) {
- m->m_flags |= M_FASTFWD_OURS;
- if (m->m_pkthdr.rcvif == NULL)
- m->m_pkthdr.rcvif = V_loif;
- if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
- m->m_pkthdr.csum_flags |=
- CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
- m->m_pkthdr.csum_data = 0xffff;
- }
- m->m_pkthdr.csum_flags |=
- CSUM_IP_CHECKED | CSUM_IP_VALID;
-#ifdef SCTP
- if (m->m_pkthdr.csum_flags & CSUM_SCTP)
- m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
-#endif
- error = netisr_queue(NETISR_IP, m);
- goto done;
- } else {
- if (ia != NULL)
+ case -1: /* Need to try again */
+ /* Reset everything for a new round */
+ RO_RTFREE(ro);
+ if (have_ia_ref)
ifa_free(&ia->ia_ifa);
- goto again; /* Redo the routing table lookup. */
- }
- }
+ ro->ro_prepend = NULL;
+ rte = NULL;
+ gw = dst;
+ ip = mtod(m, struct ip *);
+ goto again;
- /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
- if (m->m_flags & M_FASTFWD_OURS) {
- if (m->m_pkthdr.rcvif == NULL)
- m->m_pkthdr.rcvif = V_loif;
- if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
- m->m_pkthdr.csum_flags |=
- CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
- m->m_pkthdr.csum_data = 0xffff;
}
-#ifdef SCTP
- if (m->m_pkthdr.csum_flags & CSUM_SCTP)
- m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
-#endif
- m->m_pkthdr.csum_flags |=
- CSUM_IP_CHECKED | CSUM_IP_VALID;
-
- error = netisr_queue(NETISR_IP, m);
- goto done;
- }
- /* Or forward to some other address? */
- if ((m->m_flags & M_IP_NEXTHOP) &&
- (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
- dst = (struct sockaddr_in *)&ro->ro_dst;
- bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
- m->m_flags |= M_SKIP_FIREWALL;
- m->m_flags &= ~M_IP_NEXTHOP;
- m_tag_delete(m, fwd_tag);
- if (ia != NULL)
- ifa_free(&ia->ia_ifa);
- goto again;
}
-passout:
/* 127/8 must not appear on wire - RFC1122. */
if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
@@ -581,31 +612,28 @@ passout:
}
m->m_pkthdr.csum_flags |= CSUM_IP;
- sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
- if (sw_csum & CSUM_DELAY_DATA) {
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
in_delayed_cksum(m);
- sw_csum &= ~CSUM_DELAY_DATA;
+ m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
}
#ifdef SCTP
- if (sw_csum & CSUM_SCTP) {
+ if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
- sw_csum &= ~CSUM_SCTP;
+ m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
}
#endif
- m->m_pkthdr.csum_flags &= ifp->if_hwassist;
/*
* If small enough for interface, or the interface will take
* care of the fragmentation for us, we can just send directly.
*/
- if (ip->ip_len <= mtu ||
- (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
- ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
- ip->ip_len = htons(ip->ip_len);
- ip->ip_off = htons(ip->ip_off);
+ if (ip_len <= mtu ||
+ (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
ip->ip_sum = 0;
- if (sw_csum & CSUM_DELAY_IP)
+ if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
ip->ip_sum = in_cksum(m, hlen);
+ m->m_pkthdr.csum_flags &= ~CSUM_IP;
+ }
/*
* Record statistics for this interface address.
@@ -615,28 +643,30 @@ passout:
*/
if (!(flags & IP_FORWARDING) && ia) {
if (m->m_pkthdr.csum_flags & CSUM_TSO)
- ia->ia_ifa.if_opackets +=
- m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
+ counter_u64_add(ia->ia_ifa.ifa_opackets,
+ m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
else
- ia->ia_ifa.if_opackets++;
- ia->ia_ifa.if_obytes += m->m_pkthdr.len;
+ counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
+
+ counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
}
#ifdef MBUF_STRESS_TEST
if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
- m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
+ m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
#endif
/*
* Reset layer specific mbuf flags
* to avoid confusing lower layers.
*/
- m->m_flags &= ~(M_PROTOFLAGS);
+ m_clrprotoflags(m);
+ IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
error = (*ifp->if_output)(ifp, m,
- (struct sockaddr *)dst, ro);
+ (const struct sockaddr *)gw, ro);
goto done;
}
/* Balk when DF bit is set or the interface didn't support TSO. */
- if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
+ if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
error = EMSGSIZE;
IPSTAT_INC(ips_cantfrag);
goto bad;
@@ -646,7 +676,7 @@ passout:
* Too large for interface; fragment if possible. If successful,
* on return, m will point to a list of packets to be sent.
*/
- error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
+ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
if (error)
goto bad;
for (; m; m = m0) {
@@ -655,17 +685,19 @@ passout:
if (error == 0) {
/* Record statistics for this interface address. */
if (ia != NULL) {
- ia->ia_ifa.if_opackets++;
- ia->ia_ifa.if_obytes += m->m_pkthdr.len;
+ counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
+ counter_u64_add(ia->ia_ifa.ifa_obytes,
+ m->m_pkthdr.len);
}
/*
* Reset layer specific mbuf flags
* to avoid confusing upper layers.
*/
- m->m_flags &= ~(M_PROTOFLAGS);
+ m_clrprotoflags(m);
+ IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
error = (*ifp->if_output)(ifp, m,
- (struct sockaddr *)dst, ro);
+ (const struct sockaddr *)gw, ro);
} else
m_freem(m);
}
@@ -674,9 +706,20 @@ passout:
IPSTAT_INC(ips_fragmented);
done:
- if (ro == &iproute)
+ /*
+ * Release the route if using our private route, or if
+ * (with flowtable) we don't have our own reference.
+ */
+ if (ro == &iproute || ro->ro_flags & RT_NORTREF)
RO_RTFREE(ro);
- if (ia != NULL)
+ else if (rte == NULL)
+ /*
+ * If the caller supplied a route but somehow the reference
+ * to it has been released need to prevent the caller
+ * calling RTFREE on it again.
+ */
+ ro->ro_rt = NULL;
+ if (have_ia_ref)
ifa_free(&ia->ia_ifa);
return (error);
bad:
@@ -691,11 +734,10 @@ bad:
* chain of fragments that should be freed by the caller.
*
* if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
- * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
*/
int
ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
- u_long if_hwassist_flags, int sw_csum)
+ u_long if_hwassist_flags)
{
int error = 0;
int hlen = ip->ip_hl << 2;
@@ -705,8 +747,12 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
int firstlen;
struct mbuf **mnext;
int nfrags;
+ uint16_t ip_len, ip_off;
+
+ ip_len = ntohs(ip->ip_len);
+ ip_off = ntohs(ip->ip_off);
- if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */
+ if (ip_off & IP_DF) { /* Fragmentation not allowed */
IPSTAT_INC(ips_cantfrag);
return EMSGSIZE;
}
@@ -732,10 +778,10 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
}
#endif
if (len > PAGE_SIZE) {
- /*
- * Fragment large datagrams such that each segment
- * contains a multiple of PAGE_SIZE amount of data,
- * plus headers. This enables a receiver to perform
+ /*
+ * Fragment large datagrams such that each segment
+ * contains a multiple of PAGE_SIZE amount of data,
+ * plus headers. This enables a receiver to perform
* page-flipping zero-copy optimizations.
*
* XXX When does this help given that sender and receiver
@@ -747,7 +793,7 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
off = MIN(mtu, m0->m_pkthdr.len);
/*
- * firstlen (off - hlen) must be aligned on an
+ * firstlen (off - hlen) must be aligned on an
* 8-byte boundary
*/
if (off < hlen)
@@ -776,22 +822,30 @@ smart_frag_failure:
* The fragments are linked off the m_nextpkt of the original
* packet, which after processing serves as the first fragment.
*/
- for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
+ for (nfrags = 1; off < ip_len; off += len, nfrags++) {
struct ip *mhip; /* ip header on the fragment */
struct mbuf *m;
int mhlen = sizeof (struct ip);
- MGETHDR(m, M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL) {
error = ENOBUFS;
IPSTAT_INC(ips_odropped);
goto done;
}
- /* copy multicast and flowid flag, if any */
- m->m_flags |= (m0->m_flags & (M_FLOWID | M_MCAST)) | M_FRAG;
- /* make sure the flowid is the same for the fragmented mbufs */
- M_HASHTYPE_SET(m, M_HASHTYPE_GET(m0));
- m->m_pkthdr.flowid = m0->m_pkthdr.flowid;
+ /*
+ * Make sure the complete packet header gets copied
+ * from the originating mbuf to the newly created
+ * mbuf. This also ensures that existing firewall
+ * classification(s), VLAN tags and so on get copied
+ * to the resulting fragmented packet(s):
+ */
+ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
+ m_free(m);
+ error = ENOBUFS;
+ IPSTAT_INC(ips_odropped);
+ goto done;
+ }
/*
* In the first mbuf, leave room for the link header, then
* copy the original IP header including options. The payload
@@ -806,15 +860,14 @@ smart_frag_failure:
mhip->ip_hl = mhlen >> 2;
}
m->m_len = mhlen;
- /* XXX do we need to add ip->ip_off below ? */
- mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
- if (off + len >= ip->ip_len) { /* last fragment */
- len = ip->ip_len - off;
- m->m_flags |= M_LASTFRAG;
- } else
+ /* XXX do we need to add ip_off below ? */
+ mhip->ip_off = ((off - hlen) >> 3) + ip_off;
+ if (off + len >= ip_len)
+ len = ip_len - off;
+ else
mhip->ip_off |= IP_MF;
mhip->ip_len = htons((u_short)(len + mhlen));
- m->m_next = m_copym(m0, off, len, M_DONTWAIT);
+ m->m_next = m_copym(m0, off, len, M_NOWAIT);
if (m->m_next == NULL) { /* copy failed */
m_free(m);
error = ENOBUFS; /* ??? */
@@ -822,36 +875,33 @@ smart_frag_failure:
goto done;
}
m->m_pkthdr.len = mhlen + len;
- m->m_pkthdr.rcvif = NULL;
#ifdef MAC
mac_netinet_fragment(m0, m);
#endif
- m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
mhip->ip_off = htons(mhip->ip_off);
mhip->ip_sum = 0;
- if (sw_csum & CSUM_DELAY_IP)
+ if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
mhip->ip_sum = in_cksum(m, mhlen);
+ m->m_pkthdr.csum_flags &= ~CSUM_IP;
+ }
*mnext = m;
mnext = &m->m_nextpkt;
}
IPSTAT_ADD(ips_ofragments, nfrags);
- /* set first marker for fragment chain */
- m0->m_flags |= M_FIRSTFRAG | M_FRAG;
- m0->m_pkthdr.csum_data = nfrags;
-
/*
* Update first fragment by trimming what's been copied out
* and updating header.
*/
- m_adj(m0, hlen + firstlen - ip->ip_len);
+ m_adj(m0, hlen + firstlen - ip_len);
m0->m_pkthdr.len = hlen + firstlen;
ip->ip_len = htons((u_short)m0->m_pkthdr.len);
- ip->ip_off |= IP_MF;
- ip->ip_off = htons(ip->ip_off);
+ ip->ip_off = htons(ip_off | IP_MF);
ip->ip_sum = 0;
- if (sw_csum & CSUM_DELAY_IP)
+ if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
ip->ip_sum = in_cksum(m0, hlen);
+ m0->m_pkthdr.csum_flags &= ~CSUM_IP;
+ }
done:
*m_frag = m0;
@@ -862,11 +912,12 @@ void
in_delayed_cksum(struct mbuf *m)
{
struct ip *ip;
- u_short csum, offset;
+ uint16_t csum, offset, ip_len;
ip = mtod(m, struct ip *);
offset = ip->ip_hl << 2 ;
- csum = in_cksum_skip(m, ip->ip_len, offset);
+ ip_len = ntohs(ip->ip_len);
+ csum = in_cksum_skip(m, ip_len, offset);
if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
csum = 0xffff;
offset += m->m_pkthdr.csum_data; /* checksum offset */
@@ -889,6 +940,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
{
struct inpcb *inp = sotoinpcb(so);
int error, optval;
+#ifdef RSS
+ uint32_t rss_bucket;
+ int retval;
+#endif
error = optval = 0;
if (sopt->sopt_level != IPPROTO_IP) {
@@ -941,7 +996,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
error = EMSGSIZE;
break;
}
- MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
+ m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
if (m == NULL) {
error = ENOBUFS;
break;
@@ -967,6 +1022,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
break;
}
/* FALLTHROUGH */
+ case IP_BINDMULTI:
+#ifdef RSS
+ case IP_RSS_LISTEN_BUCKET:
+#endif
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
@@ -975,10 +1034,13 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
case IP_RECVDSTADDR:
case IP_RECVTTL:
case IP_RECVIF:
- case IP_FAITH:
case IP_ONESBCAST:
case IP_DONTFRAG:
case IP_RECVTOS:
+ case IP_RECVFLOWID:
+#ifdef RSS
+ case IP_RECVRSSBUCKETID:
+#endif
error = sooptcopyin(sopt, &optval, sizeof optval,
sizeof optval);
if (error)
@@ -1009,6 +1071,15 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
INP_WUNLOCK(inp); \
} while (0)
+#define OPTSET2(bit, val) do { \
+ INP_WLOCK(inp); \
+ if (val) \
+ inp->inp_flags2 |= bit; \
+ else \
+ inp->inp_flags2 &= ~bit; \
+ INP_WUNLOCK(inp); \
+} while (0)
+
case IP_RECVOPTS:
OPTSET(INP_RECVOPTS);
break;
@@ -1029,10 +1100,6 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
OPTSET(INP_RECVIF);
break;
- case IP_FAITH:
- OPTSET(INP_FAITH);
- break;
-
case IP_ONESBCAST:
OPTSET(INP_ONESBCAST);
break;
@@ -1045,9 +1112,30 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
case IP_RECVTOS:
OPTSET(INP_RECVTOS);
break;
+ case IP_BINDMULTI:
+ OPTSET2(INP_BINDMULTI, optval);
+ break;
+ case IP_RECVFLOWID:
+ OPTSET2(INP_RECVFLOWID, optval);
+ break;
+#ifdef RSS
+ case IP_RSS_LISTEN_BUCKET:
+ if ((optval >= 0) &&
+ (optval < rss_getnumbuckets())) {
+ inp->inp_rss_listen_bucket = optval;
+ OPTSET2(INP_RSS_BUCKET_SET, 1);
+ } else {
+ error = EINVAL;
+ }
+ break;
+ case IP_RECVRSSBUCKETID:
+ OPTSET2(INP_RECVRSSBUCKETID, optval);
+ break;
+#endif
}
break;
#undef OPTSET
+#undef OPTSET2
/*
* Multicast socket options are processed by the in_mcast
@@ -1133,7 +1221,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
case IP_OPTIONS:
case IP_RETOPTS:
if (inp->inp_options)
- error = sooptcopyout(sopt,
+ error = sooptcopyout(sopt,
mtod(inp->inp_options,
char *),
inp->inp_options->m_len);
@@ -1150,11 +1238,18 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
case IP_RECVTTL:
case IP_RECVIF:
case IP_PORTRANGE:
- case IP_FAITH:
case IP_ONESBCAST:
case IP_DONTFRAG:
case IP_BINDANY:
case IP_RECVTOS:
+ case IP_BINDMULTI:
+ case IP_FLOWID:
+ case IP_FLOWTYPE:
+ case IP_RECVFLOWID:
+#ifdef RSS
+ case IP_RSSBUCKETID:
+ case IP_RECVRSSBUCKETID:
+#endif
switch (sopt->sopt_name) {
case IP_TOS:
@@ -1170,6 +1265,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
break;
#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
+#define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0)
case IP_RECVOPTS:
optval = OPTBIT(INP_RECVOPTS);
@@ -1200,10 +1296,6 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
optval = 0;
break;
- case IP_FAITH:
- optval = OPTBIT(INP_FAITH);
- break;
-
case IP_ONESBCAST:
optval = OPTBIT(INP_ONESBCAST);
break;
@@ -1216,6 +1308,32 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
case IP_RECVTOS:
optval = OPTBIT(INP_RECVTOS);
break;
+ case IP_FLOWID:
+ optval = inp->inp_flowid;
+ break;
+ case IP_FLOWTYPE:
+ optval = inp->inp_flowtype;
+ break;
+ case IP_RECVFLOWID:
+ optval = OPTBIT2(INP_RECVFLOWID);
+ break;
+#ifdef RSS
+ case IP_RSSBUCKETID:
+ retval = rss_hash2bucket(inp->inp_flowid,
+ inp->inp_flowtype,
+ &rss_bucket);
+ if (retval == 0)
+ optval = rss_bucket;
+ else
+ error = EINVAL;
+ break;
+ case IP_RECVRSSBUCKETID:
+ optval = OPTBIT2(INP_RECVRSSBUCKETID);
+ break;
+#endif
+ case IP_BINDMULTI:
+ optval = OPTBIT2(INP_BINDMULTI);
+ break;
}
error = sooptcopyout(sopt, &optval, sizeof optval);
break;
@@ -1239,7 +1357,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
caddr_t req = NULL;
size_t len = 0;
- if (m != 0) {
+ if (m != NULL) {
req = mtod(m, caddr_t);
len = m->m_len;
}
@@ -1269,18 +1387,17 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
* replicating that code here.
*/
static void
-ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
- int hlen)
+ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
{
- register struct ip *ip;
+ struct ip *ip;
struct mbuf *copym;
/*
* Make a deep copy of the packet because we're going to
* modify the pack in order to generate checksums.
*/
- copym = m_dup(m, M_DONTWAIT);
- if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
+ copym = m_dup(m, M_NOWAIT);
+ if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
copym = m_pullup(copym, hlen);
if (copym != NULL) {
/* If needed, compute the checksum and mark it as valid. */
@@ -1296,17 +1413,8 @@ ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
* than the interface's MTU. Can this possibly matter?
*/
ip = mtod(copym, struct ip *);
- ip->ip_len = htons(ip->ip_len);
- ip->ip_off = htons(ip->ip_off);
ip->ip_sum = 0;
ip->ip_sum = in_cksum(copym, hlen);
-#if 1 /* XXX */
- if (dst->sin_family != AF_INET) {
- printf("ip_mloopback: bad address family %d\n",
- dst->sin_family);
- dst->sin_family = AF_INET;
- }
-#endif
- if_simloop(ifp, copym, dst->sin_family, 0);
+ if_simloop(ifp, copym, AF_INET, 0);
}
}
diff --git a/freebsd/sys/netinet/ip_reass.c b/freebsd/sys/netinet/ip_reass.c
new file mode 100644
index 00000000..aae24b9d
--- /dev/null
+++ b/freebsd/sys/netinet/ip_reass.c
@@ -0,0 +1,660 @@
+#include <machine/rtems-bsd-kernel-space.h>
+
+/*-
+ * Copyright (c) 2015 Gleb Smirnoff <glebius@FreeBSD.org>
+ * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_input.c 8.2 (Berkeley) 1/4/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/local/opt_rss.h>
+
+#include <rtems/bsd/sys/param.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/hash.h>
+#include <sys/mbuf.h>
+#include <sys/malloc.h>
+#include <rtems/bsd/sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+
+#include <net/rss_config.h>
+#include <net/netisr.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/in_rss.h>
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+SYSCTL_DECL(_net_inet_ip);
+
+/*
+ * Reassembly headers are stored in hash buckets.
+ */
+#define IPREASS_NHASH_LOG2 6
+#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
+#define IPREASS_HMASK (IPREASS_NHASH - 1)
+
+struct ipqbucket {
+ TAILQ_HEAD(ipqhead, ipq) head;
+ struct mtx lock;
+};
+
+static VNET_DEFINE(struct ipqbucket, ipq[IPREASS_NHASH]);
+#define V_ipq VNET(ipq)
+static VNET_DEFINE(uint32_t, ipq_hashseed);
+#define V_ipq_hashseed VNET(ipq_hashseed)
+
+#define IPQ_LOCK(i) mtx_lock(&V_ipq[i].lock)
+#define IPQ_TRYLOCK(i) mtx_trylock(&V_ipq[i].lock)
+#define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock)
+#define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED)
+
+void ipreass_init(void);
+void ipreass_drain(void);
+void ipreass_slowtimo(void);
+#ifdef VIMAGE
+void ipreass_destroy(void);
+#endif
+static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS);
+static void ipreass_zone_change(void *);
+static void ipreass_drain_tomax(void);
+static void ipq_free(struct ipqhead *, struct ipq *);
+static struct ipq * ipq_reuse(int);
+
+static inline void
+ipq_timeout(struct ipqhead *head, struct ipq *fp)
+{
+
+ IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
+ ipq_free(head, fp);
+}
+
+static inline void
+ipq_drop(struct ipqhead *head, struct ipq *fp)
+{
+
+ IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
+ ipq_free(head, fp);
+}
+
+static VNET_DEFINE(uma_zone_t, ipq_zone);
+#define V_ipq_zone VNET(ipq_zone)
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET |
+ CTLTYPE_INT | CTLFLAG_RW, NULL, 0, sysctl_maxfragpackets, "I",
+ "Maximum number of IPv4 fragment reassembly queue entries");
+SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET,
+ &VNET_NAME(ipq_zone),
+ "Current number of IPv4 fragment reassembly queue entries");
+
+static VNET_DEFINE(int, noreass);
+#define V_noreass VNET(noreass)
+
+static VNET_DEFINE(int, maxfragsperpacket);
+#define V_maxfragsperpacket VNET(maxfragsperpacket)
+SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(maxfragsperpacket), 0,
+ "Maximum number of IPv4 fragments allowed per packet");
+
+/*
+ * Take incoming datagram fragment and try to reassemble it into
+ * whole datagram. If the argument is the first fragment or one
+ * in between the function will return NULL and store the mbuf
+ * in the fragment chain. If the argument is the last fragment
+ * the packet will be reassembled and the pointer to the new
+ * mbuf returned for further processing. Only m_tags attached
+ * to the first packet/fragment are preserved.
+ * The IP header is *NOT* adjusted out of iplen.
+ */
+#define M_IP_FRAG M_PROTO9
+struct mbuf *
+ip_reass(struct mbuf *m)
+{
+ struct ip *ip;
+ struct mbuf *p, *q, *nq, *t;
+ struct ipq *fp;
+ struct ipqhead *head;
+ int i, hlen, next;
+ u_int8_t ecn, ecn0;
+ uint32_t hash;
+#ifdef RSS
+ uint32_t rss_hash, rss_type;
+#endif
+
+ /*
+ * If no reassembling or maxfragsperpacket are 0,
+ * never accept fragments.
+ */
+ if (V_noreass == 1 || V_maxfragsperpacket == 0) {
+ IPSTAT_INC(ips_fragments);
+ IPSTAT_INC(ips_fragdropped);
+ m_freem(m);
+ return (NULL);
+ }
+
+ ip = mtod(m, struct ip *);
+ hlen = ip->ip_hl << 2;
+
+ /*
+ * Adjust ip_len to not reflect header,
+ * convert offset of this to bytes.
+ */
+ ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
+ if (ip->ip_off & htons(IP_MF)) {
+ /*
+ * Make sure that fragments have a data length
+ * that's a non-zero multiple of 8 bytes.
+ */
+ if (ip->ip_len == htons(0) || (ntohs(ip->ip_len) & 0x7) != 0) {
+ IPSTAT_INC(ips_toosmall); /* XXX */
+ IPSTAT_INC(ips_fragdropped);
+ m_freem(m);
+ return (NULL);
+ }
+ m->m_flags |= M_IP_FRAG;
+ } else
+ m->m_flags &= ~M_IP_FRAG;
+ ip->ip_off = htons(ntohs(ip->ip_off) << 3);
+
+ /*
+ * Attempt reassembly; if it succeeds, proceed.
+ * ip_reass() will return a different mbuf.
+ */
+ IPSTAT_INC(ips_fragments);
+ m->m_pkthdr.PH_loc.ptr = ip;
+
+ /*
+ * Presence of header sizes in mbufs
+ * would confuse code below.
+ */
+ m->m_data += hlen;
+ m->m_len -= hlen;
+
+ hash = ip->ip_src.s_addr ^ ip->ip_id;
+ hash = jenkins_hash32(&hash, 1, V_ipq_hashseed) & IPREASS_HMASK;
+ head = &V_ipq[hash].head;
+ IPQ_LOCK(hash);
+
+ /*
+ * Look for queue of fragments
+ * of this datagram.
+ */
+ TAILQ_FOREACH(fp, head, ipq_list)
+ if (ip->ip_id == fp->ipq_id &&
+ ip->ip_src.s_addr == fp->ipq_src.s_addr &&
+ ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
+#ifdef MAC
+ mac_ipq_match(m, fp) &&
+#endif
+ ip->ip_p == fp->ipq_p)
+ break;
+ /*
+ * If first fragment to arrive, create a reassembly queue.
+ */
+ if (fp == NULL) {
+ fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
+ if (fp == NULL)
+ fp = ipq_reuse(hash);
+#ifdef MAC
+ if (mac_ipq_init(fp, M_NOWAIT) != 0) {
+ uma_zfree(V_ipq_zone, fp);
+ fp = NULL;
+ goto dropfrag;
+ }
+ mac_ipq_create(m, fp);
+#endif
+ TAILQ_INSERT_HEAD(head, fp, ipq_list);
+ fp->ipq_nfrags = 1;
+ fp->ipq_ttl = IPFRAGTTL;
+ fp->ipq_p = ip->ip_p;
+ fp->ipq_id = ip->ip_id;
+ fp->ipq_src = ip->ip_src;
+ fp->ipq_dst = ip->ip_dst;
+ fp->ipq_frags = m;
+ m->m_nextpkt = NULL;
+ goto done;
+ } else {
+ fp->ipq_nfrags++;
+#ifdef MAC
+ mac_ipq_update(m, fp);
+#endif
+ }
+
+#define GETIP(m) ((struct ip*)((m)->m_pkthdr.PH_loc.ptr))
+
+ /*
+ * Handle ECN by comparing this segment with the first one;
+ * if CE is set, do not lose CE.
+ * drop if CE and not-ECT are mixed for the same packet.
+ */
+ ecn = ip->ip_tos & IPTOS_ECN_MASK;
+ ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
+ if (ecn == IPTOS_ECN_CE) {
+ if (ecn0 == IPTOS_ECN_NOTECT)
+ goto dropfrag;
+ if (ecn0 != IPTOS_ECN_CE)
+ GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
+ }
+ if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
+ goto dropfrag;
+
+ /*
+ * Find a segment which begins after this one does.
+ */
+ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
+ if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off))
+ break;
+
+ /*
+ * If there is a preceding segment, it may provide some of
+ * our data already. If so, drop the data from the incoming
+ * segment. If it provides all of our data, drop us, otherwise
+ * stick new segment in the proper place.
+ *
+ * If some of the data is dropped from the preceding
+ * segment, then it's checksum is invalidated.
+ */
+ if (p) {
+ i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) -
+ ntohs(ip->ip_off);
+ if (i > 0) {
+ if (i >= ntohs(ip->ip_len))
+ goto dropfrag;
+ m_adj(m, i);
+ m->m_pkthdr.csum_flags = 0;
+ ip->ip_off = htons(ntohs(ip->ip_off) + i);
+ ip->ip_len = htons(ntohs(ip->ip_len) - i);
+ }
+ m->m_nextpkt = p->m_nextpkt;
+ p->m_nextpkt = m;
+ } else {
+ m->m_nextpkt = fp->ipq_frags;
+ fp->ipq_frags = m;
+ }
+
+ /*
+ * While we overlap succeeding segments trim them or,
+ * if they are completely covered, dequeue them.
+ */
+ for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) >
+ ntohs(GETIP(q)->ip_off); q = nq) {
+ i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) -
+ ntohs(GETIP(q)->ip_off);
+ if (i < ntohs(GETIP(q)->ip_len)) {
+ GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i);
+ GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i);
+ m_adj(q, i);
+ q->m_pkthdr.csum_flags = 0;
+ break;
+ }
+ nq = q->m_nextpkt;
+ m->m_nextpkt = nq;
+ IPSTAT_INC(ips_fragdropped);
+ fp->ipq_nfrags--;
+ m_freem(q);
+ }
+
+ /*
+ * Check for complete reassembly and perform frag per packet
+ * limiting.
+ *
+ * Frag limiting is performed here so that the nth frag has
+ * a chance to complete the packet before we drop the packet.
+ * As a result, n+1 frags are actually allowed per packet, but
+ * only n will ever be stored. (n = maxfragsperpacket.)
+ *
+ */
+ next = 0;
+ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
+ if (ntohs(GETIP(q)->ip_off) != next) {
+ if (fp->ipq_nfrags > V_maxfragsperpacket)
+ ipq_drop(head, fp);
+ goto done;
+ }
+ next += ntohs(GETIP(q)->ip_len);
+ }
+ /* Make sure the last packet didn't have the IP_MF flag */
+ if (p->m_flags & M_IP_FRAG) {
+ if (fp->ipq_nfrags > V_maxfragsperpacket)
+ ipq_drop(head, fp);
+ goto done;
+ }
+
+ /*
+ * Reassembly is complete. Make sure the packet is a sane size.
+ */
+ q = fp->ipq_frags;
+ ip = GETIP(q);
+ if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
+ IPSTAT_INC(ips_toolong);
+ ipq_drop(head, fp);
+ goto done;
+ }
+
+ /*
+ * Concatenate fragments.
+ */
+ m = q;
+ t = m->m_next;
+ m->m_next = NULL;
+ m_cat(m, t);
+ nq = q->m_nextpkt;
+ q->m_nextpkt = NULL;
+ for (q = nq; q != NULL; q = nq) {
+ nq = q->m_nextpkt;
+ q->m_nextpkt = NULL;
+ m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
+ m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
+ m_cat(m, q);
+ }
+ /*
+ * In order to do checksumming faster we do 'end-around carry' here
+ * (and not in for{} loop), though it implies we are not going to
+ * reassemble more than 64k fragments.
+ */
+ while (m->m_pkthdr.csum_data & 0xffff0000)
+ m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
+ (m->m_pkthdr.csum_data >> 16);
+#ifdef MAC
+ mac_ipq_reassemble(fp, m);
+ mac_ipq_destroy(fp);
+#endif
+
+ /*
+ * Create header for new ip packet by modifying header of first
+ * packet; dequeue and discard fragment reassembly header.
+ * Make header visible.
+ */
+ ip->ip_len = htons((ip->ip_hl << 2) + next);
+ ip->ip_src = fp->ipq_src;
+ ip->ip_dst = fp->ipq_dst;
+ TAILQ_REMOVE(head, fp, ipq_list);
+ uma_zfree(V_ipq_zone, fp);
+ m->m_len += (ip->ip_hl << 2);
+ m->m_data -= (ip->ip_hl << 2);
+ /* some debugging cruft by sklower, below, will go away soon */
+ if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */
+ m_fixhdr(m);
+ IPSTAT_INC(ips_reassembled);
+ IPQ_UNLOCK(hash);
+
+#ifdef RSS
+ /*
+ * Query the RSS layer for the flowid / flowtype for the
+ * mbuf payload.
+ *
+ * For now, just assume we have to calculate a new one.
+ * Later on we should check to see if the assigned flowid matches
+ * what RSS wants for the given IP protocol and if so, just keep it.
+ *
+ * We then queue into the relevant netisr so it can be dispatched
+ * to the correct CPU.
+ *
+ * Note - this may return 1, which means the flowid in the mbuf
+ * is correct for the configured RSS hash types and can be used.
+ */
+ if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) {
+ m->m_pkthdr.flowid = rss_hash;
+ M_HASHTYPE_SET(m, rss_type);
+ }
+
+ /*
+ * Queue/dispatch for reprocessing.
+ *
+ * Note: this is much slower than just handling the frame in the
+ * current receive context. It's likely worth investigating
+ * why this is.
+ */
+ netisr_dispatch(NETISR_IP_DIRECT, m);
+ return (NULL);
+#endif
+
+ /* Handle in-line */
+ return (m);
+
+dropfrag:
+ IPSTAT_INC(ips_fragdropped);
+ if (fp != NULL)
+ fp->ipq_nfrags--;
+ m_freem(m);
+done:
+ IPQ_UNLOCK(hash);
+ return (NULL);
+
+#undef GETIP
+}
+
+/*
+ * Initialize IP reassembly structures.
+ */
+void
+ipreass_init(void)
+{
+
+ for (int i = 0; i < IPREASS_NHASH; i++) {
+ TAILQ_INIT(&V_ipq[i].head);
+ mtx_init(&V_ipq[i].lock, "IP reassembly", NULL,
+ MTX_DEF | MTX_DUPOK);
+ }
+ V_ipq_hashseed = arc4random();
+ V_maxfragsperpacket = 16;
+ V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
+ NULL, UMA_ALIGN_PTR, 0);
+ uma_zone_set_max(V_ipq_zone, nmbclusters / 32);
+
+ if (IS_DEFAULT_VNET(curvnet))
+ EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change,
+ NULL, EVENTHANDLER_PRI_ANY);
+}
+
+/*
+ * If a timer expires on a reassembly queue, discard it.
+ */
+void
+ipreass_slowtimo(void)
+{
+ struct ipq *fp, *tmp;
+
+ for (int i = 0; i < IPREASS_NHASH; i++) {
+ IPQ_LOCK(i);
+ TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp)
+ if (--fp->ipq_ttl == 0)
+ ipq_timeout(&V_ipq[i].head, fp);
+ IPQ_UNLOCK(i);
+ }
+}
+
+/*
+ * Drain off all datagram fragments.
+ */
+void
+ipreass_drain(void)
+{
+
+ for (int i = 0; i < IPREASS_NHASH; i++) {
+ IPQ_LOCK(i);
+ while(!TAILQ_EMPTY(&V_ipq[i].head))
+ ipq_drop(&V_ipq[i].head, TAILQ_FIRST(&V_ipq[i].head));
+ IPQ_UNLOCK(i);
+ }
+}
+
+#ifdef VIMAGE
+/*
+ * Destroy IP reassembly structures.
+ */
+void
+ipreass_destroy(void)
+{
+
+ ipreass_drain();
+ uma_zdestroy(V_ipq_zone);
+ for (int i = 0; i < IPREASS_NHASH; i++)
+ mtx_destroy(&V_ipq[i].lock);
+}
+#endif
+
+/*
+ * After maxnipq has been updated, propagate the change to UMA. The UMA zone
+ * max has slightly different semantics than the sysctl, for historical
+ * reasons.
+ */
+static void
+ipreass_drain_tomax(void)
+{
+ int target;
+
+ /*
+ * If we are over the maximum number of fragments,
+ * drain off enough to get down to the new limit,
+ * stripping off last elements on queues. Every
+ * run we strip the oldest element from each bucket.
+ */
+ target = uma_zone_get_max(V_ipq_zone);
+ while (uma_zone_get_cur(V_ipq_zone) > target) {
+ struct ipq *fp;
+
+ for (int i = 0; i < IPREASS_NHASH; i++) {
+ IPQ_LOCK(i);
+ fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
+ if (fp != NULL)
+ ipq_timeout(&V_ipq[i].head, fp);
+ IPQ_UNLOCK(i);
+ }
+ }
+}
+
+static void
+ipreass_zone_change(void *tag)
+{
+
+ uma_zone_set_max(V_ipq_zone, nmbclusters / 32);
+ ipreass_drain_tomax();
+}
+
+/*
+ * Change the limit on the UMA zone, or disable the fragment allocation
+ * at all. Since 0 and -1 is a special values here, we need our own handler,
+ * instead of sysctl_handle_uma_zone_max().
+ */
+static int
+sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
+{
+ int error, max;
+
+ if (V_noreass == 0) {
+ max = uma_zone_get_max(V_ipq_zone);
+ if (max == 0)
+ max = -1;
+ } else
+ max = 0;
+ error = sysctl_handle_int(oidp, &max, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (max > 0) {
+ /*
+ * XXXRW: Might be a good idea to sanity check the argument
+ * and place an extreme upper bound.
+ */
+ max = uma_zone_set_max(V_ipq_zone, max);
+ ipreass_drain_tomax();
+ V_noreass = 0;
+ } else if (max == 0) {
+ V_noreass = 1;
+ ipreass_drain();
+ } else if (max == -1) {
+ V_noreass = 0;
+ uma_zone_set_max(V_ipq_zone, 0);
+ } else
+ return (EINVAL);
+ return (0);
+}
+
+/*
+ * Seek for old fragment queue header that can be reused. Try to
+ * reuse a header from currently locked hash bucket.
+ */
+static struct ipq *
+ipq_reuse(int start)
+{
+ struct ipq *fp;
+ int i;
+
+ IPQ_LOCK_ASSERT(start);
+
+ for (i = start;; i++) {
+ if (i == IPREASS_NHASH)
+ i = 0;
+ if (i != start && IPQ_TRYLOCK(i) == 0)
+ continue;
+ fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
+ if (fp) {
+ struct mbuf *m;
+
+ IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
+ while (fp->ipq_frags) {
+ m = fp->ipq_frags;
+ fp->ipq_frags = m->m_nextpkt;
+ m_freem(m);
+ }
+ TAILQ_REMOVE(&V_ipq[i].head, fp, ipq_list);
+ if (i != start)
+ IPQ_UNLOCK(i);
+ IPQ_LOCK_ASSERT(start);
+ return (fp);
+ }
+ if (i != start)
+ IPQ_UNLOCK(i);
+ }
+}
+
+/*
+ * Free a fragment reassembly header and all associated datagrams.
+ */
+static void
+ipq_free(struct ipqhead *fhp, struct ipq *fp)
+{
+ struct mbuf *q;
+
+ while (fp->ipq_frags) {
+ q = fp->ipq_frags;
+ fp->ipq_frags = q->m_nextpkt;
+ m_freem(q);
+ }
+ TAILQ_REMOVE(fhp, fp, ipq_list);
+ uma_zfree(V_ipq_zone, fp);
+}
diff --git a/freebsd/sys/netinet/ip_var.h b/freebsd/sys/netinet/ip_var.h
index b07ef162..847704fd 100644
--- a/freebsd/sys/netinet/ip_var.h
+++ b/freebsd/sys/netinet/ip_var.h
@@ -93,50 +93,54 @@ struct ip_moptions {
u_short imo_max_memberships; /* max memberships this socket */
struct in_multi **imo_membership; /* group memberships */
struct in_mfilter *imo_mfilters; /* source filters */
+ STAILQ_ENTRY(ip_moptions) imo_link;
};
struct ipstat {
- u_long ips_total; /* total packets received */
- u_long ips_badsum; /* checksum bad */
- u_long ips_tooshort; /* packet too short */
- u_long ips_toosmall; /* not enough data */
- u_long ips_badhlen; /* ip header length < data size */
- u_long ips_badlen; /* ip length < ip header length */
- u_long ips_fragments; /* fragments received */
- u_long ips_fragdropped; /* frags dropped (dups, out of space) */
- u_long ips_fragtimeout; /* fragments timed out */
- u_long ips_forward; /* packets forwarded */
- u_long ips_fastforward; /* packets fast forwarded */
- u_long ips_cantforward; /* packets rcvd for unreachable dest */
- u_long ips_redirectsent; /* packets forwarded on same net */
- u_long ips_noproto; /* unknown or unsupported protocol */
- u_long ips_delivered; /* datagrams delivered to upper level*/
- u_long ips_localout; /* total ip packets generated here */
- u_long ips_odropped; /* lost packets due to nobufs, etc. */
- u_long ips_reassembled; /* total packets reassembled ok */
- u_long ips_fragmented; /* datagrams successfully fragmented */
- u_long ips_ofragments; /* output fragments created */
- u_long ips_cantfrag; /* don't fragment flag was set, etc. */
- u_long ips_badoptions; /* error in option processing */
- u_long ips_noroute; /* packets discarded due to no route */
- u_long ips_badvers; /* ip version != 4 */
- u_long ips_rawout; /* total raw ip packets generated */
- u_long ips_toolong; /* ip length > max ip packet size */
- u_long ips_notmember; /* multicasts for unregistered grps */
- u_long ips_nogif; /* no match gif found */
- u_long ips_badaddr; /* invalid address on header */
+ uint64_t ips_total; /* total packets received */
+ uint64_t ips_badsum; /* checksum bad */
+ uint64_t ips_tooshort; /* packet too short */
+ uint64_t ips_toosmall; /* not enough data */
+ uint64_t ips_badhlen; /* ip header length < data size */
+ uint64_t ips_badlen; /* ip length < ip header length */
+ uint64_t ips_fragments; /* fragments received */
+ uint64_t ips_fragdropped; /* frags dropped (dups, out of space) */
+ uint64_t ips_fragtimeout; /* fragments timed out */
+ uint64_t ips_forward; /* packets forwarded */
+ uint64_t ips_fastforward; /* packets fast forwarded */
+ uint64_t ips_cantforward; /* packets rcvd for unreachable dest */
+ uint64_t ips_redirectsent; /* packets forwarded on same net */
+ uint64_t ips_noproto; /* unknown or unsupported protocol */
+ uint64_t ips_delivered; /* datagrams delivered to upper level*/
+ uint64_t ips_localout; /* total ip packets generated here */
+ uint64_t ips_odropped; /* lost packets due to nobufs, etc. */
+ uint64_t ips_reassembled; /* total packets reassembled ok */
+ uint64_t ips_fragmented; /* datagrams successfully fragmented */
+ uint64_t ips_ofragments; /* output fragments created */
+ uint64_t ips_cantfrag; /* don't fragment flag was set, etc. */
+ uint64_t ips_badoptions; /* error in option processing */
+ uint64_t ips_noroute; /* packets discarded due to no route */
+ uint64_t ips_badvers; /* ip version != 4 */
+ uint64_t ips_rawout; /* total raw ip packets generated */
+ uint64_t ips_toolong; /* ip length > max ip packet size */
+ uint64_t ips_notmember; /* multicasts for unregistered grps */
+ uint64_t ips_nogif; /* no match gif found */
+ uint64_t ips_badaddr; /* invalid address on header */
};
#ifdef _KERNEL
+#include <sys/counter.h>
#include <net/vnet.h>
+VNET_PCPUSTAT_DECLARE(struct ipstat, ipstat);
/*
* In-kernel consumers can use these accessor macros directly to update
* stats.
*/
-#define IPSTAT_ADD(name, val) V_ipstat.name += (val)
-#define IPSTAT_SUB(name, val) V_ipstat.name -= (val)
+#define IPSTAT_ADD(name, val) \
+ VNET_PCPUSTAT_ADD(struct ipstat, ipstat, name, (val))
+#define IPSTAT_SUB(name, val) IPSTAT_ADD(name, -(val))
#define IPSTAT_INC(name) IPSTAT_ADD(name, 1)
#define IPSTAT_DEC(name) IPSTAT_SUB(name, 1)
@@ -144,11 +148,11 @@ struct ipstat {
* Kernel module consumers must use this accessor macro.
*/
void kmod_ipstat_inc(int statnum);
-#define KMOD_IPSTAT_INC(name) \
- kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(u_long))
+#define KMOD_IPSTAT_INC(name) \
+ kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(uint64_t))
void kmod_ipstat_dec(int statnum);
-#define KMOD_IPSTAT_DEC(name) \
- kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(u_long))
+#define KMOD_IPSTAT_DEC(name) \
+ kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(uint64_t))
/* flags passed to ip_output as last parameter */
#define IP_FORWARDING 0x1 /* most of ip header exists */
@@ -157,12 +161,7 @@ void kmod_ipstat_dec(int statnum);
#define IP_SENDTOIF 0x8 /* send on specific ifnet */
#define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */
#define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */
-
-/*
- * mbuf flag used by ip_fastfwd
- */
-#define M_FASTFWD_OURS M_PROTO1 /* changed dst to local */
-#define M_IP_NEXTHOP M_PROTO2 /* explicit ip nexthop */
+#define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */
#ifdef __NO_STRICT_ALIGNMENT
#define IP_HDR_ALIGNED_P(ip) 1
@@ -175,8 +174,6 @@ struct inpcb;
struct route;
struct sockopt;
-VNET_DECLARE(struct ipstat, ipstat);
-VNET_DECLARE(u_short, ip_id); /* ip packet ctr, for ids */
VNET_DECLARE(int, ip_defttl); /* default IP ttl */
VNET_DECLARE(int, ipforwarding); /* ip forwarding */
#ifdef IPSTEALTH
@@ -191,7 +188,6 @@ VNET_DECLARE(int, rsvp_on);
VNET_DECLARE(int, drop_redirect);
extern struct pr_usrreqs rip_usrreqs;
-#define V_ipstat VNET(ipstat)
#define V_ip_id VNET(ip_id)
#define V_ip_defttl VNET(ip_defttl)
#define V_ipforwarding VNET(ipforwarding)
@@ -210,12 +206,9 @@ int inp_setmoptions(struct inpcb *, struct sockopt *);
int ip_ctloutput(struct socket *, struct sockopt *sopt);
void ip_drain(void);
int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
- u_long if_hwassist_flags, int sw_csum);
+ u_long if_hwassist_flags);
void ip_forward(struct mbuf *m, int srcrt);
void ip_init(void);
-#ifdef VIMAGE
-void ip_destroy(void);
-#endif
extern int
(*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
struct ip_moptions *);
@@ -226,27 +219,22 @@ int ipproto_register(short);
int ipproto_unregister(short);
struct mbuf *
ip_reass(struct mbuf *);
-struct in_ifaddr *
- ip_rtaddr(struct in_addr, u_int fibnum);
void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
struct mbuf *);
void ip_slowtimo(void);
-u_int16_t ip_randomid(void);
+void ip_fillid(struct ip *);
int rip_ctloutput(struct socket *, struct sockopt *);
void rip_ctlinput(int, struct sockaddr *, void *);
void rip_init(void);
-#ifdef VIMAGE
-void rip_destroy(void);
-#endif
-void rip_input(struct mbuf *, int);
-int rip_output(struct mbuf *, struct socket *, u_long);
-void ipip_input(struct mbuf *, int);
-void rsvp_input(struct mbuf *, int);
+int rip_input(struct mbuf **, int *, int);
+int rip_output(struct mbuf *, struct socket *, ...);
+int ipip_input(struct mbuf **, int *, int);
+int rsvp_input(struct mbuf **, int *, int);
int ip_rsvp_init(struct socket *);
int ip_rsvp_done(void);
extern int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
extern void (*ip_rsvp_force_done)(struct socket *);
-extern void (*rsvp_input_p)(struct mbuf *m, int off);
+extern int (*rsvp_input_p)(struct mbuf **, int *, int);
VNET_DECLARE(struct pfil_head, inet_pfil_hook); /* packet filter hooks */
#define V_inet_pfil_hook VNET(inet_pfil_hook)
@@ -285,7 +273,7 @@ enum {
IPFW_IS_MASK = 0x30000000, /* which source ? */
IPFW_IS_DIVERT = 0x20000000,
IPFW_IS_DUMMYNET =0x10000000,
- IPFW_IS_PIPE = 0x08000000, /* pip1=1, queue = 0 */
+ IPFW_IS_PIPE = 0x08000000, /* pipe=1, queue = 0 */
};
#define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */
#define MTAG_IPFW_RULE 1262273568 /* rule reference */
@@ -294,9 +282,7 @@ enum {
struct ip_fw_args;
typedef int (*ip_fw_chk_ptr_t)(struct ip_fw_args *args);
typedef int (*ip_fw_ctl_ptr_t)(struct sockopt *);
-VNET_DECLARE(ip_fw_chk_ptr_t, ip_fw_chk_ptr);
VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr);
-#define V_ip_fw_chk_ptr VNET(ip_fw_chk_ptr)
#define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr)
/* Divert hooks. */
@@ -307,12 +293,6 @@ extern int (*ng_ipfw_input_p)(struct mbuf **, int,
extern int (*ip_dn_ctl_ptr)(struct sockopt *);
extern int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *);
-
-VNET_DECLARE(int, ip_do_randomid);
-#define V_ip_do_randomid VNET(ip_do_randomid)
-#define ip_newid() ((V_ip_do_randomid != 0) ? ip_randomid() : \
- htons(V_ip_id++))
-
#endif /* _KERNEL */
#endif /* !_NETINET_IP_VAR_H_ */
diff --git a/freebsd/sys/netinet/libalias/alias.c b/freebsd/sys/netinet/libalias/alias.c
index 9e975122..a2cd987c 100644
--- a/freebsd/sys/netinet/libalias/alias.c
+++ b/freebsd/sys/netinet/libalias/alias.c
@@ -1724,7 +1724,7 @@ LibAliasUnLoadAllModule(void)
/* Unload all modules then reload everything. */
while ((p = first_handler()) != NULL) {
- detach_handler(p);
+ LibAliasDetachHandlers(p);
}
while ((t = walk_dll_chain()) != NULL) {
dlclose(t->handle);
@@ -1751,40 +1751,22 @@ LibAliasUnLoadAllModule(void)
struct mbuf *
m_megapullup(struct mbuf *m, int len) {
struct mbuf *mcl;
-
+
if (len > m->m_pkthdr.len)
goto bad;
-
- /* Do not reallocate packet if it is sequentional,
- * writable and has some extra space for expansion.
- * XXX: Constant 100bytes is completely empirical. */
-#define RESERVE 100
- if (m->m_next == NULL && M_WRITABLE(m) && M_TRAILINGSPACE(m) >= RESERVE)
+
+ if (m->m_next == NULL && M_WRITABLE(m))
return (m);
- if (len <= MCLBYTES - RESERVE) {
- mcl = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
- } else if (len < MJUM16BYTES) {
- int size;
- if (len <= MJUMPAGESIZE - RESERVE) {
- size = MJUMPAGESIZE;
- } else if (len <= MJUM9BYTES - RESERVE) {
- size = MJUM9BYTES;
- } else {
- size = MJUM16BYTES;
- };
- mcl = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, size);
- } else {
- goto bad;
- }
+ mcl = m_get2(len, M_NOWAIT, MT_DATA, M_PKTHDR);
if (mcl == NULL)
goto bad;
-
+ m_align(mcl, len);
m_move_pkthdr(mcl, m);
m_copydata(m, 0, len, mtod(mcl, caddr_t));
mcl->m_len = mcl->m_pkthdr.len = len;
m_freem(m);
-
+
return (mcl);
bad:
m_freem(m);
diff --git a/freebsd/sys/netinet/libalias/alias_cuseeme.c b/freebsd/sys/netinet/libalias/alias_cuseeme.c
index 1bdb7c4a..d6c9520c 100644
--- a/freebsd/sys/netinet/libalias/alias_cuseeme.c
+++ b/freebsd/sys/netinet/libalias/alias_cuseeme.c
@@ -58,14 +58,14 @@ __FBSDID("$FreeBSD$");
#define CUSEEME_PORT_NUMBER 7648
static void
-AliasHandleCUSeeMeOut(struct libalias *la, struct ip *pip,
+AliasHandleCUSeeMeOut(struct libalias *la, struct ip *pip,
struct alias_link *lnk);
static void
-AliasHandleCUSeeMeIn(struct libalias *la, struct ip *pip,
+AliasHandleCUSeeMeIn(struct libalias *la, struct ip *pip,
struct in_addr original_addr);
-static int
+static int
fingerprint(struct libalias *la, struct alias_data *ah)
{
@@ -76,7 +76,7 @@ fingerprint(struct libalias *la, struct alias_data *ah)
return (-1);
}
-static int
+static int
protohandlerin(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
@@ -84,7 +84,7 @@ protohandlerin(struct libalias *la, struct ip *pip, struct alias_data *ah)
return (0);
}
-static int
+static int
protohandlerout(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
@@ -94,20 +94,20 @@ protohandlerout(struct libalias *la, struct ip *pip, struct alias_data *ah)
/* Kernel module definition. */
struct proto_handler handlers[] = {
- {
- .pri = 120,
- .dir = OUT,
- .proto = UDP,
- .fingerprint = &fingerprint,
+ {
+ .pri = 120,
+ .dir = OUT,
+ .proto = UDP,
+ .fingerprint = &fingerprint,
.protohandler = &protohandlerout
- },
+ },
{
- .pri = 120,
- .dir = IN,
- .proto = UDP,
- .fingerprint = &fingerprint,
+ .pri = 120,
+ .dir = IN,
+ .proto = UDP,
+ .fingerprint = &fingerprint,
.protohandler = &protohandlerin
- },
+ },
{ EOH }
};
@@ -132,9 +132,9 @@ mod_handler(module_t mod, int type, void *data)
}
#ifdef _KERNEL
-static
+static
#endif
-moduledata_t
+moduledata_t
alias_mod = {
"alias_cuseeme", mod_handler, NULL
};
diff --git a/freebsd/sys/netinet/libalias/alias_db.c b/freebsd/sys/netinet/libalias/alias_db.c
index fabe586e..219d5d34 100644
--- a/freebsd/sys/netinet/libalias/alias_db.c
+++ b/freebsd/sys/netinet/libalias/alias_db.c
@@ -148,6 +148,7 @@ __FBSDID("$FreeBSD$");
#include <machine/stdarg.h>
#include <rtems/bsd/sys/param.h>
#include <sys/kernel.h>
+#include <sys/systm.h>
#include <rtems/bsd/sys/lock.h>
#include <sys/module.h>
#include <sys/rwlock.h>
@@ -350,24 +351,16 @@ MODULE_VERSION(libalias, 1);
static int
alias_mod_handler(module_t mod, int type, void *data)
{
- int error;
switch (type) {
- case MOD_LOAD:
- error = 0;
- handler_chain_init();
- break;
case MOD_QUIESCE:
case MOD_UNLOAD:
- handler_chain_destroy();
finishoff();
- error = 0;
- break;
+ case MOD_LOAD:
+ return (0);
default:
- error = EINVAL;
+ return (EINVAL);
}
-
- return (error);
}
static moduledata_t alias_mod = {
@@ -793,9 +786,9 @@ FindNewPortGroup(struct libalias *la,
struct alias_link *search_result;
for (j = 0; j < port_count; j++)
- if (0 != (search_result = FindLinkIn(la, dst_addr, alias_addr,
- dst_port, htons(port_sys + j),
- link_type, 0)))
+ if ((search_result = FindLinkIn(la, dst_addr,
+ alias_addr, dst_port, htons(port_sys + j),
+ link_type, 0)) != NULL)
break;
/* Found a good range, return base */
diff --git a/freebsd/sys/netinet/libalias/alias_dummy.c b/freebsd/sys/netinet/libalias/alias_dummy.c
index eacfac86..b4c00c20 100644
--- a/freebsd/sys/netinet/libalias/alias_dummy.c
+++ b/freebsd/sys/netinet/libalias/alias_dummy.c
@@ -29,7 +29,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-/*
+/*
* Alias_dummy is just an empty skeleton used to demostrate how to write
* a module for libalias, that will run unalterated in userland or in
* kernel land.
@@ -61,19 +61,19 @@ __FBSDID("$FreeBSD$");
static void
AliasHandleDummy(struct libalias *la, struct ip *ip, struct alias_data *ah);
-static int
+static int
fingerprint(struct libalias *la, struct alias_data *ah)
{
- /*
- * Check here all the data that will be used later, if any field
+ /*
+ * Check here all the data that will be used later, if any field
* is empy/NULL, return a -1 value.
*/
- if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
+ if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
ah->maxpktsize == 0)
return (-1);
- /*
- * Fingerprint the incoming packet, if it matches any conditions
+ /*
+ * Fingerprint the incoming packet, if it matches any conditions
* return an OK value.
*/
if (ntohs(*ah->dport) == 123
@@ -82,12 +82,12 @@ fingerprint(struct libalias *la, struct alias_data *ah)
return (-1); /* I don't recognize this packet. */
}
-/*
- * Wrap in this general purpose function, the real function used to alias the
+/*
+ * Wrap in this general purpose function, the real function used to alias the
* packets.
*/
-static int
+static int
protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
@@ -95,22 +95,22 @@ protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
return (0);
}
-/*
- * NOTA BENE: the next variable MUST NOT be renamed in any case if you want
- * your module to work in userland, cause it's used to find and use all
+/*
+ * NOTA BENE: the next variable MUST NOT be renamed in any case if you want
+ * your module to work in userland, cause it's used to find and use all
* the protocol handlers present in every module.
- * So WATCH OUT, your module needs this variables and it needs it with
+ * So WATCH OUT, your module needs this variables and it needs it with
* ITS EXACT NAME: handlers.
*/
struct proto_handler handlers [] = {
- {
- .pri = 666,
- .dir = IN|OUT,
- .proto = UDP|TCP,
- .fingerprint = &fingerprint,
+ {
+ .pri = 666,
+ .dir = IN|OUT,
+ .proto = UDP|TCP,
+ .fingerprint = &fingerprint,
.protohandler = &protohandler
- },
+ },
{ EOH }
};
@@ -119,7 +119,7 @@ mod_handler(module_t mod, int type, void *data)
{
int error;
- switch (type) {
+ switch (type) {
case MOD_LOAD:
error = 0;
LibAliasAttachHandlers(handlers);
diff --git a/freebsd/sys/netinet/libalias/alias_irc.c b/freebsd/sys/netinet/libalias/alias_irc.c
index 880d897e..44ff6d92 100644
--- a/freebsd/sys/netinet/libalias/alias_irc.c
+++ b/freebsd/sys/netinet/libalias/alias_irc.c
@@ -46,7 +46,7 @@ __FBSDID("$FreeBSD$");
Version 2.1: May, 1997 (cjm)
Very minor changes to conform with
local/global/function naming conventions
- withing the packet alising module.
+ within the packet alising module.
*/
/* Includes */
@@ -94,11 +94,11 @@ static void
AliasHandleIrcOut(struct libalias *, struct ip *, struct alias_link *,
int maxpacketsize);
-static int
+static int
fingerprint(struct libalias *la, struct alias_data *ah)
{
- if (ah->dport == NULL || ah->dport == NULL || ah->lnk == NULL ||
+ if (ah->dport == NULL || ah->dport == NULL || ah->lnk == NULL ||
ah->maxpktsize == 0)
return (-1);
if (ntohs(*ah->dport) == IRC_CONTROL_PORT_NUMBER_1
@@ -107,7 +107,7 @@ fingerprint(struct libalias *la, struct alias_data *ah)
return (-1);
}
-static int
+static int
protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
@@ -120,13 +120,13 @@ protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
}
struct proto_handler handlers[] = {
- {
- .pri = 90,
- .dir = OUT,
- .proto = TCP,
- .fingerprint = &fingerprint,
+ {
+ .pri = 90,
+ .dir = OUT,
+ .proto = TCP,
+ .fingerprint = &fingerprint,
.protohandler = &protohandler
- },
+ },
{ EOH }
};
@@ -151,7 +151,7 @@ mod_handler(module_t mod, int type, void *data)
}
#ifdef _KERNEL
-static
+static
#endif
moduledata_t alias_mod = {
"alias_irc", mod_handler, NULL
@@ -484,7 +484,7 @@ lPACKET_DONE:
which will generate a type-error on all but 32-bit machines.
[Note 2] This routine really ought to be replaced with one that
- creates a transparent proxy on the aliasing host, to allow arbitary
+ creates a transparent proxy on the aliasing host, to allow arbitrary
changes in the TCP stream. This should not be too difficult given
this base; I (ee) will try to do this some time later.
*/
diff --git a/freebsd/sys/netinet/libalias/alias_local.h b/freebsd/sys/netinet/libalias/alias_local.h
index a7b3fe19..3010be84 100644
--- a/freebsd/sys/netinet/libalias/alias_local.h
+++ b/freebsd/sys/netinet/libalias/alias_local.h
@@ -357,7 +357,7 @@ void PunchFWHole(struct alias_link *_lnk);
/* Housekeeping function */
void HouseKeeping(struct libalias *);
-/* Tcp specfic routines */
+/* Tcp specific routines */
/* lint -save -library Suppress flexelint warnings */
/* Transparent proxy routines */
diff --git a/freebsd/sys/netinet/libalias/alias_mod.c b/freebsd/sys/netinet/libalias/alias_mod.c
index 0e0bd56a..6acbbee6 100644
--- a/freebsd/sys/netinet/libalias/alias_mod.c
+++ b/freebsd/sys/netinet/libalias/alias_mod.c
@@ -54,201 +54,82 @@ __FBSDID("$FreeBSD$");
#endif
/* Protocol and userland module handlers chains. */
-LIST_HEAD(handler_chain, proto_handler) handler_chain = LIST_HEAD_INITIALIZER(handler_chain);
-#ifdef _KERNEL
-struct rwlock handler_rw;
-#endif
-SLIST_HEAD(dll_chain, dll) dll_chain = SLIST_HEAD_INITIALIZER(dll_chain);
-
-#ifdef _KERNEL
-
-#define LIBALIAS_RWLOCK_INIT() \
- rw_init(&handler_rw, "Libalias_modules_rwlock")
-#define LIBALIAS_RWLOCK_DESTROY() rw_destroy(&handler_rw)
-#define LIBALIAS_WLOCK_ASSERT() \
- rw_assert(&handler_rw, RA_WLOCKED)
-
-static __inline void
-LIBALIAS_RLOCK(void)
-{
- rw_rlock(&handler_rw);
-}
-
-static __inline void
-LIBALIAS_RUNLOCK(void)
-{
- rw_runlock(&handler_rw);
-}
-
-static __inline void
-LIBALIAS_WLOCK(void)
-{
- rw_wlock(&handler_rw);
-}
-
-static __inline void
-LIBALIAS_WUNLOCK(void)
-{
- rw_wunlock(&handler_rw);
-}
-
-static void
-_handler_chain_init(void)
-{
-
- if (!rw_initialized(&handler_rw))
- LIBALIAS_RWLOCK_INIT();
-}
-
-static void
-_handler_chain_destroy(void)
-{
-
- if (rw_initialized(&handler_rw))
- LIBALIAS_RWLOCK_DESTROY();
-}
-
-#else
-#define LIBALIAS_RWLOCK_INIT() ;
-#define LIBALIAS_RWLOCK_DESTROY() ;
-#define LIBALIAS_WLOCK_ASSERT() ;
-#define LIBALIAS_RLOCK() ;
-#define LIBALIAS_RUNLOCK() ;
-#define LIBALIAS_WLOCK() ;
-#define LIBALIAS_WUNLOCK() ;
-#define _handler_chain_init() ;
-#define _handler_chain_destroy() ;
-#endif
-
-void
-handler_chain_init(void)
-{
- _handler_chain_init();
-}
-
-void
-handler_chain_destroy(void)
-{
- _handler_chain_destroy();
-}
+static TAILQ_HEAD(handler_chain, proto_handler) handler_chain =
+ TAILQ_HEAD_INITIALIZER(handler_chain);
static int
-_attach_handler(struct proto_handler *p)
+attach_handler(struct proto_handler *p)
{
struct proto_handler *b;
- LIBALIAS_WLOCK_ASSERT();
- b = NULL;
- LIST_FOREACH(b, &handler_chain, entries) {
- if ((b->pri == p->pri) &&
+ TAILQ_FOREACH(b, &handler_chain, link) {
+ if ((b->pri == p->pri) &&
(b->dir == p->dir) &&
(b->proto == p->proto))
- return (EEXIST); /* Priority conflict. */
+ return (EEXIST);
if (b->pri > p->pri) {
- LIST_INSERT_BEFORE(b, p, entries);
+ TAILQ_INSERT_BEFORE(b, p, link);
return (0);
}
}
- /* End of list or found right position, inserts here. */
- if (b)
- LIST_INSERT_AFTER(b, p, entries);
- else
- LIST_INSERT_HEAD(&handler_chain, p, entries);
- return (0);
-}
-static int
-_detach_handler(struct proto_handler *p)
-{
- struct proto_handler *b, *b_tmp;
+ TAILQ_INSERT_TAIL(&handler_chain, p, link);
- LIBALIAS_WLOCK_ASSERT();
- LIST_FOREACH_SAFE(b, &handler_chain, entries, b_tmp) {
- if (b == p) {
- LIST_REMOVE(b, entries);
- return (0);
- }
- }
- return (ENOENT); /* Handler not found. */
+ return (0);
}
int
-LibAliasAttachHandlers(struct proto_handler *_p)
+LibAliasAttachHandlers(struct proto_handler *p)
{
- int i, error;
+ int error;
- LIBALIAS_WLOCK();
- error = -1;
- for (i = 0; 1; i++) {
- if (*((int *)&_p[i]) == EOH)
- break;
- error = _attach_handler(&_p[i]);
- if (error != 0)
- break;
+ while (p->dir != NODIR) {
+ error = attach_handler(p);
+ if (error)
+ return (error);
+ p++;
}
- LIBALIAS_WUNLOCK();
- return (error);
+
+ return (0);
}
+/* XXXGL: should be void, but no good reason to break ABI */
int
-LibAliasDetachHandlers(struct proto_handler *_p)
+LibAliasDetachHandlers(struct proto_handler *p)
{
- int i, error;
- LIBALIAS_WLOCK();
- error = -1;
- for (i = 0; 1; i++) {
- if (*((int *)&_p[i]) == EOH)
- break;
- error = _detach_handler(&_p[i]);
- if (error != 0)
- break;
+ while (p->dir != NODIR) {
+ TAILQ_REMOVE(&handler_chain, p, link);
+ p++;
}
- LIBALIAS_WUNLOCK();
- return (error);
-}
-
-int
-detach_handler(struct proto_handler *_p)
-{
- int error;
- LIBALIAS_WLOCK();
- error = -1;
- error = _detach_handler(_p);
- LIBALIAS_WUNLOCK();
- return (error);
+ return (0);
}
int
-find_handler(int8_t dir, int8_t proto, struct libalias *la, __unused struct ip *pip,
+find_handler(int8_t dir, int8_t proto, struct libalias *la, struct ip *ip,
struct alias_data *ad)
{
struct proto_handler *p;
- int error;
- LIBALIAS_RLOCK();
- error = ENOENT;
- LIST_FOREACH(p, &handler_chain, entries) {
- if ((p->dir & dir) && (p->proto & proto))
- if (p->fingerprint(la, ad) == 0) {
- error = p->protohandler(la, pip, ad);
- break;
- }
- }
- LIBALIAS_RUNLOCK();
- return (error);
+ TAILQ_FOREACH(p, &handler_chain, link)
+ if ((p->dir & dir) && (p->proto & proto) &&
+ p->fingerprint(la, ad) == 0)
+ return (p->protohandler(la, ip, ad));
+
+ return (ENOENT);
}
struct proto_handler *
first_handler(void)
{
-
- return (LIST_FIRST(&handler_chain));
+
+ return (TAILQ_FIRST(&handler_chain));
}
+#ifndef _KERNEL
/* Dll manipulation code - this code is not thread safe... */
-
+SLIST_HEAD(dll_chain, dll) dll_chain = SLIST_HEAD_INITIALIZER(dll_chain);
int
attach_dll(struct dll *p)
{
@@ -272,7 +153,7 @@ detach_dll(char *p)
error = NULL;
SLIST_FOREACH_SAFE(b, &dll_chain, next, b_tmp)
if (!strncmp(b->name, p, DLL_LEN)) {
- SLIST_REMOVE(&dll_chain, b, dll, next);
+ SLIST_REMOVE(&dll_chain, b, dll, next);
error = b;
break;
}
@@ -290,3 +171,4 @@ walk_dll_chain(void)
SLIST_REMOVE_HEAD(&dll_chain, next);
return (t);
}
+#endif /* !_KERNEL */
diff --git a/freebsd/sys/netinet/libalias/alias_mod.h b/freebsd/sys/netinet/libalias/alias_mod.h
index 727df8e6..fd020c46 100644
--- a/freebsd/sys/netinet/libalias/alias_mod.h
+++ b/freebsd/sys/netinet/libalias/alias_mod.h
@@ -54,102 +54,92 @@ MALLOC_DECLARE(M_ALIAS);
#endif
#endif
-/* Protocol handlers struct & function. */
+/* Packet flow direction flags. */
+#define IN 0x0001
+#define OUT 0x0002
+#define NODIR 0x4000
-/* Packet flow direction. */
-#define IN 1
-#define OUT 2
+/* Working protocol flags. */
+#define IP 0x01
+#define TCP 0x02
+#define UDP 0x04
-/* Working protocol. */
-#define IP 1
-#define TCP 2
-#define UDP 4
-
-/*
+/*
* Data passed to protocol handler module, it must be filled
* right before calling find_handler() to determine which
* module is elegible to be called.
*/
+struct alias_data {
+ struct alias_link *lnk;
+ struct in_addr *oaddr; /* Original address. */
+ struct in_addr *aaddr; /* Alias address. */
+ uint16_t *aport; /* Alias port. */
+ uint16_t *sport, *dport; /* Source & destination port */
+ uint16_t maxpktsize; /* Max packet size. */
+};
-struct alias_data {
- struct alias_link *lnk;
- struct in_addr *oaddr; /* Original address. */
- struct in_addr *aaddr; /* Alias address. */
- uint16_t *aport; /* Alias port. */
- uint16_t *sport, *dport; /* Source & destination port */
- uint16_t maxpktsize; /* Max packet size. */
-};
-
-/*
+/*
* This structure contains all the information necessary to make
* a protocol handler correctly work.
*/
-
struct proto_handler {
- u_int pri; /* Handler priority. */
- int16_t dir; /* Flow direction. */
- uint8_t proto; /* Working protocol. */
- int (*fingerprint)(struct libalias *, /* Fingerprint * function. */
- struct alias_data *);
- int (*protohandler)(struct libalias *, /* Aliasing * function. */
- struct ip *, struct alias_data *);
- LIST_ENTRY(proto_handler) entries;
+ u_int pri; /* Handler priority. */
+ int16_t dir; /* Flow direction. */
+ uint8_t proto; /* Working protocol. */
+ /* Fingerprint * function. */
+ int (*fingerprint)(struct libalias *, struct alias_data *);
+ /* Aliasing * function. */
+ int (*protohandler)(struct libalias *, struct ip *,
+ struct alias_data *);
+ TAILQ_ENTRY(proto_handler) link;
};
+/* End of handlers. */
+#define EOH .dir = NODIR
-/*
+/* Functions used with protocol handlers. */
+int LibAliasAttachHandlers(struct proto_handler *);
+int LibAliasDetachHandlers(struct proto_handler *);
+int find_handler(int8_t, int8_t, struct libalias *, struct ip *,
+ struct alias_data *);
+struct proto_handler *first_handler(void);
+
+#ifndef _KERNEL
+/*
* Used only in userland when libalias needs to keep track of all
* module loaded. In kernel land (kld mode) we don't need to care
* care about libalias modules cause it's kld to do it for us.
*/
-
-#define DLL_LEN 32
-struct dll {
- char name[DLL_LEN]; /* Name of module. */
- void *handle; /*
- * Ptr to shared obj obtained through
- * dlopen() - use this ptr to get access
- * to any symbols from a loaded module
- * via dlsym().
- */
- SLIST_ENTRY(dll) next;
+#define DLL_LEN 32
+struct dll {
+ char name[DLL_LEN]; /* Name of module. */
+ void *handle; /*
+ * Ptr to shared obj obtained through
+ * dlopen() - use this ptr to get access
+ * to any symbols from a loaded module
+ * via dlsym().
+ */
+ SLIST_ENTRY(dll) next;
};
-/* Functions used with protocol handlers. */
-
-void handler_chain_init(void);
-void handler_chain_destroy(void);
-int LibAliasAttachHandlers(struct proto_handler *);
-int LibAliasDetachHandlers(struct proto_handler *);
-int detach_handler(struct proto_handler *);
-int find_handler(int8_t, int8_t, struct libalias *,
- struct ip *, struct alias_data *);
-struct proto_handler *first_handler(void);
-
/* Functions used with dll module. */
+void dll_chain_init(void);
+void dll_chain_destroy(void);
+int attach_dll(struct dll *);
+void *detach_dll(char *);
+struct dll *walk_dll_chain(void);
-void dll_chain_init(void);
-void dll_chain_destroy(void);
-int attach_dll(struct dll *);
-void *detach_dll(char *);
-struct dll *walk_dll_chain(void);
-
-/* End of handlers. */
-#define EOH -1
-
-/*
+/*
* Some defines borrowed from sys/module.h used to compile a kld
* in userland as a shared lib.
*/
-
-#ifndef _KERNEL
typedef enum modeventtype {
- MOD_LOAD,
- MOD_UNLOAD,
- MOD_SHUTDOWN,
- MOD_QUIESCE
+ MOD_LOAD,
+ MOD_UNLOAD,
+ MOD_SHUTDOWN,
+ MOD_QUIESCE
} modeventtype_t;
-
+
typedef struct module *module_t;
typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *);
@@ -157,10 +147,10 @@ typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *);
* Struct for registering modules statically via SYSINIT.
*/
typedef struct moduledata {
- const char *name; /* module name */
- modeventhand_t evhand; /* event handler */
- void *priv; /* extra data */
+ const char *name; /* module name */
+ modeventhand_t evhand; /* event handler */
+ void *priv; /* extra data */
} moduledata_t;
-#endif
+#endif /* !_KERNEL */
-#endif /* !_ALIAS_MOD_H_ */
+#endif /* !_ALIAS_MOD_H_ */
diff --git a/freebsd/sys/netinet/libalias/alias_nbt.c b/freebsd/sys/netinet/libalias/alias_nbt.c
index 5a917872..c10f9b48 100644
--- a/freebsd/sys/netinet/libalias/alias_nbt.c
+++ b/freebsd/sys/netinet/libalias/alias_nbt.c
@@ -72,17 +72,17 @@ __FBSDID("$FreeBSD$");
#define NETBIOS_DGM_PORT_NUMBER 138
static int
-AliasHandleUdpNbt(struct libalias *, struct ip *, struct alias_link *,
+AliasHandleUdpNbt(struct libalias *, struct ip *, struct alias_link *,
struct in_addr *, u_short);
static int
AliasHandleUdpNbtNS(struct libalias *, struct ip *, struct alias_link *,
struct in_addr *, u_short *, struct in_addr *, u_short *);
-static int
+static int
fingerprint1(struct libalias *la, struct alias_data *ah)
{
- if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
+ if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
ah->aaddr == NULL || ah->aport == NULL)
return (-1);
if (ntohs(*ah->dport) == NETBIOS_DGM_PORT_NUMBER
@@ -91,18 +91,18 @@ fingerprint1(struct libalias *la, struct alias_data *ah)
return (-1);
}
-static int
+static int
protohandler1(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
return (AliasHandleUdpNbt(la, pip, ah->lnk, ah->aaddr, *ah->aport));
}
-static int
+static int
fingerprint2(struct libalias *la, struct alias_data *ah)
{
- if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
+ if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
ah->aaddr == NULL || ah->aport == NULL)
return (-1);
if (ntohs(*ah->dport) == NETBIOS_NS_PORT_NUMBER
@@ -111,7 +111,7 @@ fingerprint2(struct libalias *la, struct alias_data *ah)
return (-1);
}
-static int
+static int
protohandler2in(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
@@ -120,7 +120,7 @@ protohandler2in(struct libalias *la, struct ip *pip, struct alias_data *ah)
return (0);
}
-static int
+static int
protohandler2out(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
@@ -130,27 +130,27 @@ protohandler2out(struct libalias *la, struct ip *pip, struct alias_data *ah)
/* Kernel module definition. */
struct proto_handler handlers[] = {
- {
- .pri = 130,
- .dir = IN|OUT,
- .proto = UDP,
- .fingerprint = &fingerprint1,
+ {
+ .pri = 130,
+ .dir = IN|OUT,
+ .proto = UDP,
+ .fingerprint = &fingerprint1,
.protohandler = &protohandler1
- },
- {
- .pri = 140,
- .dir = IN,
- .proto = UDP,
- .fingerprint = &fingerprint2,
+ },
+ {
+ .pri = 140,
+ .dir = IN,
+ .proto = UDP,
+ .fingerprint = &fingerprint2,
.protohandler = &protohandler2in
- },
- {
- .pri = 140,
- .dir = OUT,
- .proto = UDP,
- .fingerprint = &fingerprint2,
+ },
+ {
+ .pri = 140,
+ .dir = OUT,
+ .proto = UDP,
+ .fingerprint = &fingerprint2,
.protohandler = &protohandler2out
- },
+ },
{ EOH }
};
@@ -175,7 +175,7 @@ mod_handler(module_t mod, int type, void *data)
}
#ifdef _KERNEL
-static
+static
#endif
moduledata_t alias_mod = {
"alias_nbt", mod_handler, NULL
diff --git a/freebsd/sys/netinet/libalias/alias_pptp.c b/freebsd/sys/netinet/libalias/alias_pptp.c
index e8205db0..39861c5c 100644
--- a/freebsd/sys/netinet/libalias/alias_pptp.c
+++ b/freebsd/sys/netinet/libalias/alias_pptp.c
@@ -80,7 +80,7 @@ AliasHandlePptpGreOut(struct libalias *, struct ip *);
static int
AliasHandlePptpGreIn(struct libalias *, struct ip *);
-static int
+static int
fingerprint(struct libalias *la, struct alias_data *ah)
{
@@ -92,14 +92,14 @@ fingerprint(struct libalias *la, struct alias_data *ah)
return (-1);
}
-static int
+static int
fingerprintgre(struct libalias *la, struct alias_data *ah)
{
return (0);
}
-static int
+static int
protohandlerin(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
@@ -107,7 +107,7 @@ protohandlerin(struct libalias *la, struct ip *pip, struct alias_data *ah)
return (0);
}
-static int
+static int
protohandlerout(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
@@ -115,7 +115,7 @@ protohandlerout(struct libalias *la, struct ip *pip, struct alias_data *ah)
return (0);
}
-static int
+static int
protohandlergrein(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
@@ -125,7 +125,7 @@ protohandlergrein(struct libalias *la, struct ip *pip, struct alias_data *ah)
return (-1);
}
-static int
+static int
protohandlergreout(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
@@ -136,39 +136,39 @@ protohandlergreout(struct libalias *la, struct ip *pip, struct alias_data *ah)
/* Kernel module definition. */
struct proto_handler handlers[] = {
- {
- .pri = 200,
- .dir = IN,
- .proto = TCP,
- .fingerprint = &fingerprint,
+ {
+ .pri = 200,
+ .dir = IN,
+ .proto = TCP,
+ .fingerprint = &fingerprint,
.protohandler = &protohandlerin
},
- {
- .pri = 210,
- .dir = OUT,
- .proto = TCP,
- .fingerprint = &fingerprint,
+ {
+ .pri = 210,
+ .dir = OUT,
+ .proto = TCP,
+ .fingerprint = &fingerprint,
.protohandler = &protohandlerout
},
-/*
- * WATCH OUT!!! these 2 handlers NEED a priority of INT_MAX (highest possible)
+/*
+ * WATCH OUT!!! these 2 handlers NEED a priority of INT_MAX (highest possible)
* cause they will ALWAYS process packets, so they must be the last one
* in chain: look fingerprintgre() above.
*/
- {
- .pri = INT_MAX,
- .dir = IN,
- .proto = IP,
- .fingerprint = &fingerprintgre,
+ {
+ .pri = INT_MAX,
+ .dir = IN,
+ .proto = IP,
+ .fingerprint = &fingerprintgre,
.protohandler = &protohandlergrein
},
- {
- .pri = INT_MAX,
- .dir = OUT,
- .proto = IP,
- .fingerprint = &fingerprintgre,
+ {
+ .pri = INT_MAX,
+ .dir = OUT,
+ .proto = IP,
+ .fingerprint = &fingerprintgre,
.protohandler = &protohandlergreout
- },
+ },
{ EOH }
};
static int
@@ -192,7 +192,7 @@ mod_handler(module_t mod, int type, void *data)
}
#ifdef _KERNEL
-static
+static
#endif
moduledata_t alias_mod = {
"alias_pptp", mod_handler, NULL
diff --git a/freebsd/sys/netinet/libalias/alias_sctp.h b/freebsd/sys/netinet/libalias/alias_sctp.h
index 840917ad..99cceee4 100644
--- a/freebsd/sys/netinet/libalias/alias_sctp.h
+++ b/freebsd/sys/netinet/libalias/alias_sctp.h
@@ -92,7 +92,6 @@
#ifndef _KERNEL
#include <stdlib.h>
#include <stdio.h>
-#include <curses.h>
#endif //#ifdef _KERNEL
diff --git a/freebsd/sys/netinet/libalias/alias_skinny.c b/freebsd/sys/netinet/libalias/alias_skinny.c
index 9f292916..b1f8f8c7 100644
--- a/freebsd/sys/netinet/libalias/alias_skinny.c
+++ b/freebsd/sys/netinet/libalias/alias_skinny.c
@@ -58,7 +58,7 @@
static void
AliasHandleSkinny(struct libalias *, struct ip *, struct alias_link *);
-static int
+static int
fingerprint(struct libalias *la, struct alias_data *ah)
{
@@ -70,7 +70,7 @@ fingerprint(struct libalias *la, struct alias_data *ah)
return (-1);
}
-static int
+static int
protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
@@ -79,13 +79,13 @@ protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
}
struct proto_handler handlers[] = {
- {
- .pri = 110,
- .dir = IN|OUT,
- .proto = TCP,
- .fingerprint = &fingerprint,
+ {
+ .pri = 110,
+ .dir = IN|OUT,
+ .proto = TCP,
+ .fingerprint = &fingerprint,
.protohandler = &protohandler
- },
+ },
{ EOH }
};
@@ -110,7 +110,7 @@ mod_handler(module_t mod, int type, void *data)
}
#ifdef _KERNEL
-static
+static
#endif
moduledata_t alias_mod = {
"alias_skinny", mod_handler, NULL
@@ -342,7 +342,7 @@ AliasHandleSkinny(struct libalias *la, struct ip *pip, struct alias_link *lnk)
* through the packet using len to determine message boundaries.
* This comes into play big time with port messages being in the
* same packet as register messages. Also, open receive channel
- * acks are usually buried in a pakcet some 400 bytes long.
+ * acks are usually buried in a packet some 400 bytes long.
*/
while (dlen >= skinny_hdr_len) {
len = (sd->len);
diff --git a/freebsd/sys/netinet/libalias/alias_smedia.c b/freebsd/sys/netinet/libalias/alias_smedia.c
index 47ae2748..9578a4af 100644
--- a/freebsd/sys/netinet/libalias/alias_smedia.c
+++ b/freebsd/sys/netinet/libalias/alias_smedia.c
@@ -133,14 +133,14 @@ __FBSDID("$FreeBSD$");
static void
AliasHandleRtspOut(struct libalias *, struct ip *, struct alias_link *,
int maxpacketsize);
-static int
+static int
fingerprint(struct libalias *la, struct alias_data *ah)
{
if (ah->dport != NULL && ah->aport != NULL && ah->sport != NULL &&
ntohs(*ah->dport) == TFTP_PORT_NUMBER)
return (0);
- if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
+ if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL ||
ah->maxpktsize == 0)
return (-1);
if (ntohs(*ah->dport) == RTSP_CONTROL_PORT_NUMBER_1
@@ -151,7 +151,7 @@ fingerprint(struct libalias *la, struct alias_data *ah)
return (-1);
}
-static int
+static int
protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
{
@@ -163,13 +163,13 @@ protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah)
}
struct proto_handler handlers[] = {
- {
- .pri = 100,
- .dir = OUT,
+ {
+ .pri = 100,
+ .dir = OUT,
.proto = TCP|UDP,
- .fingerprint = &fingerprint,
+ .fingerprint = &fingerprint,
.protohandler = &protohandler
- },
+ },
{ EOH }
};
@@ -194,7 +194,7 @@ mod_handler(module_t mod, int type, void *data)
}
#ifdef _KERNEL
-static
+static
#endif
moduledata_t alias_mod = {
"alias_smedia", mod_handler, NULL
@@ -408,7 +408,7 @@ alias_rtsp_out(struct libalias *la, struct ip *pip,
SetAckModified(lnk);
tc = (struct tcphdr *)ip_next(pip);
delta = GetDeltaSeqOut(tc->th_seq, lnk);
- AddSeq(lnk, delta + new_dlen - dlen, pip->ip_hl, pip->ip_len,
+ AddSeq(lnk, delta + new_dlen - dlen, pip->ip_hl, pip->ip_len,
tc->th_seq, tc->th_off);
new_len = htons(hlen + new_dlen);
@@ -520,7 +520,7 @@ AliasHandleRtspOut(struct libalias *la, struct ip *pip, struct alias_link *lnk,
/*
* When aliasing a server, check for the 200 reply
- * Accomodate varying number of blanks between 200 & OK
+ * Accommodate varying number of blanks between 200 & OK
*/
if (dlen >= (int)strlen(str200)) {
diff --git a/freebsd/sys/netinet/pim_var.h b/freebsd/sys/netinet/pim_var.h
index 41657b61..ae876c94 100644
--- a/freebsd/sys/netinet/pim_var.h
+++ b/freebsd/sys/netinet/pim_var.h
@@ -46,38 +46,33 @@
* PIM statistics kept in the kernel
*/
struct pimstat {
- u_quad_t pims_rcv_total_msgs; /* total PIM messages received */
- u_quad_t pims_rcv_total_bytes; /* total PIM bytes received */
- u_quad_t pims_rcv_tooshort; /* rcvd with too few bytes */
- u_quad_t pims_rcv_badsum; /* rcvd with bad checksum */
- u_quad_t pims_rcv_badversion; /* rcvd bad PIM version */
- u_quad_t pims_rcv_registers_msgs; /* rcvd regs. msgs (data only) */
- u_quad_t pims_rcv_registers_bytes; /* rcvd regs. bytes (data only) */
- u_quad_t pims_rcv_registers_wrongiif; /* rcvd regs. on wrong iif */
- u_quad_t pims_rcv_badregisters; /* rcvd invalid registers */
- u_quad_t pims_snd_registers_msgs; /* sent regs. msgs (data only) */
- u_quad_t pims_snd_registers_bytes; /* sent regs. bytes (data only) */
+ uint64_t pims_rcv_total_msgs; /* total PIM messages received */
+ uint64_t pims_rcv_total_bytes; /* total PIM bytes received */
+ uint64_t pims_rcv_tooshort; /* rcvd with too few bytes */
+ uint64_t pims_rcv_badsum; /* rcvd with bad checksum */
+ uint64_t pims_rcv_badversion; /* rcvd bad PIM version */
+ uint64_t pims_rcv_registers_msgs; /* rcvd regs. msgs (data only) */
+ uint64_t pims_rcv_registers_bytes; /* rcvd regs. bytes (data only) */
+ uint64_t pims_rcv_registers_wrongiif; /* rcvd regs. on wrong iif */
+ uint64_t pims_rcv_badregisters; /* rcvd invalid registers */
+ uint64_t pims_snd_registers_msgs; /* sent regs. msgs (data only) */
+ uint64_t pims_snd_registers_bytes; /* sent regs. bytes (data only) */
};
#ifdef _KERNEL
-#define PIMSTAT_ADD(name, val) V_pimstat.name += (val)
+#define PIMSTAT_ADD(name, val) \
+ VNET_PCPUSTAT_ADD(struct pimstat, pimstat, name, (val))
#define PIMSTAT_INC(name) PIMSTAT_ADD(name, 1)
#endif
/*
- * Names for PIM sysctl objects
+ * Identifiers for PIM sysctl nodes
*/
#define PIMCTL_STATS 1 /* statistics (read-only) */
-#define PIMCTL_MAXID 2
-
-#define PIMCTL_NAMES { \
- { 0, 0 }, \
- { "stats", CTLTYPE_STRUCT }, \
-}
#ifdef _KERNEL
-void pim_input(struct mbuf *, int);
+int pim_input(struct mbuf **, int *, int);
SYSCTL_DECL(_net_inet_pim);
#endif
diff --git a/freebsd/sys/netinet/raw_ip.c b/freebsd/sys/netinet/raw_ip.c
index 827eca6e..a4679586 100644
--- a/freebsd/sys/netinet/raw_ip.c
+++ b/freebsd/sys/netinet/raw_ip.c
@@ -42,12 +42,14 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/jail.h>
#include <sys/kernel.h>
+#include <sys/eventhandler.h>
#include <rtems/bsd/sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
+#include <sys/rmlock.h>
#include <sys/rwlock.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
@@ -59,6 +61,7 @@ __FBSDID("$FreeBSD$");
#include <vm/uma.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#include <net/vnet.h>
@@ -70,15 +73,17 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_mroute.h>
+#include <netinet/ip_icmp.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
#endif /*IPSEC*/
+#include <machine/stdarg.h>
#include <security/mac/mac_framework.h>
VNET_DEFINE(int, ip_defttl) = IPDEFTTL;
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(ip_defttl), 0,
"Maximum TTL on IP packets");
@@ -102,9 +107,6 @@ void (*ip_divert_ptr)(struct mbuf *, int);
int (*ng_ipfw_input_p)(struct mbuf **, int,
struct ip_fw_args *, int);
-/* Hook for telling pf that the destination address changed */
-void (*m_addr_chg_pf_p)(struct mbuf *m);
-
#ifdef INET
/*
* Hooks for multicast routing. They all default to NULL, so leave them not
@@ -128,11 +130,13 @@ int (*mrt_ioctl)(u_long, caddr_t, int);
int (*legal_vif_num)(int);
u_long (*ip_mcast_src)(int);
-void (*rsvp_input_p)(struct mbuf *m, int off);
+int (*rsvp_input_p)(struct mbuf **, int *, int);
int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
void (*ip_rsvp_force_done)(struct socket *);
#endif /* INET */
+extern struct protosw inetsw[];
+
u_long rip_sendspace = 9216;
SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
&rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
@@ -210,19 +214,19 @@ rip_init(void)
{
in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE,
- 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE,
- IPI_HASHFIELDS_NONE);
+ 1, "ripcb", rip_inpcb_init, NULL, 0, IPI_HASHFIELDS_NONE);
EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
EVENTHANDLER_PRI_ANY);
}
#ifdef VIMAGE
-void
-rip_destroy(void)
+static void
+rip_destroy(void *unused __unused)
{
in_pcbinfo_destroy(&V_ripcbinfo);
}
+VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL);
#endif
#ifdef INET
@@ -274,16 +278,18 @@ rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
* Setup generic address and protocol structures for raw_input routine, then
* pass them along with mbuf chain.
*/
-void
-rip_input(struct mbuf *m, int off)
+int
+rip_input(struct mbuf **mp, int *offp, int proto)
{
struct ifnet *ifp;
+ struct mbuf *m = *mp;
struct ip *ip = mtod(m, struct ip *);
- int proto = ip->ip_p;
struct inpcb *inp, *last;
struct sockaddr_in ripsrc;
int hash;
+ *mp = NULL;
+
bzero(&ripsrc, sizeof(ripsrc));
ripsrc.sin_len = sizeof(ripsrc);
ripsrc.sin_family = AF_INET;
@@ -411,10 +417,15 @@ rip_input(struct mbuf *m, int off)
IPSTAT_INC(ips_delivered);
INP_RUNLOCK(last);
} else {
- m_freem(m);
- IPSTAT_INC(ips_noproto);
- IPSTAT_DEC(ips_delivered);
+ if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) {
+ IPSTAT_INC(ips_noproto);
+ IPSTAT_DEC(ips_delivered);
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0);
+ } else {
+ m_freem(m);
+ }
}
+ return (IPPROTO_DONE);
}
/*
@@ -422,14 +433,20 @@ rip_input(struct mbuf *m, int off)
* have setup with control call.
*/
int
-rip_output(struct mbuf *m, struct socket *so, u_long dst)
+rip_output(struct mbuf *m, struct socket *so, ...)
{
struct ip *ip;
int error;
struct inpcb *inp = sotoinpcb(so);
+ va_list ap;
+ u_long dst;
int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
IP_ALLOWBROADCAST;
+ va_start(ap, so);
+ dst = va_arg(ap, u_long);
+ va_end(ap);
+
/*
* If the user handed us a complete IP packet, use it. Otherwise,
* allocate an mbuf for a header and fill it in.
@@ -439,7 +456,7 @@ rip_output(struct mbuf *m, struct socket *so, u_long dst)
m_freem(m);
return(EMSGSIZE);
}
- M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
+ M_PREPEND(m, sizeof(struct ip), M_NOWAIT);
if (m == NULL)
return(ENOBUFS);
@@ -447,32 +464,32 @@ rip_output(struct mbuf *m, struct socket *so, u_long dst)
ip = mtod(m, struct ip *);
ip->ip_tos = inp->inp_ip_tos;
if (inp->inp_flags & INP_DONTFRAG)
- ip->ip_off = IP_DF;
+ ip->ip_off = htons(IP_DF);
else
- ip->ip_off = 0;
+ ip->ip_off = htons(0);
ip->ip_p = inp->inp_ip_p;
- ip->ip_len = m->m_pkthdr.len;
+ ip->ip_len = htons(m->m_pkthdr.len);
ip->ip_src = inp->inp_laddr;
+ ip->ip_dst.s_addr = dst;
if (jailed(inp->inp_cred)) {
/*
* prison_local_ip4() would be good enough but would
* let a source of INADDR_ANY pass, which we do not
- * want to see from jails. We do not go through the
- * pain of in_pcbladdr() for raw sockets.
+ * want to see from jails.
*/
- if (ip->ip_src.s_addr == INADDR_ANY)
- error = prison_get_ip4(inp->inp_cred,
- &ip->ip_src);
- else
+ if (ip->ip_src.s_addr == INADDR_ANY) {
+ error = in_pcbladdr(inp, &ip->ip_dst, &ip->ip_src,
+ inp->inp_cred);
+ } else {
error = prison_local_ip4(inp->inp_cred,
&ip->ip_src);
+ }
if (error != 0) {
INP_RUNLOCK(inp);
m_freem(m);
return (error);
}
}
- ip->ip_dst.s_addr = dst;
ip->ip_ttl = inp->inp_ip_ttl;
} else {
if (m->m_pkthdr.len > IP_MAXPACKET) {
@@ -493,14 +510,18 @@ rip_output(struct mbuf *m, struct socket *so, u_long dst)
* and don't allow packet length sizes that will crash.
*/
if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options)
- || (ip->ip_len > m->m_pkthdr.len)
- || (ip->ip_len < (ip->ip_hl << 2))) {
+ || (ntohs(ip->ip_len) > m->m_pkthdr.len)
+ || (ntohs(ip->ip_len) < (ip->ip_hl << 2))) {
INP_RUNLOCK(inp);
m_freem(m);
return (EINVAL);
}
+ /*
+ * This doesn't allow application to specify ID of zero,
+ * but we got this limitation from the beginning of history.
+ */
if (ip->ip_id == 0)
- ip->ip_id = ip_newid();
+ ip_fillid(ip);
/*
* XXX prevent ip_output from overwriting header fields.
@@ -539,6 +560,8 @@ rip_output(struct mbuf *m, struct socket *so, u_long dst)
*
* When adding new socket options here, make sure to add access control
* checks here as necessary.
+ *
+ * XXX-BZ inp locking?
*/
int
rip_ctloutput(struct socket *so, struct sockopt *sopt)
@@ -712,6 +735,7 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt)
void
rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
{
+ struct rm_priotracker in_ifa_tracker;
struct in_ifaddr *ia;
struct ifnet *ifp;
int err;
@@ -719,16 +743,16 @@ rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
switch (cmd) {
case PRC_IFDOWN:
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if (ia->ia_ifa.ifa_addr == sa
&& (ia->ia_flags & IFA_ROUTE)) {
ifa_ref(&ia->ia_ifa);
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
/*
- * in_ifscrub kills the interface route.
+ * in_scrubprefix() kills the interface route.
*/
- in_ifscrub(ia->ia_ifp, ia, 0);
+ in_scrubprefix(ia, 0);
/*
* in_ifadown gets rid of all the rest of the
* routes. This is not quite the right thing
@@ -741,21 +765,21 @@ rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
}
}
if (ia == NULL) /* If ia matched, already unlocked. */
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
break;
case PRC_IFUP:
- IN_IFADDR_RLOCK();
+ IN_IFADDR_RLOCK(&in_ifa_tracker);
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if (ia->ia_ifa.ifa_addr == sa)
break;
}
if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) {
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return;
}
ifa_ref(&ia->ia_ifa);
- IN_IFADDR_RUNLOCK();
+ IN_IFADDR_RUNLOCK(&in_ifa_tracker);
flags = RTF_UP;
ifp = ia->ia_ifa.ifa_ifp;
@@ -764,16 +788,12 @@ rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
flags |= RTF_HOST;
err = ifa_del_loopback_route((struct ifaddr *)ia, sa);
- if (err == 0)
- ia->ia_flags &= ~IFA_RTSELF;
err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
if (err == 0)
ia->ia_flags |= IFA_ROUTE;
err = ifa_add_loopback_route((struct ifaddr *)ia, sa);
- if (err == 0)
- ia->ia_flags |= IFA_RTSELF;
ifa_free(&ia->ia_ifa);
break;
@@ -1036,7 +1056,7 @@ rip_pcblist(SYSCTL_HANDLER_ARGS)
return (error);
inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
- if (inp_list == 0)
+ if (inp_list == NULL)
return (ENOMEM);
INP_INFO_RLOCK(&V_ripcbinfo);
diff --git a/freebsd/sys/netinet/sctp.h b/freebsd/sys/netinet/sctp.h
index 4c5c03dc..ec42cffa 100644
--- a/freebsd/sys/netinet/sctp.h
+++ b/freebsd/sys/netinet/sctp.h
@@ -121,6 +121,14 @@ struct sctp_paramhdr {
#define SCTP_DEFAULT_PRINFO 0x00000022
#define SCTP_PEER_ADDR_THLDS 0x00000023
#define SCTP_REMOTE_UDP_ENCAPS_PORT 0x00000024
+#define SCTP_ECN_SUPPORTED 0x00000025
+#define SCTP_PR_SUPPORTED 0x00000026
+#define SCTP_AUTH_SUPPORTED 0x00000027
+#define SCTP_ASCONF_SUPPORTED 0x00000028
+#define SCTP_RECONFIG_SUPPORTED 0x00000029
+#define SCTP_NRSACK_SUPPORTED 0x00000030
+#define SCTP_PKTDROP_SUPPORTED 0x00000031
+#define SCTP_MAX_CWND 0x00000032
/*
* read-only options
@@ -133,6 +141,8 @@ struct sctp_paramhdr {
#define SCTP_GET_ASSOC_NUMBER 0x00000104 /* ro */
#define SCTP_GET_ASSOC_ID_LIST 0x00000105 /* ro */
#define SCTP_TIMEOUTS 0x00000106
+#define SCTP_PR_STREAM_STATUS 0x00000107
+#define SCTP_PR_ASSOC_STATUS 0x00000108
/*
* user socket options: BSD implementation specific
@@ -186,6 +196,9 @@ struct sctp_paramhdr {
#define SCTP_SS_VALUE 0x00001204
#define SCTP_CC_OPTION 0x00001205 /* Options for CC
* modules */
+/* For I-DATA */
+#define SCTP_INTERLEAVING_SUPPORTED 0x00001206
+
/* read only */
#define SCTP_GET_SNDBUF_USE 0x00001101
#define SCTP_GET_STAT_LOG 0x00001103
@@ -378,33 +391,32 @@ struct sctp_error_cause {
} SCTP_PACKED;
struct sctp_error_invalid_stream {
- struct sctp_error_cause cause; /* code=SCTP_ERROR_INVALID_STREAM */
+ struct sctp_error_cause cause; /* code=SCTP_CAUSE_INVALID_STREAM */
uint16_t stream_id; /* stream id of the DATA in error */
uint16_t reserved;
} SCTP_PACKED;
struct sctp_error_missing_param {
- struct sctp_error_cause cause; /* code=SCTP_ERROR_MISSING_PARAM */
+ struct sctp_error_cause cause; /* code=SCTP_CAUSE_MISSING_PARAM */
uint32_t num_missing_params; /* number of missing parameters */
- /* uint16_t param_type's follow */
+ uint16_t type[];
} SCTP_PACKED;
struct sctp_error_stale_cookie {
- struct sctp_error_cause cause; /* code=SCTP_ERROR_STALE_COOKIE */
+ struct sctp_error_cause cause; /* code=SCTP_CAUSE_STALE_COOKIE */
uint32_t stale_time; /* time in usec of staleness */
} SCTP_PACKED;
struct sctp_error_out_of_resource {
- struct sctp_error_cause cause; /* code=SCTP_ERROR_OUT_OF_RESOURCES */
+ struct sctp_error_cause cause; /* code=SCTP_CAUSE_OUT_OF_RESOURCES */
} SCTP_PACKED;
struct sctp_error_unresolv_addr {
- struct sctp_error_cause cause; /* code=SCTP_ERROR_UNRESOLVABLE_ADDR */
-
+ struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNRESOLVABLE_ADDR */
} SCTP_PACKED;
struct sctp_error_unrecognized_chunk {
- struct sctp_error_cause cause; /* code=SCTP_ERROR_UNRECOG_CHUNK */
+ struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNRECOG_CHUNK */
struct sctp_chunkhdr ch;/* header from chunk in error */
} SCTP_PACKED;
@@ -413,6 +425,11 @@ struct sctp_error_no_user_data {
uint32_t tsn; /* TSN of the empty data chunk */
} SCTP_PACKED;
+struct sctp_error_auth_invalid_hmac {
+ struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNSUPPORTED_HMACID */
+ uint16_t hmac_id;
+} SCTP_PACKED;
+
/*
* Main SCTP chunk types we place these here so natd and f/w's in user land
* can find them.
@@ -438,6 +455,7 @@ struct sctp_error_no_user_data {
/* EY nr_sack chunk id*/
#define SCTP_NR_SELECTIVE_ACK 0x10
/************0x40 series ***********/
+#define SCTP_IDATA 0x40
/************0x80 series ***********/
/* RFC5061 */
#define SCTP_ASCONF_ACK 0x80
@@ -453,7 +471,7 @@ struct sctp_error_no_user_data {
#define SCTP_FORWARD_CUM_TSN 0xc0
/* RFC5061 */
#define SCTP_ASCONF 0xc1
-
+#define SCTP_IFORWARD_CUM_TSN 0xc2
/* ABORT and SHUTDOWN COMPLETE FLAG */
#define SCTP_HAD_NO_TCB 0x01
diff --git a/freebsd/sys/netinet/sctp_asconf.c b/freebsd/sys/netinet/sctp_asconf.c
index 551f0690..4256ab51 100644
--- a/freebsd/sys/netinet/sctp_asconf.c
+++ b/freebsd/sys/netinet/sctp_asconf.c
@@ -82,7 +82,7 @@ sctp_asconf_success_response(uint32_t id)
struct sctp_asconf_paramhdr *aph;
m_reply = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_paramhdr),
- 0, M_DONTWAIT, 1, MT_DATA);
+ 0, M_NOWAIT, 1, MT_DATA);
if (m_reply == NULL) {
SCTPDBG(SCTP_DEBUG_ASCONF1,
"asconf_success_response: couldn't get mbuf!\n");
@@ -110,7 +110,7 @@ sctp_asconf_error_response(uint32_t id, uint16_t cause, uint8_t * error_tlv,
m_reply = sctp_get_mbuf_for_msg((sizeof(struct sctp_asconf_paramhdr) +
tlv_length +
sizeof(struct sctp_error_cause)),
- 0, M_DONTWAIT, 1, MT_DATA);
+ 0, M_NOWAIT, 1, MT_DATA);
if (m_reply == NULL) {
SCTPDBG(SCTP_DEBUG_ASCONF1,
"asconf_error_response: couldn't get mbuf!\n");
@@ -150,7 +150,7 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap
{
struct sctp_nets *net;
struct mbuf *m_reply = NULL;
- struct sockaddr_storage sa_store;
+ union sctp_sockstore store;
struct sctp_paramhdr *ph;
uint16_t param_type, aparam_length;
@@ -179,7 +179,7 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap
#if defined(INET) || defined(INET6)
param_length = ntohs(ph->param_length);
#endif
- sa = (struct sockaddr *)&sa_store;
+ sa = &store.sa;
switch (param_type) {
#ifdef INET
case SCTP_IPV4_ADDRESS:
@@ -188,7 +188,7 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap
return (NULL);
}
v4addr = (struct sctp_ipv4addr_param *)ph;
- sin = (struct sockaddr_in *)&sa_store;
+ sin = &store.sin;
bzero(sin, sizeof(*sin));
sin->sin_family = AF_INET;
sin->sin_len = sizeof(struct sockaddr_in);
@@ -211,7 +211,7 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap
return (NULL);
}
v6addr = (struct sctp_ipv6addr_param *)ph;
- sin6 = (struct sockaddr_in6 *)&sa_store;
+ sin6 = &store.sin6;
bzero(sin6, sizeof(*sin6));
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(struct sockaddr_in6);
@@ -246,7 +246,8 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap
m_reply = sctp_asconf_error_response(aph->correlation_id,
SCTP_CAUSE_INVALID_PARAM, (uint8_t *) aph,
aparam_length);
- } else if (sctp_add_remote_addr(stcb, sa, &net, SCTP_DONOT_SETSCOPE,
+ } else if (sctp_add_remote_addr(stcb, sa, &net, stcb->asoc.port,
+ SCTP_DONOT_SETSCOPE,
SCTP_ADDR_DYNAMIC_ADDED) != 0) {
SCTPDBG(SCTP_DEBUG_ASCONF1,
"process_asconf_add_ip: error adding address\n");
@@ -304,7 +305,7 @@ sctp_process_asconf_delete_ip(struct sockaddr *src,
struct sctp_tcb *stcb, int response_required)
{
struct mbuf *m_reply = NULL;
- struct sockaddr_storage sa_store;
+ union sctp_sockstore store;
struct sctp_paramhdr *ph;
uint16_t param_type, aparam_length;
@@ -333,7 +334,7 @@ sctp_process_asconf_delete_ip(struct sockaddr *src,
#if defined(INET) || defined(INET6)
param_length = ntohs(ph->param_length);
#endif
- sa = (struct sockaddr *)&sa_store;
+ sa = &store.sa;
switch (param_type) {
#ifdef INET
case SCTP_IPV4_ADDRESS:
@@ -342,7 +343,7 @@ sctp_process_asconf_delete_ip(struct sockaddr *src,
return (NULL);
}
v4addr = (struct sctp_ipv4addr_param *)ph;
- sin = (struct sockaddr_in *)&sa_store;
+ sin = &store.sin;
bzero(sin, sizeof(*sin));
sin->sin_family = AF_INET;
sin->sin_len = sizeof(struct sockaddr_in);
@@ -362,7 +363,7 @@ sctp_process_asconf_delete_ip(struct sockaddr *src,
return (NULL);
}
v6addr = (struct sctp_ipv6addr_param *)ph;
- sin6 = (struct sockaddr_in6 *)&sa_store;
+ sin6 = &store.sin6;
bzero(sin6, sizeof(*sin6));
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(struct sockaddr_in6);
@@ -439,7 +440,7 @@ sctp_process_asconf_set_primary(struct sockaddr *src,
struct sctp_tcb *stcb, int response_required)
{
struct mbuf *m_reply = NULL;
- struct sockaddr_storage sa_store;
+ union sctp_sockstore store;
struct sctp_paramhdr *ph;
uint16_t param_type, aparam_length;
@@ -467,7 +468,7 @@ sctp_process_asconf_set_primary(struct sockaddr *src,
#if defined(INET) || defined(INET6)
param_length = ntohs(ph->param_length);
#endif
- sa = (struct sockaddr *)&sa_store;
+ sa = &store.sa;
switch (param_type) {
#ifdef INET
case SCTP_IPV4_ADDRESS:
@@ -476,7 +477,7 @@ sctp_process_asconf_set_primary(struct sockaddr *src,
return (NULL);
}
v4addr = (struct sctp_ipv4addr_param *)ph;
- sin = (struct sockaddr_in *)&sa_store;
+ sin = &store.sin;
bzero(sin, sizeof(*sin));
sin->sin_family = AF_INET;
sin->sin_len = sizeof(struct sockaddr_in);
@@ -494,7 +495,7 @@ sctp_process_asconf_set_primary(struct sockaddr *src,
return (NULL);
}
v6addr = (struct sctp_ipv6addr_param *)ph;
- sin6 = (struct sockaddr_in6 *)&sa_store;
+ sin6 = &store.sin6;
bzero(sin6, sizeof(*sin6));
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(struct sockaddr_in6);
@@ -557,7 +558,9 @@ sctp_process_asconf_set_primary(struct sockaddr *src,
(stcb->asoc.primary_destination->dest_state &
SCTP_ADDR_UNCONFIRMED) == 0) {
- sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_TIMER + SCTP_LOC_7);
+ sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED,
+ stcb->sctp_ep, stcb, NULL,
+ SCTP_FROM_SCTP_ASCONF + SCTP_LOC_1);
if (sctp_is_mobility_feature_on(stcb->sctp_ep,
SCTP_MOBILITY_FASTHANDOFF)) {
sctp_assoc_immediate_retrans(stcb,
@@ -598,7 +601,7 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset,
uint32_t serial_num;
struct mbuf *n, *m_ack, *m_result, *m_tail;
struct sctp_asconf_ack_chunk *ack_cp;
- struct sctp_asconf_paramhdr *aph, *ack_aph;
+ struct sctp_asconf_paramhdr *aph;
struct sctp_ipv6addr_param *p_addr;
unsigned int asconf_limit, cnt;
int error = 0; /* did an error occur? */
@@ -653,7 +656,7 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset,
}
}
m_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_ack_chunk), 0,
- M_DONTWAIT, 1, MT_DATA);
+ M_NOWAIT, 1, MT_DATA);
if (m_ack == NULL) {
SCTPDBG(SCTP_DEBUG_ASCONF1,
"handle_asconf: couldn't get mbuf!\n");
@@ -681,13 +684,6 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset,
}
/* param_length is already validated in process_control... */
offset += ntohs(p_addr->ph.param_length); /* skip lookup addr */
-
- /* get pointer to first asconf param in ASCONF-ACK */
- ack_aph = (struct sctp_asconf_paramhdr *)(mtod(m_ack, caddr_t)+sizeof(struct sctp_asconf_ack_chunk));
- if (ack_aph == NULL) {
- SCTPDBG(SCTP_DEBUG_ASCONF1, "Gak in asconf2\n");
- return;
- }
/* get pointer to first asconf param in ASCONF */
aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_asconf_paramhdr), (uint8_t *) & aparam_buf);
if (aph == NULL) {
@@ -726,13 +722,11 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset,
}
switch (param_type) {
case SCTP_ADD_IP_ADDRESS:
- asoc->peer_supports_asconf = 1;
m_result = sctp_process_asconf_add_ip(src, aph, stcb,
(cnt < SCTP_BASE_SYSCTL(sctp_hb_maxburst)), error);
cnt++;
break;
case SCTP_DEL_IP_ADDRESS:
- asoc->peer_supports_asconf = 1;
m_result = sctp_process_asconf_delete_ip(src, aph, stcb,
error);
break;
@@ -740,7 +734,6 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset,
/* not valid in an ASCONF chunk */
break;
case SCTP_SET_PRIM_ADDR:
- asoc->peer_supports_asconf = 1;
m_result = sctp_process_asconf_set_primary(src, aph,
stcb, error);
break;
@@ -932,8 +925,6 @@ sctp_addr_match(struct sctp_paramhdr *ph, struct sockaddr *sa)
void
sctp_asconf_cleanup(struct sctp_tcb *stcb, struct sctp_nets *net)
{
- /* mark peer as ASCONF incapable */
- stcb->asoc.peer_supports_asconf = 0;
/*
* clear out any existing asconfs going out
*/
@@ -1005,7 +996,7 @@ sctp_assoc_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *dstnet)
SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.primary_destination->ro._l_addr.sa);
sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb,
stcb->asoc.deleted_primary,
- SCTP_FROM_SCTP_TIMER + SCTP_LOC_8);
+ SCTP_FROM_SCTP_ASCONF + SCTP_LOC_3);
stcb->asoc.num_send_timers_up--;
if (stcb->asoc.num_send_timers_up < 0) {
stcb->asoc.num_send_timers_up = 0;
@@ -1044,7 +1035,7 @@ sctp_net_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *net)
SCTPDBG(SCTP_DEBUG_ASCONF1, "net_immediate_retrans: RTO is %d\n", net->RTO);
sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net,
- SCTP_FROM_SCTP_TIMER + SCTP_LOC_5);
+ SCTP_FROM_SCTP_ASCONF + SCTP_LOC_4);
stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net);
net->error_count = 0;
TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) {
@@ -1121,7 +1112,8 @@ sctp_path_check_and_react(struct sctp_tcb *stcb, struct sctp_ifa *newifa)
* not be changed.
*/
SCTP_RTALLOC((sctp_route_t *) & net->ro,
- stcb->sctp_ep->def_vrf_id);
+ stcb->sctp_ep->def_vrf_id,
+ stcb->sctp_ep->fibnum);
if (net->ro.ro_rt == NULL)
continue;
@@ -1275,7 +1267,7 @@ sctp_asconf_queue_mgmt(struct sctp_tcb *stcb, struct sctp_ifa *ifa,
{
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)&ifa->address.sa;
+ sin6 = &ifa->address.sin6;
aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS;
aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv6addr_param));
aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) +
@@ -1290,7 +1282,7 @@ sctp_asconf_queue_mgmt(struct sctp_tcb *stcb, struct sctp_ifa *ifa,
{
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)&ifa->address.sa;
+ sin = &ifa->address.sin;
aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS;
aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv4addr_param));
aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) +
@@ -1340,24 +1332,31 @@ sctp_asconf_queue_add(struct sctp_tcb *stcb, struct sctp_ifa *ifa,
{
uint32_t status;
int pending_delete_queued = 0;
+ int last;
/* see if peer supports ASCONF */
- if (stcb->asoc.peer_supports_asconf == 0) {
+ if (stcb->asoc.asconf_supported == 0) {
return (-1);
}
/*
* if this is deleting the last address from the assoc, mark it as
* pending.
*/
- if ((type == SCTP_DEL_IP_ADDRESS) && !stcb->asoc.asconf_del_pending &&
- (sctp_local_addr_count(stcb) < 2)) {
- /* set the pending delete info only */
- stcb->asoc.asconf_del_pending = 1;
- stcb->asoc.asconf_addr_del_pending = ifa;
- atomic_add_int(&ifa->refcount, 1);
- SCTPDBG(SCTP_DEBUG_ASCONF2,
- "asconf_queue_add: mark delete last address pending\n");
- return (-1);
+ if ((type == SCTP_DEL_IP_ADDRESS) && !stcb->asoc.asconf_del_pending) {
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
+ last = (sctp_local_addr_count(stcb) == 0);
+ } else {
+ last = (sctp_local_addr_count(stcb) == 1);
+ }
+ if (last) {
+ /* set the pending delete info only */
+ stcb->asoc.asconf_del_pending = 1;
+ stcb->asoc.asconf_addr_del_pending = ifa;
+ atomic_add_int(&ifa->refcount, 1);
+ SCTPDBG(SCTP_DEBUG_ASCONF2,
+ "asconf_queue_add: mark delete last address pending\n");
+ return (-1);
+ }
}
/* queue an asconf parameter */
status = sctp_asconf_queue_mgmt(stcb, ifa, type);
@@ -1426,13 +1425,12 @@ sctp_asconf_queue_sa_delete(struct sctp_tcb *stcb, struct sockaddr *sa)
{
struct sctp_ifa *ifa;
struct sctp_asconf_addr *aa, *aa_next;
- uint32_t vrf_id;
if (stcb == NULL) {
return (-1);
}
/* see if peer supports ASCONF */
- if (stcb->asoc.peer_supports_asconf == 0) {
+ if (stcb->asoc.asconf_supported == 0) {
return (-1);
}
/* make sure the request isn't already in the queue */
@@ -1458,12 +1456,7 @@ sctp_asconf_queue_sa_delete(struct sctp_tcb *stcb, struct sockaddr *sa)
} /* for each aa */
/* find any existing ifa-- NOTE ifa CAN be allowed to be NULL */
- if (stcb) {
- vrf_id = stcb->asoc.vrf_id;
- } else {
- vrf_id = SCTP_DEFAULT_VRFID;
- }
- ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED);
+ ifa = sctp_find_ifa_by_addr(sa, stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED);
/* adding new request to the queue */
SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa),
@@ -1552,7 +1545,7 @@ sctp_asconf_find_param(struct sctp_tcb *stcb, uint32_t correlation_id)
* notifications based on the error response
*/
static void
-sctp_asconf_process_error(struct sctp_tcb *stcb,
+sctp_asconf_process_error(struct sctp_tcb *stcb SCTP_UNUSED,
struct sctp_asconf_paramhdr *aph)
{
struct sctp_error_cause *eh;
@@ -1590,10 +1583,7 @@ sctp_asconf_process_error(struct sctp_tcb *stcb,
switch (param_type) {
case SCTP_ADD_IP_ADDRESS:
case SCTP_DEL_IP_ADDRESS:
- stcb->asoc.peer_supports_asconf = 0;
- break;
case SCTP_SET_PRIM_ADDR:
- stcb->asoc.peer_supports_asconf = 0;
break;
default:
break;
@@ -1629,8 +1619,6 @@ sctp_asconf_process_param_ack(struct sctp_tcb *stcb,
SCTPDBG(SCTP_DEBUG_ASCONF1,
"process_param_ack: set primary IP address\n");
/* nothing to do... peer may start using this addr */
- if (flag == 0)
- stcb->asoc.peer_supports_asconf = 0;
break;
default:
/* should NEVER happen */
@@ -1648,11 +1636,11 @@ sctp_asconf_process_param_ack(struct sctp_tcb *stcb,
* cleanup from a bad asconf ack parameter
*/
static void
-sctp_asconf_ack_clear(struct sctp_tcb *stcb)
+sctp_asconf_ack_clear(struct sctp_tcb *stcb SCTP_UNUSED)
{
/* assume peer doesn't really know how to do asconfs */
- stcb->asoc.peer_supports_asconf = 0;
/* XXX we could free the pending queue here */
+
}
void
@@ -1695,8 +1683,14 @@ sctp_handle_asconf_ack(struct mbuf *m, int offset,
* abort the asoc, since someone probably just hijacked us...
*/
if (serial_num == (asoc->asconf_seq_out + 1)) {
+ struct mbuf *op_err;
+ char msg[SCTP_DIAG_INFO_LEN];
+
SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: got unexpected next serial number! Aborting asoc!\n");
- sctp_abort_an_association(stcb->sctp_ep, stcb, NULL, SCTP_SO_NOT_LOCKED);
+ snprintf(msg, sizeof(msg), "Never sent serial number %8.8x",
+ serial_num);
+ op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
+ sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
*abort_no_unlock = 1;
return;
}
@@ -1709,7 +1703,7 @@ sctp_handle_asconf_ack(struct mbuf *m, int offset,
if (serial_num == asoc->asconf_seq_out - 1) {
/* stop our timer */
sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, net,
- SCTP_FROM_SCTP_ASCONF + SCTP_LOC_3);
+ SCTP_FROM_SCTP_ASCONF + SCTP_LOC_5);
}
/* process the ASCONF-ACK contents */
ack_length = ntohs(cp->ch.chunk_length) -
@@ -1937,7 +1931,7 @@ sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
{
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)&ifa->address.sin6;
+ sin6 = &ifa->address.sin6;
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
/* we skip unspecifed addresses */
return;
@@ -1970,7 +1964,7 @@ sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
SCTP_IPV6_V6ONLY(inp6))
return;
- sin = (struct sockaddr_in *)&ifa->address.sa;
+ sin = &ifa->address.sin;
if (sin->sin_addr.s_addr == 0) {
/* we skip unspecifed addresses */
return;
@@ -1990,7 +1984,7 @@ sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
/* queue an asconf for this address add/delete */
if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF)) {
/* does the peer do asconf? */
- if (stcb->asoc.peer_supports_asconf) {
+ if (stcb->asoc.asconf_supported) {
/* queue an asconf for this addr */
status = sctp_asconf_queue_add(stcb, ifa, type);
@@ -2000,7 +1994,8 @@ sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
* sent when the state goes open.
*/
if (status == 0 &&
- SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) {
+ ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED))) {
#ifdef SCTP_TIMER_BASED_ASCONF
sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp,
stcb, stcb->asoc.primary_destination);
@@ -2127,7 +2122,7 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
else
continue;
}
- sin6 = (struct sockaddr_in6 *)&ifa->address.sin6;
+ sin6 = &ifa->address.sin6;
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
/* we skip unspecifed addresses */
continue;
@@ -2161,7 +2156,7 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
SCTP_IPV6_V6ONLY(inp6))
continue;
- sin = (struct sockaddr_in *)&ifa->address.sa;
+ sin = &ifa->address.sin;
if (sin->sin_addr.s_addr == 0) {
/* we skip unspecifed addresses */
continue;
@@ -2240,7 +2235,7 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
}
/* queue an asconf for this address add/delete */
if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF) &&
- stcb->asoc.peer_supports_asconf) {
+ stcb->asoc.asconf_supported == 1) {
/* queue an asconf for this addr */
status = sctp_asconf_queue_add(stcb, ifa, type);
/*
@@ -2248,7 +2243,8 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
* count of queued params. If in the non-open
* state, these get sent when the assoc goes open.
*/
- if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) {
+ if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
if (status >= 0) {
num_queued++;
}
@@ -2308,7 +2304,8 @@ sctp_set_primary_ip_address_sa(struct sctp_tcb *stcb, struct sockaddr *sa)
"set_primary_ip_address_sa: queued on tcb=%p, ",
(void *)stcb);
SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
- if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) {
+ if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
#ifdef SCTP_TIMER_BASED_ASCONF
sctp_timer_start(SCTP_TIMER_TYPE_ASCONF,
stcb->sctp_ep, stcb,
@@ -2344,7 +2341,8 @@ sctp_set_primary_ip_address(struct sctp_ifa *ifa)
SCTPDBG(SCTP_DEBUG_ASCONF1, "set_primary_ip_address: queued on stcb=%p, ",
(void *)stcb);
SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &ifa->address.sa);
- if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) {
+ if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
#ifdef SCTP_TIMER_BASED_ASCONF
sctp_timer_start(SCTP_TIMER_TYPE_ASCONF,
stcb->sctp_ep, stcb,
@@ -2478,7 +2476,7 @@ sctp_find_valid_localaddr(struct sctp_tcb *stcb, int addr_locked)
if (stcb->asoc.scope.ipv4_addr_legal) {
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)&sctp_ifa->address.sa;
+ sin = &sctp_ifa->address.sin;
if (sin->sin_addr.s_addr == 0) {
/* skip unspecifed addresses */
continue;
@@ -2512,7 +2510,7 @@ sctp_find_valid_localaddr(struct sctp_tcb *stcb, int addr_locked)
if (sctp_ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) {
continue;
}
- sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa;
+ sin6 = &sctp_ifa->address.sin6;
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
/*
* we skip unspecifed
@@ -2606,14 +2604,14 @@ sctp_compose_asconf(struct sctp_tcb *stcb, int *retlen, int addr_locked)
* it's simpler to fill in the asconf chunk header lookup address on
* the fly
*/
- m_asconf_chk = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_chunk), 0, M_DONTWAIT, 1, MT_DATA);
+ m_asconf_chk = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_chunk), 0, M_NOWAIT, 1, MT_DATA);
if (m_asconf_chk == NULL) {
/* no mbuf's */
SCTPDBG(SCTP_DEBUG_ASCONF1,
"compose_asconf: couldn't get chunk mbuf!\n");
return (NULL);
}
- m_asconf = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ m_asconf = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
if (m_asconf == NULL) {
/* no mbuf's */
SCTPDBG(SCTP_DEBUG_ASCONF1,
@@ -2784,19 +2782,16 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m,
struct sctp_paramhdr tmp_param, *ph;
uint16_t plen, ptype;
struct sctp_ifa *sctp_ifa;
+ union sctp_sockstore store;
#ifdef INET6
struct sctp_ipv6addr_param addr6_store;
- struct sockaddr_in6 sin6;
#endif
#ifdef INET
struct sctp_ipv4addr_param addr4_store;
- struct sockaddr_in sin;
#endif
- struct sockaddr *sa;
- uint32_t vrf_id;
SCTPDBG(SCTP_DEBUG_ASCONF2, "processing init-ack addresses\n");
if (stcb == NULL) /* Un-needed check for SA */
@@ -2808,21 +2803,6 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m,
if ((offset + sizeof(struct sctp_paramhdr)) > length) {
return;
}
- /* init the addresses */
-#ifdef INET6
- bzero(&sin6, sizeof(sin6));
- sin6.sin6_family = AF_INET6;
- sin6.sin6_len = sizeof(sin6);
- sin6.sin6_port = stcb->rport;
-#endif
-
-#ifdef INET
- bzero(&sin, sizeof(sin));
- sin.sin_family = AF_INET;
- sin.sin_len = sizeof(sin);
- sin.sin_port = stcb->rport;
-#endif
-
/* go through the addresses in the init-ack */
ph = (struct sctp_paramhdr *)
sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr),
@@ -2845,9 +2825,11 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m,
a6p == NULL) {
return;
}
- memcpy(&sin6.sin6_addr, a6p->addr,
- sizeof(struct in6_addr));
- sa = (struct sockaddr *)&sin6;
+ memset(&store, 0, sizeof(union sctp_sockstore));
+ store.sin6.sin6_family = AF_INET6;
+ store.sin6.sin6_len = sizeof(struct sockaddr_in6);
+ store.sin6.sin6_port = stcb->rport;
+ memcpy(&store.sin6.sin6_addr, a6p->addr, sizeof(struct in6_addr));
break;
}
#endif
@@ -2864,8 +2846,11 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m,
a4p == NULL) {
return;
}
- sin.sin_addr.s_addr = a4p->addr;
- sa = (struct sockaddr *)&sin;
+ memset(&store, 0, sizeof(union sctp_sockstore));
+ store.sin.sin_family = AF_INET;
+ store.sin.sin_len = sizeof(struct sockaddr_in);
+ store.sin.sin_port = stcb->rport;
+ store.sin.sin_addr.s_addr = a4p->addr;
break;
}
#endif
@@ -2874,12 +2859,7 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m,
}
/* see if this address really (still) exists */
- if (stcb) {
- vrf_id = stcb->asoc.vrf_id;
- } else {
- vrf_id = SCTP_DEFAULT_VRFID;
- }
- sctp_ifa = sctp_find_ifa_by_addr(sa, vrf_id,
+ sctp_ifa = sctp_find_ifa_by_addr(&store.sa, stcb->asoc.vrf_id,
SCTP_ADDR_NOT_LOCKED);
if (sctp_ifa == NULL) {
/* address doesn't exist anymore */
@@ -2888,9 +2868,9 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m,
/* are ASCONFs allowed ? */
if ((sctp_is_feature_on(stcb->sctp_ep,
SCTP_PCB_FLAGS_DO_ASCONF)) &&
- stcb->asoc.peer_supports_asconf) {
+ stcb->asoc.asconf_supported) {
/* queue an ASCONF DEL_IP_ADDRESS */
- status = sctp_asconf_queue_sa_delete(stcb, sa);
+ status = sctp_asconf_queue_sa_delete(stcb, &store.sa);
/*
* if queued ok, and in correct state, send
* out the ASCONF.
@@ -3137,7 +3117,7 @@ sctp_check_address_list_all(struct sctp_tcb *stcb, struct mbuf *m, int offset,
switch (sctp_ifa->address.sa.sa_family) {
#ifdef INET
case AF_INET:
- sin = (struct sockaddr_in *)&sctp_ifa->address.sin;
+ sin = &sctp_ifa->address.sin;
if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred,
&sin->sin_addr) != 0) {
continue;
@@ -3151,7 +3131,7 @@ sctp_check_address_list_all(struct sctp_tcb *stcb, struct mbuf *m, int offset,
#endif
#ifdef INET6
case AF_INET6:
- sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sin6;
+ sin6 = &sctp_ifa->address.sin6;
if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred,
&sin6->sin6_addr) != 0) {
continue;
@@ -3271,6 +3251,7 @@ sctp_addr_mgmt_ep_sa(struct sctp_inpcb *inp, struct sockaddr *sa,
} else {
struct sctp_asconf_iterator *asc;
struct sctp_laddr *wi;
+ int ret;
SCTP_MALLOC(asc, struct sctp_asconf_iterator *,
sizeof(struct sctp_asconf_iterator),
@@ -3292,7 +3273,7 @@ sctp_addr_mgmt_ep_sa(struct sctp_inpcb *inp, struct sockaddr *sa,
wi->action = type;
atomic_add_int(&ifa->refcount, 1);
LIST_INSERT_HEAD(&asc->list_of_work, wi, sctp_nxt_addr);
- (void)sctp_initiate_iterator(sctp_asconf_iterator_ep,
+ ret = sctp_initiate_iterator(sctp_asconf_iterator_ep,
sctp_asconf_iterator_stcb,
sctp_asconf_iterator_ep_end,
SCTP_PCB_ANY_FLAGS,
@@ -3300,6 +3281,12 @@ sctp_addr_mgmt_ep_sa(struct sctp_inpcb *inp, struct sockaddr *sa,
SCTP_ASOC_ANY_STATE,
(void *)asc, 0,
sctp_asconf_iterator_end, inp, 0);
+ if (ret) {
+ SCTP_PRINTF("Failed to initiate iterator for addr_mgmt_ep_sa\n");
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EFAULT);
+ sctp_asconf_iterator_end(asc, 0);
+ return (EFAULT);
+ }
}
return (0);
} else {
@@ -3389,6 +3376,11 @@ sctp_asconf_send_nat_state_update(struct sctp_tcb *stcb,
TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
break;
#endif
+ default:
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "sctp_asconf_send_nat_state_update: unknown address family\n");
+ SCTP_FREE(aa, SCTP_M_ASC_ADDR);
+ return;
}
SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa),
SCTP_M_ASC_ADDR);
@@ -3422,6 +3414,11 @@ sctp_asconf_send_nat_state_update(struct sctp_tcb *stcb,
TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next);
break;
#endif
+ default:
+ SCTPDBG(SCTP_DEBUG_ASCONF1,
+ "sctp_asconf_send_nat_state_update: unknown address family\n");
+ SCTP_FREE(aa, SCTP_M_ASC_ADDR);
+ return;
}
/* Now we must hunt the addresses and add all global addresses */
if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
diff --git a/freebsd/sys/netinet/sctp_auth.c b/freebsd/sys/netinet/sctp_auth.c
index fc649032..19e30718 100644
--- a/freebsd/sys/netinet/sctp_auth.c
+++ b/freebsd/sys/netinet/sctp_auth.c
@@ -135,11 +135,6 @@ sctp_auth_delete_chunk(uint8_t chunk, sctp_auth_chklist_t * list)
if (list == NULL)
return (-1);
- /* is chunk restricted? */
- if ((chunk == SCTP_ASCONF) ||
- (chunk == SCTP_ASCONF_ACK)) {
- return (-1);
- }
if (list->chunks[chunk] == 1) {
list->chunks[chunk] = 0;
list->num_chunks--;
@@ -160,16 +155,6 @@ sctp_auth_get_chklist_size(const sctp_auth_chklist_t * list)
}
/*
- * set the default list of chunks requiring AUTH
- */
-void
-sctp_auth_set_default_chunks(sctp_auth_chklist_t * list)
-{
- (void)sctp_auth_add_chunk(SCTP_ASCONF, list);
- (void)sctp_auth_add_chunk(SCTP_ASCONF_ACK, list);
-}
-
-/*
* return the current number and list of required chunks caller must
* guarantee ptr has space for up to 256 bytes
*/
@@ -559,7 +544,7 @@ sctp_insert_sharedkey(struct sctp_keyhead *shared_keys,
}
}
/* shouldn't reach here */
- return (0);
+ return (EINVAL);
}
void
@@ -575,7 +560,7 @@ sctp_auth_key_acquire(struct sctp_tcb *stcb, uint16_t key_id)
atomic_add_int(&skey->refcount, 1);
SCTPDBG(SCTP_DEBUG_AUTH2,
"%s: stcb %p key %u refcount acquire to %d\n",
- __FUNCTION__, (void *)stcb, key_id, skey->refcount);
+ __func__, (void *)stcb, key_id, skey->refcount);
}
}
@@ -593,20 +578,20 @@ sctp_auth_key_release(struct sctp_tcb *stcb, uint16_t key_id, int so_locked
/* decrement the ref count */
if (skey) {
- sctp_free_sharedkey(skey);
SCTPDBG(SCTP_DEBUG_AUTH2,
"%s: stcb %p key %u refcount release to %d\n",
- __FUNCTION__, (void *)stcb, key_id, skey->refcount);
+ __func__, (void *)stcb, key_id, skey->refcount);
/* see if a notification should be generated */
- if ((skey->refcount <= 1) && (skey->deactivated)) {
+ if ((skey->refcount <= 2) && (skey->deactivated)) {
/* notify ULP that key is no longer used */
sctp_ulp_notify(SCTP_NOTIFY_AUTH_FREE_KEY, stcb,
key_id, 0, so_locked);
SCTPDBG(SCTP_DEBUG_AUTH2,
"%s: stcb %p key %u no longer used, %d\n",
- __FUNCTION__, (void *)stcb, key_id, skey->refcount);
+ __func__, (void *)stcb, key_id, skey->refcount);
}
+ sctp_free_sharedkey(skey);
}
}
@@ -639,8 +624,11 @@ sctp_copy_skeylist(const struct sctp_keyhead *src, struct sctp_keyhead *dest)
LIST_FOREACH(skey, src, next) {
new_skey = sctp_copy_sharedkey(skey);
if (new_skey != NULL) {
- (void)sctp_insert_sharedkey(dest, new_skey);
- count++;
+ if (sctp_insert_sharedkey(dest, new_skey)) {
+ sctp_free_sharedkey(new_skey);
+ } else {
+ count++;
+ }
}
}
return (count);
@@ -648,7 +636,7 @@ sctp_copy_skeylist(const struct sctp_keyhead *src, struct sctp_keyhead *dest)
sctp_hmaclist_t *
-sctp_alloc_hmaclist(uint8_t num_hmacs)
+sctp_alloc_hmaclist(uint16_t num_hmacs)
{
sctp_hmaclist_t *new_list;
int alloc_size;
@@ -1455,8 +1443,8 @@ sctp_auth_get_cookie_params(struct sctp_tcb *stcb, struct mbuf *m,
p_random = (struct sctp_auth_random *)phdr;
random_len = plen - sizeof(*p_random);
} else if (ptype == SCTP_HMAC_LIST) {
- int num_hmacs;
- int i;
+ uint16_t num_hmacs;
+ uint16_t i;
if (plen > sizeof(hmacs_store))
break;
@@ -1668,8 +1656,8 @@ sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *auth,
/* is the indicated HMAC supported? */
if (!sctp_auth_is_supported_hmac(stcb->asoc.local_hmacs, hmac_id)) {
- struct mbuf *m_err;
- struct sctp_auth_invalid_hmac *err;
+ struct mbuf *op_err;
+ struct sctp_error_auth_invalid_hmac *cause;
SCTP_STAT_INCR(sctps_recvivalhmacid);
SCTPDBG(SCTP_DEBUG_AUTH1,
@@ -1679,20 +1667,19 @@ sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *auth,
* report this in an Error Chunk: Unsupported HMAC
* Identifier
*/
- m_err = sctp_get_mbuf_for_msg(sizeof(*err), 0, M_DONTWAIT,
- 1, MT_HEADER);
- if (m_err != NULL) {
+ op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_auth_invalid_hmac),
+ 0, M_NOWAIT, 1, MT_HEADER);
+ if (op_err != NULL) {
/* pre-reserve some space */
- SCTP_BUF_RESV_UF(m_err, sizeof(struct sctp_chunkhdr));
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr));
/* fill in the error */
- err = mtod(m_err, struct sctp_auth_invalid_hmac *);
- bzero(err, sizeof(*err));
- err->ph.param_type = htons(SCTP_CAUSE_UNSUPPORTED_HMACID);
- err->ph.param_length = htons(sizeof(*err));
- err->hmac_id = ntohs(hmac_id);
- SCTP_BUF_LEN(m_err) = sizeof(*err);
+ cause = mtod(op_err, struct sctp_error_auth_invalid_hmac *);
+ cause->cause.code = htons(SCTP_CAUSE_UNSUPPORTED_HMACID);
+ cause->cause.length = htons(sizeof(struct sctp_error_auth_invalid_hmac));
+ cause->hmac_id = ntohs(hmac_id);
+ SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_auth_invalid_hmac);
/* queue it */
- sctp_queue_op_err(stcb, m_err);
+ sctp_queue_op_err(stcb, op_err);
}
return (-1);
}
@@ -1785,7 +1772,7 @@ sctp_notify_authentication(struct sctp_tcb *stcb, uint32_t indication,
return;
m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_authkey_event),
- 0, M_DONTWAIT, 1, MT_HEADER);
+ 0, M_NOWAIT, 1, MT_HEADER);
if (m_notify == NULL)
/* no space left */
return;
@@ -1951,8 +1938,7 @@ sctp_validate_init_auth_params(struct mbuf *m, int offset, int limit)
"SCTP: peer sent chunk list w/o AUTH\n");
return (-1);
}
- if (!SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk) && peer_supports_asconf &&
- !peer_supports_auth) {
+ if (peer_supports_asconf && !peer_supports_auth) {
SCTPDBG(SCTP_DEBUG_AUTH1,
"SCTP: peer supports ASCONF but not AUTH\n");
return (-1);
diff --git a/freebsd/sys/netinet/sctp_auth.h b/freebsd/sys/netinet/sctp_auth.h
index 535c0fc0..b98764e2 100644
--- a/freebsd/sys/netinet/sctp_auth.h
+++ b/freebsd/sys/netinet/sctp_auth.h
@@ -112,7 +112,6 @@ extern sctp_auth_chklist_t *sctp_copy_chunklist(sctp_auth_chklist_t * chklist);
extern int sctp_auth_add_chunk(uint8_t chunk, sctp_auth_chklist_t * list);
extern int sctp_auth_delete_chunk(uint8_t chunk, sctp_auth_chklist_t * list);
extern size_t sctp_auth_get_chklist_size(const sctp_auth_chklist_t * list);
-extern void sctp_auth_set_default_chunks(sctp_auth_chklist_t * list);
extern int
sctp_serialize_auth_chunks(const sctp_auth_chklist_t * list,
uint8_t * ptr);
@@ -155,7 +154,7 @@ sctp_auth_key_release(struct sctp_tcb *stcb, uint16_t keyid,
/* hmac list handling */
-extern sctp_hmaclist_t *sctp_alloc_hmaclist(uint8_t num_hmacs);
+extern sctp_hmaclist_t *sctp_alloc_hmaclist(uint16_t num_hmacs);
extern void sctp_free_hmaclist(sctp_hmaclist_t * list);
extern int sctp_auth_add_hmacid(sctp_hmaclist_t * list, uint16_t hmac_id);
extern sctp_hmaclist_t *sctp_copy_hmaclist(sctp_hmaclist_t * list);
diff --git a/freebsd/sys/netinet/sctp_bsd_addr.c b/freebsd/sys/netinet/sctp_bsd_addr.c
index d558bd82..bfd7f816 100644
--- a/freebsd/sys/netinet/sctp_bsd_addr.c
+++ b/freebsd/sys/netinet/sctp_bsd_addr.c
@@ -295,9 +295,12 @@ sctp_addr_change(struct ifaddr *ifa, int cmd)
{
uint32_t ifa_flags = 0;
+ if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) {
+ return;
+ }
/*
* BSD only has one VRF, if this changes we will need to hook in the
- * right things here to get the id to pass to the address managment
+ * right things here to get the id to pass to the address management
* routine.
*/
if (SCTP_BASE_VAR(first_time) == 0) {
@@ -383,17 +386,7 @@ sctp_get_mbuf_for_msg(unsigned int space_needed, int want_header,
return (m);
}
if (allonebuf) {
- int siz;
-
- if (SCTP_BUF_IS_EXTENDED(m)) {
- siz = SCTP_BUF_EXTEND_SIZE(m);
- } else {
- if (want_header)
- siz = MHLEN;
- else
- siz = MLEN;
- }
- if (siz < space_needed) {
+ if (SCTP_BUF_SIZE(m) < space_needed) {
m_freem(m);
return (NULL);
}
@@ -404,9 +397,7 @@ sctp_get_mbuf_for_msg(unsigned int space_needed, int want_header,
}
#ifdef SCTP_MBUF_LOGGING
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- if (SCTP_BUF_IS_EXTENDED(m)) {
- sctp_log_mb(m, SCTP_MBUF_IALLOC);
- }
+ sctp_log_mb(m, SCTP_MBUF_IALLOC);
}
#endif
return (m);
diff --git a/freebsd/sys/netinet/sctp_cc_functions.c b/freebsd/sys/netinet/sctp_cc_functions.c
index 9758e011..68dc460a 100644
--- a/freebsd/sys/netinet/sctp_cc_functions.c
+++ b/freebsd/sys/netinet/sctp_cc_functions.c
@@ -55,6 +55,19 @@ __FBSDID("$FreeBSD$");
#define SHIFT_MPTCP_MULTI 8
static void
+sctp_enforce_cwnd_limit(struct sctp_association *assoc, struct sctp_nets *net)
+{
+ if ((assoc->max_cwnd > 0) &&
+ (net->cwnd > assoc->max_cwnd) &&
+ (net->cwnd > (net->mtu - sizeof(struct sctphdr)))) {
+ net->cwnd = assoc->max_cwnd;
+ if (net->cwnd < (net->mtu - sizeof(struct sctphdr))) {
+ net->cwnd = net->mtu - sizeof(struct sctphdr);
+ }
+ }
+}
+
+static void
sctp_set_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net)
{
struct sctp_association *assoc;
@@ -82,8 +95,9 @@ sctp_set_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net)
net->cwnd = net->mtu - sizeof(struct sctphdr);
}
}
+ sctp_enforce_cwnd_limit(assoc, net);
net->ssthresh = assoc->peers_rwnd;
- SDT_PROBE(sctp, cwnd, net, init,
+ SDT_PROBE5(sctp, cwnd, net, init,
stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net,
0, net->cwnd);
if (SCTP_BASE_SYSCTL(sctp_logging_level) &
@@ -180,7 +194,8 @@ sctp_cwnd_update_after_fr(struct sctp_tcb *stcb,
}
}
net->cwnd = net->ssthresh;
- SDT_PROBE(sctp, cwnd, net, fr,
+ sctp_enforce_cwnd_limit(asoc, net);
+ SDT_PROBE5(sctp, cwnd, net, fr,
stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net,
old_cwnd, net->cwnd);
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
@@ -213,7 +228,8 @@ sctp_cwnd_update_after_fr(struct sctp_tcb *stcb,
}
sctp_timer_stop(SCTP_TIMER_TYPE_SEND,
- stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32);
+ stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_CC_FUNCTIONS + SCTP_LOC_1);
sctp_timer_start(SCTP_TIMER_TYPE_SEND,
stcb->sctp_ep, stcb, net);
}
@@ -228,7 +244,7 @@ sctp_cwnd_update_after_fr(struct sctp_tcb *stcb,
}
/* Defines for instantaneous bw decisions */
-#define SCTP_INST_LOOSING 1 /* Loosing to other flows */
+#define SCTP_INST_LOOSING 1 /* Losing to other flows */
#define SCTP_INST_NEUTRAL 2 /* Neutral, no indication */
#define SCTP_INST_GAINING 3 /* Gaining, step down possible */
@@ -247,7 +263,7 @@ cc_bw_same(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw,
*/
/* Probe point 5 */
probepoint |= ((5 << 16) | 1);
- SDT_PROBE(sctp, cwnd, net, rttvar,
+ SDT_PROBE5(sctp, cwnd, net, rttvar,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -268,7 +284,7 @@ cc_bw_same(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw,
oth |= net->cc_mod.rtcc.step_cnt;
oth <<= 16;
oth |= net->cc_mod.rtcc.last_step_state;
- SDT_PROBE(sctp, cwnd, net, rttstep,
+ SDT_PROBE5(sctp, cwnd, net, rttstep,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -292,7 +308,7 @@ cc_bw_same(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw,
*/
/* Probe point 6 */
probepoint |= ((6 << 16) | 0);
- SDT_PROBE(sctp, cwnd, net, rttvar,
+ SDT_PROBE5(sctp, cwnd, net, rttvar,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -304,7 +320,7 @@ cc_bw_same(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw,
oth |= net->cc_mod.rtcc.step_cnt;
oth <<= 16;
oth |= net->cc_mod.rtcc.last_step_state;
- SDT_PROBE(sctp, cwnd, net, rttstep,
+ SDT_PROBE5(sctp, cwnd, net, rttstep,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -335,7 +351,7 @@ cc_bw_same(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw,
*/
/* Probe point 7 */
probepoint |= ((7 << 16) | net->cc_mod.rtcc.ret_from_eq);
- SDT_PROBE(sctp, cwnd, net, rttvar,
+ SDT_PROBE5(sctp, cwnd, net, rttvar,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -384,7 +400,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
/* We caused it maybe.. back off? */
/* PROBE POINT 1 */
probepoint |= ((1 << 16) | 1);
- SDT_PROBE(sctp, cwnd, net, rttvar,
+ SDT_PROBE5(sctp, cwnd, net, rttvar,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -402,7 +418,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
}
/* Probe point 2 */
probepoint |= ((2 << 16) | 0);
- SDT_PROBE(sctp, cwnd, net, rttvar,
+ SDT_PROBE5(sctp, cwnd, net, rttvar,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -415,7 +431,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
oth |= net->cc_mod.rtcc.step_cnt;
oth <<= 16;
oth |= net->cc_mod.rtcc.last_step_state;
- SDT_PROBE(sctp, cwnd, net, rttstep,
+ SDT_PROBE5(sctp, cwnd, net, rttstep,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -428,6 +444,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
if ((net->cc_mod.rtcc.vol_reduce) &&
(inst_ind != SCTP_INST_GAINING)) {
net->cwnd += net->mtu;
+ sctp_enforce_cwnd_limit(&stcb->asoc, net);
net->cc_mod.rtcc.vol_reduce--;
}
net->cc_mod.rtcc.last_step_state = 2;
@@ -438,7 +455,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
/* bw & rtt decreased */
/* Probe point 3 */
probepoint |= ((3 << 16) | 0);
- SDT_PROBE(sctp, cwnd, net, rttvar,
+ SDT_PROBE5(sctp, cwnd, net, rttvar,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -450,7 +467,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
oth |= net->cc_mod.rtcc.step_cnt;
oth <<= 16;
oth |= net->cc_mod.rtcc.last_step_state;
- SDT_PROBE(sctp, cwnd, net, rttstep,
+ SDT_PROBE5(sctp, cwnd, net, rttstep,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -459,6 +476,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
if ((net->cc_mod.rtcc.vol_reduce) &&
(inst_ind != SCTP_INST_GAINING)) {
net->cwnd += net->mtu;
+ sctp_enforce_cwnd_limit(&stcb->asoc, net);
net->cc_mod.rtcc.vol_reduce--;
}
net->cc_mod.rtcc.last_step_state = 3;
@@ -469,7 +487,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
/* The bw decreased but rtt stayed the same */
/* Probe point 4 */
probepoint |= ((4 << 16) | 0);
- SDT_PROBE(sctp, cwnd, net, rttvar,
+ SDT_PROBE5(sctp, cwnd, net, rttvar,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -481,7 +499,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
oth |= net->cc_mod.rtcc.step_cnt;
oth <<= 16;
oth |= net->cc_mod.rtcc.last_step_state;
- SDT_PROBE(sctp, cwnd, net, rttstep,
+ SDT_PROBE5(sctp, cwnd, net, rttstep,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -490,6 +508,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
if ((net->cc_mod.rtcc.vol_reduce) &&
(inst_ind != SCTP_INST_GAINING)) {
net->cwnd += net->mtu;
+ sctp_enforce_cwnd_limit(&stcb->asoc, net);
net->cc_mod.rtcc.vol_reduce--;
}
net->cc_mod.rtcc.last_step_state = 4;
@@ -518,7 +537,7 @@ cc_bw_increase(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
*/
/* PROBE POINT 0 */
probepoint = (((uint64_t) net->cwnd) << 32);
- SDT_PROBE(sctp, cwnd, net, rttvar,
+ SDT_PROBE5(sctp, cwnd, net, rttvar,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -530,7 +549,7 @@ cc_bw_increase(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
oth |= net->cc_mod.rtcc.step_cnt;
oth <<= 16;
oth |= net->cc_mod.rtcc.last_step_state;
- SDT_PROBE(sctp, cwnd, net, rttstep,
+ SDT_PROBE5(sctp, cwnd, net, rttstep,
vtag,
((net->cc_mod.rtcc.lbw << 32) | nbw),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -546,7 +565,7 @@ cc_bw_increase(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6
return (0);
}
-/* RTCC Algoritm to limit growth of cwnd, return
+/* RTCC Algorithm to limit growth of cwnd, return
* true if you want to NOT allow cwnd growth
*/
static int
@@ -630,7 +649,7 @@ cc_bw_limit(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw)
/* Can't determine do not change */
probepoint |= ((0xd << 16) | inst_ind);
}
- SDT_PROBE(sctp, cwnd, net, rttvar,
+ SDT_PROBE5(sctp, cwnd, net, rttvar,
vtag,
((nbw << 32) | inst_bw),
((net->cc_mod.rtcc.lbw_rtt << 32) | rtt),
@@ -790,7 +809,7 @@ sctp_cwnd_update_after_sack_common(struct sctp_tcb *stcb,
(((uint32_t) (stcb->sctp_ep->sctp_lport)) << 16) |
(stcb->rport);
- SDT_PROBE(sctp, cwnd, net, rttvar,
+ SDT_PROBE5(sctp, cwnd, net, rttvar,
vtag,
nbw,
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -884,11 +903,12 @@ sctp_cwnd_update_after_sack_common(struct sctp_tcb *stcb,
break;
}
net->cwnd += incr;
+ sctp_enforce_cwnd_limit(asoc, net);
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
sctp_log_cwnd(stcb, net, incr,
SCTP_CWND_LOG_FROM_SS);
}
- SDT_PROBE(sctp, cwnd, net, ack,
+ SDT_PROBE5(sctp, cwnd, net, ack,
stcb->asoc.my_vtag,
((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)),
net,
@@ -950,7 +970,8 @@ sctp_cwnd_update_after_sack_common(struct sctp_tcb *stcb,
break;
}
net->cwnd += incr;
- SDT_PROBE(sctp, cwnd, net, ack,
+ sctp_enforce_cwnd_limit(asoc, net);
+ SDT_PROBE5(sctp, cwnd, net, ack,
stcb->asoc.my_vtag,
((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)),
net,
@@ -982,7 +1003,7 @@ sctp_cwnd_update_exit_pf_common(struct sctp_tcb *stcb, struct sctp_nets *net)
old_cwnd = net->cwnd;
net->cwnd = net->mtu;
- SDT_PROBE(sctp, cwnd, net, ack,
+ SDT_PROBE5(sctp, cwnd, net, ack,
stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net,
old_cwnd, net->cwnd);
SCTPDBG(SCTP_DEBUG_INDATA1, "Destination %p moved from PF to reachable with cwnd %d.\n",
@@ -1053,7 +1074,7 @@ sctp_cwnd_update_after_timeout(struct sctp_tcb *stcb, struct sctp_nets *net)
}
net->cwnd = net->mtu;
net->partial_bytes_acked = 0;
- SDT_PROBE(sctp, cwnd, net, to,
+ SDT_PROBE5(sctp, cwnd, net, to,
stcb->asoc.my_vtag,
((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)),
net,
@@ -1091,7 +1112,7 @@ sctp_cwnd_update_after_ecn_echo_common(struct sctp_tcb *stcb, struct sctp_nets *
} else {
/*
* Further tuning down required over the drastic
- * orginal cut
+ * original cut
*/
net->ssthresh -= (net->mtu * num_pkt_lost);
net->cwnd -= (net->mtu * num_pkt_lost);
@@ -1113,7 +1134,7 @@ sctp_cwnd_update_after_ecn_echo_common(struct sctp_tcb *stcb, struct sctp_nets *
net->RTO <<= 1;
}
net->cwnd = net->ssthresh;
- SDT_PROBE(sctp, cwnd, net, ecn,
+ SDT_PROBE5(sctp, cwnd, net, ecn,
stcb->asoc.my_vtag,
((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)),
net,
@@ -1132,12 +1153,9 @@ sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb,
uint32_t * bottle_bw, uint32_t * on_queue)
{
uint32_t bw_avail;
- int rtt;
unsigned int incr;
int old_cwnd = net->cwnd;
- /* need real RTT in msd for this calc */
- rtt = net->rtt / 1000;
/* get bottle neck bw */
*bottle_bw = ntohl(cp->bottle_bw);
/* and whats on queue */
@@ -1146,10 +1164,11 @@ sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb,
* adjust the on-queue if our flight is more it could be that the
* router has not yet gotten data "in-flight" to it
*/
- if (*on_queue < net->flight_size)
+ if (*on_queue < net->flight_size) {
*on_queue = net->flight_size;
- /* calculate the available space */
- bw_avail = (*bottle_bw * rtt) / 1000;
+ }
+ /* rtt is measured in micro seconds, bottle_bw in bytes per second */
+ bw_avail = (uint32_t) (((uint64_t) (*bottle_bw) * net->rtt) / (uint64_t) 1000000);
if (bw_avail > *bottle_bw) {
/*
* Cap the growth to no more than the bottle neck. This can
@@ -1169,7 +1188,6 @@ sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb,
int seg_inflight, seg_onqueue, my_portion;
net->partial_bytes_acked = 0;
-
/* how much are we over queue size? */
incr = *on_queue - bw_avail;
if (stcb->asoc.seen_a_sack_this_pkt) {
@@ -1232,9 +1250,10 @@ sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb,
/* We always have 1 MTU */
net->cwnd = net->mtu;
}
+ sctp_enforce_cwnd_limit(&stcb->asoc, net);
if (net->cwnd - old_cwnd != 0) {
/* log only changes */
- SDT_PROBE(sctp, cwnd, net, pd,
+ SDT_PROBE5(sctp, cwnd, net, pd,
stcb->asoc.my_vtag,
((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)),
net,
@@ -1256,7 +1275,8 @@ sctp_cwnd_update_after_output(struct sctp_tcb *stcb,
net->ssthresh = net->cwnd;
if (burst_limit) {
net->cwnd = (net->flight_size + (burst_limit * net->mtu));
- SDT_PROBE(sctp, cwnd, net, bl,
+ sctp_enforce_cwnd_limit(&stcb->asoc, net);
+ SDT_PROBE5(sctp, cwnd, net, bl,
stcb->asoc.my_vtag,
((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)),
net,
@@ -1272,7 +1292,7 @@ sctp_cwnd_update_after_sack(struct sctp_tcb *stcb,
struct sctp_association *asoc,
int accum_moved, int reneged_all, int will_exit)
{
- /* Passing a zero argument in last disables the rtcc algoritm */
+ /* Passing a zero argument in last disables the rtcc algorithm */
sctp_cwnd_update_after_sack_common(stcb, asoc, accum_moved, reneged_all, will_exit, 0);
}
@@ -1280,13 +1300,13 @@ static void
sctp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net,
int in_window, int num_pkt_lost)
{
- /* Passing a zero argument in last disables the rtcc algoritm */
+ /* Passing a zero argument in last disables the rtcc algorithm */
sctp_cwnd_update_after_ecn_echo_common(stcb, net, in_window, num_pkt_lost, 0);
}
/* Here starts the RTCCVAR type CC invented by RRS which
* is a slight mod to RFC2581. We reuse a common routine or
- * two since these algoritms are so close and need to
+ * two since these algorithms are so close and need to
* remain the same.
*/
static void
@@ -1332,7 +1352,7 @@ sctp_cwnd_new_rtcc_transmission_begins(struct sctp_tcb *stcb,
probepoint = (((uint64_t) net->cwnd) << 32);
/* Probe point 8 */
probepoint |= ((8 << 16) | 0);
- SDT_PROBE(sctp, cwnd, net, rttvar,
+ SDT_PROBE5(sctp, cwnd, net, rttvar,
vtag,
((net->cc_mod.rtcc.lbw << 32) | 0),
((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt),
@@ -1395,7 +1415,7 @@ sctp_set_rtcc_initial_cc_param(struct sctp_tcb *stcb,
vtag = (net->rtt << 32) |
(((uint32_t) (stcb->sctp_ep->sctp_lport)) << 16) |
(stcb->rport);
- SDT_PROBE(sctp, cwnd, net, rttvar,
+ SDT_PROBE5(sctp, cwnd, net, rttvar,
vtag,
0,
0,
@@ -1492,7 +1512,7 @@ sctp_cwnd_update_rtcc_after_sack(struct sctp_tcb *stcb,
struct sctp_association *asoc,
int accum_moved, int reneged_all, int will_exit)
{
- /* Passing a one argument at the last enables the rtcc algoritm */
+ /* Passing a one argument at the last enables the rtcc algorithm */
sctp_cwnd_update_after_sack_common(stcb, asoc, accum_moved, reneged_all, will_exit, 1);
}
@@ -1508,13 +1528,13 @@ sctp_rtt_rtcc_calculated(struct sctp_tcb *stcb SCTP_UNUSED,
struct sctp_hs_raise_drop {
int32_t cwnd;
- int32_t increase;
- int32_t drop_percent;
+ int8_t increase;
+ int8_t drop_percent;
};
#define SCTP_HS_TABLE_SIZE 73
-struct sctp_hs_raise_drop sctp_cwnd_adjust[SCTP_HS_TABLE_SIZE] = {
+static const struct sctp_hs_raise_drop sctp_cwnd_adjust[SCTP_HS_TABLE_SIZE] = {
{38, 1, 50}, /* 0 */
{118, 2, 44}, /* 1 */
{221, 3, 41}, /* 2 */
@@ -1594,6 +1614,7 @@ static void
sctp_hs_cwnd_increase(struct sctp_tcb *stcb, struct sctp_nets *net)
{
int cur_val, i, indx, incr;
+ int old_cwnd = net->cwnd;
cur_val = net->cwnd >> 10;
indx = SCTP_HS_TABLE_SIZE - 1;
@@ -1602,14 +1623,8 @@ sctp_hs_cwnd_increase(struct sctp_tcb *stcb, struct sctp_nets *net)
/* normal mode */
if (net->net_ack > net->mtu) {
net->cwnd += net->mtu;
- if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
- sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_FROM_SS);
- }
} else {
net->cwnd += net->net_ack;
- if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
- sctp_log_cwnd(stcb, net, net->net_ack, SCTP_CWND_LOG_FROM_SS);
- }
}
} else {
for (i = net->last_hs_used; i < SCTP_HS_TABLE_SIZE; i++) {
@@ -1619,11 +1634,12 @@ sctp_hs_cwnd_increase(struct sctp_tcb *stcb, struct sctp_nets *net)
}
}
net->last_hs_used = indx;
- incr = ((sctp_cwnd_adjust[indx].increase) << 10);
+ incr = (((int32_t) sctp_cwnd_adjust[indx].increase) << 10);
net->cwnd += incr;
- if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
- sctp_log_cwnd(stcb, net, incr, SCTP_CWND_LOG_FROM_SS);
- }
+ }
+ sctp_enforce_cwnd_limit(&stcb->asoc, net);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
+ sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SS);
}
}
@@ -1644,7 +1660,7 @@ sctp_hs_cwnd_decrease(struct sctp_tcb *stcb, struct sctp_nets *net)
} else {
/* drop by the proper amount */
net->ssthresh = net->cwnd - (int)((net->cwnd / 100) *
- sctp_cwnd_adjust[net->last_hs_used].drop_percent);
+ (int32_t) sctp_cwnd_adjust[net->last_hs_used].drop_percent);
net->cwnd = net->ssthresh;
/* now where are we */
indx = net->last_hs_used;
@@ -1662,6 +1678,7 @@ sctp_hs_cwnd_decrease(struct sctp_tcb *stcb, struct sctp_nets *net)
net->last_hs_used = indx;
}
}
+ sctp_enforce_cwnd_limit(&stcb->asoc, net);
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_FR);
}
@@ -1718,7 +1735,8 @@ sctp_hs_cwnd_update_after_fr(struct sctp_tcb *stcb,
}
sctp_timer_stop(SCTP_TIMER_TYPE_SEND,
- stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32);
+ stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_CC_FUNCTIONS + SCTP_LOC_2);
sctp_timer_start(SCTP_TIMER_TYPE_SEND,
stcb->sctp_ep, stcb, net);
}
@@ -1793,9 +1811,7 @@ sctp_hs_cwnd_update_after_sack(struct sctp_tcb *stcb,
if (net->cwnd <= net->ssthresh) {
/* We are in slow start */
if (net->flight_size + net->net_ack >= net->cwnd) {
-
sctp_hs_cwnd_increase(stcb, net);
-
} else {
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
sctp_log_cwnd(stcb, net, net->net_ack,
@@ -1809,6 +1825,7 @@ sctp_hs_cwnd_update_after_sack(struct sctp_tcb *stcb,
(net->partial_bytes_acked >= net->cwnd)) {
net->partial_bytes_acked -= net->cwnd;
net->cwnd += net->mtu;
+ sctp_enforce_cwnd_limit(asoc, net);
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
sctp_log_cwnd(stcb, net, net->mtu,
SCTP_CWND_LOG_FROM_CA);
@@ -2047,6 +2064,7 @@ htcp_cong_avoid(struct sctp_tcb *stcb, struct sctp_nets *net)
SCTP_CWND_LOG_FROM_SS);
}
}
+ sctp_enforce_cwnd_limit(&stcb->asoc, net);
} else {
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) {
sctp_log_cwnd(stcb, net, net->net_ack,
@@ -2068,6 +2086,7 @@ htcp_cong_avoid(struct sctp_tcb *stcb, struct sctp_nets *net)
*/
net->cwnd += net->mtu;
net->partial_bytes_acked = 0;
+ sctp_enforce_cwnd_limit(&stcb->asoc, net);
htcp_alpha_update(&net->cc_mod.htcp_ca);
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
sctp_log_cwnd(stcb, net, net->mtu,
@@ -2114,6 +2133,7 @@ sctp_htcp_set_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net)
*/
net->cwnd = min((net->mtu * 4), max((2 * net->mtu), SCTP_INITIAL_CWND));
net->ssthresh = stcb->asoc.peers_rwnd;
+ sctp_enforce_cwnd_limit(&stcb->asoc, net);
htcp_init(net);
if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) {
@@ -2217,6 +2237,7 @@ sctp_htcp_cwnd_update_after_fr(struct sctp_tcb *stcb,
htcp_reset(&net->cc_mod.htcp_ca);
net->ssthresh = htcp_recalc_ssthresh(net);
net->cwnd = net->ssthresh;
+ sctp_enforce_cwnd_limit(asoc, net);
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd),
SCTP_CWND_LOG_FROM_FR);
@@ -2247,7 +2268,8 @@ sctp_htcp_cwnd_update_after_fr(struct sctp_tcb *stcb,
}
sctp_timer_stop(SCTP_TIMER_TYPE_SEND,
- stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32);
+ stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_CC_FUNCTIONS + SCTP_LOC_3);
sctp_timer_start(SCTP_TIMER_TYPE_SEND,
stcb->sctp_ep, stcb, net);
}
@@ -2296,13 +2318,14 @@ sctp_htcp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb,
net->RTO <<= 1;
}
net->cwnd = net->ssthresh;
+ sctp_enforce_cwnd_limit(&stcb->asoc, net);
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) {
sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SAT);
}
}
}
-struct sctp_cc_functions sctp_cc_functions[] = {
+const struct sctp_cc_functions sctp_cc_functions[] = {
{
.sctp_set_initial_cc_param = sctp_set_initial_cc_param,
.sctp_cwnd_update_after_sack = sctp_cwnd_update_after_sack,
diff --git a/freebsd/sys/netinet/sctp_constants.h b/freebsd/sys/netinet/sctp_constants.h
index 0ede04ca..ecde4fee 100644
--- a/freebsd/sys/netinet/sctp_constants.h
+++ b/freebsd/sys/netinet/sctp_constants.h
@@ -66,6 +66,10 @@ __FBSDID("$FreeBSD$");
*/
#define SCTP_LARGEST_INIT_ACCEPTED (65535 - 2048)
+/* Largest length of a chunk */
+#define SCTP_MAX_CHUNK_LENGTH 0xffff
+/* Largest length of an error cause */
+#define SCTP_MAX_CAUSE_LENGTH 0xffff
/* Number of addresses where we just skip the counting */
#define SCTP_COUNT_LIMIT 40
@@ -267,20 +271,11 @@ __FBSDID("$FreeBSD$");
/* how many addresses per assoc remote and local */
#define SCTP_SCALE_FOR_ADDR 2
-/* default AUTO_ASCONF mode enable(1)/disable(0) value (sysctl) */
-#define SCTP_DEFAULT_AUTO_ASCONF 1
-
/* default MULTIPLE_ASCONF mode enable(1)/disable(0) value (sysctl) */
#define SCTP_DEFAULT_MULTIPLE_ASCONFS 0
-/* default MOBILITY_BASE mode enable(1)/disable(0) value (sysctl) */
-#define SCTP_DEFAULT_MOBILITY_BASE 0
-
-/* default MOBILITY_FASTHANDOFF mode enable(1)/disable(0) value (sysctl) */
-#define SCTP_DEFAULT_MOBILITY_FASTHANDOFF 0
-
/*
- * Theshold for rwnd updates, we have to read (sb_hiwat >>
+ * Threshold for rwnd updates, we have to read (sb_hiwat >>
* SCTP_RWND_HIWAT_SHIFT) before we will look to see if we need to send a
* window update sack. When we look, we compare the last rwnd we sent vs the
* current rwnd. It too must be greater than this value. Using 3 divdes the
@@ -350,6 +345,7 @@ __FBSDID("$FreeBSD$");
#define SCTP_RTT_FROM_NON_DATA 0
#define SCTP_RTT_FROM_DATA 1
+#define PR_SCTP_UNORDERED_FLAG 0x0001
/* IP hdr (20/40) + 12+2+2 (enet) + sctp common 12 */
#define SCTP_FIRST_MBUF_RESV 68
@@ -391,8 +387,8 @@ __FBSDID("$FreeBSD$");
/* align to 32-bit sizes */
#define SCTP_SIZE32(x) ((((x) + 3) >> 2) << 2)
-#define IS_SCTP_CONTROL(a) ((a)->chunk_type != SCTP_DATA)
-#define IS_SCTP_DATA(a) ((a)->chunk_type == SCTP_DATA)
+#define IS_SCTP_CONTROL(a) (((a)->chunk_type != SCTP_DATA) && ((a)->chunk_type != SCTP_IDATA))
+#define IS_SCTP_DATA(a) (((a)->chunk_type == SCTP_DATA) || ((a)->chunk_type == SCTP_IDATA))
/* SCTP parameter types */
@@ -467,7 +463,7 @@ __FBSDID("$FreeBSD$");
/*
- * SCTP states for internal state machine XXX (should match "user" values)
+ * SCTP states for internal state machine
*/
#define SCTP_STATE_EMPTY 0x0000
#define SCTP_STATE_INUSE 0x0001
@@ -518,7 +514,7 @@ __FBSDID("$FreeBSD$");
/* Maximum the mapping array will grow to (TSN mapping array) */
#define SCTP_MAPPING_ARRAY 512
-/* size of the inital malloc on the mapping array */
+/* size of the initial malloc on the mapping array */
#define SCTP_INITIAL_MAPPING_ARRAY 16
/* how much we grow the mapping array each call */
#define SCTP_MAPPING_ARRAY_INCR 32
@@ -621,10 +617,6 @@ __FBSDID("$FreeBSD$");
/* 30 seconds + RTO (in ms) */
#define SCTP_HB_DEFAULT_MSEC 30000
-/* Max time I will wait for Shutdown to complete */
-#define SCTP_DEF_MAX_SHUTDOWN_SEC 180
-
-
/*
* This is how long a secret lives, NOT how long a cookie lives how many
* ticks the current secret will live.
@@ -647,7 +639,7 @@ __FBSDID("$FreeBSD$");
#define SCTP_DEF_PMTU_RAISE_SEC 600 /* 10 min between raise attempts */
-/* How many streams I request initally by default */
+/* How many streams I request initially by default */
#define SCTP_OSTREAM_INITIAL 10
#define SCTP_ISTREAM_INITIAL 2048
@@ -774,18 +766,19 @@ __FBSDID("$FreeBSD$");
*/
/* File defines */
-#define SCTP_FROM_SCTP_INPUT 0x10000000
-#define SCTP_FROM_SCTP_PCB 0x20000000
-#define SCTP_FROM_SCTP_INDATA 0x30000000
-#define SCTP_FROM_SCTP_TIMER 0x40000000
-#define SCTP_FROM_SCTP_USRREQ 0x50000000
-#define SCTP_FROM_SCTPUTIL 0x60000000
-#define SCTP_FROM_SCTP6_USRREQ 0x70000000
-#define SCTP_FROM_SCTP_ASCONF 0x80000000
-#define SCTP_FROM_SCTP_OUTPUT 0x90000000
-#define SCTP_FROM_SCTP_PEELOFF 0xa0000000
-#define SCTP_FROM_SCTP_PANDA 0xb0000000
-#define SCTP_FROM_SCTP_SYSCTL 0xc0000000
+#define SCTP_FROM_SCTP_INPUT 0x10000000
+#define SCTP_FROM_SCTP_PCB 0x20000000
+#define SCTP_FROM_SCTP_INDATA 0x30000000
+#define SCTP_FROM_SCTP_TIMER 0x40000000
+#define SCTP_FROM_SCTP_USRREQ 0x50000000
+#define SCTP_FROM_SCTPUTIL 0x60000000
+#define SCTP_FROM_SCTP6_USRREQ 0x70000000
+#define SCTP_FROM_SCTP_ASCONF 0x80000000
+#define SCTP_FROM_SCTP_OUTPUT 0x90000000
+#define SCTP_FROM_SCTP_PEELOFF 0xa0000000
+#define SCTP_FROM_SCTP_PANDA 0xb0000000
+#define SCTP_FROM_SCTP_SYSCTL 0xc0000000
+#define SCTP_FROM_SCTP_CC_FUNCTIONS 0xd0000000
/* Location ID's */
#define SCTP_LOC_1 0x00000001
@@ -821,6 +814,8 @@ __FBSDID("$FreeBSD$");
#define SCTP_LOC_31 0x0000001f
#define SCTP_LOC_32 0x00000020
#define SCTP_LOC_33 0x00000021
+#define SCTP_LOC_34 0x00000022
+#define SCTP_LOC_35 0x00000023
/* Free assoc codes */
@@ -892,12 +887,19 @@ __FBSDID("$FreeBSD$");
/* modular comparison */
/* See RFC 1982 for details. */
-#define SCTP_SSN_GT(a, b) (((a < b) && ((uint16_t)(b - a) > (1U<<15))) || \
- ((a > b) && ((uint16_t)(a - b) < (1U<<15))))
-#define SCTP_SSN_GE(a, b) (SCTP_SSN_GT(a, b) || (a == b))
-#define SCTP_TSN_GT(a, b) (((a < b) && ((uint32_t)(b - a) > (1U<<31))) || \
- ((a > b) && ((uint32_t)(a - b) < (1U<<31))))
-#define SCTP_TSN_GE(a, b) (SCTP_TSN_GT(a, b) || (a == b))
+#define SCTP_UINT16_GT(a, b) (((a < b) && ((uint16_t)(b - a) > (1U<<15))) || \
+ ((a > b) && ((uint16_t)(a - b) < (1U<<15))))
+#define SCTP_UINT16_GE(a, b) (SCTP_UINT16_GT(a, b) || (a == b))
+#define SCTP_UINT32_GT(a, b) (((a < b) && ((uint32_t)(b - a) > (1U<<31))) || \
+ ((a > b) && ((uint32_t)(a - b) < (1U<<31))))
+#define SCTP_UINT32_GE(a, b) (SCTP_UINT32_GT(a, b) || (a == b))
+
+#define SCTP_SSN_GT(a, b) SCTP_UINT16_GT(a, b)
+#define SCTP_SSN_GE(a, b) SCTP_UINT16_GE(a, b)
+#define SCTP_TSN_GT(a, b) SCTP_UINT32_GT(a, b)
+#define SCTP_TSN_GE(a, b) SCTP_UINT32_GE(a, b)
+#define SCTP_MSGID_GT(o, a, b) ((o == 1) ? SCTP_UINT16_GT((uint16_t)a, (uint16_t)b) : SCTP_UINT32_GT(a, b))
+#define SCTP_MSGID_GE(o, a, b) ((o == 1) ? SCTP_UINT16_GE((uint16_t)a, (uint16_t)b) : SCTP_UINT32_GE(a, b))
/* Mapping array manipulation routines */
#define SCTP_IS_TSN_PRESENT(arry, gap) ((arry[(gap >> 3)] >> (gap & 0x07)) & 0x01)
@@ -920,7 +922,7 @@ __FBSDID("$FreeBSD$");
* element. Each entry will take 2 4 byte ints (and of course the overhead
* of the next pointer as well). Using 15 as an example will yield * ((8 *
* 15) + 8) or 128 bytes of overhead for each timewait block that gets
- * initialized. Increasing it to 31 would yeild 256 bytes per block.
+ * initialized. Increasing it to 31 would yield 256 bytes per block.
*/
#define SCTP_NUMBER_IN_VTAG_BLOCK 15
/*
@@ -986,10 +988,7 @@ __FBSDID("$FreeBSD$");
(((uint8_t *)&(a)->s_addr)[1] == 168)))
#define IN4_ISLOOPBACK_ADDRESS(a) \
- ((((uint8_t *)&(a)->s_addr)[0] == 127) && \
- (((uint8_t *)&(a)->s_addr)[1] == 0) && \
- (((uint8_t *)&(a)->s_addr)[2] == 0) && \
- (((uint8_t *)&(a)->s_addr)[3] == 1))
+ (((uint8_t *)&(a)->s_addr)[0] == 127)
#define IN4_ISLINKLOCAL_ADDRESS(a) \
((((uint8_t *)&(a)->s_addr)[0] == 169) && \
diff --git a/freebsd/sys/netinet/sctp_dtrace_declare.h b/freebsd/sys/netinet/sctp_dtrace_declare.h
index f6fe48bd..c5c8f9ce 100644
--- a/freebsd/sys/netinet/sctp_dtrace_declare.h
+++ b/freebsd/sys/netinet/sctp_dtrace_declare.h
@@ -35,7 +35,6 @@ __FBSDID("$FreeBSD$");
#ifndef _NETINET_SCTP_DTRACE_DECLARE_H_
#define _NETINET_SCTP_DTRACE_DECLARE_H_
-#include <rtems/bsd/local/opt_kdtrace.h>
#include <sys/kernel.h>
#include <sys/sdt.h>
diff --git a/freebsd/sys/netinet/sctp_dtrace_define.h b/freebsd/sys/netinet/sctp_dtrace_define.h
index 0bfe18c0..19f44da4 100644
--- a/freebsd/sys/netinet/sctp_dtrace_define.h
+++ b/freebsd/sys/netinet/sctp_dtrace_define.h
@@ -35,7 +35,6 @@ __FBSDID("$FreeBSD$");
#ifndef _NETINET_SCTP_DTRACE_DEFINE_H_
#define _NETINET_SCTP_DTRACE_DEFINE_H_
-#include <rtems/bsd/local/opt_kdtrace.h>
#include <sys/kernel.h>
#include <sys/sdt.h>
@@ -46,131 +45,131 @@ SDT_PROVIDER_DEFINE(sctp);
/********************************************************/
/* Initial */
SDT_PROBE_DEFINE5(sctp, cwnd, net, init,
- "uint32_t", /* The Vtag for this end */
- "uint32_t", /*
- * The port number of the local side << 16 | port number
- * of remote in network byte order.
- */
- "uintptr_t", /* The pointer to the struct sctp_nets * changing */
- "int", /* The old value of the cwnd */
- "int"); /* The new value of the cwnd */
+ "uint32_t", /* The Vtag for this end */
+ "uint32_t", /* The port number of the local side << 16 |
+ * port number of remote in network byte
+ * order. */
+ "uintptr_t", /* The pointer to the struct sctp_nets *
+ * changing */
+ "int", /* The old value of the cwnd */
+ "int"); /* The new value of the cwnd */
/* ACK-INCREASE */
SDT_PROBE_DEFINE5(sctp, cwnd, net, ack,
- "uint32_t", /* The Vtag for this end */
- "uint32_t", /*
- * The port number of the local side << 16 | port number
- * of remote in network byte order.
- */
- "uintptr_t", /* The pointer to the struct sctp_nets * changing */
- "int", /* The old value of the cwnd */
- "int"); /* The new value of the cwnd */
+ "uint32_t", /* The Vtag for this end */
+ "uint32_t", /* The port number of the local side << 16 |
+ * port number of remote in network byte
+ * order. */
+ "uintptr_t", /* The pointer to the struct sctp_nets *
+ * changing */
+ "int", /* The old value of the cwnd */
+ "int"); /* The new value of the cwnd */
/* ACK-INCREASE */
SDT_PROBE_DEFINE5(sctp, cwnd, net, rttvar,
- "uint64_t", /* The Vtag << 32 | localport << 16 | remoteport */
- "uint64_t", /* obw | nbw */
- "uint64_t", /* bwrtt | newrtt */
- "uint64_t", /* flight */
- "uint64_t"); /* (cwnd << 32) | point << 16 | retval(0/1) */
+ "uint64_t", /* The Vtag << 32 | localport << 16 |
+ * remoteport */
+ "uint64_t", /* obw | nbw */
+ "uint64_t", /* bwrtt | newrtt */
+ "uint64_t", /* flight */
+ "uint64_t"); /* (cwnd << 32) | point << 16 | retval(0/1) */
SDT_PROBE_DEFINE5(sctp, cwnd, net, rttstep,
- "uint64_t", /* The Vtag << 32 | localport << 16 | remoteport */
- "uint64_t", /* obw | nbw */
- "uint64_t", /* bwrtt | newrtt */
- "uint64_t", /* flight */
- "uint64_t"); /* (cwnd << 32) | point << 16 | retval(0/1) */
+ "uint64_t", /* The Vtag << 32 | localport << 16 |
+ * remoteport */
+ "uint64_t", /* obw | nbw */
+ "uint64_t", /* bwrtt | newrtt */
+ "uint64_t", /* flight */
+ "uint64_t"); /* (cwnd << 32) | point << 16 | retval(0/1) */
/* FastRetransmit-DECREASE */
SDT_PROBE_DEFINE5(sctp, cwnd, net, fr,
- "uint32_t", /* The Vtag for this end */
- "uint32_t", /*
- * The port number of the local side << 16 | port number
- * of remote in network byte order.
- */
- "uintptr_t", /* The pointer to the struct sctp_nets * changing */
- "int", /* The old value of the cwnd */
- "int"); /* The new value of the cwnd */
+ "uint32_t", /* The Vtag for this end */
+ "uint32_t", /* The port number of the local side << 16 |
+ * port number of remote in network byte
+ * order. */
+ "uintptr_t", /* The pointer to the struct sctp_nets *
+ * changing */
+ "int", /* The old value of the cwnd */
+ "int"); /* The new value of the cwnd */
/* TimeOut-DECREASE */
SDT_PROBE_DEFINE5(sctp, cwnd, net, to,
- "uint32_t", /* The Vtag for this end */
- "uint32_t", /*
- * The port number of the local side << 16 | port number
- * of remote in network byte order.
- */
- "uintptr_t", /* The pointer to the struct sctp_nets * changing */
- "int", /* The old value of the cwnd */
- "int"); /* The new value of the cwnd */
+ "uint32_t", /* The Vtag for this end */
+ "uint32_t", /* The port number of the local side << 16 |
+ * port number of remote in network byte
+ * order. */
+ "uintptr_t", /* The pointer to the struct sctp_nets *
+ * changing */
+ "int", /* The old value of the cwnd */
+ "int"); /* The new value of the cwnd */
/* BurstLimit-DECREASE */
SDT_PROBE_DEFINE5(sctp, cwnd, net, bl,
- "uint32_t", /* The Vtag for this end */
- "uint32_t", /*
- * The port number of the local side << 16 | port number
- * of remote in network byte order.
- */
- "uintptr_t", /* The pointer to the struct sctp_nets * changing */
- "int", /* The old value of the cwnd */
- "int"); /* The new value of the cwnd */
+ "uint32_t", /* The Vtag for this end */
+ "uint32_t", /* The port number of the local side << 16 |
+ * port number of remote in network byte
+ * order. */
+ "uintptr_t", /* The pointer to the struct sctp_nets *
+ * changing */
+ "int", /* The old value of the cwnd */
+ "int"); /* The new value of the cwnd */
/* ECN-DECREASE */
SDT_PROBE_DEFINE5(sctp, cwnd, net, ecn,
- "uint32_t", /* The Vtag for this end */
- "uint32_t", /*
- * The port number of the local side << 16 | port number
- * of remote in network byte order.
- */
- "uintptr_t", /* The pointer to the struct sctp_nets * changing */
- "int", /* The old value of the cwnd */
- "int"); /* The new value of the cwnd */
+ "uint32_t", /* The Vtag for this end */
+ "uint32_t", /* The port number of the local side << 16 |
+ * port number of remote in network byte
+ * order. */
+ "uintptr_t", /* The pointer to the struct sctp_nets *
+ * changing */
+ "int", /* The old value of the cwnd */
+ "int"); /* The new value of the cwnd */
/* PacketDrop-DECREASE */
SDT_PROBE_DEFINE5(sctp, cwnd, net, pd,
- "uint32_t", /* The Vtag for this end */
- "uint32_t", /*
- * The port number of the local side << 16 | port number
- * of remote in network byte order.
- */
- "uintptr_t", /* The pointer to the struct sctp_nets * changing */
- "int", /* The old value of the cwnd */
- "int"); /* The new value of the cwnd */
+ "uint32_t", /* The Vtag for this end */
+ "uint32_t", /* The port number of the local side << 16 |
+ * port number of remote in network byte
+ * order. */
+ "uintptr_t", /* The pointer to the struct sctp_nets *
+ * changing */
+ "int", /* The old value of the cwnd */
+ "int"); /* The new value of the cwnd */
/********************************************************/
/* Rwnd probe - tracks changes in the receiver window for an assoc */
/********************************************************/
SDT_PROBE_DEFINE4(sctp, rwnd, assoc, val,
- "uint32_t", /* The Vtag for this end */
- "uint32_t", /*
- * The port number of the local side << 16 | port number
- * of remote in network byte order.
- */
- "int", /* The up/down amount */
- "int"); /* The new value of the cwnd */
+ "uint32_t", /* The Vtag for this end */
+ "uint32_t", /* The port number of the local side << 16 |
+ * port number of remote in network byte
+ * order. */
+ "int", /* The up/down amount */
+ "int"); /* The new value of the cwnd */
/********************************************************/
/* flight probe - tracks changes in the flight size on a net or assoc */
/********************************************************/
SDT_PROBE_DEFINE5(sctp, flightsize, net, val,
- "uint32_t", /* The Vtag for this end */
- "uint32_t", /*
- * The port number of the local side << 16 | port number
- * of remote in network byte order.
- */
- "uintptr_t", /* The pointer to the struct sctp_nets * changing */
- "int", /* The up/down amount */
- "int"); /* The new value of the cwnd */
+ "uint32_t", /* The Vtag for this end */
+ "uint32_t", /* The port number of the local side << 16 |
+ * port number of remote in network byte
+ * order. */
+ "uintptr_t", /* The pointer to the struct sctp_nets *
+ * changing */
+ "int", /* The up/down amount */
+ "int"); /* The new value of the cwnd */
/********************************************************/
/* The total flight version */
/********************************************************/
SDT_PROBE_DEFINE4(sctp, flightsize, assoc, val,
- "uint32_t", /* The Vtag for this end */
- "uint32_t", /*
- * The port number of the local side << 16 | port number
- * of remote in network byte order.
- */
- "int", /* The up/down amount */
- "int"); /* The new value of the cwnd */
+ "uint32_t", /* The Vtag for this end */
+ "uint32_t", /* The port number of the local side << 16 |
+ * port number of remote in network byte
+ * order. */
+ "int", /* The up/down amount */
+ "int"); /* The new value of the cwnd */
#endif
diff --git a/freebsd/sys/netinet/sctp_header.h b/freebsd/sys/netinet/sctp_header.h
index 8f898a4b..3f4948dd 100644
--- a/freebsd/sys/netinet/sctp_header.h
+++ b/freebsd/sys/netinet/sctp_header.h
@@ -82,12 +82,6 @@ struct sctp_supported_addr_param {
uint16_t addr_type[2]; /* array of supported address types */
} SCTP_PACKED;
-/* ECN parameter */
-struct sctp_ecn_supported_param {
- struct sctp_paramhdr ph;/* type=SCTP_ECN_CAPABLE */
-} SCTP_PACKED;
-
-
/* heartbeat info parameter */
struct sctp_heartbeat_info_param {
struct sctp_paramhdr ph;
@@ -158,6 +152,23 @@ struct sctp_data_chunk {
struct sctp_data dp;
} SCTP_PACKED;
+struct sctp_idata {
+ uint32_t tsn;
+ uint16_t stream_id;
+ uint16_t reserved; /* Where does the SSN go? */
+ uint32_t msg_id;
+ union {
+ uint32_t protocol_id;
+ uint32_t fsn; /* Fragment Sequence Number */
+ } ppid_fsn;
+ /* user data follows */
+} SCTP_PACKED;
+
+struct sctp_idata_chunk {
+ struct sctp_chunkhdr ch;
+ struct sctp_idata dp;
+} SCTP_PACKED;
+
/*
* Structures for the control chunks
*/
@@ -208,34 +219,6 @@ struct sctp_state_cookie { /* this is our definition... */
*/
} SCTP_PACKED;
-
-/* Used for NAT state error cause */
-struct sctp_missing_nat_state {
- uint16_t cause;
- uint16_t length;
- uint8_t data[];
-} SCTP_PACKED;
-
-
-struct sctp_inv_mandatory_param {
- uint16_t cause;
- uint16_t length;
- uint32_t num_param;
- uint16_t param;
- /*
- * We include this to 0 it since only a missing cookie will cause
- * this error.
- */
- uint16_t resv;
-} SCTP_PACKED;
-
-struct sctp_unresolv_addr {
- uint16_t cause;
- uint16_t length;
- uint16_t addr_type;
- uint16_t reserved; /* Only one invalid addr type */
-} SCTP_PACKED;
-
/* state cookie parameter */
struct sctp_state_cookie_param {
struct sctp_paramhdr ph;
@@ -376,28 +359,11 @@ struct sctp_shutdown_complete_chunk {
struct sctp_chunkhdr ch;
} SCTP_PACKED;
-/* Oper error holding a stale cookie */
-struct sctp_stale_cookie_msg {
- struct sctp_paramhdr ph;/* really an error cause */
- uint32_t time_usec;
-} SCTP_PACKED;
-
struct sctp_adaptation_layer_indication {
struct sctp_paramhdr ph;
uint32_t indication;
} SCTP_PACKED;
-struct sctp_cookie_while_shutting_down {
- struct sctphdr sh;
- struct sctp_chunkhdr ch;
- struct sctp_paramhdr ph;/* really an error cause */
-} SCTP_PACKED;
-
-struct sctp_shutdown_complete_msg {
- struct sctphdr sh;
- struct sctp_shutdown_complete_chunk shut_cmp;
-} SCTP_PACKED;
-
/*
* draft-ietf-tsvwg-addip-sctp
*/
@@ -429,6 +395,12 @@ struct sctp_strseq {
uint16_t sequence;
} SCTP_PACKED;
+struct sctp_strseq_mid {
+ uint16_t stream;
+ uint16_t flags;
+ uint32_t msg_id;
+};
+
struct sctp_forward_tsn_msg {
struct sctphdr sh;
struct sctp_forward_tsn_chunk msg;
@@ -456,6 +428,11 @@ struct sctp_pktdrop_chunk {
/**********STREAM RESET STUFF ******************/
+struct sctp_stream_reset_request {
+ struct sctp_paramhdr ph;
+ uint32_t request_seq;
+} SCTP_PACKED;
+
struct sctp_stream_reset_out_request {
struct sctp_paramhdr ph;
uint32_t request_seq; /* monotonically increasing seq no */
@@ -470,7 +447,6 @@ struct sctp_stream_reset_in_request {
uint16_t list_of_streams[]; /* if not all list of streams */
} SCTP_PACKED;
-
struct sctp_stream_reset_tsn_request {
struct sctp_paramhdr ph;
uint32_t request_seq;
@@ -556,12 +532,6 @@ struct sctp_auth_chunk {
uint8_t hmac[];
} SCTP_PACKED;
-struct sctp_auth_invalid_hmac {
- struct sctp_paramhdr ph;
- uint16_t hmac_id;
- uint16_t padding;
-} SCTP_PACKED;
-
/*
* we pre-reserve enough room for a ECNE or CWR AND a SACK with no missing
* pieces. If ENCE is missing we could have a couple of blocks. This way we
diff --git a/freebsd/sys/netinet/sctp_indata.c b/freebsd/sys/netinet/sctp_indata.c
index 07d8fd2b..12c2c80f 100644
--- a/freebsd/sys/netinet/sctp_indata.c
+++ b/freebsd/sys/netinet/sctp_indata.c
@@ -36,18 +36,22 @@
__FBSDID("$FreeBSD$");
#include <netinet/sctp_os.h>
+#include <sys/proc.h>
#include <netinet/sctp_var.h>
#include <netinet/sctp_sysctl.h>
-#include <netinet/sctp_pcb.h>
#include <netinet/sctp_header.h>
+#include <netinet/sctp_pcb.h>
#include <netinet/sctputil.h>
#include <netinet/sctp_output.h>
-#include <netinet/sctp_input.h>
-#include <netinet/sctp_indata.h>
#include <netinet/sctp_uio.h>
+#include <netinet/sctp_auth.h>
#include <netinet/sctp_timer.h>
-
-
+#include <netinet/sctp_asconf.h>
+#include <netinet/sctp_indata.h>
+#include <netinet/sctp_bsd_addr.h>
+#include <netinet/sctp_input.h>
+#include <netinet/sctp_crc32.h>
+#include <netinet/sctp_lock_bsd.h>
/*
* NOTES: On the outbound side of things I need to check the sack timer to
* see if I should generate a sack into the chunk queue (if I have data to
@@ -57,6 +61,13 @@ __FBSDID("$FreeBSD$");
* This will cause sctp_service_queues() to get called on the top entry in
* the list.
*/
+static void
+sctp_add_chk_to_control(struct sctp_queued_to_read *control,
+ struct sctp_stream_in *strm,
+ struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ struct sctp_tmit_chunk *chk, int lock_held);
+
void
sctp_set_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc)
@@ -76,9 +87,9 @@ sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc)
* sctp_soreceive then we will fix this so that ONLY this
* associations data is taken into account.
*/
- if (stcb->sctp_socket == NULL)
+ if (stcb->sctp_socket == NULL) {
return (calc);
-
+ }
if (stcb->asoc.sb_cc == 0 &&
asoc->size_on_reasm_queue == 0 &&
asoc->size_on_all_streams == 0) {
@@ -88,7 +99,6 @@ sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc)
}
/* get actual space */
calc = (uint32_t) sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv);
-
/*
* take out what has NOT been put on socket queue and we yet hold
* for putting up.
@@ -97,7 +107,6 @@ sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc)
asoc->cnt_on_reasm_queue * MSIZE));
calc = sctp_sbspace_sub(calc, (uint32_t) (asoc->size_on_all_streams +
asoc->cnt_on_all_streams * MSIZE));
-
if (calc == 0) {
/* out of space */
return (calc);
@@ -124,7 +133,7 @@ sctp_build_readq_entry(struct sctp_tcb *stcb,
struct sctp_nets *net,
uint32_t tsn, uint32_t ppid,
uint32_t context, uint16_t stream_no,
- uint16_t stream_seq, uint8_t flags,
+ uint32_t stream_seq, uint8_t flags,
struct mbuf *dm)
{
struct sctp_queued_to_read *read_queue_e = NULL;
@@ -133,73 +142,26 @@ sctp_build_readq_entry(struct sctp_tcb *stcb,
if (read_queue_e == NULL) {
goto failed_build;
}
+ memset(read_queue_e, 0, sizeof(struct sctp_queued_to_read));
read_queue_e->sinfo_stream = stream_no;
read_queue_e->sinfo_ssn = stream_seq;
read_queue_e->sinfo_flags = (flags << 8);
read_queue_e->sinfo_ppid = ppid;
read_queue_e->sinfo_context = context;
- read_queue_e->sinfo_timetolive = 0;
read_queue_e->sinfo_tsn = tsn;
read_queue_e->sinfo_cumtsn = tsn;
read_queue_e->sinfo_assoc_id = sctp_get_associd(stcb);
+ read_queue_e->top_fsn = read_queue_e->fsn_included = 0xffffffff;
+ TAILQ_INIT(&read_queue_e->reasm);
read_queue_e->whoFrom = net;
- read_queue_e->length = 0;
atomic_add_int(&net->ref_count, 1);
read_queue_e->data = dm;
- read_queue_e->spec_flags = 0;
- read_queue_e->tail_mbuf = NULL;
- read_queue_e->aux_data = NULL;
read_queue_e->stcb = stcb;
read_queue_e->port_from = stcb->rport;
- read_queue_e->do_not_ref_stcb = 0;
- read_queue_e->end_added = 0;
- read_queue_e->some_taken = 0;
- read_queue_e->pdapi_aborted = 0;
failed_build:
return (read_queue_e);
}
-
-/*
- * Build out our readq entry based on the incoming packet.
- */
-static struct sctp_queued_to_read *
-sctp_build_readq_entry_chk(struct sctp_tcb *stcb,
- struct sctp_tmit_chunk *chk)
-{
- struct sctp_queued_to_read *read_queue_e = NULL;
-
- sctp_alloc_a_readq(stcb, read_queue_e);
- if (read_queue_e == NULL) {
- goto failed_build;
- }
- read_queue_e->sinfo_stream = chk->rec.data.stream_number;
- read_queue_e->sinfo_ssn = chk->rec.data.stream_seq;
- read_queue_e->sinfo_flags = (chk->rec.data.rcv_flags << 8);
- read_queue_e->sinfo_ppid = chk->rec.data.payloadtype;
- read_queue_e->sinfo_context = stcb->asoc.context;
- read_queue_e->sinfo_timetolive = 0;
- read_queue_e->sinfo_tsn = chk->rec.data.TSN_seq;
- read_queue_e->sinfo_cumtsn = chk->rec.data.TSN_seq;
- read_queue_e->sinfo_assoc_id = sctp_get_associd(stcb);
- read_queue_e->whoFrom = chk->whoTo;
- read_queue_e->aux_data = NULL;
- read_queue_e->length = 0;
- atomic_add_int(&chk->whoTo->ref_count, 1);
- read_queue_e->data = chk->data;
- read_queue_e->tail_mbuf = NULL;
- read_queue_e->stcb = stcb;
- read_queue_e->port_from = stcb->rport;
- read_queue_e->spec_flags = 0;
- read_queue_e->do_not_ref_stcb = 0;
- read_queue_e->end_added = 0;
- read_queue_e->some_taken = 0;
- read_queue_e->pdapi_aborted = 0;
-failed_build:
- return (read_queue_e);
-}
-
-
struct mbuf *
sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo)
{
@@ -225,9 +187,9 @@ sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo)
}
seinfo = (struct sctp_extrcvinfo *)sinfo;
if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO) &&
- (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_AVAIL)) {
+ (seinfo->serinfo_next_flags & SCTP_NEXT_MSG_AVAIL)) {
provide_nxt = 1;
- len += CMSG_SPACE(sizeof(struct sctp_rcvinfo));
+ len += CMSG_SPACE(sizeof(struct sctp_nxtinfo));
} else {
provide_nxt = 0;
}
@@ -243,7 +205,7 @@ sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo)
use_extended = 0;
}
- ret = sctp_get_mbuf_for_msg(len, 0, M_DONTWAIT, 1, MT_DATA);
+ ret = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA);
if (ret == NULL) {
/* No space */
return (ret);
@@ -278,20 +240,20 @@ sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo)
cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_nxtinfo));
cmh->cmsg_type = SCTP_NXTINFO;
nxtinfo = (struct sctp_nxtinfo *)CMSG_DATA(cmh);
- nxtinfo->nxt_sid = seinfo->sreinfo_next_stream;
+ nxtinfo->nxt_sid = seinfo->serinfo_next_stream;
nxtinfo->nxt_flags = 0;
- if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_IS_UNORDERED) {
+ if (seinfo->serinfo_next_flags & SCTP_NEXT_MSG_IS_UNORDERED) {
nxtinfo->nxt_flags |= SCTP_UNORDERED;
}
- if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_IS_NOTIFICATION) {
+ if (seinfo->serinfo_next_flags & SCTP_NEXT_MSG_IS_NOTIFICATION) {
nxtinfo->nxt_flags |= SCTP_NOTIFICATION;
}
- if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_ISCOMPLETE) {
+ if (seinfo->serinfo_next_flags & SCTP_NEXT_MSG_ISCOMPLETE) {
nxtinfo->nxt_flags |= SCTP_COMPLETE;
}
- nxtinfo->nxt_ppid = seinfo->sreinfo_next_ppid;
- nxtinfo->nxt_length = seinfo->sreinfo_next_length;
- nxtinfo->nxt_assoc_id = seinfo->sreinfo_next_aid;
+ nxtinfo->nxt_ppid = seinfo->serinfo_next_ppid;
+ nxtinfo->nxt_length = seinfo->serinfo_next_length;
+ nxtinfo->nxt_assoc_id = seinfo->serinfo_next_aid;
cmh = (struct cmsghdr *)((caddr_t)cmh + CMSG_SPACE(sizeof(struct sctp_nxtinfo)));
SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_nxtinfo));
}
@@ -319,6 +281,7 @@ sctp_mark_non_revokable(struct sctp_association *asoc, uint32_t tsn)
{
uint32_t gap, i, cumackp1;
int fnd = 0;
+ int in_r = 0, in_nr = 0;
if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) {
return;
@@ -332,15 +295,20 @@ sctp_mark_non_revokable(struct sctp_association *asoc, uint32_t tsn)
return;
}
SCTP_CALC_TSN_TO_GAP(gap, tsn, asoc->mapping_array_base_tsn);
- if (!SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) {
+ in_r = SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap);
+ in_nr = SCTP_IS_TSN_PRESENT(asoc->nr_mapping_array, gap);
+ if ((in_r == 0) && (in_nr == 0)) {
+#ifdef INVARIANTS
+ panic("Things are really messed up now");
+#else
SCTP_PRINTF("gap:%x tsn:%x\n", gap, tsn);
sctp_print_mapping_array(asoc);
-#ifdef INVARIANTS
- panic("Things are really messed up now!!");
#endif
}
- SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
- SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
+ if (in_nr == 0)
+ SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
+ if (in_r)
+ SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) {
asoc->highest_tsn_inside_nr_map = tsn;
}
@@ -360,197 +328,162 @@ sctp_mark_non_revokable(struct sctp_association *asoc, uint32_t tsn)
}
}
-
-/*
- * We are delivering currently from the reassembly queue. We must continue to
- * deliver until we either: 1) run out of space. 2) run out of sequential
- * TSN's 3) hit the SCTP_DATA_LAST_FRAG flag.
- */
-static void
-sctp_service_reassembly(struct sctp_tcb *stcb, struct sctp_association *asoc)
+static int
+sctp_place_control_in_stream(struct sctp_stream_in *strm,
+ struct sctp_association *asoc,
+ struct sctp_queued_to_read *control)
{
- struct sctp_tmit_chunk *chk, *nchk;
- uint16_t nxt_todel;
- uint16_t stream_no;
- int end = 0;
- int cntDel;
- struct sctp_queued_to_read *control, *ctl, *nctl;
-
- if (stcb == NULL)
- return;
-
- cntDel = stream_no = 0;
- if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) ||
- (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) ||
- (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) {
- /* socket above is long gone or going.. */
-abandon:
- asoc->fragmented_delivery_inprogress = 0;
- TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) {
- TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
- asoc->size_on_reasm_queue -= chk->send_size;
- sctp_ucount_decr(asoc->cnt_on_reasm_queue);
- /*
- * Lose the data pointer, since its in the socket
- * buffer
- */
- if (chk->data) {
- sctp_m_freem(chk->data);
- chk->data = NULL;
+ struct sctp_queued_to_read *at;
+ struct sctp_readhead *q;
+ uint8_t bits, unordered;
+
+ bits = (control->sinfo_flags >> 8);
+ unordered = bits & SCTP_DATA_UNORDERED;
+ if (unordered) {
+ q = &strm->uno_inqueue;
+ if (asoc->idata_supported == 0) {
+ if (!TAILQ_EMPTY(q)) {
+ /*
+ * Only one stream can be here in old style
+ * -- abort
+ */
+ return (-1);
}
- /* Now free the address and data */
- sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
- /* sa_ignore FREED_MEMORY */
+ TAILQ_INSERT_TAIL(q, control, next_instrm);
+ control->on_strm_q = SCTP_ON_UNORDERED;
+ return (0);
}
- return;
+ } else {
+ q = &strm->inqueue;
}
- SCTP_TCB_LOCK_ASSERT(stcb);
- TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) {
- if (chk->rec.data.TSN_seq != (asoc->tsn_last_delivered + 1)) {
- /* Can't deliver more :< */
- return;
- }
- stream_no = chk->rec.data.stream_number;
- nxt_todel = asoc->strmin[stream_no].last_sequence_delivered + 1;
- if (nxt_todel != chk->rec.data.stream_seq &&
- (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) {
- /*
- * Not the next sequence to deliver in its stream OR
- * unordered
- */
- return;
- }
- if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) {
-
- control = sctp_build_readq_entry_chk(stcb, chk);
- if (control == NULL) {
- /* out of memory? */
- return;
- }
- /* save it off for our future deliveries */
- stcb->asoc.control_pdapi = control;
- if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG)
- end = 1;
- else
- end = 0;
- sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq);
- sctp_add_to_readq(stcb->sctp_ep,
- stcb, control, &stcb->sctp_socket->so_rcv, end,
- SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
- cntDel++;
+ if ((bits & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) {
+ control->end_added = control->last_frag_seen = control->first_frag_seen = 1;
+ }
+ if (TAILQ_EMPTY(q)) {
+ /* Empty queue */
+ TAILQ_INSERT_HEAD(q, control, next_instrm);
+ if (unordered) {
+ control->on_strm_q = SCTP_ON_UNORDERED;
} else {
- if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG)
- end = 1;
- else
- end = 0;
- sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq);
- if (sctp_append_to_readq(stcb->sctp_ep, stcb,
- stcb->asoc.control_pdapi,
- chk->data, end, chk->rec.data.TSN_seq,
- &stcb->sctp_socket->so_rcv)) {
+ control->on_strm_q = SCTP_ON_ORDERED;
+ }
+ return (0);
+ } else {
+ TAILQ_FOREACH(at, q, next_instrm) {
+ if (SCTP_TSN_GT(at->msg_id, control->msg_id)) {
/*
- * something is very wrong, either
- * control_pdapi is NULL, or the tail_mbuf
- * is corrupt, or there is a EOM already on
- * the mbuf chain.
+ * one in queue is bigger than the new one,
+ * insert before this one
*/
- if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
- goto abandon;
+ TAILQ_INSERT_BEFORE(at, control, next_instrm);
+ if (unordered) {
+ control->on_strm_q = SCTP_ON_UNORDERED;
} else {
-#ifdef INVARIANTS
- if ((stcb->asoc.control_pdapi == NULL) || (stcb->asoc.control_pdapi->tail_mbuf == NULL)) {
- panic("This should not happen control_pdapi NULL?");
+ control->on_strm_q = SCTP_ON_ORDERED;
+ }
+ break;
+ } else if (at->msg_id == control->msg_id) {
+ /*
+ * Gak, He sent me a duplicate msg id
+ * number?? return -1 to abort.
+ */
+ return (-1);
+ } else {
+ if (TAILQ_NEXT(at, next_instrm) == NULL) {
+ /*
+ * We are at the end, insert it
+ * after this one
+ */
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
+ sctp_log_strm_del(control, at,
+ SCTP_STR_LOG_FROM_INSERT_TL);
}
- /* if we did not panic, it was a EOM */
- panic("Bad chunking ??");
-#else
- if ((stcb->asoc.control_pdapi == NULL) || (stcb->asoc.control_pdapi->tail_mbuf == NULL)) {
- SCTP_PRINTF("This should not happen control_pdapi NULL?\n");
+ TAILQ_INSERT_AFTER(q,
+ at, control, next_instrm);
+ if (unordered) {
+ control->on_strm_q = SCTP_ON_UNORDERED;
+ } else {
+ control->on_strm_q = SCTP_ON_ORDERED;
}
- SCTP_PRINTF("Bad chunking ??\n");
- SCTP_PRINTF("Dumping re-assembly queue this will probably hose the association\n");
-
-#endif
- goto abandon;
+ break;
}
}
- cntDel++;
}
- /* pull it we did it */
- TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
- if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) {
- asoc->fragmented_delivery_inprogress = 0;
- if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) {
- asoc->strmin[stream_no].last_sequence_delivered++;
- }
- if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0) {
- SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs);
- }
- } else if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) {
- /*
- * turn the flag back on since we just delivered
- * yet another one.
- */
- asoc->fragmented_delivery_inprogress = 1;
- }
- asoc->tsn_of_pdapi_last_delivered = chk->rec.data.TSN_seq;
- asoc->last_flags_delivered = chk->rec.data.rcv_flags;
- asoc->last_strm_seq_delivered = chk->rec.data.stream_seq;
- asoc->last_strm_no_delivered = chk->rec.data.stream_number;
+ }
+ return (0);
+}
- asoc->tsn_last_delivered = chk->rec.data.TSN_seq;
- asoc->size_on_reasm_queue -= chk->send_size;
- sctp_ucount_decr(asoc->cnt_on_reasm_queue);
- /* free up the chk */
- chk->data = NULL;
- sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
+static void
+sctp_abort_in_reasm(struct sctp_tcb *stcb,
+ struct sctp_queued_to_read *control,
+ struct sctp_tmit_chunk *chk,
+ int *abort_flag, int opspot)
+{
+ char msg[SCTP_DIAG_INFO_LEN];
+ struct mbuf *oper;
+
+ if (stcb->asoc.idata_supported) {
+ snprintf(msg, sizeof(msg),
+ "Reass %x,CF:%x,TSN=%8.8x,SID=%4.4x,FSN=%8.8x,MID:%8.8x",
+ opspot,
+ control->fsn_included,
+ chk->rec.data.TSN_seq,
+ chk->rec.data.stream_number,
+ chk->rec.data.fsn_num, chk->rec.data.stream_seq);
+ } else {
+ snprintf(msg, sizeof(msg),
+ "Reass %x,CI:%x,TSN=%8.8x,SID=%4.4x,FSN=%4.4x,SSN:%4.4x",
+ opspot,
+ control->fsn_included,
+ chk->rec.data.TSN_seq,
+ chk->rec.data.stream_number,
+ chk->rec.data.fsn_num,
+ (uint16_t) chk->rec.data.stream_seq);
+ }
+ oper = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_1;
+ sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
+ *abort_flag = 1;
+}
- if (asoc->fragmented_delivery_inprogress == 0) {
- /*
- * Now lets see if we can deliver the next one on
- * the stream
- */
- struct sctp_stream_in *strm;
+static void
+sctp_clean_up_control(struct sctp_tcb *stcb, struct sctp_queued_to_read *control)
+{
+ /*
+ * The control could not be placed and must be cleaned.
+ */
+ struct sctp_tmit_chunk *chk, *nchk;
- strm = &asoc->strmin[stream_no];
- nxt_todel = strm->last_sequence_delivered + 1;
- TAILQ_FOREACH_SAFE(ctl, &strm->inqueue, next, nctl) {
- /* Deliver more if we can. */
- if (nxt_todel == ctl->sinfo_ssn) {
- TAILQ_REMOVE(&strm->inqueue, ctl, next);
- asoc->size_on_all_streams -= ctl->length;
- sctp_ucount_decr(asoc->cnt_on_all_streams);
- strm->last_sequence_delivered++;
- sctp_mark_non_revokable(asoc, ctl->sinfo_tsn);
- sctp_add_to_readq(stcb->sctp_ep, stcb,
- ctl,
- &stcb->sctp_socket->so_rcv, 1,
- SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
- } else {
- break;
- }
- nxt_todel = strm->last_sequence_delivered + 1;
- }
- break;
- }
+ TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) {
+ TAILQ_REMOVE(&control->reasm, chk, sctp_next);
+ if (chk->data)
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
}
+ sctp_free_a_readq(stcb, control);
}
/*
* Queue the chunk either right into the socket buffer if it is the next one
* to go OR put it in the correct place in the delivery queue. If we do
- * append to the so_buf, keep doing so until we are out of order. One big
- * question still remains, what to do when the socket buffer is FULL??
+ * append to the so_buf, keep doing so until we are out of order as
+ * long as the control's entered are non-fragmented.
*/
static void
-sctp_queue_data_to_stream(struct sctp_tcb *stcb, struct sctp_association *asoc,
- struct sctp_queued_to_read *control, int *abort_flag)
+sctp_queue_data_to_stream(struct sctp_tcb *stcb,
+ struct sctp_stream_in *strm,
+ struct sctp_association *asoc,
+ struct sctp_queued_to_read *control, int *abort_flag, int *need_reasm)
{
/*
* FIX-ME maybe? What happens when the ssn wraps? If we are getting
* all the data in one stream this could happen quite rapidly. One
* could use the TSN to keep track of things, but this scheme breaks
- * down in the other type of stream useage that could occur. Send a
+ * down in the other type of stream usage that could occur. Send a
* single msg to stream 0, send 4Billion messages to stream 1, now
* send a message to stream 0. You have a situation where the TSN
* has wrapped but not in the stream. Is this worth worrying about
@@ -564,47 +497,57 @@ sctp_queue_data_to_stream(struct sctp_tcb *stcb, struct sctp_association *asoc,
* SSN alone. Maybe a hybred approach is the answer
*
*/
- struct sctp_stream_in *strm;
struct sctp_queued_to_read *at;
int queue_needed;
- uint16_t nxt_todel;
+ uint32_t nxt_todel;
struct mbuf *op_err;
char msg[SCTP_DIAG_INFO_LEN];
- queue_needed = 1;
- asoc->size_on_all_streams += control->length;
- sctp_ucount_incr(asoc->cnt_on_all_streams);
- strm = &asoc->strmin[control->sinfo_stream];
- nxt_todel = strm->last_sequence_delivered + 1;
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_INTO_STRD);
}
- SCTPDBG(SCTP_DEBUG_INDATA1,
- "queue to stream called for ssn:%u lastdel:%u nxt:%u\n",
- (uint32_t) control->sinfo_stream,
- (uint32_t) strm->last_sequence_delivered,
- (uint32_t) nxt_todel);
- if (SCTP_SSN_GE(strm->last_sequence_delivered, control->sinfo_ssn)) {
+ if (SCTP_MSGID_GT((!asoc->idata_supported), strm->last_sequence_delivered, control->sinfo_ssn)) {
/* The incoming sseq is behind where we last delivered? */
- SCTPDBG(SCTP_DEBUG_INDATA1, "Duplicate S-SEQ:%d delivered:%d from peer, Abort association\n",
+ SCTPDBG(SCTP_DEBUG_INDATA1, "Duplicate S-SEQ: %u delivered: %u from peer, Abort association\n",
control->sinfo_ssn, strm->last_sequence_delivered);
protocol_error:
/*
* throw it in the stream so it gets cleaned up in
* association destruction
*/
- TAILQ_INSERT_HEAD(&strm->inqueue, control, next);
+ TAILQ_INSERT_HEAD(&strm->inqueue, control, next_instrm);
snprintf(msg, sizeof(msg), "Delivered SSN=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
strm->last_sequence_delivered, control->sinfo_tsn,
control->sinfo_stream, control->sinfo_ssn);
op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_1;
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_2;
sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
*abort_flag = 1;
return;
}
+ if ((SCTP_TSN_GE(asoc->cumulative_tsn, control->sinfo_tsn)) && (asoc->idata_supported == 0)) {
+ goto protocol_error;
+ }
+ queue_needed = 1;
+ asoc->size_on_all_streams += control->length;
+ sctp_ucount_incr(asoc->cnt_on_all_streams);
+ nxt_todel = strm->last_sequence_delivered + 1;
if (nxt_todel == control->sinfo_ssn) {
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_SOCKET_LOCK(so, 1);
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return;
+ }
+#endif
/* can be delivered right away? */
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_IMMED_DEL);
@@ -614,19 +557,27 @@ protocol_error:
asoc->size_on_all_streams -= control->length;
sctp_ucount_decr(asoc->cnt_on_all_streams);
strm->last_sequence_delivered++;
-
sctp_mark_non_revokable(asoc, control->sinfo_tsn);
sctp_add_to_readq(stcb->sctp_ep, stcb,
control,
&stcb->sctp_socket->so_rcv, 1,
- SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
- TAILQ_FOREACH_SAFE(control, &strm->inqueue, next, at) {
+ SCTP_READ_LOCK_NOT_HELD, SCTP_SO_LOCKED);
+ TAILQ_FOREACH_SAFE(control, &strm->inqueue, next_instrm, at) {
/* all delivered */
nxt_todel = strm->last_sequence_delivered + 1;
- if (nxt_todel == control->sinfo_ssn) {
- TAILQ_REMOVE(&strm->inqueue, control, next);
+ if ((nxt_todel == control->sinfo_ssn) &&
+ (((control->sinfo_flags >> 8) & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG)) {
asoc->size_on_all_streams -= control->length;
sctp_ucount_decr(asoc->cnt_on_all_streams);
+ if (control->on_strm_q == SCTP_ON_ORDERED) {
+ TAILQ_REMOVE(&strm->inqueue, control, next_instrm);
+#ifdef INVARIANTS
+ } else {
+ panic("Huh control: %p is on_strm_q: %d",
+ control, control->on_strm_q);
+#endif
+ }
+ control->on_strm_q = 0;
strm->last_sequence_delivered++;
/*
* We ignore the return of deliver_data here
@@ -643,184 +594,686 @@ protocol_error:
control,
&stcb->sctp_socket->so_rcv, 1,
SCTP_READ_LOCK_NOT_HELD,
- SCTP_SO_NOT_LOCKED);
+ SCTP_SO_LOCKED);
continue;
+ } else if (nxt_todel == control->sinfo_ssn) {
+ *need_reasm = 1;
}
break;
}
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
}
if (queue_needed) {
/*
* Ok, we did not deliver this guy, find the correct place
* to put it on the queue.
*/
- if (SCTP_TSN_GE(asoc->cumulative_tsn, control->sinfo_tsn)) {
- goto protocol_error;
+ if (sctp_place_control_in_stream(strm, asoc, control)) {
+ snprintf(msg, sizeof(msg),
+ "Queue to str msg_id: %u duplicate",
+ control->msg_id);
+ sctp_clean_up_control(stcb, control);
+ op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_3;
+ sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
+ *abort_flag = 1;
}
- if (TAILQ_EMPTY(&strm->inqueue)) {
- /* Empty queue */
- if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
- sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_INSERT_HD);
+ }
+}
+
+
+static void
+sctp_setup_tail_pointer(struct sctp_queued_to_read *control)
+{
+ struct mbuf *m, *prev = NULL;
+ struct sctp_tcb *stcb;
+
+ stcb = control->stcb;
+ control->held_length = 0;
+ control->length = 0;
+ m = control->data;
+ while (m) {
+ if (SCTP_BUF_LEN(m) == 0) {
+ /* Skip mbufs with NO length */
+ if (prev == NULL) {
+ /* First one */
+ control->data = sctp_m_free(m);
+ m = control->data;
+ } else {
+ SCTP_BUF_NEXT(prev) = sctp_m_free(m);
+ m = SCTP_BUF_NEXT(prev);
}
- TAILQ_INSERT_HEAD(&strm->inqueue, control, next);
- } else {
- TAILQ_FOREACH(at, &strm->inqueue, next) {
- if (SCTP_SSN_GT(at->sinfo_ssn, control->sinfo_ssn)) {
+ if (m == NULL) {
+ control->tail_mbuf = prev;
+ }
+ continue;
+ }
+ prev = m;
+ atomic_add_int(&control->length, SCTP_BUF_LEN(m));
+ if (control->on_read_q) {
+ /*
+ * On read queue so we must increment the SB stuff,
+ * we assume caller has done any locks of SB.
+ */
+ sctp_sballoc(stcb, &stcb->sctp_socket->so_rcv, m);
+ }
+ m = SCTP_BUF_NEXT(m);
+ }
+ if (prev) {
+ control->tail_mbuf = prev;
+ }
+}
+
+static void
+sctp_add_to_tail_pointer(struct sctp_queued_to_read *control, struct mbuf *m)
+{
+ struct mbuf *prev = NULL;
+ struct sctp_tcb *stcb;
+
+ stcb = control->stcb;
+ if (stcb == NULL) {
+#ifdef INVARIANTS
+ panic("Control broken");
+#else
+ return;
+#endif
+ }
+ if (control->tail_mbuf == NULL) {
+ /* TSNH */
+ control->data = m;
+ sctp_setup_tail_pointer(control);
+ return;
+ }
+ control->tail_mbuf->m_next = m;
+ while (m) {
+ if (SCTP_BUF_LEN(m) == 0) {
+ /* Skip mbufs with NO length */
+ if (prev == NULL) {
+ /* First one */
+ control->tail_mbuf->m_next = sctp_m_free(m);
+ m = control->tail_mbuf->m_next;
+ } else {
+ SCTP_BUF_NEXT(prev) = sctp_m_free(m);
+ m = SCTP_BUF_NEXT(prev);
+ }
+ if (m == NULL) {
+ control->tail_mbuf = prev;
+ }
+ continue;
+ }
+ prev = m;
+ if (control->on_read_q) {
+ /*
+ * On read queue so we must increment the SB stuff,
+ * we assume caller has done any locks of SB.
+ */
+ sctp_sballoc(stcb, &stcb->sctp_socket->so_rcv, m);
+ }
+ atomic_add_int(&control->length, SCTP_BUF_LEN(m));
+ m = SCTP_BUF_NEXT(m);
+ }
+ if (prev) {
+ control->tail_mbuf = prev;
+ }
+}
+
+static void
+sctp_build_readq_entry_from_ctl(struct sctp_queued_to_read *nc, struct sctp_queued_to_read *control)
+{
+ memset(nc, 0, sizeof(struct sctp_queued_to_read));
+ nc->sinfo_stream = control->sinfo_stream;
+ nc->sinfo_ssn = control->sinfo_ssn;
+ TAILQ_INIT(&nc->reasm);
+ nc->top_fsn = control->top_fsn;
+ nc->msg_id = control->msg_id;
+ nc->sinfo_flags = control->sinfo_flags;
+ nc->sinfo_ppid = control->sinfo_ppid;
+ nc->sinfo_context = control->sinfo_context;
+ nc->fsn_included = 0xffffffff;
+ nc->sinfo_tsn = control->sinfo_tsn;
+ nc->sinfo_cumtsn = control->sinfo_cumtsn;
+ nc->sinfo_assoc_id = control->sinfo_assoc_id;
+ nc->whoFrom = control->whoFrom;
+ atomic_add_int(&nc->whoFrom->ref_count, 1);
+ nc->stcb = control->stcb;
+ nc->port_from = control->port_from;
+}
+
+static void
+sctp_reset_a_control(struct sctp_queued_to_read *control,
+ struct sctp_inpcb *inp, uint32_t tsn)
+{
+ control->fsn_included = tsn;
+ if (control->on_read_q) {
+ /*
+ * We have to purge it from there, hopefully this will work
+ * :-)
+ */
+ TAILQ_REMOVE(&inp->read_queue, control, next);
+ control->on_read_q = 0;
+ }
+}
+
+static int
+sctp_handle_old_unordered_data(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ struct sctp_stream_in *strm,
+ struct sctp_queued_to_read *control,
+ uint32_t pd_point,
+ int inp_read_lock_held)
+{
+ /*
+ * Special handling for the old un-ordered data chunk. All the
+ * chunks/TSN's go to msg_id 0. So we have to do the old style
+ * watching to see if we have it all. If you return one, no other
+ * control entries on the un-ordered queue will be looked at. In
+ * theory there should be no others entries in reality, unless the
+ * guy is sending both unordered NDATA and unordered DATA...
+ */
+ struct sctp_tmit_chunk *chk, *lchk, *tchk;
+ uint32_t fsn;
+ struct sctp_queued_to_read *nc;
+ int cnt_added;
+
+ if (control->first_frag_seen == 0) {
+ /* Nothing we can do, we have not seen the first piece yet */
+ return (1);
+ }
+ /* Collapse any we can */
+ cnt_added = 0;
+restart:
+ fsn = control->fsn_included + 1;
+ /* Now what can we add? */
+ TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, lchk) {
+ if (chk->rec.data.fsn_num == fsn) {
+ /* Ok lets add it */
+ sctp_alloc_a_readq(stcb, nc);
+ if (nc == NULL) {
+ break;
+ }
+ memset(nc, 0, sizeof(struct sctp_queued_to_read));
+ TAILQ_REMOVE(&control->reasm, chk, sctp_next);
+ sctp_add_chk_to_control(control, strm, stcb, asoc, chk, SCTP_READ_LOCK_NOT_HELD);
+ fsn++;
+ cnt_added++;
+ chk = NULL;
+ if (control->end_added) {
+ /* We are done */
+ if (!TAILQ_EMPTY(&control->reasm)) {
/*
- * one in queue is bigger than the
- * new one, insert before this one
+ * Ok we have to move anything left
+ * on the control queue to a new
+ * control.
*/
- if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
- sctp_log_strm_del(control, at,
- SCTP_STR_LOG_FROM_INSERT_MD);
+ sctp_build_readq_entry_from_ctl(nc, control);
+ tchk = TAILQ_FIRST(&control->reasm);
+ if (tchk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) {
+ TAILQ_REMOVE(&control->reasm, tchk, sctp_next);
+ nc->first_frag_seen = 1;
+ nc->fsn_included = tchk->rec.data.fsn_num;
+ nc->data = tchk->data;
+ nc->sinfo_ppid = tchk->rec.data.payloadtype;
+ nc->sinfo_tsn = tchk->rec.data.TSN_seq;
+ sctp_mark_non_revokable(asoc, tchk->rec.data.TSN_seq);
+ tchk->data = NULL;
+ sctp_free_a_chunk(stcb, tchk, SCTP_SO_NOT_LOCKED);
+ sctp_setup_tail_pointer(nc);
+ tchk = TAILQ_FIRST(&control->reasm);
+ }
+ /* Spin the rest onto the queue */
+ while (tchk) {
+ TAILQ_REMOVE(&control->reasm, tchk, sctp_next);
+ TAILQ_INSERT_TAIL(&nc->reasm, tchk, sctp_next);
+ tchk = TAILQ_FIRST(&control->reasm);
}
- TAILQ_INSERT_BEFORE(at, control, next);
- break;
- } else if (at->sinfo_ssn == control->sinfo_ssn) {
/*
- * Gak, He sent me a duplicate str
- * seq number
+ * Now lets add it to the queue
+ * after removing control
*/
+ TAILQ_INSERT_TAIL(&strm->uno_inqueue, nc, next_instrm);
+ nc->on_strm_q = SCTP_ON_UNORDERED;
+ if (control->on_strm_q) {
+ TAILQ_REMOVE(&strm->uno_inqueue, control, next_instrm);
+ control->on_strm_q = 0;
+ }
+ }
+ if (control->pdapi_started) {
+ strm->pd_api_started = 0;
+ control->pdapi_started = 0;
+ }
+ if (control->on_strm_q) {
+ TAILQ_REMOVE(&strm->uno_inqueue, control, next_instrm);
+ control->on_strm_q = 0;
+ SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs);
+ }
+ if (control->on_read_q == 0) {
+ sctp_add_to_readq(stcb->sctp_ep, stcb, control,
+ &stcb->sctp_socket->so_rcv, control->end_added,
+ inp_read_lock_held, SCTP_SO_NOT_LOCKED);
+ }
+ sctp_wakeup_the_read_socket(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED);
+ if ((nc->first_frag_seen) && !TAILQ_EMPTY(&nc->reasm)) {
/*
- * foo bar, I guess I will just free
- * this new guy, should we abort
- * too? FIX ME MAYBE? Or it COULD be
- * that the SSN's have wrapped.
- * Maybe I should compare to TSN
- * somehow... sigh for now just blow
- * away the chunk!
+ * Switch to the new guy and
+ * continue
*/
-
- if (control->data)
- sctp_m_freem(control->data);
- control->data = NULL;
- asoc->size_on_all_streams -= control->length;
- sctp_ucount_decr(asoc->cnt_on_all_streams);
- if (control->whoFrom) {
- sctp_free_remote_addr(control->whoFrom);
- control->whoFrom = NULL;
- }
- sctp_free_a_readq(stcb, control);
- return;
+ control = nc;
+ goto restart;
} else {
- if (TAILQ_NEXT(at, next) == NULL) {
- /*
- * We are at the end, insert
- * it after this one
- */
- if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
- sctp_log_strm_del(control, at,
- SCTP_STR_LOG_FROM_INSERT_TL);
- }
- TAILQ_INSERT_AFTER(&strm->inqueue,
- at, control, next);
- break;
+ if (nc->on_strm_q == 0) {
+ sctp_free_a_readq(stcb, nc);
}
}
+ return (1);
+ } else {
+ sctp_free_a_readq(stcb, nc);
}
+ } else {
+ /* Can't add more */
+ break;
}
}
+ if ((control->length > pd_point) && (strm->pd_api_started == 0)) {
+ strm->pd_api_started = 1;
+ control->pdapi_started = 1;
+ sctp_add_to_readq(stcb->sctp_ep, stcb, control,
+ &stcb->sctp_socket->so_rcv, control->end_added,
+ inp_read_lock_held, SCTP_SO_NOT_LOCKED);
+ sctp_wakeup_the_read_socket(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED);
+ return (0);
+ } else {
+ return (1);
+ }
+}
+
+static void
+sctp_inject_old_unordered_data(struct sctp_tcb *stcb,
+ struct sctp_association *asoc,
+ struct sctp_queued_to_read *control,
+ struct sctp_tmit_chunk *chk,
+ int *abort_flag)
+{
+ struct sctp_tmit_chunk *at;
+ int inserted;
+
+ /*
+ * Here we need to place the chunk into the control structure sorted
+ * in the correct order.
+ */
+ if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) {
+ /* Its the very first one. */
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "chunk is a first fsn: %u becomes fsn_included\n",
+ chk->rec.data.fsn_num);
+ if (control->first_frag_seen) {
+ /*
+ * In old un-ordered we can reassembly on one
+ * control multiple messages. As long as the next
+ * FIRST is greater then the old first (TSN i.e. FSN
+ * wise)
+ */
+ struct mbuf *tdata;
+ uint32_t tmp;
+
+ if (SCTP_TSN_GT(chk->rec.data.fsn_num, control->fsn_included)) {
+ /*
+ * Easy way the start of a new guy beyond
+ * the lowest
+ */
+ goto place_chunk;
+ }
+ if ((chk->rec.data.fsn_num == control->fsn_included) ||
+ (control->pdapi_started)) {
+ /*
+ * Ok this should not happen, if it does we
+ * started the pd-api on the higher TSN
+ * (since the equals part is a TSN failure
+ * it must be that).
+ *
+ * We are completly hosed in that case since I
+ * have no way to recover. This really will
+ * only happen if we can get more TSN's
+ * higher before the pd-api-point.
+ */
+ sctp_abort_in_reasm(stcb, control, chk,
+ abort_flag,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_4);
+
+ return;
+ }
+ /*
+ * Ok we have two firsts and the one we just got is
+ * smaller than the one we previously placed.. yuck!
+ * We must swap them out.
+ */
+ /* swap the mbufs */
+ tdata = control->data;
+ control->data = chk->data;
+ chk->data = tdata;
+ /* Save the lengths */
+ chk->send_size = control->length;
+ /* Recompute length of control and tail pointer */
+ sctp_setup_tail_pointer(control);
+ /* Fix the FSN included */
+ tmp = control->fsn_included;
+ control->fsn_included = chk->rec.data.fsn_num;
+ chk->rec.data.fsn_num = tmp;
+ /* Fix the TSN included */
+ tmp = control->sinfo_tsn;
+ control->sinfo_tsn = chk->rec.data.TSN_seq;
+ chk->rec.data.TSN_seq = tmp;
+ /* Fix the PPID included */
+ tmp = control->sinfo_ppid;
+ control->sinfo_ppid = chk->rec.data.payloadtype;
+ chk->rec.data.payloadtype = tmp;
+ /* Fix tail pointer */
+ goto place_chunk;
+ }
+ control->first_frag_seen = 1;
+ control->top_fsn = control->fsn_included = chk->rec.data.fsn_num;
+ control->sinfo_tsn = chk->rec.data.TSN_seq;
+ control->sinfo_ppid = chk->rec.data.payloadtype;
+ control->data = chk->data;
+ sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq);
+ chk->data = NULL;
+ sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
+ sctp_setup_tail_pointer(control);
+ return;
+ }
+place_chunk:
+ inserted = 0;
+ TAILQ_FOREACH(at, &control->reasm, sctp_next) {
+ if (SCTP_TSN_GT(at->rec.data.fsn_num, chk->rec.data.fsn_num)) {
+ /*
+ * This one in queue is bigger than the new one,
+ * insert the new one before at.
+ */
+ asoc->size_on_reasm_queue += chk->send_size;
+ sctp_ucount_incr(asoc->cnt_on_reasm_queue);
+ inserted = 1;
+ TAILQ_INSERT_BEFORE(at, chk, sctp_next);
+ break;
+ } else if (at->rec.data.fsn_num == chk->rec.data.fsn_num) {
+ /*
+ * They sent a duplicate fsn number. This really
+ * should not happen since the FSN is a TSN and it
+ * should have been dropped earlier.
+ */
+ sctp_abort_in_reasm(stcb, control, chk,
+ abort_flag,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_5);
+ return;
+ }
+ }
+ if (inserted == 0) {
+ /* Its at the end */
+ asoc->size_on_reasm_queue += chk->send_size;
+ sctp_ucount_incr(asoc->cnt_on_reasm_queue);
+ control->top_fsn = chk->rec.data.fsn_num;
+ TAILQ_INSERT_TAIL(&control->reasm, chk, sctp_next);
+ }
}
-/*
- * Returns two things: You get the total size of the deliverable parts of the
- * first fragmented message on the reassembly queue. And you get a 1 back if
- * all of the message is ready or a 0 back if the message is still incomplete
- */
static int
-sctp_is_all_msg_on_reasm(struct sctp_association *asoc, uint32_t * t_size)
+sctp_deliver_reasm_check(struct sctp_tcb *stcb, struct sctp_association *asoc,
+ struct sctp_stream_in *strm, int inp_read_lock_held)
{
- struct sctp_tmit_chunk *chk;
- uint32_t tsn;
+ /*
+ * Given a stream, strm, see if any of the SSN's on it that are
+ * fragmented are ready to deliver. If so go ahead and place them on
+ * the read queue. In so placing if we have hit the end, then we
+ * need to remove them from the stream's queue.
+ */
+ struct sctp_queued_to_read *control, *nctl = NULL;
+ uint32_t next_to_del;
+ uint32_t pd_point;
+ int ret = 0;
- *t_size = 0;
- chk = TAILQ_FIRST(&asoc->reasmqueue);
- if (chk == NULL) {
- /* nothing on the queue */
- return (0);
+ if (stcb->sctp_socket) {
+ pd_point = min(SCTP_SB_LIMIT_RCV(stcb->sctp_socket) >> SCTP_PARTIAL_DELIVERY_SHIFT,
+ stcb->sctp_ep->partial_delivery_point);
+ } else {
+ pd_point = stcb->sctp_ep->partial_delivery_point;
+ }
+ control = TAILQ_FIRST(&strm->uno_inqueue);
+
+ if ((control) &&
+ (asoc->idata_supported == 0)) {
+ /* Special handling needed for "old" data format */
+ if (sctp_handle_old_unordered_data(stcb, asoc, strm, control, pd_point, inp_read_lock_held)) {
+ goto done_un;
+ }
}
- if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0) {
- /* Not a first on the queue */
+ if (strm->pd_api_started) {
+ /* Can't add more */
return (0);
}
- tsn = chk->rec.data.TSN_seq;
- TAILQ_FOREACH(chk, &asoc->reasmqueue, sctp_next) {
- if (tsn != chk->rec.data.TSN_seq) {
- return (0);
+ while (control) {
+ SCTPDBG(SCTP_DEBUG_XXX, "Looking at control: %p e(%d) ssn: %u top_fsn: %u inc_fsn: %u -uo\n",
+ control, control->end_added, control->sinfo_ssn, control->top_fsn, control->fsn_included);
+ nctl = TAILQ_NEXT(control, next_instrm);
+ if (control->end_added) {
+ /* We just put the last bit on */
+ if (control->on_strm_q) {
+#ifdef INVARIANTS
+ if (control->on_strm_q != SCTP_ON_UNORDERED) {
+ panic("Huh control: %p on_q: %d -- not unordered?",
+ control, control->on_strm_q);
+ }
+#endif
+ SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs);
+ TAILQ_REMOVE(&strm->uno_inqueue, control, next_instrm);
+ control->on_strm_q = 0;
+ }
+ if (control->on_read_q == 0) {
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, control->end_added,
+ inp_read_lock_held, SCTP_SO_NOT_LOCKED);
+ }
+ } else {
+ /* Can we do a PD-API for this un-ordered guy? */
+ if ((control->length >= pd_point) && (strm->pd_api_started == 0)) {
+ strm->pd_api_started = 1;
+ control->pdapi_started = 1;
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, control->end_added,
+ inp_read_lock_held, SCTP_SO_NOT_LOCKED);
+
+ break;
+ }
}
- *t_size += chk->send_size;
- if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) {
- return (1);
+ control = nctl;
+ }
+done_un:
+ control = TAILQ_FIRST(&strm->inqueue);
+ if (strm->pd_api_started) {
+ /* Can't add more */
+ return (0);
+ }
+ if (control == NULL) {
+ return (ret);
+ }
+ if (strm->last_sequence_delivered == control->sinfo_ssn) {
+ /*
+ * Ok the guy at the top was being partially delivered
+ * completed, so we remove it. Note the pd_api flag was
+ * taken off when the chunk was merged on in
+ * sctp_queue_data_for_reasm below.
+ */
+ nctl = TAILQ_NEXT(control, next_instrm);
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "Looking at control: %p e(%d) ssn: %u top_fsn: %u inc_fsn: %u (lastdel: %u)- o\n",
+ control, control->end_added, control->sinfo_ssn,
+ control->top_fsn, control->fsn_included,
+ strm->last_sequence_delivered);
+ if (control->end_added) {
+ if (control->on_strm_q) {
+#ifdef INVARIANTS
+ if (control->on_strm_q != SCTP_ON_ORDERED) {
+ panic("Huh control: %p on_q: %d -- not ordered?",
+ control, control->on_strm_q);
+ }
+#endif
+ SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs);
+ TAILQ_REMOVE(&strm->inqueue, control, next_instrm);
+ control->on_strm_q = 0;
+ }
+ if (strm->pd_api_started && control->pdapi_started) {
+ control->pdapi_started = 0;
+ strm->pd_api_started = 0;
+ }
+ if (control->on_read_q == 0) {
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, control->end_added,
+ inp_read_lock_held, SCTP_SO_NOT_LOCKED);
+ }
+ control = nctl;
}
- tsn++;
}
- return (0);
-}
-
-static void
-sctp_deliver_reasm_check(struct sctp_tcb *stcb, struct sctp_association *asoc)
-{
- struct sctp_tmit_chunk *chk;
- uint16_t nxt_todel;
- uint32_t tsize, pd_point;
-
-doit_again:
- chk = TAILQ_FIRST(&asoc->reasmqueue);
- if (chk == NULL) {
- /* Huh? */
- asoc->size_on_reasm_queue = 0;
- asoc->cnt_on_reasm_queue = 0;
- return;
+ if (strm->pd_api_started) {
+ /*
+ * Can't add more must have gotten an un-ordered above being
+ * partially delivered.
+ */
+ return (0);
}
- if (asoc->fragmented_delivery_inprogress == 0) {
- nxt_todel =
- asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered + 1;
- if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) &&
- (nxt_todel == chk->rec.data.stream_seq ||
- (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED))) {
- /*
- * Yep the first one is here and its ok to deliver
- * but should we?
- */
- if (stcb->sctp_socket) {
- pd_point = min(SCTP_SB_LIMIT_RCV(stcb->sctp_socket) >> SCTP_PARTIAL_DELIVERY_SHIFT,
- stcb->sctp_ep->partial_delivery_point);
- } else {
- pd_point = stcb->sctp_ep->partial_delivery_point;
+deliver_more:
+ next_to_del = strm->last_sequence_delivered + 1;
+ if (control) {
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "Looking at control: %p e(%d) ssn: %u top_fsn: %u inc_fsn: %u (nxtdel: %u)- o\n",
+ control, control->end_added, control->sinfo_ssn, control->top_fsn, control->fsn_included,
+ next_to_del);
+ nctl = TAILQ_NEXT(control, next_instrm);
+ if ((control->sinfo_ssn == next_to_del) &&
+ (control->first_frag_seen)) {
+ int done;
+
+ /* Ok we can deliver it onto the stream. */
+ if (control->end_added) {
+ /* We are done with it afterwards */
+ if (control->on_strm_q) {
+#ifdef INVARIANTS
+ if (control->on_strm_q != SCTP_ON_ORDERED) {
+ panic("Huh control: %p on_q: %d -- not ordered?",
+ control, control->on_strm_q);
+ }
+#endif
+ SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs);
+ TAILQ_REMOVE(&strm->inqueue, control, next_instrm);
+ control->on_strm_q = 0;
+ }
+ ret++;
}
- if (sctp_is_all_msg_on_reasm(asoc, &tsize) || (tsize >= pd_point)) {
+ if (((control->sinfo_flags >> 8) & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) {
+ /*
+ * A singleton now slipping through - mark
+ * it non-revokable too
+ */
+ sctp_mark_non_revokable(asoc, control->sinfo_tsn);
+ } else if (control->end_added == 0) {
/*
- * Yes, we setup to start reception, by
- * backing down the TSN just in case we
- * can't deliver. If we
+ * Check if we can defer adding until its
+ * all there
*/
- asoc->fragmented_delivery_inprogress = 1;
- asoc->tsn_last_delivered =
- chk->rec.data.TSN_seq - 1;
- asoc->str_of_pdapi =
- chk->rec.data.stream_number;
- asoc->ssn_of_pdapi = chk->rec.data.stream_seq;
- asoc->pdapi_ppid = chk->rec.data.payloadtype;
- asoc->fragment_flags = chk->rec.data.rcv_flags;
- sctp_service_reassembly(stcb, asoc);
+ if ((control->length < pd_point) || (strm->pd_api_started)) {
+ /*
+ * Don't need it or cannot add more
+ * (one being delivered that way)
+ */
+ goto out;
+ }
+ }
+ done = (control->end_added) && (control->last_frag_seen);
+ if (control->on_read_q == 0) {
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, control->end_added,
+ inp_read_lock_held, SCTP_SO_NOT_LOCKED);
+ }
+ strm->last_sequence_delivered = next_to_del;
+ if (done) {
+ control = nctl;
+ goto deliver_more;
+ } else {
+ /* We are now doing PD API */
+ strm->pd_api_started = 1;
+ control->pdapi_started = 1;
}
}
- } else {
+ }
+out:
+ return (ret);
+}
+
+
+void
+sctp_add_chk_to_control(struct sctp_queued_to_read *control,
+ struct sctp_stream_in *strm,
+ struct sctp_tcb *stcb, struct sctp_association *asoc,
+ struct sctp_tmit_chunk *chk, int hold_rlock)
+{
+ /*
+ * Given a control and a chunk, merge the data from the chk onto the
+ * control and free up the chunk resources.
+ */
+ int i_locked = 0;
+
+ if (control->on_read_q && (hold_rlock == 0)) {
/*
- * Service re-assembly will deliver stream data queued at
- * the end of fragmented delivery.. but it wont know to go
- * back and call itself again... we do that here with the
- * got doit_again
+ * Its being pd-api'd so we must do some locks.
*/
- sctp_service_reassembly(stcb, asoc);
- if (asoc->fragmented_delivery_inprogress == 0) {
- /*
- * finished our Fragmented delivery, could be more
- * waiting?
- */
- goto doit_again;
+ SCTP_INP_READ_LOCK(stcb->sctp_ep);
+ i_locked = 1;
+ }
+ if (control->data == NULL) {
+ control->data = chk->data;
+ sctp_setup_tail_pointer(control);
+ } else {
+ sctp_add_to_tail_pointer(control, chk->data);
+ }
+ control->fsn_included = chk->rec.data.fsn_num;
+ asoc->size_on_reasm_queue -= chk->send_size;
+ sctp_ucount_decr(asoc->cnt_on_reasm_queue);
+ sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq);
+ chk->data = NULL;
+ if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) {
+ control->first_frag_seen = 1;
+ }
+ if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) {
+ /* Its complete */
+ if ((control->on_strm_q) && (control->on_read_q)) {
+ if (control->pdapi_started) {
+ control->pdapi_started = 0;
+ strm->pd_api_started = 0;
+ }
+ if (control->on_strm_q == SCTP_ON_UNORDERED) {
+ /* Unordered */
+ TAILQ_REMOVE(&strm->uno_inqueue, control, next_instrm);
+ control->on_strm_q = 0;
+ } else if (control->on_strm_q == SCTP_ON_ORDERED) {
+ /* Ordered */
+ TAILQ_REMOVE(&strm->inqueue, control, next_instrm);
+ control->on_strm_q = 0;
+#ifdef INVARIANTS
+ } else if (control->on_strm_q) {
+ panic("Unknown state on ctrl: %p on_strm_q: %d", control,
+ control->on_strm_q);
+#endif
+ }
}
+ control->end_added = 1;
+ control->last_frag_seen = 1;
+ }
+ if (i_locked) {
+ SCTP_INP_READ_UNLOCK(stcb->sctp_ep);
}
+ sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
}
/*
@@ -831,462 +1284,361 @@ doit_again:
*/
static void
sctp_queue_data_for_reasm(struct sctp_tcb *stcb, struct sctp_association *asoc,
- struct sctp_tmit_chunk *chk, int *abort_flag)
+ struct sctp_stream_in *strm,
+ struct sctp_queued_to_read *control,
+ struct sctp_tmit_chunk *chk,
+ int created_control,
+ int *abort_flag, uint32_t tsn)
{
- struct mbuf *op_err;
- char msg[SCTP_DIAG_INFO_LEN];
- uint32_t cum_ackp1, prev_tsn, post_tsn;
- struct sctp_tmit_chunk *at, *prev, *next;
-
- prev = next = NULL;
- cum_ackp1 = asoc->tsn_last_delivered + 1;
- if (TAILQ_EMPTY(&asoc->reasmqueue)) {
- /* This is the first one on the queue */
- TAILQ_INSERT_HEAD(&asoc->reasmqueue, chk, sctp_next);
- /*
- * we do not check for delivery of anything when only one
- * fragment is here
- */
- asoc->size_on_reasm_queue = chk->send_size;
- sctp_ucount_incr(asoc->cnt_on_reasm_queue);
- if (chk->rec.data.TSN_seq == cum_ackp1) {
- if (asoc->fragmented_delivery_inprogress == 0 &&
- (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) !=
- SCTP_DATA_FIRST_FRAG) {
- /*
- * An empty queue, no delivery inprogress,
- * we hit the next one and it does NOT have
- * a FIRST fragment mark.
- */
- SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, its not first, no fragmented delivery in progress\n");
- snprintf(msg, sizeof(msg),
- "Expected B-bit for TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- chk->rec.data.TSN_seq,
- chk->rec.data.stream_number,
- chk->rec.data.stream_seq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_2;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- } else if (asoc->fragmented_delivery_inprogress &&
- (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == SCTP_DATA_FIRST_FRAG) {
- /*
- * We are doing a partial delivery and the
- * NEXT chunk MUST be either the LAST or
- * MIDDLE fragment NOT a FIRST
- */
- SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS a first and fragmented delivery in progress\n");
- snprintf(msg, sizeof(msg),
- "Didn't expect B-bit for TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- chk->rec.data.TSN_seq,
- chk->rec.data.stream_number,
- chk->rec.data.stream_seq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_3;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- } else if (asoc->fragmented_delivery_inprogress) {
- /*
- * Here we are ok with a MIDDLE or LAST
- * piece
- */
- if (chk->rec.data.stream_number !=
- asoc->str_of_pdapi) {
- /* Got to be the right STR No */
- SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS not same stream number %d vs %d\n",
- chk->rec.data.stream_number,
- asoc->str_of_pdapi);
- snprintf(msg, sizeof(msg),
- "Expected SID=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- asoc->str_of_pdapi,
- chk->rec.data.TSN_seq,
- chk->rec.data.stream_number,
- chk->rec.data.stream_seq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_4;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- } else if ((asoc->fragment_flags & SCTP_DATA_UNORDERED) !=
- SCTP_DATA_UNORDERED &&
- chk->rec.data.stream_seq != asoc->ssn_of_pdapi) {
- /* Got to be the right STR Seq */
- SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS not same stream seq %d vs %d\n",
- chk->rec.data.stream_seq,
- asoc->ssn_of_pdapi);
- snprintf(msg, sizeof(msg),
- "Expected SSN=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- asoc->ssn_of_pdapi,
- chk->rec.data.TSN_seq,
- chk->rec.data.stream_number,
- chk->rec.data.stream_seq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_5;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- }
- }
+ uint32_t next_fsn;
+ struct sctp_tmit_chunk *at, *nat;
+ int do_wakeup, unordered;
+
+ /*
+ * For old un-ordered data chunks.
+ */
+ if ((control->sinfo_flags >> 8) & SCTP_DATA_UNORDERED) {
+ unordered = 1;
+ } else {
+ unordered = 0;
+ }
+ /* Must be added to the stream-in queue */
+ if (created_control) {
+ if (sctp_place_control_in_stream(strm, asoc, control)) {
+ /* Duplicate SSN? */
+ sctp_clean_up_control(stcb, control);
+ sctp_abort_in_reasm(stcb, control, chk,
+ abort_flag,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_6);
+ return;
}
- return;
- }
- /* Find its place */
- TAILQ_FOREACH(at, &asoc->reasmqueue, sctp_next) {
- if (SCTP_TSN_GT(at->rec.data.TSN_seq, chk->rec.data.TSN_seq)) {
- /*
- * one in queue is bigger than the new one, insert
- * before this one
- */
- /* A check */
- asoc->size_on_reasm_queue += chk->send_size;
- sctp_ucount_incr(asoc->cnt_on_reasm_queue);
- next = at;
- TAILQ_INSERT_BEFORE(at, chk, sctp_next);
- break;
- } else if (at->rec.data.TSN_seq == chk->rec.data.TSN_seq) {
- /* Gak, He sent me a duplicate str seq number */
+ if ((tsn == (asoc->cumulative_tsn + 1) && (asoc->idata_supported == 0))) {
/*
- * foo bar, I guess I will just free this new guy,
- * should we abort too? FIX ME MAYBE? Or it COULD be
- * that the SSN's have wrapped. Maybe I should
- * compare to TSN somehow... sigh for now just blow
- * away the chunk!
+ * Ok we created this control and now lets validate
+ * that its legal i.e. there is a B bit set, if not
+ * and we have up to the cum-ack then its invalid.
*/
- if (chk->data) {
- sctp_m_freem(chk->data);
- chk->data = NULL;
- }
- sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
- return;
- } else {
- prev = at;
- if (TAILQ_NEXT(at, sctp_next) == NULL) {
- /*
- * We are at the end, insert it after this
- * one
- */
- /* check it first */
- asoc->size_on_reasm_queue += chk->send_size;
- sctp_ucount_incr(asoc->cnt_on_reasm_queue);
- TAILQ_INSERT_AFTER(&asoc->reasmqueue, at, chk, sctp_next);
- break;
+ if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0) {
+ sctp_abort_in_reasm(stcb, control, chk,
+ abort_flag,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_7);
+ return;
}
}
}
- /* Now the audits */
- if (prev) {
- prev_tsn = chk->rec.data.TSN_seq - 1;
- if (prev_tsn == prev->rec.data.TSN_seq) {
+ if ((asoc->idata_supported == 0) && (unordered == 1)) {
+ sctp_inject_old_unordered_data(stcb, asoc, control, chk, abort_flag);
+ return;
+ }
+ /*
+ * Ok we must queue the chunk into the reasembly portion: o if its
+ * the first it goes to the control mbuf. o if its not first but the
+ * next in sequence it goes to the control, and each succeeding one
+ * in order also goes. o if its not in order we place it on the list
+ * in its place.
+ */
+ if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) {
+ /* Its the very first one. */
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "chunk is a first fsn: %u becomes fsn_included\n",
+ chk->rec.data.fsn_num);
+ if (control->first_frag_seen) {
/*
- * Ok the one I am dropping onto the end is the
- * NEXT. A bit of valdiation here.
+ * Error on senders part, they either sent us two
+ * data chunks with FIRST, or they sent two
+ * un-ordered chunks that were fragmented at the
+ * same time in the same stream.
*/
- if ((prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
- SCTP_DATA_FIRST_FRAG ||
- (prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
- SCTP_DATA_MIDDLE_FRAG) {
+ sctp_abort_in_reasm(stcb, control, chk,
+ abort_flag,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_8);
+ return;
+ }
+ control->first_frag_seen = 1;
+ control->fsn_included = chk->rec.data.fsn_num;
+ control->data = chk->data;
+ sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq);
+ chk->data = NULL;
+ sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
+ sctp_setup_tail_pointer(control);
+ } else {
+ /* Place the chunk in our list */
+ int inserted = 0;
+
+ if (control->last_frag_seen == 0) {
+ /* Still willing to raise highest FSN seen */
+ if (SCTP_TSN_GT(chk->rec.data.fsn_num, control->top_fsn)) {
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "We have a new top_fsn: %u\n",
+ chk->rec.data.fsn_num);
+ control->top_fsn = chk->rec.data.fsn_num;
+ }
+ if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) {
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "The last fsn is now in place fsn: %u\n",
+ chk->rec.data.fsn_num);
+ control->last_frag_seen = 1;
+ }
+ if (asoc->idata_supported || control->first_frag_seen) {
/*
- * Insert chk MUST be a MIDDLE or LAST
- * fragment
+ * For IDATA we always check since we know
+ * that the first fragment is 0. For old
+ * DATA we have to receive the first before
+ * we know the first FSN (which is the TSN).
*/
- if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
- SCTP_DATA_FIRST_FRAG) {
- SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - It can be a midlle or last but not a first\n");
- SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it's a FIRST!\n");
- snprintf(msg, sizeof(msg),
- "Can't handle B-bit, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- chk->rec.data.TSN_seq,
- chk->rec.data.stream_number,
- chk->rec.data.stream_seq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_6;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- return;
- }
- if (chk->rec.data.stream_number !=
- prev->rec.data.stream_number) {
- /*
- * Huh, need the correct STR here,
- * they must be the same.
- */
- SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, Evil plot, sid:%d not the same as at:%d\n",
- chk->rec.data.stream_number,
- prev->rec.data.stream_number);
- snprintf(msg, sizeof(msg),
- "Expect SID=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- prev->rec.data.stream_number,
- chk->rec.data.TSN_seq,
- chk->rec.data.stream_number,
- chk->rec.data.stream_seq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_7;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- return;
- }
- if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) !=
- (prev->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) {
+ if (SCTP_TSN_GE(control->fsn_included, chk->rec.data.fsn_num)) {
/*
- * Huh, need the same ordering here,
- * they must be the same.
+ * We have already delivered up to
+ * this so its a dup
*/
- SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, Evil plot, U-bit not constant\n");
- snprintf(msg, sizeof(msg),
- "Expect U-bit=%d for TSN=%8.8x, got U-bit=%d",
- (prev->rec.data.rcv_flags & SCTP_DATA_UNORDERED) ? 1 : 0,
- chk->rec.data.TSN_seq,
- (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) ? 1 : 0);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_7;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
+ sctp_abort_in_reasm(stcb, control, chk,
+ abort_flag,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_9);
return;
}
- if ((prev->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0 &&
- chk->rec.data.stream_seq !=
- prev->rec.data.stream_seq) {
+ }
+ } else {
+ if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) {
+ /* Second last? huh? */
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "Duplicate last fsn: %u (top: %u) -- abort\n",
+ chk->rec.data.fsn_num, control->top_fsn);
+ sctp_abort_in_reasm(stcb, control,
+ chk, abort_flag,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_10);
+ return;
+ }
+ if (asoc->idata_supported || control->first_frag_seen) {
+ /*
+ * For IDATA we always check since we know
+ * that the first fragment is 0. For old
+ * DATA we have to receive the first before
+ * we know the first FSN (which is the TSN).
+ */
+
+ if (SCTP_TSN_GE(control->fsn_included, chk->rec.data.fsn_num)) {
/*
- * Huh, need the correct STR here,
- * they must be the same.
+ * We have already delivered up to
+ * this so its a dup
*/
- SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, Evil plot, sseq:%d not the same as at:%d\n",
- chk->rec.data.stream_seq,
- prev->rec.data.stream_seq);
- snprintf(msg, sizeof(msg),
- "Expect SSN=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- prev->rec.data.stream_seq,
- chk->rec.data.TSN_seq,
- chk->rec.data.stream_number,
- chk->rec.data.stream_seq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_8;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- return;
- }
- } else if ((prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
- SCTP_DATA_LAST_FRAG) {
- /* Insert chk MUST be a FIRST */
- if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) !=
- SCTP_DATA_FIRST_FRAG) {
- SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, evil plot, its not FIRST and it must be!\n");
- snprintf(msg, sizeof(msg),
- "Expect B-bit, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- chk->rec.data.TSN_seq,
- chk->rec.data.stream_number,
- chk->rec.data.stream_seq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_9;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "New fsn: %u is already seen in included_fsn: %u -- abort\n",
+ chk->rec.data.fsn_num, control->fsn_included);
+ sctp_abort_in_reasm(stcb, control, chk,
+ abort_flag,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_11);
return;
}
}
- }
- }
- if (next) {
- post_tsn = chk->rec.data.TSN_seq + 1;
- if (post_tsn == next->rec.data.TSN_seq) {
/*
- * Ok the one I am inserting ahead of is my NEXT
- * one. A bit of valdiation here.
+ * validate not beyond top FSN if we have seen last
+ * one
*/
- if (next->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) {
- /* Insert chk MUST be a last fragment */
- if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK)
- != SCTP_DATA_LAST_FRAG) {
- SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Next is FIRST, we must be LAST\n");
- SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, its not a last!\n");
- snprintf(msg, sizeof(msg),
- "Expect only E-bit, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- chk->rec.data.TSN_seq,
- chk->rec.data.stream_number,
- chk->rec.data.stream_seq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_10;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- return;
- }
- } else if ((next->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
- SCTP_DATA_MIDDLE_FRAG ||
- (next->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
- SCTP_DATA_LAST_FRAG) {
+ if (SCTP_TSN_GT(chk->rec.data.fsn_num, control->top_fsn)) {
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "New fsn: %u is beyond or at top_fsn: %u -- abort\n",
+ chk->rec.data.fsn_num,
+ control->top_fsn);
+ sctp_abort_in_reasm(stcb, control, chk,
+ abort_flag,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_12);
+ return;
+ }
+ }
+ /*
+ * If we reach here, we need to place the new chunk in the
+ * reassembly for this control.
+ */
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "chunk is a not first fsn: %u needs to be inserted\n",
+ chk->rec.data.fsn_num);
+ TAILQ_FOREACH(at, &control->reasm, sctp_next) {
+ if (SCTP_TSN_GT(at->rec.data.fsn_num, chk->rec.data.fsn_num)) {
/*
- * Insert chk CAN be MIDDLE or FIRST NOT
- * LAST
+ * This one in queue is bigger than the new
+ * one, insert the new one before at.
*/
- if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) ==
- SCTP_DATA_LAST_FRAG) {
- SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Next is a MIDDLE/LAST\n");
- SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, new prev chunk is a LAST\n");
- snprintf(msg, sizeof(msg),
- "Didn't expect E-bit, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- chk->rec.data.TSN_seq,
- chk->rec.data.stream_number,
- chk->rec.data.stream_seq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_11;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- return;
- }
- if (chk->rec.data.stream_number !=
- next->rec.data.stream_number) {
- /*
- * Huh, need the correct STR here,
- * they must be the same.
- */
- SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Gak, Evil plot, ssn:%d not the same as at:%d\n",
- chk->rec.data.stream_number,
- next->rec.data.stream_number);
- snprintf(msg, sizeof(msg),
- "Required SID %4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- next->rec.data.stream_number,
- chk->rec.data.TSN_seq,
- chk->rec.data.stream_number,
- chk->rec.data.stream_seq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_12;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- return;
- }
- if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) !=
- (next->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) {
- /*
- * Huh, need the same ordering here,
- * they must be the same.
- */
- SCTPDBG(SCTP_DEBUG_INDATA1, "Next check - Gak, Evil plot, U-bit not constant\n");
- snprintf(msg, sizeof(msg),
- "Expect U-bit=%d for TSN=%8.8x, got U-bit=%d",
- (next->rec.data.rcv_flags & SCTP_DATA_UNORDERED) ? 1 : 0,
- chk->rec.data.TSN_seq,
- (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) ? 1 : 0);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_12;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- return;
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "Insert it before fsn: %u\n",
+ at->rec.data.fsn_num);
+ asoc->size_on_reasm_queue += chk->send_size;
+ sctp_ucount_incr(asoc->cnt_on_reasm_queue);
+ TAILQ_INSERT_BEFORE(at, chk, sctp_next);
+ inserted = 1;
+ break;
+ } else if (at->rec.data.fsn_num == chk->rec.data.fsn_num) {
+ /*
+ * Gak, He sent me a duplicate str seq
+ * number
+ */
+ /*
+ * foo bar, I guess I will just free this
+ * new guy, should we abort too? FIX ME
+ * MAYBE? Or it COULD be that the SSN's have
+ * wrapped. Maybe I should compare to TSN
+ * somehow... sigh for now just blow away
+ * the chunk!
+ */
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "Duplicate to fsn: %u -- abort\n",
+ at->rec.data.fsn_num);
+ sctp_abort_in_reasm(stcb, control,
+ chk, abort_flag,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_13);
+ return;
+ }
+ }
+ if (inserted == 0) {
+ /* Goes on the end */
+ SCTPDBG(SCTP_DEBUG_XXX, "Inserting at tail of list fsn: %u\n",
+ chk->rec.data.fsn_num);
+ asoc->size_on_reasm_queue += chk->send_size;
+ sctp_ucount_incr(asoc->cnt_on_reasm_queue);
+ TAILQ_INSERT_TAIL(&control->reasm, chk, sctp_next);
+ }
+ }
+ /*
+ * Ok lets see if we can suck any up into the control structure that
+ * are in seq if it makes sense.
+ */
+ do_wakeup = 0;
+ /*
+ * If the first fragment has not been seen there is no sense in
+ * looking.
+ */
+ if (control->first_frag_seen) {
+ next_fsn = control->fsn_included + 1;
+ TAILQ_FOREACH_SAFE(at, &control->reasm, sctp_next, nat) {
+ if (at->rec.data.fsn_num == next_fsn) {
+ /* We can add this one now to the control */
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "Adding more to control: %p at: %p fsn: %u next_fsn: %u included: %u\n",
+ control, at,
+ at->rec.data.fsn_num,
+ next_fsn, control->fsn_included);
+ TAILQ_REMOVE(&control->reasm, at, sctp_next);
+ sctp_add_chk_to_control(control, strm, stcb, asoc, at, SCTP_READ_LOCK_NOT_HELD);
+ if (control->on_read_q) {
+ do_wakeup = 1;
}
- if ((next->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0 &&
- chk->rec.data.stream_seq !=
- next->rec.data.stream_seq) {
- /*
- * Huh, need the correct STR here,
- * they must be the same.
- */
- SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Gak, Evil plot, sseq:%d not the same as at:%d\n",
- chk->rec.data.stream_seq,
- next->rec.data.stream_seq);
- snprintf(msg, sizeof(msg),
- "Required SSN %4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- next->rec.data.stream_seq,
- chk->rec.data.TSN_seq,
- chk->rec.data.stream_number,
- chk->rec.data.stream_seq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_13;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- return;
+ next_fsn++;
+ if (control->end_added && control->pdapi_started) {
+ if (strm->pd_api_started) {
+ strm->pd_api_started = 0;
+ control->pdapi_started = 0;
+ }
+ if (control->on_read_q == 0) {
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ control,
+ &stcb->sctp_socket->so_rcv, control->end_added,
+ SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
+ do_wakeup = 1;
+ }
+ break;
}
+ } else {
+ break;
}
}
}
- /* Do we need to do some delivery? check */
- sctp_deliver_reasm_check(stcb, asoc);
+ if (do_wakeup) {
+ /* Need to wakeup the reader */
+ sctp_wakeup_the_read_socket(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED);
+ }
}
-/*
- * This is an unfortunate routine. It checks to make sure a evil guy is not
- * stuffing us full of bad packet fragments. A broken peer could also do this
- * but this is doubtful. It is to bad I must worry about evil crackers sigh
- * :< more cycles.
- */
-static int
-sctp_does_tsn_belong_to_reasm(struct sctp_association *asoc,
- uint32_t TSN_seq)
+static struct sctp_queued_to_read *
+sctp_find_reasm_entry(struct sctp_stream_in *strm, uint32_t msg_id, int ordered, int old)
{
- struct sctp_tmit_chunk *at;
- uint32_t tsn_est;
-
- TAILQ_FOREACH(at, &asoc->reasmqueue, sctp_next) {
- if (SCTP_TSN_GT(TSN_seq, at->rec.data.TSN_seq)) {
- /* is it one bigger? */
- tsn_est = at->rec.data.TSN_seq + 1;
- if (tsn_est == TSN_seq) {
- /* yep. It better be a last then */
- if ((at->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) !=
- SCTP_DATA_LAST_FRAG) {
- /*
- * Ok this guy belongs next to a guy
- * that is NOT last, it should be a
- * middle/last, not a complete
- * chunk.
- */
- return (1);
- } else {
- /*
- * This guy is ok since its a LAST
- * and the new chunk is a fully
- * self- contained one.
- */
- return (0);
- }
+ struct sctp_queued_to_read *control;
+
+ if (ordered) {
+ TAILQ_FOREACH(control, &strm->inqueue, next_instrm) {
+ if (control->msg_id == msg_id) {
+ break;
}
- } else if (TSN_seq == at->rec.data.TSN_seq) {
- /* Software error since I have a dup? */
- return (1);
- } else {
- /*
- * Ok, 'at' is larger than new chunk but does it
- * need to be right before it.
- */
- tsn_est = TSN_seq + 1;
- if (tsn_est == at->rec.data.TSN_seq) {
- /* Yep, It better be a first */
- if ((at->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) !=
- SCTP_DATA_FIRST_FRAG) {
- return (1);
- } else {
- return (0);
- }
+ }
+ } else {
+ if (old) {
+ control = TAILQ_FIRST(&strm->uno_inqueue);
+ return (control);
+ }
+ TAILQ_FOREACH(control, &strm->uno_inqueue, next_instrm) {
+ if (control->msg_id == msg_id) {
+ break;
}
}
}
- return (0);
+ return (control);
}
static int
sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc,
- struct mbuf **m, int offset, struct sctp_data_chunk *ch, int chk_length,
+ struct mbuf **m, int offset, int chk_length,
struct sctp_nets *net, uint32_t * high_tsn, int *abort_flag,
- int *break_flag, int last_chunk)
+ int *break_flag, int last_chunk, uint8_t chtype)
{
/* Process a data chunk */
/* struct sctp_tmit_chunk *chk; */
+ struct sctp_data_chunk *ch;
+ struct sctp_idata_chunk *nch, chunk_buf;
struct sctp_tmit_chunk *chk;
- uint32_t tsn, gap;
+ uint32_t tsn, fsn, gap, msg_id;
struct mbuf *dmbuf;
int the_len;
int need_reasm_check = 0;
- uint16_t strmno, strmseq;
+ uint16_t strmno;
struct mbuf *op_err;
char msg[SCTP_DIAG_INFO_LEN];
- struct sctp_queued_to_read *control;
- int ordered;
+ struct sctp_queued_to_read *control = NULL;
uint32_t protocol_id;
uint8_t chunk_flags;
struct sctp_stream_reset_list *liste;
+ struct sctp_stream_in *strm;
+ int ordered;
+ size_t clen;
+ int created_control = 0;
+ uint8_t old_data;
chk = NULL;
- tsn = ntohl(ch->dp.tsn);
+ if (chtype == SCTP_IDATA) {
+ nch = (struct sctp_idata_chunk *)sctp_m_getptr(*m, offset,
+ sizeof(struct sctp_idata_chunk), (uint8_t *) & chunk_buf);
+ ch = (struct sctp_data_chunk *)nch;
+ clen = sizeof(struct sctp_idata_chunk);
+ tsn = ntohl(ch->dp.tsn);
+ msg_id = ntohl(nch->dp.msg_id);
+ protocol_id = nch->dp.ppid_fsn.protocol_id;
+ if (ch->ch.chunk_flags & SCTP_DATA_FIRST_FRAG)
+ fsn = 0;
+ else
+ fsn = ntohl(nch->dp.ppid_fsn.fsn);
+ old_data = 0;
+ } else {
+ ch = (struct sctp_data_chunk *)sctp_m_getptr(*m, offset,
+ sizeof(struct sctp_data_chunk), (uint8_t *) & chunk_buf);
+ tsn = ntohl(ch->dp.tsn);
+ protocol_id = ch->dp.protocol_id;
+ clen = sizeof(struct sctp_data_chunk);
+ fsn = tsn;
+ msg_id = (uint32_t) (ntohs(ch->dp.stream_sequence));
+ nch = NULL;
+ old_data = 1;
+ }
chunk_flags = ch->ch.chunk_flags;
+ if ((size_t)chk_length == clen) {
+ /*
+ * Need to send an abort since we had a empty data chunk.
+ */
+ op_err = sctp_generate_no_user_data_cause(ch->dp.tsn);
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_14;
+ sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
+ *abort_flag = 1;
+ return (0);
+ }
if ((chunk_flags & SCTP_DATA_SACK_IMMEDIATELY) == SCTP_DATA_SACK_IMMEDIATELY) {
asoc->send_sack = 1;
}
- protocol_id = ch->dp.protocol_id;
ordered = ((chunk_flags & SCTP_DATA_UNORDERED) == 0);
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
sctp_log_map(tsn, asoc->cumulative_tsn, asoc->highest_tsn_inside_map, SCTP_MAP_TSN_ENTERS);
@@ -1356,6 +1708,117 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc,
* for on a partial delivery API.
*/
+ /* Is the stream valid? */
+ strmno = ntohs(ch->dp.stream_id);
+
+ if (strmno >= asoc->streamincnt) {
+ struct sctp_error_invalid_stream *cause;
+
+ op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_invalid_stream),
+ 0, M_NOWAIT, 1, MT_DATA);
+ if (op_err != NULL) {
+ /* add some space up front so prepend will work well */
+ SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr));
+ cause = mtod(op_err, struct sctp_error_invalid_stream *);
+ /*
+ * Error causes are just param's and this one has
+ * two back to back phdr, one with the error type
+ * and size, the other with the streamid and a rsvd
+ */
+ SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_invalid_stream);
+ cause->cause.code = htons(SCTP_CAUSE_INVALID_STREAM);
+ cause->cause.length = htons(sizeof(struct sctp_error_invalid_stream));
+ cause->stream_id = ch->dp.stream_id;
+ cause->reserved = htons(0);
+ sctp_queue_op_err(stcb, op_err);
+ }
+ SCTP_STAT_INCR(sctps_badsid);
+ SCTP_TCB_LOCK_ASSERT(stcb);
+ SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
+ if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) {
+ asoc->highest_tsn_inside_nr_map = tsn;
+ }
+ if (tsn == (asoc->cumulative_tsn + 1)) {
+ /* Update cum-ack */
+ asoc->cumulative_tsn = tsn;
+ }
+ return (0);
+ }
+ strm = &asoc->strmin[strmno];
+ /*
+ * If its a fragmented message, lets see if we can find the control
+ * on the reassembly queues.
+ */
+ if ((chtype == SCTP_IDATA) &&
+ ((chunk_flags & SCTP_DATA_FIRST_FRAG) == 0) &&
+ (fsn == 0)) {
+ /*
+ * The first *must* be fsn 0, and other (middle/end) pieces
+ * can *not* be fsn 0. XXX: This can happen in case of a
+ * wrap around. Ignore is for now.
+ */
+ snprintf(msg, sizeof(msg), "FSN zero for MID=%8.8x, but flags=%2.2x",
+ msg_id, chunk_flags);
+ goto err_out;
+ }
+ control = sctp_find_reasm_entry(strm, msg_id, ordered, old_data);
+ SCTPDBG(SCTP_DEBUG_XXX, "chunk_flags:0x%x look for control on queues %p\n",
+ chunk_flags, control);
+ if ((chunk_flags & SCTP_DATA_NOT_FRAG) != SCTP_DATA_NOT_FRAG) {
+ /* See if we can find the re-assembly entity */
+ if (control != NULL) {
+ /* We found something, does it belong? */
+ if (ordered && (msg_id != control->sinfo_ssn)) {
+ snprintf(msg, sizeof(msg), "Reassembly problem (MID=%8.8x)", msg_id);
+ err_out:
+ op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_15;
+ sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
+ *abort_flag = 1;
+ return (0);
+ }
+ if (ordered && ((control->sinfo_flags >> 8) & SCTP_DATA_UNORDERED)) {
+ /*
+ * We can't have a switched order with an
+ * unordered chunk
+ */
+ snprintf(msg, sizeof(msg), "All fragments of a user message must be ordered or unordered (TSN=%8.8x)",
+ tsn);
+ goto err_out;
+ }
+ if (!ordered && (((control->sinfo_flags >> 8) & SCTP_DATA_UNORDERED) == 0)) {
+ /*
+ * We can't have a switched unordered with a
+ * ordered chunk
+ */
+ snprintf(msg, sizeof(msg), "All fragments of a user message must be ordered or unordered (TSN=%8.8x)",
+ tsn);
+ goto err_out;
+ }
+ }
+ } else {
+ /*
+ * Its a complete segment. Lets validate we don't have a
+ * re-assembly going on with the same Stream/Seq (for
+ * ordered) or in the same Stream for unordered.
+ */
+ if (control != NULL) {
+ if (ordered || (old_data == 0)) {
+ SCTPDBG(SCTP_DEBUG_XXX, "chunk_flags: 0x%x dup detected on msg_id: %u\n",
+ chunk_flags, msg_id);
+ snprintf(msg, sizeof(msg), "Duplicate MID=%8.8x detected.", msg_id);
+ goto err_out;
+ } else {
+ if ((tsn == control->fsn_included + 1) &&
+ (control->end_added == 0)) {
+ snprintf(msg, sizeof(msg), "Illegal message sequence, missing end for MID: %8.8x", control->fsn_included);
+ goto err_out;
+ } else {
+ control = NULL;
+ }
+ }
+ }
+ }
/* now do the tests */
if (((asoc->cnt_on_all_streams +
asoc->cnt_on_reasm_queue +
@@ -1388,68 +1851,31 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc,
#endif
}
/* now is it in the mapping array of what we have accepted? */
- if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_map) &&
- SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) {
- /* Nope not in the valid range dump it */
- sctp_set_rwnd(stcb, asoc);
- if ((asoc->cnt_on_all_streams +
- asoc->cnt_on_reasm_queue +
- asoc->cnt_msg_on_sb) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)) {
- SCTP_STAT_INCR(sctps_datadropchklmt);
- } else {
- SCTP_STAT_INCR(sctps_datadroprwnd);
+ if (nch == NULL) {
+ if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_map) &&
+ SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) {
+ /* Nope not in the valid range dump it */
+ dump_packet:
+ sctp_set_rwnd(stcb, asoc);
+ if ((asoc->cnt_on_all_streams +
+ asoc->cnt_on_reasm_queue +
+ asoc->cnt_msg_on_sb) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)) {
+ SCTP_STAT_INCR(sctps_datadropchklmt);
+ } else {
+ SCTP_STAT_INCR(sctps_datadroprwnd);
+ }
+ *break_flag = 1;
+ return (0);
+ }
+ } else {
+ if (control == NULL) {
+ goto dump_packet;
+ }
+ if (SCTP_TSN_GT(fsn, control->top_fsn)) {
+ goto dump_packet;
}
- *break_flag = 1;
- return (0);
- }
- }
- strmno = ntohs(ch->dp.stream_id);
- if (strmno >= asoc->streamincnt) {
- struct sctp_paramhdr *phdr;
- struct mbuf *mb;
-
- mb = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) * 2),
- 0, M_DONTWAIT, 1, MT_DATA);
- if (mb != NULL) {
- /* add some space up front so prepend will work well */
- SCTP_BUF_RESV_UF(mb, sizeof(struct sctp_chunkhdr));
- phdr = mtod(mb, struct sctp_paramhdr *);
- /*
- * Error causes are just param's and this one has
- * two back to back phdr, one with the error type
- * and size, the other with the streamid and a rsvd
- */
- SCTP_BUF_LEN(mb) = (sizeof(struct sctp_paramhdr) * 2);
- phdr->param_type = htons(SCTP_CAUSE_INVALID_STREAM);
- phdr->param_length =
- htons(sizeof(struct sctp_paramhdr) * 2);
- phdr++;
- /* We insert the stream in the type field */
- phdr->param_type = ch->dp.stream_id;
- /* And set the length to 0 for the rsvd field */
- phdr->param_length = 0;
- sctp_queue_op_err(stcb, mb);
- }
- SCTP_STAT_INCR(sctps_badsid);
- SCTP_TCB_LOCK_ASSERT(stcb);
- SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
- if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) {
- asoc->highest_tsn_inside_nr_map = tsn;
- }
- if (tsn == (asoc->cumulative_tsn + 1)) {
- /* Update cum-ack */
- asoc->cumulative_tsn = tsn;
}
- return (0);
}
- /*
- * Before we continue lets validate that we are not being fooled by
- * an evil attacker. We can only have 4k chunks based on our TSN
- * spread allowed by the mapping array 512 * 8 bits, so there is no
- * way our stream sequence numbers could have wrapped. We of course
- * only validate the FIRST fragment so the bit must be set.
- */
- strmseq = ntohs(ch->dp.stream_sequence);
#ifdef SCTP_ASOCLOG_OF_TSNS
SCTP_TCB_LOCK_ASSERT(stcb);
if (asoc->tsn_in_at >= SCTP_TSN_LOG_SIZE) {
@@ -1458,7 +1884,7 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc,
}
asoc->in_tsnlog[asoc->tsn_in_at].tsn = tsn;
asoc->in_tsnlog[asoc->tsn_in_at].strm = strmno;
- asoc->in_tsnlog[asoc->tsn_in_at].seq = strmseq;
+ asoc->in_tsnlog[asoc->tsn_in_at].seq = msg_id;
asoc->in_tsnlog[asoc->tsn_in_at].sz = chk_length;
asoc->in_tsnlog[asoc->tsn_in_at].flgs = chunk_flags;
asoc->in_tsnlog[asoc->tsn_in_at].stcb = (void *)stcb;
@@ -1466,18 +1892,26 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc,
asoc->in_tsnlog[asoc->tsn_in_at].in_out = 1;
asoc->tsn_in_at++;
#endif
+ /*
+ * Before we continue lets validate that we are not being fooled by
+ * an evil attacker. We can only have Nk chunks based on our TSN
+ * spread allowed by the mapping array N * 8 bits, so there is no
+ * way our stream sequence numbers could have wrapped. We of course
+ * only validate the FIRST fragment so the bit must be set.
+ */
if ((chunk_flags & SCTP_DATA_FIRST_FRAG) &&
(TAILQ_EMPTY(&asoc->resetHead)) &&
(chunk_flags & SCTP_DATA_UNORDERED) == 0 &&
- SCTP_SSN_GE(asoc->strmin[strmno].last_sequence_delivered, strmseq)) {
+ SCTP_MSGID_GE(old_data, asoc->strmin[strmno].last_sequence_delivered, msg_id)) {
/* The incoming sseq is behind where we last delivered? */
- SCTPDBG(SCTP_DEBUG_INDATA1, "EVIL/Broken-Dup S-SEQ:%d delivered:%d from peer, Abort!\n",
- strmseq, asoc->strmin[strmno].last_sequence_delivered);
+ SCTPDBG(SCTP_DEBUG_INDATA1, "EVIL/Broken-Dup S-SEQ: %u delivered: %u from peer, Abort!\n",
+ msg_id, asoc->strmin[strmno].last_sequence_delivered);
+
snprintf(msg, sizeof(msg), "Delivered SSN=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
asoc->strmin[strmno].last_sequence_delivered,
- tsn, strmno, strmseq);
+ tsn, strmno, msg_id);
op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_14;
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_16;
sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
*abort_flag = 1;
return (0);
@@ -1486,21 +1920,24 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc,
* From here down we may find ch-> invalid
* so its a good idea NOT to use it.
*************************************/
-
- the_len = (chk_length - sizeof(struct sctp_data_chunk));
+ if (nch) {
+ the_len = (chk_length - sizeof(struct sctp_idata_chunk));
+ } else {
+ the_len = (chk_length - sizeof(struct sctp_data_chunk));
+ }
if (last_chunk == 0) {
- dmbuf = SCTP_M_COPYM(*m,
- (offset + sizeof(struct sctp_data_chunk)),
- the_len, M_DONTWAIT);
+ if (nch) {
+ dmbuf = SCTP_M_COPYM(*m,
+ (offset + sizeof(struct sctp_idata_chunk)),
+ the_len, M_NOWAIT);
+ } else {
+ dmbuf = SCTP_M_COPYM(*m,
+ (offset + sizeof(struct sctp_data_chunk)),
+ the_len, M_NOWAIT);
+ }
#ifdef SCTP_MBUF_LOGGING
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- struct mbuf *mat;
-
- for (mat = dmbuf; mat; mat = SCTP_BUF_NEXT(mat)) {
- if (SCTP_BUF_IS_EXTENDED(mat)) {
- sctp_log_mb(mat, SCTP_MBUF_ICOPY);
- }
- }
+ sctp_log_mbc(dmbuf, SCTP_MBUF_ICOPY);
}
#endif
} else {
@@ -1509,7 +1946,11 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc,
dmbuf = *m;
/* lop off the top part */
- m_adj(dmbuf, (offset + sizeof(struct sctp_data_chunk)));
+ if (nch) {
+ m_adj(dmbuf, (offset + sizeof(struct sctp_idata_chunk)));
+ } else {
+ m_adj(dmbuf, (offset + sizeof(struct sctp_data_chunk)));
+ }
if (SCTP_BUF_NEXT(dmbuf) == NULL) {
l_len = SCTP_BUF_LEN(dmbuf);
} else {
@@ -1533,11 +1974,36 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc,
SCTP_STAT_INCR(sctps_nomem);
return (0);
}
+ /*
+ * Now no matter what we need a control, get one if we don't have
+ * one (we may have gotten it above when we found the message was
+ * fragmented
+ */
+ if (control == NULL) {
+ sctp_alloc_a_readq(stcb, control);
+ sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn,
+ protocol_id,
+ strmno, msg_id,
+ chunk_flags,
+ NULL, fsn, msg_id);
+ if (control == NULL) {
+ SCTP_STAT_INCR(sctps_nomem);
+ return (0);
+ }
+ if ((chunk_flags & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) {
+ control->data = dmbuf;
+ control->tail_mbuf = NULL;
+ control->end_added = control->last_frag_seen = control->first_frag_seen = 1;
+ control->top_fsn = control->fsn_included = fsn;
+ }
+ created_control = 1;
+ }
+ SCTPDBG(SCTP_DEBUG_XXX, "chunk_flags: 0x%x ordered: %d msgid: %u control: %p\n",
+ chunk_flags, ordered, msg_id, control);
if ((chunk_flags & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG &&
- asoc->fragmented_delivery_inprogress == 0 &&
TAILQ_EMPTY(&asoc->resetHead) &&
((ordered == 0) ||
- ((uint16_t) (asoc->strmin[strmno].last_sequence_delivered + 1) == strmseq &&
+ ((uint16_t) (asoc->strmin[strmno].last_sequence_delivered + 1) == msg_id &&
TAILQ_EMPTY(&asoc->strmin[strmno].inqueue)))) {
/* Candidate for express delivery */
/*
@@ -1547,109 +2013,30 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc,
* And there is room for it in the socket buffer. Lets just
* stuff it up the buffer....
*/
-
- /* It would be nice to avoid this copy if we could :< */
- sctp_alloc_a_readq(stcb, control);
- sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn,
- protocol_id,
- strmno, strmseq,
- chunk_flags,
- dmbuf);
- if (control == NULL) {
- goto failed_express_del;
- }
SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) {
asoc->highest_tsn_inside_nr_map = tsn;
}
+ SCTPDBG(SCTP_DEBUG_XXX, "Injecting control: %p to be read (msg_id: %u)\n",
+ control, msg_id);
+
sctp_add_to_readq(stcb->sctp_ep, stcb,
control, &stcb->sctp_socket->so_rcv,
1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
if ((chunk_flags & SCTP_DATA_UNORDERED) == 0) {
/* for ordered, bump what we delivered */
- asoc->strmin[strmno].last_sequence_delivered++;
+ strm->last_sequence_delivered++;
}
SCTP_STAT_INCR(sctps_recvexpress);
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
- sctp_log_strm_del_alt(stcb, tsn, strmseq, strmno,
+ sctp_log_strm_del_alt(stcb, tsn, msg_id, strmno,
SCTP_STR_LOG_FROM_EXPRS_DEL);
}
control = NULL;
-
goto finish_express_del;
}
-failed_express_del:
- /* If we reach here this is a new chunk */
- chk = NULL;
- control = NULL;
- /* Express for fragmented delivery? */
- if ((asoc->fragmented_delivery_inprogress) &&
- (stcb->asoc.control_pdapi) &&
- (asoc->str_of_pdapi == strmno) &&
- (asoc->ssn_of_pdapi == strmseq)
- ) {
- control = stcb->asoc.control_pdapi;
- if ((chunk_flags & SCTP_DATA_FIRST_FRAG) == SCTP_DATA_FIRST_FRAG) {
- /* Can't be another first? */
- goto failed_pdapi_express_del;
- }
- if (tsn == (control->sinfo_tsn + 1)) {
- /* Yep, we can add it on */
- int end = 0;
-
- if (chunk_flags & SCTP_DATA_LAST_FRAG) {
- end = 1;
- }
- if (sctp_append_to_readq(stcb->sctp_ep, stcb, control, dmbuf, end,
- tsn,
- &stcb->sctp_socket->so_rcv)) {
- SCTP_PRINTF("Append fails end:%d\n", end);
- goto failed_pdapi_express_del;
- }
- SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
- if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) {
- asoc->highest_tsn_inside_nr_map = tsn;
- }
- SCTP_STAT_INCR(sctps_recvexpressm);
- asoc->tsn_last_delivered = tsn;
- asoc->fragment_flags = chunk_flags;
- asoc->tsn_of_pdapi_last_delivered = tsn;
- asoc->last_flags_delivered = chunk_flags;
- asoc->last_strm_seq_delivered = strmseq;
- asoc->last_strm_no_delivered = strmno;
- if (end) {
- /* clean up the flags and such */
- asoc->fragmented_delivery_inprogress = 0;
- if ((chunk_flags & SCTP_DATA_UNORDERED) == 0) {
- asoc->strmin[strmno].last_sequence_delivered++;
- }
- stcb->asoc.control_pdapi = NULL;
- if (TAILQ_EMPTY(&asoc->reasmqueue) == 0) {
- /*
- * There could be another message
- * ready
- */
- need_reasm_check = 1;
- }
- }
- control = NULL;
- goto finish_express_del;
- }
- }
-failed_pdapi_express_del:
- control = NULL;
- if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) {
- SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
- if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) {
- asoc->highest_tsn_inside_nr_map = tsn;
- }
- } else {
- SCTP_SET_TSN_PRESENT(asoc->mapping_array, gap);
- if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_map)) {
- asoc->highest_tsn_inside_map = tsn;
- }
- }
+ /* Now will we need a chunk too? */
if ((chunk_flags & SCTP_DATA_NOT_FRAG) != SCTP_DATA_NOT_FRAG) {
sctp_alloc_a_chunk(stcb, chk);
if (chk == NULL) {
@@ -1663,7 +2050,8 @@ failed_pdapi_express_del:
}
chk->rec.data.TSN_seq = tsn;
chk->no_fr_allowed = 0;
- chk->rec.data.stream_seq = strmseq;
+ chk->rec.data.fsn_num = fsn;
+ chk->rec.data.stream_seq = msg_id;
chk->rec.data.stream_number = strmno;
chk->rec.data.payloadtype = protocol_id;
chk->rec.data.context = stcb->asoc.context;
@@ -1672,193 +2060,110 @@ failed_pdapi_express_del:
chk->asoc = asoc;
chk->send_size = the_len;
chk->whoTo = net;
+ SCTPDBG(SCTP_DEBUG_XXX, "Building ck: %p for control: %p to be read (msg_id: %u)\n",
+ chk,
+ control, msg_id);
atomic_add_int(&net->ref_count, 1);
chk->data = dmbuf;
+ }
+ /* Set the appropriate TSN mark */
+ if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) {
+ SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap);
+ if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) {
+ asoc->highest_tsn_inside_nr_map = tsn;
+ }
} else {
- sctp_alloc_a_readq(stcb, control);
- sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn,
- protocol_id,
- strmno, strmseq,
- chunk_flags,
- dmbuf);
- if (control == NULL) {
- /* No memory so we drop the chunk */
- SCTP_STAT_INCR(sctps_nomem);
- if (last_chunk == 0) {
- /* we copied it, free the copy */
- sctp_m_freem(dmbuf);
- }
- return (0);
+ SCTP_SET_TSN_PRESENT(asoc->mapping_array, gap);
+ if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_map)) {
+ asoc->highest_tsn_inside_map = tsn;
}
- control->length = the_len;
}
-
- /* Mark it as received */
- /* Now queue it where it belongs */
- if (control != NULL) {
- /* First a sanity check */
- if (asoc->fragmented_delivery_inprogress) {
+ /* Now is it complete (i.e. not fragmented)? */
+ if ((chunk_flags & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) {
+ /*
+ * Special check for when streams are resetting. We could be
+ * more smart about this and check the actual stream to see
+ * if it is not being reset.. that way we would not create a
+ * HOLB when amongst streams being reset and those not being
+ * reset.
+ *
+ */
+ if (((liste = TAILQ_FIRST(&asoc->resetHead)) != NULL) &&
+ SCTP_TSN_GT(tsn, liste->tsn)) {
/*
- * Ok, we have a fragmented delivery in progress if
- * this chunk is next to deliver OR belongs in our
- * view to the reassembly, the peer is evil or
- * broken.
+ * yep its past where we need to reset... go ahead
+ * and queue it.
*/
- uint32_t estimate_tsn;
-
- estimate_tsn = asoc->tsn_last_delivered + 1;
- if (TAILQ_EMPTY(&asoc->reasmqueue) &&
- (estimate_tsn == control->sinfo_tsn)) {
- /* Evil/Broke peer */
- sctp_m_freem(control->data);
- control->data = NULL;
- if (control->whoFrom) {
- sctp_free_remote_addr(control->whoFrom);
- control->whoFrom = NULL;
- }
- sctp_free_a_readq(stcb, control);
- snprintf(msg, sizeof(msg), "Reas. queue emtpy, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- tsn, strmno, strmseq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_15;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- if (last_chunk) {
- *m = NULL;
- }
- return (0);
+ if (TAILQ_EMPTY(&asoc->pending_reply_queue)) {
+ /* first one on */
+ TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next);
} else {
- if (sctp_does_tsn_belong_to_reasm(asoc, control->sinfo_tsn)) {
- sctp_m_freem(control->data);
- control->data = NULL;
- if (control->whoFrom) {
- sctp_free_remote_addr(control->whoFrom);
- control->whoFrom = NULL;
- }
- sctp_free_a_readq(stcb, control);
- snprintf(msg, sizeof(msg), "PD ongoing, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- tsn, strmno, strmseq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_16;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- if (last_chunk) {
- *m = NULL;
+ struct sctp_queued_to_read *ctlOn, *nctlOn;
+ unsigned char inserted = 0;
+
+ TAILQ_FOREACH_SAFE(ctlOn, &asoc->pending_reply_queue, next, nctlOn) {
+ if (SCTP_TSN_GT(control->sinfo_tsn, ctlOn->sinfo_tsn)) {
+
+ continue;
+ } else {
+ /* found it */
+ TAILQ_INSERT_BEFORE(ctlOn, control, next);
+ inserted = 1;
+ break;
}
- return (0);
}
- }
- } else {
- /* No PDAPI running */
- if (!TAILQ_EMPTY(&asoc->reasmqueue)) {
- /*
- * Reassembly queue is NOT empty validate
- * that this tsn does not need to be in
- * reasembly queue. If it does then our peer
- * is broken or evil.
- */
- if (sctp_does_tsn_belong_to_reasm(asoc, control->sinfo_tsn)) {
- sctp_m_freem(control->data);
- control->data = NULL;
- if (control->whoFrom) {
- sctp_free_remote_addr(control->whoFrom);
- control->whoFrom = NULL;
- }
- sctp_free_a_readq(stcb, control);
- snprintf(msg, sizeof(msg), "No PD ongoing, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x",
- tsn, strmno, strmseq);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_17;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- *abort_flag = 1;
- if (last_chunk) {
- *m = NULL;
- }
- return (0);
+ if (inserted == 0) {
+ /*
+ * must be put at end, use prevP
+ * (all setup from loop) to setup
+ * nextP.
+ */
+ TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next);
}
}
+ goto finish_express_del;
}
- /* ok, if we reach here we have passed the sanity checks */
if (chunk_flags & SCTP_DATA_UNORDERED) {
/* queue directly into socket buffer */
+ SCTPDBG(SCTP_DEBUG_XXX, "Unordered data to be read control: %p msg_id: %u\n",
+ control, msg_id);
sctp_mark_non_revokable(asoc, control->sinfo_tsn);
sctp_add_to_readq(stcb->sctp_ep, stcb,
control,
- &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
- } else {
- /*
- * Special check for when streams are resetting. We
- * could be more smart about this and check the
- * actual stream to see if it is not being reset..
- * that way we would not create a HOLB when amongst
- * streams being reset and those not being reset.
- *
- * We take complete messages that have a stream reset
- * intervening (aka the TSN is after where our
- * cum-ack needs to be) off and put them on a
- * pending_reply_queue. The reassembly ones we do
- * not have to worry about since they are all sorted
- * and proceessed by TSN order. It is only the
- * singletons I must worry about.
- */
- if (((liste = TAILQ_FIRST(&asoc->resetHead)) != NULL) &&
- SCTP_TSN_GT(tsn, liste->tsn)) {
- /*
- * yep its past where we need to reset... go
- * ahead and queue it.
- */
- if (TAILQ_EMPTY(&asoc->pending_reply_queue)) {
- /* first one on */
- TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next);
- } else {
- struct sctp_queued_to_read *ctlOn,
- *nctlOn;
- unsigned char inserted = 0;
+ &stcb->sctp_socket->so_rcv, 1,
+ SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
- TAILQ_FOREACH_SAFE(ctlOn, &asoc->pending_reply_queue, next, nctlOn) {
- if (SCTP_TSN_GT(control->sinfo_tsn, ctlOn->sinfo_tsn)) {
- continue;
- } else {
- /* found it */
- TAILQ_INSERT_BEFORE(ctlOn, control, next);
- inserted = 1;
- break;
- }
- }
- if (inserted == 0) {
- /*
- * must be put at end, use
- * prevP (all setup from
- * loop) to setup nextP.
- */
- TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next);
- }
- }
- } else {
- sctp_queue_data_to_stream(stcb, asoc, control, abort_flag);
- if (*abort_flag) {
- if (last_chunk) {
- *m = NULL;
- }
- return (0);
+ } else {
+ SCTPDBG(SCTP_DEBUG_XXX, "Queue control: %p for reordering msg_id: %u\n", control,
+ msg_id);
+ sctp_queue_data_to_stream(stcb, strm, asoc, control, abort_flag, &need_reasm_check);
+ if (*abort_flag) {
+ if (last_chunk) {
+ *m = NULL;
}
+ return (0);
}
}
- } else {
- /* Into the re-assembly queue */
- sctp_queue_data_for_reasm(stcb, asoc, chk, abort_flag);
- if (*abort_flag) {
- /*
- * the assoc is now gone and chk was put onto the
- * reasm queue, which has all been freed.
- */
- if (last_chunk) {
- *m = NULL;
- }
- return (0);
+ goto finish_express_del;
+ }
+ /* If we reach here its a reassembly */
+ need_reasm_check = 1;
+ SCTPDBG(SCTP_DEBUG_XXX,
+ "Queue data to stream for reasm control: %p msg_id: %u\n",
+ control, msg_id);
+ sctp_queue_data_for_reasm(stcb, asoc, strm, control, chk, created_control, abort_flag, tsn);
+ if (*abort_flag) {
+ /*
+ * the assoc is now gone and chk was put onto the reasm
+ * queue, which has all been freed.
+ */
+ if (last_chunk) {
+ *m = NULL;
}
+ return (0);
}
finish_express_del:
+ /* Here we tidy up things */
if (tsn == (asoc->cumulative_tsn + 1)) {
/* Update cum-ack */
asoc->cumulative_tsn = tsn;
@@ -1874,7 +2179,7 @@ finish_express_del:
SCTP_STAT_INCR(sctps_recvdata);
/* Set it present please */
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) {
- sctp_log_strm_del_alt(stcb, tsn, strmseq, strmno, SCTP_STR_LOG_FROM_MARK_TSN);
+ sctp_log_strm_del_alt(stcb, tsn, msg_id, strmno, SCTP_STR_LOG_FROM_MARK_TSN);
}
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
sctp_log_map(asoc->mapping_array_base_tsn, asoc->cumulative_tsn,
@@ -1893,6 +2198,7 @@ finish_express_del:
sctp_reset_in_stream(stcb, liste->number_entries, liste->list_of_streams);
TAILQ_REMOVE(&asoc->resetHead, liste, next_resp);
+ sctp_send_deferred_reset_response(stcb, liste, SCTP_STREAM_RESET_RESULT_PERFORMED);
SCTP_FREE(liste, SCTP_M_STRESET);
/* sa_ignore FREED_MEMORY */
liste = TAILQ_FIRST(&asoc->resetHead);
@@ -1900,7 +2206,7 @@ finish_express_del:
/* All can be removed */
TAILQ_FOREACH_SAFE(ctl, &asoc->pending_reply_queue, next, nctl) {
TAILQ_REMOVE(&asoc->pending_reply_queue, ctl, next);
- sctp_queue_data_to_stream(stcb, asoc, ctl, abort_flag);
+ sctp_queue_data_to_stream(stcb, strm, asoc, ctl, abort_flag, &need_reasm_check);
if (*abort_flag) {
return (0);
}
@@ -1916,7 +2222,7 @@ finish_express_del:
* ctl->sinfo_tsn > liste->tsn
*/
TAILQ_REMOVE(&asoc->pending_reply_queue, ctl, next);
- sctp_queue_data_to_stream(stcb, asoc, ctl, abort_flag);
+ sctp_queue_data_to_stream(stcb, strm, asoc, ctl, abort_flag, &need_reasm_check);
if (*abort_flag) {
return (0);
}
@@ -1926,17 +2232,17 @@ finish_express_del:
* Now service re-assembly to pick up anything that has been
* held on reassembly queue?
*/
- sctp_deliver_reasm_check(stcb, asoc);
+ (void)sctp_deliver_reasm_check(stcb, asoc, strm, SCTP_READ_LOCK_NOT_HELD);
need_reasm_check = 0;
}
if (need_reasm_check) {
/* Another one waits ? */
- sctp_deliver_reasm_check(stcb, asoc);
+ (void)sctp_deliver_reasm_check(stcb, asoc, strm, SCTP_READ_LOCK_NOT_HELD);
}
return (1);
}
-int8_t sctp_map_lookup_tab[256] = {
+static const int8_t sctp_map_lookup_tab[256] = {
0, 1, 0, 2, 0, 1, 0, 3,
0, 1, 0, 2, 0, 1, 0, 4,
0, 1, 0, 2, 0, 1, 0, 3,
@@ -1980,7 +2286,7 @@ sctp_slide_mapping_arrays(struct sctp_tcb *stcb)
* 1) Did we move the cum-ack point?
*
* When you first glance at this you might think that all entries that
- * make up the postion of the cum-ack would be in the nr-mapping
+ * make up the position of the cum-ack would be in the nr-mapping
* array only.. i.e. things up to the cum-ack are always
* deliverable. Thats true with one exception, when its a fragmented
* message we may not deliver the data until some threshold (or all
@@ -2078,7 +2384,7 @@ sctp_slide_mapping_arrays(struct sctp_tcb *stcb)
#ifdef INVARIANTS
panic("impossible slide");
#else
- SCTP_PRINTF("impossible slide lgap:%x slide_end:%x slide_from:%x? at:%d\n",
+ SCTP_PRINTF("impossible slide lgap: %x slide_end: %x slide_from: %x? at: %d\n",
lgap, slide_end, slide_from, at);
return;
#endif
@@ -2087,7 +2393,7 @@ sctp_slide_mapping_arrays(struct sctp_tcb *stcb)
#ifdef INVARIANTS
panic("would overrun buffer");
#else
- SCTP_PRINTF("Gak, would have overrun map end:%d slide_end:%d\n",
+ SCTP_PRINTF("Gak, would have overrun map end: %d slide_end: %d\n",
asoc->mapping_array_size, slide_end);
slide_end = asoc->mapping_array_size;
#endif
@@ -2166,7 +2472,8 @@ sctp_sack_check(struct sctp_tcb *stcb, int was_a_gap)
*/
if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) {
sctp_timer_stop(SCTP_TIMER_TYPE_RECV,
- stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_INDATA + SCTP_LOC_18);
+ stcb->sctp_ep, stcb, NULL,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_17);
}
sctp_send_shutdown(stcb,
((stcb->asoc.alternate) ? stcb->asoc.alternate : stcb->asoc.primary_destination));
@@ -2231,76 +2538,12 @@ sctp_sack_check(struct sctp_tcb *stcb, int was_a_gap)
}
}
-void
-sctp_service_queues(struct sctp_tcb *stcb, struct sctp_association *asoc)
-{
- struct sctp_tmit_chunk *chk;
- uint32_t tsize, pd_point;
- uint16_t nxt_todel;
-
- if (asoc->fragmented_delivery_inprogress) {
- sctp_service_reassembly(stcb, asoc);
- }
- /* Can we proceed further, i.e. the PD-API is complete */
- if (asoc->fragmented_delivery_inprogress) {
- /* no */
- return;
- }
- /*
- * Now is there some other chunk I can deliver from the reassembly
- * queue.
- */
-doit_again:
- chk = TAILQ_FIRST(&asoc->reasmqueue);
- if (chk == NULL) {
- asoc->size_on_reasm_queue = 0;
- asoc->cnt_on_reasm_queue = 0;
- return;
- }
- nxt_todel = asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered + 1;
- if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) &&
- ((nxt_todel == chk->rec.data.stream_seq) ||
- (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED))) {
- /*
- * Yep the first one is here. We setup to start reception,
- * by backing down the TSN just in case we can't deliver.
- */
-
- /*
- * Before we start though either all of the message should
- * be here or the socket buffer max or nothing on the
- * delivery queue and something can be delivered.
- */
- if (stcb->sctp_socket) {
- pd_point = min(SCTP_SB_LIMIT_RCV(stcb->sctp_socket) >> SCTP_PARTIAL_DELIVERY_SHIFT,
- stcb->sctp_ep->partial_delivery_point);
- } else {
- pd_point = stcb->sctp_ep->partial_delivery_point;
- }
- if (sctp_is_all_msg_on_reasm(asoc, &tsize) || (tsize >= pd_point)) {
- asoc->fragmented_delivery_inprogress = 1;
- asoc->tsn_last_delivered = chk->rec.data.TSN_seq - 1;
- asoc->str_of_pdapi = chk->rec.data.stream_number;
- asoc->ssn_of_pdapi = chk->rec.data.stream_seq;
- asoc->pdapi_ppid = chk->rec.data.payloadtype;
- asoc->fragment_flags = chk->rec.data.rcv_flags;
- sctp_service_reassembly(stcb, asoc);
- if (asoc->fragmented_delivery_inprogress == 0) {
- goto doit_again;
- }
- }
- }
-}
-
int
sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
- struct sockaddr *src, struct sockaddr *dst,
- struct sctphdr *sh, struct sctp_inpcb *inp,
- struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t * high_tsn,
- uint8_t use_mflowid, uint32_t mflowid,
- uint32_t vrf_id, uint16_t port)
+ struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ struct sctp_nets *net, uint32_t * high_tsn)
{
- struct sctp_data_chunk *ch, chunk_buf;
+ struct sctp_chunkhdr *ch, chunk_buf;
struct sctp_association *asoc;
int num_chunks = 0; /* number of control chunks processed */
int stop_proc = 0;
@@ -2338,7 +2581,7 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
*/
if (SCTP_BUF_LEN(m) < (long)MLEN && SCTP_BUF_NEXT(m) == NULL) {
/* we only handle mbufs that are singletons.. not chains */
- m = sctp_get_mbuf_for_msg(SCTP_BUF_LEN(m), 0, M_DONTWAIT, 1, MT_DATA);
+ m = sctp_get_mbuf_for_msg(SCTP_BUF_LEN(m), 0, M_NOWAIT, 1, MT_DATA);
if (m) {
/* ok lets see if we can copy the data up */
caddr_t *from, *to;
@@ -2350,7 +2593,7 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
/* copy the length and free up the old */
SCTP_BUF_LEN(m) = SCTP_BUF_LEN((*mm));
sctp_m_freem(*mm);
- /* sucess, back copy */
+ /* success, back copy */
*mm = m;
} else {
/* We are in trouble in the mbuf world .. yikes */
@@ -2358,8 +2601,8 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
}
}
/* get pointer to the first chunk header */
- ch = (struct sctp_data_chunk *)sctp_m_getptr(m, *offset,
- sizeof(struct sctp_data_chunk), (uint8_t *) & chunk_buf);
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset,
+ sizeof(struct sctp_chunkhdr), (uint8_t *) & chunk_buf);
if (ch == NULL) {
return (1);
}
@@ -2371,14 +2614,44 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
asoc->data_pkts_seen++;
while (stop_proc == 0) {
/* validate chunk length */
- chk_length = ntohs(ch->ch.chunk_length);
+ chk_length = ntohs(ch->chunk_length);
if (length - *offset < chk_length) {
/* all done, mutulated chunk */
stop_proc = 1;
continue;
}
- if (ch->ch.chunk_type == SCTP_DATA) {
- if ((size_t)chk_length < sizeof(struct sctp_data_chunk)) {
+ if ((asoc->idata_supported == 1) &&
+ (ch->chunk_type == SCTP_DATA)) {
+ struct mbuf *op_err;
+ char msg[SCTP_DIAG_INFO_LEN];
+
+ snprintf(msg, sizeof(msg), "%s", "I-DATA chunk received when DATA was negotiated");
+ op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_18;
+ sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED);
+ return (2);
+ }
+ if ((asoc->idata_supported == 0) &&
+ (ch->chunk_type == SCTP_IDATA)) {
+ struct mbuf *op_err;
+ char msg[SCTP_DIAG_INFO_LEN];
+
+ snprintf(msg, sizeof(msg), "%s", "DATA chunk received when I-DATA was negotiated");
+ op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_19;
+ sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED);
+ return (2);
+ }
+ if ((ch->chunk_type == SCTP_DATA) ||
+ (ch->chunk_type == SCTP_IDATA)) {
+ int clen;
+
+ if (ch->chunk_type == SCTP_DATA) {
+ clen = sizeof(struct sctp_data_chunk);
+ } else {
+ clen = sizeof(struct sctp_idata_chunk);
+ }
+ if (chk_length < clen) {
/*
* Need to send an abort since we had a
* invalid data chunk.
@@ -2389,26 +2662,8 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
snprintf(msg, sizeof(msg), "DATA chunk of length %d",
chk_length);
op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_19;
- sctp_abort_association(inp, stcb, m, iphlen,
- src, dst, sh, op_err,
- use_mflowid, mflowid,
- vrf_id, port);
- return (2);
- }
- if ((size_t)chk_length == sizeof(struct sctp_data_chunk)) {
- /*
- * Need to send an abort since we had an
- * empty data chunk.
- */
- struct mbuf *op_err;
-
- op_err = sctp_generate_no_user_data_cause(ch->dp.tsn);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_19;
- sctp_abort_association(inp, stcb, m, iphlen,
- src, dst, sh, op_err,
- use_mflowid, mflowid,
- vrf_id, port);
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_20;
+ sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED);
return (2);
}
#ifdef SCTP_AUDITING_ENABLED
@@ -2419,9 +2674,9 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
} else {
last_chunk = 0;
}
- if (sctp_process_a_data_chunk(stcb, asoc, mm, *offset, ch,
+ if (sctp_process_a_data_chunk(stcb, asoc, mm, *offset,
chk_length, net, high_tsn, &abort_flag, &break_flag,
- last_chunk)) {
+ last_chunk, ch->chunk_type)) {
num_chunks++;
}
if (abort_flag)
@@ -2437,7 +2692,7 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
}
} else {
/* not a data chunk in the data region */
- switch (ch->ch.chunk_type) {
+ switch (ch->chunk_type) {
case SCTP_INITIATION:
case SCTP_INITIATION_ACK:
case SCTP_SELECTIVE_ACK:
@@ -2459,64 +2714,50 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
case SCTP_STREAM_RESET:
case SCTP_FORWARD_CUM_TSN:
case SCTP_ASCONF:
- /*
- * Now, what do we do with KNOWN chunks that
- * are NOT in the right place?
- *
- * For now, I do nothing but ignore them. We
- * may later want to add sysctl stuff to
- * switch out and do either an ABORT() or
- * possibly process them.
- */
- if (SCTP_BASE_SYSCTL(sctp_strict_data_order)) {
+ {
+ /*
+ * Now, what do we do with KNOWN
+ * chunks that are NOT in the right
+ * place?
+ *
+ * For now, I do nothing but ignore
+ * them. We may later want to add
+ * sysctl stuff to switch out and do
+ * either an ABORT() or possibly
+ * process them.
+ */
struct mbuf *op_err;
+ char msg[SCTP_DIAG_INFO_LEN];
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, "");
- sctp_abort_association(inp, stcb,
- m, iphlen,
- src, dst,
- sh, op_err,
- use_mflowid, mflowid,
- vrf_id, port);
+ snprintf(msg, sizeof(msg), "DATA chunk followed by chunk of type %2.2x",
+ ch->chunk_type);
+ op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
+ sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED);
return (2);
}
- break;
default:
/* unknown chunk type, use bit rules */
- if (ch->ch.chunk_type & 0x40) {
+ if (ch->chunk_type & 0x40) {
/* Add a error report to the queue */
- struct mbuf *merr;
- struct sctp_paramhdr *phd;
-
- merr = sctp_get_mbuf_for_msg(sizeof(*phd), 0, M_DONTWAIT, 1, MT_DATA);
- if (merr) {
- phd = mtod(merr, struct sctp_paramhdr *);
- /*
- * We cheat and use param
- * type since we did not
- * bother to define a error
- * cause struct. They are
- * the same basic format
- * with different names.
- */
- phd->param_type =
- htons(SCTP_CAUSE_UNRECOG_CHUNK);
- phd->param_length =
- htons(chk_length + sizeof(*phd));
- SCTP_BUF_LEN(merr) = sizeof(*phd);
- SCTP_BUF_NEXT(merr) = SCTP_M_COPYM(m, *offset, chk_length, M_DONTWAIT);
- if (SCTP_BUF_NEXT(merr)) {
- if (sctp_pad_lastmbuf(SCTP_BUF_NEXT(merr), SCTP_SIZE32(chk_length) - chk_length, NULL)) {
- sctp_m_freem(merr);
- } else {
- sctp_queue_op_err(stcb, merr);
- }
+ struct mbuf *op_err;
+ struct sctp_gen_error_cause *cause;
+
+ op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_gen_error_cause),
+ 0, M_NOWAIT, 1, MT_DATA);
+ if (op_err != NULL) {
+ cause = mtod(op_err, struct sctp_gen_error_cause *);
+ cause->code = htons(SCTP_CAUSE_UNRECOG_CHUNK);
+ cause->length = htons((uint16_t) (chk_length + sizeof(struct sctp_gen_error_cause)));
+ SCTP_BUF_LEN(op_err) = sizeof(struct sctp_gen_error_cause);
+ SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(m, *offset, chk_length, M_NOWAIT);
+ if (SCTP_BUF_NEXT(op_err) != NULL) {
+ sctp_queue_op_err(stcb, op_err);
} else {
- sctp_m_freem(merr);
+ sctp_m_freem(op_err);
}
}
}
- if ((ch->ch.chunk_type & 0x80) == 0) {
+ if ((ch->chunk_type & 0x80) == 0) {
/* discard the rest of this packet */
stop_proc = 1;
} /* else skip this bad chunk and
@@ -2530,8 +2771,8 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
stop_proc = 1;
continue;
}
- ch = (struct sctp_data_chunk *)sctp_m_getptr(m, *offset,
- sizeof(struct sctp_data_chunk), (uint8_t *) & chunk_buf);
+ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset,
+ sizeof(struct sctp_chunkhdr), (uint8_t *) & chunk_buf);
if (ch == NULL) {
*offset = length;
stop_proc = 1;
@@ -2561,9 +2802,6 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
(void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_last_rcvd);
}
/* now service all of the reassm queue if needed */
- if (!(TAILQ_EMPTY(&asoc->reasmqueue)))
- sctp_service_queues(stcb, asoc);
-
if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) {
/* Assure that we ack right away */
stcb->asoc.send_sack = 1;
@@ -2604,12 +2842,14 @@ sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1
* cumack trackers for first transmissions,
* and retransmissions.
*/
- if ((tp1->whoTo->find_pseudo_cumack == 1) && (tp1->sent < SCTP_DATAGRAM_RESEND) &&
+ if ((tp1->sent < SCTP_DATAGRAM_RESEND) &&
+ (tp1->whoTo->find_pseudo_cumack == 1) &&
(tp1->snd_count == 1)) {
tp1->whoTo->pseudo_cumack = tp1->rec.data.TSN_seq;
tp1->whoTo->find_pseudo_cumack = 0;
}
- if ((tp1->whoTo->find_rtx_pseudo_cumack == 1) && (tp1->sent < SCTP_DATAGRAM_RESEND) &&
+ if ((tp1->sent < SCTP_DATAGRAM_RESEND) &&
+ (tp1->whoTo->find_rtx_pseudo_cumack == 1) &&
(tp1->snd_count > 1)) {
tp1->whoTo->rtx_pseudo_cumack = tp1->rec.data.TSN_seq;
tp1->whoTo->find_rtx_pseudo_cumack = 0;
@@ -2697,7 +2937,7 @@ sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1
sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_GAP,
tp1->whoTo->flight_size,
tp1->book_size,
- (uintptr_t) tp1->whoTo,
+ (uint32_t) (uintptr_t) tp1->whoTo,
tp1->rec.data.TSN_seq);
}
sctp_flight_size_decrease(tp1);
@@ -2772,6 +3012,11 @@ sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1
panic("No chunks on the queues for sid %u.", tp1->rec.data.stream_number);
#endif
}
+ if ((stcb->asoc.strmout[tp1->rec.data.stream_number].chunks_on_queues == 0) &&
+ (stcb->asoc.strmout[tp1->rec.data.stream_number].state == SCTP_STREAM_RESET_PENDING) &&
+ TAILQ_EMPTY(&stcb->asoc.strmout[tp1->rec.data.stream_number].outqueue)) {
+ stcb->asoc.trigger_reset = 1;
+ }
tp1->sent = SCTP_DATAGRAM_NR_ACKED;
if (tp1->data) {
/*
@@ -2901,7 +3146,7 @@ sctp_check_for_revoked(struct sctp_tcb *stcb,
sctp_misc_ints(SCTP_FLIGHT_LOG_UP_REVOKE,
tp1->whoTo->flight_size,
tp1->book_size,
- (uintptr_t) tp1->whoTo,
+ (uint32_t) (uintptr_t) tp1->whoTo,
tp1->rec.data.TSN_seq);
}
sctp_flight_size_increase(tp1);
@@ -2961,7 +3206,7 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc,
num_dests_sacked++;
}
}
- if (stcb->asoc.peer_supports_prsctp) {
+ if (stcb->asoc.prsctp_supported) {
(void)SCTP_GETTIME_TIMEVAL(&now);
}
TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) {
@@ -2982,7 +3227,7 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc,
/* done */
break;
}
- if (stcb->asoc.peer_supports_prsctp) {
+ if (stcb->asoc.prsctp_supported) {
if ((PR_SCTP_TTL_ENABLED(tp1->flags)) && tp1->sent < SCTP_DATAGRAM_ACKED) {
/* Is it expired? */
if (timevalcmp(&now, &tp1->rec.data.timetodrop, >)) {
@@ -3215,7 +3460,7 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc,
sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_RSND,
(tp1->whoTo ? (tp1->whoTo->flight_size) : 0),
tp1->book_size,
- (uintptr_t) tp1->whoTo,
+ (uint32_t) (uintptr_t) tp1->whoTo,
tp1->rec.data.TSN_seq);
}
if (tp1->whoTo) {
@@ -3236,7 +3481,7 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc,
/* remove from the total flight */
sctp_total_flight_decrease(stcb, tp1);
- if ((stcb->asoc.peer_supports_prsctp) &&
+ if ((stcb->asoc.prsctp_supported) &&
(PR_SCTP_RTX_ENABLED(tp1->flags))) {
/*
* Has it been retransmitted tv_sec times? -
@@ -3381,7 +3626,7 @@ sctp_try_advance_peer_ack_point(struct sctp_tcb *stcb,
struct timeval now;
int now_filled = 0;
- if (asoc->peer_supports_prsctp == 0) {
+ if (asoc->prsctp_supported == 0) {
return (NULL);
}
TAILQ_FOREACH_SAFE(tp1, &asoc->sent_queue, sctp_next, tp2) {
@@ -3467,18 +3712,24 @@ sctp_fs_audit(struct sctp_association *asoc)
{
struct sctp_tmit_chunk *chk;
int inflight = 0, resend = 0, inbetween = 0, acked = 0, above = 0;
- int entry_flight, entry_cnt, ret;
+ int ret;
+
+#ifndef INVARIANTS
+ int entry_flight, entry_cnt;
+#endif
+
+ ret = 0;
+#ifndef INVARIANTS
entry_flight = asoc->total_flight;
entry_cnt = asoc->total_flight_count;
- ret = 0;
-
+#endif
if (asoc->pr_sctp_cnt >= asoc->sent_queue_cnt)
return (0);
TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
if (chk->sent < SCTP_DATAGRAM_RESEND) {
- SCTP_PRINTF("Chk TSN:%u size:%d inflight cnt:%d\n",
+ SCTP_PRINTF("Chk TSN: %u size: %d inflight cnt: %d\n",
chk->rec.data.TSN_seq,
chk->send_size,
chk->snd_count);
@@ -3498,10 +3749,10 @@ sctp_fs_audit(struct sctp_association *asoc)
#ifdef INVARIANTS
panic("Flight size-express incorrect? \n");
#else
- SCTP_PRINTF("asoc->total_flight:%d cnt:%d\n",
+ SCTP_PRINTF("asoc->total_flight: %d cnt: %d\n",
entry_flight, entry_cnt);
- SCTP_PRINTF("Flight size-express incorrect F:%d I:%d R:%d Ab:%d ACK:%d\n",
+ SCTP_PRINTF("Flight size-express incorrect F: %d I: %d R: %d Ab: %d ACK: %d\n",
inflight, inbetween, resend, above, acked);
ret = 1;
#endif
@@ -3519,9 +3770,9 @@ sctp_window_probe_recovery(struct sctp_tcb *stcb,
if ((tp1->sent >= SCTP_DATAGRAM_ACKED) || (tp1->data == NULL)) {
/* TSN's skipped we do NOT move back. */
sctp_misc_ints(SCTP_FLIGHT_LOG_DWN_WP_FWD,
- tp1->whoTo->flight_size,
+ tp1->whoTo ? tp1->whoTo->flight_size : 0,
tp1->book_size,
- (uintptr_t) tp1->whoTo,
+ (uint32_t) (uintptr_t) tp1->whoTo,
tp1->rec.data.TSN_seq);
return;
}
@@ -3540,7 +3791,7 @@ sctp_window_probe_recovery(struct sctp_tcb *stcb,
sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_WP,
tp1->whoTo->flight_size,
tp1->book_size,
- (uintptr_t) tp1->whoTo,
+ (uint32_t) (uintptr_t) tp1->whoTo,
tp1->rec.data.TSN_seq);
}
}
@@ -3557,6 +3808,7 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack,
int win_probe_recovered = 0;
int j, done_once = 0;
int rto_ok = 1;
+ uint32_t send_s;
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_SACK_ARRIVALS_ENABLE) {
sctp_misc_ints(SCTP_SACK_LOG_EXPRESS, cumack,
@@ -3608,36 +3860,25 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack,
(*stcb->asoc.cc_functions.sctp_cwnd_prepare_net_for_sack) (stcb, net);
}
}
- if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) {
- uint32_t send_s;
-
- if (!TAILQ_EMPTY(&asoc->sent_queue)) {
- tp1 = TAILQ_LAST(&asoc->sent_queue,
- sctpchunk_listhead);
- send_s = tp1->rec.data.TSN_seq + 1;
- } else {
- send_s = asoc->sending_seq;
- }
- if (SCTP_TSN_GE(cumack, send_s)) {
-#ifndef INVARIANTS
- struct mbuf *op_err;
- char msg[SCTP_DIAG_INFO_LEN];
-
-#endif
-#ifdef INVARIANTS
- panic("Impossible sack 1");
-#else
+ if (!TAILQ_EMPTY(&asoc->sent_queue)) {
+ tp1 = TAILQ_LAST(&asoc->sent_queue,
+ sctpchunk_listhead);
+ send_s = tp1->rec.data.TSN_seq + 1;
+ } else {
+ send_s = asoc->sending_seq;
+ }
+ if (SCTP_TSN_GE(cumack, send_s)) {
+ struct mbuf *op_err;
+ char msg[SCTP_DIAG_INFO_LEN];
- *abort_now = 1;
- /* XXX */
- snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal then TSN %8.8x",
- cumack, send_s);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- return;
-#endif
- }
+ *abort_now = 1;
+ /* XXX */
+ snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal than TSN %8.8x",
+ cumack, send_s);
+ op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_21;
+ sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
+ return;
}
asoc->this_sack_highest_gap = cumack;
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
@@ -3666,7 +3907,7 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack,
sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_CA,
tp1->whoTo->flight_size,
tp1->book_size,
- (uintptr_t) tp1->whoTo,
+ (uint32_t) (uintptr_t) tp1->whoTo,
tp1->rec.data.TSN_seq);
}
sctp_flight_size_decrease(tp1);
@@ -3746,6 +3987,11 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack,
#endif
}
}
+ if ((asoc->strmout[tp1->rec.data.stream_number].chunks_on_queues == 0) &&
+ (asoc->strmout[tp1->rec.data.stream_number].state == SCTP_STREAM_RESET_PENDING) &&
+ TAILQ_EMPTY(&asoc->strmout[tp1->rec.data.stream_number].outqueue)) {
+ asoc->trigger_reset = 1;
+ }
TAILQ_REMOVE(&asoc->sent_queue, tp1, sctp_next);
if (tp1->data) {
/* sa_ignore NO_NULL_CHK */
@@ -3830,7 +4076,9 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack,
}
if (net->dest_state & SCTP_ADDR_PF) {
net->dest_state &= ~SCTP_ADDR_PF;
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
+ stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_22);
sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
asoc->cc_functions.sctp_cwnd_update_exit_pf(stcb, net);
/* Done with this net */
@@ -3916,7 +4164,7 @@ again:
} else if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
stcb, net,
- SCTP_FROM_SCTP_INDATA + SCTP_LOC_22);
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_23);
}
}
}
@@ -3957,28 +4205,8 @@ again:
if ((asoc->stream_queue_cnt == 1) &&
((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) ||
(asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED)) &&
- (asoc->locked_on_sending)
- ) {
- struct sctp_stream_queue_pending *sp;
-
- /*
- * I may be in a state where we got all across.. but
- * cannot write more due to a shutdown... we abort
- * since the user did not indicate EOR in this case.
- * The sp will be cleaned during free of the asoc.
- */
- sp = TAILQ_LAST(&((asoc->locked_on_sending)->outqueue),
- sctp_streamhead);
- if ((sp) && (sp->length == 0)) {
- /* Let cleanup code purge it */
- if (sp->msg_is_complete) {
- asoc->stream_queue_cnt--;
- } else {
- asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
- asoc->locked_on_sending = NULL;
- asoc->stream_queue_cnt--;
- }
- }
+ ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc))) {
+ asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
}
if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) &&
(asoc->stream_queue_cnt == 0)) {
@@ -3992,6 +4220,7 @@ again:
op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_24;
sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
+ return;
} else {
struct sctp_nets *netp;
@@ -4043,7 +4272,7 @@ again:
asoc->advanced_peer_ack_point = cumack;
}
/* PR-Sctp issues need to be addressed too */
- if ((asoc->peer_supports_prsctp) && (asoc->pr_sctp_cnt > 0)) {
+ if ((asoc->prsctp_supported) && (asoc->pr_sctp_cnt > 0)) {
struct sctp_tmit_chunk *lchk;
uint32_t old_adv_peer_ack_point;
@@ -4173,40 +4402,38 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup,
sctp_log_fr(*dupdata, 0, 0, SCTP_FR_DUPED);
}
}
- if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) {
- /* reality check */
- if (!TAILQ_EMPTY(&asoc->sent_queue)) {
- tp1 = TAILQ_LAST(&asoc->sent_queue,
- sctpchunk_listhead);
- send_s = tp1->rec.data.TSN_seq + 1;
- } else {
- tp1 = NULL;
- send_s = asoc->sending_seq;
- }
- if (SCTP_TSN_GE(cum_ack, send_s)) {
- struct mbuf *op_err;
- char msg[SCTP_DIAG_INFO_LEN];
+ /* reality check */
+ if (!TAILQ_EMPTY(&asoc->sent_queue)) {
+ tp1 = TAILQ_LAST(&asoc->sent_queue,
+ sctpchunk_listhead);
+ send_s = tp1->rec.data.TSN_seq + 1;
+ } else {
+ tp1 = NULL;
+ send_s = asoc->sending_seq;
+ }
+ if (SCTP_TSN_GE(cum_ack, send_s)) {
+ struct mbuf *op_err;
+ char msg[SCTP_DIAG_INFO_LEN];
- /*
- * no way, we have not even sent this TSN out yet.
- * Peer is hopelessly messed up with us.
- */
- SCTP_PRINTF("NEW cum_ack:%x send_s:%x is smaller or equal\n",
- cum_ack, send_s);
- if (tp1) {
- SCTP_PRINTF("Got send_s from tsn:%x + 1 of tp1:%p\n",
- tp1->rec.data.TSN_seq, (void *)tp1);
- }
- hopeless_peer:
- *abort_now = 1;
- /* XXX */
- snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal then TSN %8.8x",
- cum_ack, send_s);
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
- stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25;
- sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
- return;
- }
+ /*
+ * no way, we have not even sent this TSN out yet. Peer is
+ * hopelessly messed up with us.
+ */
+ SCTP_PRINTF("NEW cum_ack:%x send_s:%x is smaller or equal\n",
+ cum_ack, send_s);
+ if (tp1) {
+ SCTP_PRINTF("Got send_s from tsn:%x + 1 of tp1: %p\n",
+ tp1->rec.data.TSN_seq, (void *)tp1);
+ }
+hopeless_peer:
+ *abort_now = 1;
+ /* XXX */
+ snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal than TSN %8.8x",
+ cum_ack, send_s);
+ op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg);
+ stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25;
+ sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
+ return;
}
/**********************/
/* 1) check the range */
@@ -4299,7 +4526,7 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup,
sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_CA,
tp1->whoTo->flight_size,
tp1->book_size,
- (uintptr_t) tp1->whoTo,
+ (uint32_t) (uintptr_t) tp1->whoTo,
tp1->rec.data.TSN_seq);
}
sctp_flight_size_decrease(tp1);
@@ -4416,20 +4643,18 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup,
num_seg, num_nr_seg, &rto_ok)) {
wake_him++;
}
- if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) {
+ /*
+ * validate the biggest_tsn_acked in the gap acks if strict
+ * adherence is wanted.
+ */
+ if (SCTP_TSN_GE(biggest_tsn_acked, send_s)) {
/*
- * validate the biggest_tsn_acked in the gap acks if
- * strict adherence is wanted.
+ * peer is either confused or we are under attack.
+ * We must abort.
*/
- if (SCTP_TSN_GE(biggest_tsn_acked, send_s)) {
- /*
- * peer is either confused or we are under
- * attack. We must abort.
- */
- SCTP_PRINTF("Hopeless peer! biggest_tsn_acked:%x largest seq:%x\n",
- biggest_tsn_acked, send_s);
- goto hopeless_peer;
- }
+ SCTP_PRINTF("Hopeless peer! biggest_tsn_acked:%x largest seq:%x\n",
+ biggest_tsn_acked, send_s);
+ goto hopeless_peer;
}
}
/*******************************************/
@@ -4469,6 +4694,11 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup,
#endif
}
}
+ if ((asoc->strmout[tp1->rec.data.stream_number].chunks_on_queues == 0) &&
+ (asoc->strmout[tp1->rec.data.stream_number].state == SCTP_STREAM_RESET_PENDING) &&
+ TAILQ_EMPTY(&asoc->strmout[tp1->rec.data.stream_number].outqueue)) {
+ asoc->trigger_reset = 1;
+ }
TAILQ_REMOVE(&asoc->sent_queue, tp1, sctp_next);
if (PR_SCTP_ENABLED(tp1->flags)) {
if (asoc->pr_sctp_cnt != 0)
@@ -4480,7 +4710,7 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup,
sctp_free_bufspace(stcb, asoc, tp1, 1);
sctp_m_freem(tp1->data);
tp1->data = NULL;
- if (asoc->peer_supports_prsctp && PR_SCTP_BUF_ENABLED(tp1->flags)) {
+ if (asoc->prsctp_supported && PR_SCTP_BUF_ENABLED(tp1->flags)) {
asoc->sent_queue_cnt_removeable--;
}
}
@@ -4497,7 +4727,7 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup,
}
if (TAILQ_EMPTY(&asoc->sent_queue) && (asoc->total_flight > 0)) {
#ifdef INVARIANTS
- panic("Warning flight size is postive and should be 0");
+ panic("Warning flight size is positive and should be 0");
#else
SCTP_PRINTF("Warning flight size incorrect should be 0 is %d\n",
asoc->total_flight);
@@ -4567,7 +4797,7 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup,
sctp_misc_ints(SCTP_FLIGHT_LOG_UP_REVOKE,
tp1->whoTo->flight_size,
tp1->book_size,
- (uintptr_t) tp1->whoTo,
+ (uint32_t) (uintptr_t) tp1->whoTo,
tp1->rec.data.TSN_seq);
}
sctp_flight_size_increase(tp1);
@@ -4620,7 +4850,9 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup,
}
if (net->dest_state & SCTP_ADDR_PF) {
net->dest_state &= ~SCTP_ADDR_PF;
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
+ stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_29);
sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
asoc->cc_functions.sctp_cwnd_update_exit_pf(stcb, net);
/* Done with this net */
@@ -4643,7 +4875,8 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup,
TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
/* stop all timers */
sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
- stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_30);
+ stcb, net,
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_30);
net->flight_size = 0;
net->partial_bytes_acked = 0;
}
@@ -4668,26 +4901,8 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup,
if ((asoc->stream_queue_cnt == 1) &&
((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) ||
(asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED)) &&
- (asoc->locked_on_sending)
- ) {
- struct sctp_stream_queue_pending *sp;
-
- /*
- * I may be in a state where we got all across.. but
- * cannot write more due to a shutdown... we abort
- * since the user did not indicate EOR in this case.
- */
- sp = TAILQ_LAST(&((asoc->locked_on_sending)->outqueue),
- sctp_streamhead);
- if ((sp) && (sp->length == 0)) {
- asoc->locked_on_sending = NULL;
- if (sp->msg_is_complete) {
- asoc->stream_queue_cnt--;
- } else {
- asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
- asoc->stream_queue_cnt--;
- }
- }
+ ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc))) {
+ asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
}
if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) &&
(asoc->stream_queue_cnt == 0)) {
@@ -4851,7 +5066,7 @@ again:
} else if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
stcb, net,
- SCTP_FROM_SCTP_INDATA + SCTP_LOC_22);
+ SCTP_FROM_SCTP_INDATA + SCTP_LOC_32);
}
}
}
@@ -4892,7 +5107,7 @@ again:
asoc->advanced_peer_ack_point = cum_ack;
}
/* C2. try to further move advancedPeerAckPoint ahead */
- if ((asoc->peer_supports_prsctp) && (asoc->pr_sctp_cnt > 0)) {
+ if ((asoc->prsctp_supported) && (asoc->pr_sctp_cnt > 0)) {
struct sctp_tmit_chunk *lchk;
uint32_t old_adv_peer_ack_point;
@@ -4952,134 +5167,219 @@ sctp_kick_prsctp_reorder_queue(struct sctp_tcb *stcb,
{
struct sctp_queued_to_read *ctl, *nctl;
struct sctp_association *asoc;
- uint16_t tt;
+ uint32_t tt;
+ int need_reasm_check = 0, old;
asoc = &stcb->asoc;
tt = strmin->last_sequence_delivered;
+ if (asoc->idata_supported) {
+ old = 0;
+ } else {
+ old = 1;
+ }
/*
* First deliver anything prior to and including the stream no that
- * came in
+ * came in.
*/
- TAILQ_FOREACH_SAFE(ctl, &strmin->inqueue, next, nctl) {
- if (SCTP_SSN_GE(tt, ctl->sinfo_ssn)) {
+ TAILQ_FOREACH_SAFE(ctl, &strmin->inqueue, next_instrm, nctl) {
+ if (SCTP_MSGID_GE(old, tt, ctl->sinfo_ssn)) {
/* this is deliverable now */
- TAILQ_REMOVE(&strmin->inqueue, ctl, next);
- /* subtract pending on streams */
- asoc->size_on_all_streams -= ctl->length;
- sctp_ucount_decr(asoc->cnt_on_all_streams);
- /* deliver it to at least the delivery-q */
- if (stcb->sctp_socket) {
- sctp_mark_non_revokable(asoc, ctl->sinfo_tsn);
- sctp_add_to_readq(stcb->sctp_ep, stcb,
- ctl,
- &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED);
+ if (((ctl->sinfo_flags >> 8) & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) {
+ if (ctl->on_strm_q) {
+ if (ctl->on_strm_q == SCTP_ON_ORDERED) {
+ TAILQ_REMOVE(&strmin->inqueue, ctl, next_instrm);
+ } else if (ctl->on_strm_q == SCTP_ON_UNORDERED) {
+ TAILQ_REMOVE(&strmin->uno_inqueue, ctl, next_instrm);
+#ifdef INVARIANTS
+ } else {
+ panic("strmin: %p ctl: %p unknown %d",
+ strmin, ctl, ctl->on_strm_q);
+#endif
+ }
+ ctl->on_strm_q = 0;
+ }
+ /* subtract pending on streams */
+ asoc->size_on_all_streams -= ctl->length;
+ sctp_ucount_decr(asoc->cnt_on_all_streams);
+ /* deliver it to at least the delivery-q */
+ if (stcb->sctp_socket) {
+ sctp_mark_non_revokable(asoc, ctl->sinfo_tsn);
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ ctl,
+ &stcb->sctp_socket->so_rcv,
+ 1, SCTP_READ_LOCK_HELD,
+ SCTP_SO_NOT_LOCKED);
+ }
+ } else {
+ /* Its a fragmented message */
+ if (ctl->first_frag_seen) {
+ /*
+ * Make it so this is next to
+ * deliver, we restore later
+ */
+ strmin->last_sequence_delivered = ctl->sinfo_ssn - 1;
+ need_reasm_check = 1;
+ break;
+ }
}
} else {
/* no more delivery now. */
break;
}
}
+ if (need_reasm_check) {
+ int ret;
+
+ ret = sctp_deliver_reasm_check(stcb, &stcb->asoc, strmin, SCTP_READ_LOCK_HELD);
+ if (SCTP_MSGID_GT(old, tt, strmin->last_sequence_delivered)) {
+ /* Restore the next to deliver unless we are ahead */
+ strmin->last_sequence_delivered = tt;
+ }
+ if (ret == 0) {
+ /* Left the front Partial one on */
+ return;
+ }
+ need_reasm_check = 0;
+ }
/*
* now we must deliver things in queue the normal way if any are
* now ready.
*/
tt = strmin->last_sequence_delivered + 1;
- TAILQ_FOREACH_SAFE(ctl, &strmin->inqueue, next, nctl) {
+ TAILQ_FOREACH_SAFE(ctl, &strmin->inqueue, next_instrm, nctl) {
if (tt == ctl->sinfo_ssn) {
- /* this is deliverable now */
- TAILQ_REMOVE(&strmin->inqueue, ctl, next);
- /* subtract pending on streams */
- asoc->size_on_all_streams -= ctl->length;
- sctp_ucount_decr(asoc->cnt_on_all_streams);
- /* deliver it to at least the delivery-q */
- strmin->last_sequence_delivered = ctl->sinfo_ssn;
- if (stcb->sctp_socket) {
- sctp_mark_non_revokable(asoc, ctl->sinfo_tsn);
- sctp_add_to_readq(stcb->sctp_ep, stcb,
- ctl,
- &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED);
+ if (((ctl->sinfo_flags >> 8) & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) {
+ /* this is deliverable now */
+ if (ctl->on_strm_q) {
+ if (ctl->on_strm_q == SCTP_ON_ORDERED) {
+ TAILQ_REMOVE(&strmin->inqueue, ctl, next_instrm);
+ } else if (ctl->on_strm_q == SCTP_ON_UNORDERED) {
+ TAILQ_REMOVE(&strmin->uno_inqueue, ctl, next_instrm);
+#ifdef INVARIANTS
+ } else {
+ panic("strmin: %p ctl: %p unknown %d",
+ strmin, ctl, ctl->on_strm_q);
+#endif
+ }
+ ctl->on_strm_q = 0;
+ }
+ /* subtract pending on streams */
+ asoc->size_on_all_streams -= ctl->length;
+ sctp_ucount_decr(asoc->cnt_on_all_streams);
+ /* deliver it to at least the delivery-q */
+ strmin->last_sequence_delivered = ctl->sinfo_ssn;
+ if (stcb->sctp_socket) {
+ sctp_mark_non_revokable(asoc, ctl->sinfo_tsn);
+ sctp_add_to_readq(stcb->sctp_ep, stcb,
+ ctl,
+ &stcb->sctp_socket->so_rcv, 1,
+ SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED);
+ }
+ tt = strmin->last_sequence_delivered + 1;
+ } else {
+ /* Its a fragmented message */
+ if (ctl->first_frag_seen) {
+ /*
+ * Make it so this is next to
+ * deliver
+ */
+ strmin->last_sequence_delivered = ctl->sinfo_ssn - 1;
+ need_reasm_check = 1;
+ break;
+ }
}
- tt = strmin->last_sequence_delivered + 1;
} else {
break;
}
}
+ if (need_reasm_check) {
+ (void)sctp_deliver_reasm_check(stcb, &stcb->asoc, strmin, SCTP_READ_LOCK_HELD);
+ }
}
+
+
static void
sctp_flush_reassm_for_str_seq(struct sctp_tcb *stcb,
struct sctp_association *asoc,
- uint16_t stream, uint16_t seq)
+ uint16_t stream, uint32_t seq, int ordered, int old, uint32_t cumtsn)
{
+ struct sctp_queued_to_read *control;
+ struct sctp_stream_in *strm;
struct sctp_tmit_chunk *chk, *nchk;
+ int cnt_removed = 0;
- /* For each one on here see if we need to toss it */
/*
- * For now large messages held on the reasmqueue that are complete
+ * For now large messages held on the stream reasm that are complete
* will be tossed too. We could in theory do more work to spin
* through and stop after dumping one msg aka seeing the start of a
* new msg at the head, and call the delivery function... to see if
* it can be delivered... But for now we just dump everything on the
* queue.
*/
- TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) {
- /*
- * Do not toss it if on a different stream or marked for
- * unordered delivery in which case the stream sequence
- * number has no meaning.
- */
- if ((chk->rec.data.stream_number != stream) ||
- ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == SCTP_DATA_UNORDERED)) {
- continue;
- }
- if (chk->rec.data.stream_seq == seq) {
- /* It needs to be tossed */
- TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
- if (SCTP_TSN_GT(chk->rec.data.TSN_seq, asoc->tsn_last_delivered)) {
- asoc->tsn_last_delivered = chk->rec.data.TSN_seq;
- asoc->str_of_pdapi = chk->rec.data.stream_number;
- asoc->ssn_of_pdapi = chk->rec.data.stream_seq;
- asoc->fragment_flags = chk->rec.data.rcv_flags;
- }
- asoc->size_on_reasm_queue -= chk->send_size;
- sctp_ucount_decr(asoc->cnt_on_reasm_queue);
-
- /* Clear up any stream problem */
- if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) != SCTP_DATA_UNORDERED &&
- SCTP_SSN_GT(chk->rec.data.stream_seq, asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered)) {
- /*
- * We must dump forward this streams
- * sequence number if the chunk is not
- * unordered that is being skipped. There is
- * a chance that if the peer does not
- * include the last fragment in its FWD-TSN
- * we WILL have a problem here since you
- * would have a partial chunk in queue that
- * may not be deliverable. Also if a Partial
- * delivery API as started the user may get
- * a partial chunk. The next read returning
- * a new chunk... really ugly but I see no
- * way around it! Maybe a notify??
- */
- asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered = chk->rec.data.stream_seq;
- }
- if (chk->data) {
- sctp_m_freem(chk->data);
- chk->data = NULL;
+ strm = &asoc->strmin[stream];
+ control = sctp_find_reasm_entry(strm, (uint32_t) seq, ordered, old);
+ if (control == NULL) {
+ /* Not found */
+ return;
+ }
+ TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) {
+ /* Purge hanging chunks */
+ if (old && (ordered == 0)) {
+ if (SCTP_TSN_GT(chk->rec.data.TSN_seq, cumtsn)) {
+ break;
}
- sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
- } else if (SCTP_SSN_GT(chk->rec.data.stream_seq, seq)) {
- /*
- * If the stream_seq is > than the purging one, we
- * are done
- */
- break;
}
+ cnt_removed++;
+ TAILQ_REMOVE(&control->reasm, chk, sctp_next);
+ asoc->size_on_reasm_queue -= chk->send_size;
+ sctp_ucount_decr(asoc->cnt_on_reasm_queue);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
+ }
+ if (!TAILQ_EMPTY(&control->reasm)) {
+ /* This has to be old data, unordered */
+ if (control->data) {
+ sctp_m_freem(control->data);
+ control->data = NULL;
+ }
+ sctp_reset_a_control(control, stcb->sctp_ep, cumtsn);
+ chk = TAILQ_FIRST(&control->reasm);
+ if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) {
+ TAILQ_REMOVE(&control->reasm, chk, sctp_next);
+ sctp_add_chk_to_control(control, strm, stcb, asoc,
+ chk, SCTP_READ_LOCK_HELD);
+ }
+ sctp_deliver_reasm_check(stcb, asoc, strm, SCTP_READ_LOCK_HELD);
+ return;
+ }
+ if (control->on_strm_q == SCTP_ON_ORDERED) {
+ TAILQ_REMOVE(&strm->inqueue, control, next_instrm);
+ control->on_strm_q = 0;
+ } else if (control->on_strm_q == SCTP_ON_UNORDERED) {
+ TAILQ_REMOVE(&strm->uno_inqueue, control, next_instrm);
+ control->on_strm_q = 0;
+#ifdef INVARIANTS
+ } else if (control->on_strm_q) {
+ panic("strm: %p ctl: %p unknown %d",
+ strm, control, control->on_strm_q);
+#endif
+ }
+ control->on_strm_q = 0;
+ if (control->on_read_q == 0) {
+ sctp_free_remote_addr(control->whoFrom);
+ if (control->data) {
+ sctp_m_freem(control->data);
+ control->data = NULL;
+ }
+ sctp_free_a_readq(stcb, control);
}
}
-
void
sctp_handle_forward_tsn(struct sctp_tcb *stcb,
struct sctp_forward_tsn_chunk *fwd,
@@ -5102,7 +5402,6 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb,
unsigned int i, fwd_sz, m_size;
uint32_t str_seq;
struct sctp_stream_in *strm;
- struct sctp_tmit_chunk *chk, *nchk;
struct sctp_queued_to_read *ctl, *sv;
asoc = &stcb->asoc;
@@ -5172,66 +5471,17 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb,
/*************************************************************/
/* 2. Clear up re-assembly queue */
/*************************************************************/
- /*
- * First service it if pd-api is up, just in case we can progress it
- * forward
- */
- if (asoc->fragmented_delivery_inprogress) {
- sctp_service_reassembly(stcb, asoc);
- }
- /* For each one on here see if we need to toss it */
- /*
- * For now large messages held on the reasmqueue that are complete
- * will be tossed too. We could in theory do more work to spin
- * through and stop after dumping one msg aka seeing the start of a
- * new msg at the head, and call the delivery function... to see if
- * it can be delivered... But for now we just dump everything on the
- * queue.
- */
- TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) {
- if (SCTP_TSN_GE(new_cum_tsn, chk->rec.data.TSN_seq)) {
- /* It needs to be tossed */
- TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
- if (SCTP_TSN_GT(chk->rec.data.TSN_seq, asoc->tsn_last_delivered)) {
- asoc->tsn_last_delivered = chk->rec.data.TSN_seq;
- asoc->str_of_pdapi = chk->rec.data.stream_number;
- asoc->ssn_of_pdapi = chk->rec.data.stream_seq;
- asoc->fragment_flags = chk->rec.data.rcv_flags;
- }
- asoc->size_on_reasm_queue -= chk->send_size;
- sctp_ucount_decr(asoc->cnt_on_reasm_queue);
-
- /* Clear up any stream problem */
- if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) != SCTP_DATA_UNORDERED &&
- SCTP_SSN_GT(chk->rec.data.stream_seq, asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered)) {
- /*
- * We must dump forward this streams
- * sequence number if the chunk is not
- * unordered that is being skipped. There is
- * a chance that if the peer does not
- * include the last fragment in its FWD-TSN
- * we WILL have a problem here since you
- * would have a partial chunk in queue that
- * may not be deliverable. Also if a Partial
- * delivery API as started the user may get
- * a partial chunk. The next read returning
- * a new chunk... really ugly but I see no
- * way around it! Maybe a notify??
- */
- asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered = chk->rec.data.stream_seq;
- }
- if (chk->data) {
- sctp_m_freem(chk->data);
- chk->data = NULL;
- }
- sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
- } else {
- /*
- * Ok we have gone beyond the end of the fwd-tsn's
- * mark.
- */
- break;
+
+ /* This is now done as part of clearing up the stream/seq */
+ if (asoc->idata_supported == 0) {
+ uint16_t sid;
+
+ /* Flush all the un-ordered data based on cum-tsn */
+ SCTP_INP_READ_LOCK(stcb->sctp_ep);
+ for (sid = 0; sid < asoc->streamincnt; sid++) {
+ sctp_flush_reassm_for_str_seq(stcb, asoc, sid, 0, 0, 1, new_cum_tsn);
}
+ SCTP_INP_READ_UNLOCK(stcb->sctp_ep);
}
/*******************************************************/
/* 3. Update the PR-stream re-ordering queues and fix */
@@ -5241,27 +5491,53 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb,
if (m && fwd_sz) {
/* New method. */
unsigned int num_str;
+ uint32_t sequence;
+ uint16_t stream;
+ uint16_t ordered, flags;
+ int old;
struct sctp_strseq *stseq, strseqbuf;
+ struct sctp_strseq_mid *stseq_m, strseqbuf_m;
offset += sizeof(*fwd);
SCTP_INP_READ_LOCK(stcb->sctp_ep);
- num_str = fwd_sz / sizeof(struct sctp_strseq);
+ if (asoc->idata_supported) {
+ num_str = fwd_sz / sizeof(struct sctp_strseq_mid);
+ old = 0;
+ } else {
+ num_str = fwd_sz / sizeof(struct sctp_strseq);
+ old = 1;
+ }
for (i = 0; i < num_str; i++) {
- uint16_t st;
-
- stseq = (struct sctp_strseq *)sctp_m_getptr(m, offset,
- sizeof(struct sctp_strseq),
- (uint8_t *) & strseqbuf);
- offset += sizeof(struct sctp_strseq);
- if (stseq == NULL) {
- break;
+ if (asoc->idata_supported) {
+ stseq_m = (struct sctp_strseq_mid *)sctp_m_getptr(m, offset,
+ sizeof(struct sctp_strseq_mid),
+ (uint8_t *) & strseqbuf_m);
+ offset += sizeof(struct sctp_strseq_mid);
+ if (stseq_m == NULL) {
+ break;
+ }
+ stream = ntohs(stseq_m->stream);
+ sequence = ntohl(stseq_m->msg_id);
+ flags = ntohs(stseq_m->flags);
+ if (flags & PR_SCTP_UNORDERED_FLAG) {
+ ordered = 0;
+ } else {
+ ordered = 1;
+ }
+ } else {
+ stseq = (struct sctp_strseq *)sctp_m_getptr(m, offset,
+ sizeof(struct sctp_strseq),
+ (uint8_t *) & strseqbuf);
+ offset += sizeof(struct sctp_strseq);
+ if (stseq == NULL) {
+ break;
+ }
+ stream = ntohs(stseq->stream);
+ sequence = (uint32_t) ntohs(stseq->sequence);
+ ordered = 1;
}
/* Convert */
- st = ntohs(stseq->stream);
- stseq->stream = st;
- st = ntohs(stseq->sequence);
- stseq->sequence = st;
/* now process */
@@ -5270,12 +5546,12 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb,
* queue where its not all delivered. If we find it
* we transmute the read entry into a PDI_ABORTED.
*/
- if (stseq->stream >= asoc->streamincnt) {
+ if (stream >= asoc->streamincnt) {
/* screwed up streams, stop! */
break;
}
- if ((asoc->str_of_pdapi == stseq->stream) &&
- (asoc->ssn_of_pdapi == stseq->sequence)) {
+ if ((asoc->str_of_pdapi == stream) &&
+ (asoc->ssn_of_pdapi == sequence)) {
/*
* If this is the one we were partially
* delivering now then we no longer are.
@@ -5284,14 +5560,38 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb,
*/
asoc->fragmented_delivery_inprogress = 0;
}
- sctp_flush_reassm_for_str_seq(stcb, asoc, stseq->stream, stseq->sequence);
+ strm = &asoc->strmin[stream];
+ if (asoc->idata_supported == 0) {
+ uint16_t strm_at;
+
+ for (strm_at = strm->last_sequence_delivered; SCTP_MSGID_GE(1, sequence, strm_at); strm_at++) {
+ sctp_flush_reassm_for_str_seq(stcb, asoc, stream, strm_at, ordered, old, new_cum_tsn);
+ }
+ } else {
+ uint32_t strm_at;
+
+ for (strm_at = strm->last_sequence_delivered; SCTP_MSGID_GE(0, sequence, strm_at); strm_at++) {
+ sctp_flush_reassm_for_str_seq(stcb, asoc, stream, strm_at, ordered, old, new_cum_tsn);
+ }
+ }
TAILQ_FOREACH(ctl, &stcb->sctp_ep->read_queue, next) {
- if ((ctl->sinfo_stream == stseq->stream) &&
- (ctl->sinfo_ssn == stseq->sequence)) {
- str_seq = (stseq->stream << 16) | stseq->sequence;
- ctl->end_added = 1;
+ if ((ctl->sinfo_stream == stream) &&
+ (ctl->sinfo_ssn == sequence)) {
+ str_seq = (stream << 16) | (0x0000ffff & sequence);
ctl->pdapi_aborted = 1;
sv = stcb->asoc.control_pdapi;
+ ctl->end_added = 1;
+ if (ctl->on_strm_q == SCTP_ON_ORDERED) {
+ TAILQ_REMOVE(&strm->inqueue, ctl, next_instrm);
+ } else if (ctl->on_strm_q == SCTP_ON_UNORDERED) {
+ TAILQ_REMOVE(&strm->uno_inqueue, ctl, next_instrm);
+#ifdef INVARIANTS
+ } else if (ctl->on_strm_q) {
+ panic("strm: %p ctl: %p unknown %d",
+ strm, ctl, ctl->on_strm_q);
+#endif
+ }
+ ctl->on_strm_q = 0;
stcb->asoc.control_pdapi = ctl;
sctp_ulp_notify(SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION,
stcb,
@@ -5300,16 +5600,15 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb,
SCTP_SO_NOT_LOCKED);
stcb->asoc.control_pdapi = sv;
break;
- } else if ((ctl->sinfo_stream == stseq->stream) &&
- SCTP_SSN_GT(ctl->sinfo_ssn, stseq->sequence)) {
+ } else if ((ctl->sinfo_stream == stream) &&
+ SCTP_MSGID_GT(old, ctl->sinfo_ssn, sequence)) {
/* We are past our victim SSN */
break;
}
}
- strm = &asoc->strmin[stseq->stream];
- if (SCTP_SSN_GT(stseq->sequence, strm->last_sequence_delivered)) {
+ if (SCTP_MSGID_GT(old, sequence, strm->last_sequence_delivered)) {
/* Update the sequence number */
- strm->last_sequence_delivered = stseq->sequence;
+ strm->last_sequence_delivered = sequence;
}
/* now kick the stream the new way */
/* sa_ignore NO_NULL_CHK */
@@ -5321,10 +5620,4 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb,
* Now slide thing forward.
*/
sctp_slide_mapping_arrays(stcb);
-
- if (!TAILQ_EMPTY(&asoc->reasmqueue)) {
- /* now lets kick out and check for more fragmented delivery */
- /* sa_ignore NO_NULL_CHK */
- sctp_deliver_reasm_check(stcb, &stcb->asoc);
- }
}
diff --git a/freebsd/sys/netinet/sctp_indata.h b/freebsd/sys/netinet/sctp_indata.h
index 79a86e2a..162ca905 100644
--- a/freebsd/sys/netinet/sctp_indata.h
+++ b/freebsd/sys/netinet/sctp_indata.h
@@ -43,35 +43,31 @@ sctp_build_readq_entry(struct sctp_tcb *stcb,
struct sctp_nets *net,
uint32_t tsn, uint32_t ppid,
uint32_t context, uint16_t stream_no,
- uint16_t stream_seq, uint8_t flags,
+ uint32_t stream_seq, uint8_t flags,
struct mbuf *dm);
-#define sctp_build_readq_entry_mac(_ctl, in_it, context, net, tsn, ppid, stream_no, stream_seq, flags, dm) do { \
+#define sctp_build_readq_entry_mac(_ctl, in_it, context, net, tsn, ppid, stream_no, stream_seq, flags, dm, tfsn, msgid) do { \
if (_ctl) { \
atomic_add_int(&((net)->ref_count), 1); \
+ memset(_ctl, 0, sizeof(struct sctp_queued_to_read)); \
(_ctl)->sinfo_stream = stream_no; \
(_ctl)->sinfo_ssn = stream_seq; \
+ TAILQ_INIT(&_ctl->reasm); \
+ (_ctl)->top_fsn = tfsn; \
+ (_ctl)->msg_id = msgid; \
(_ctl)->sinfo_flags = (flags << 8); \
(_ctl)->sinfo_ppid = ppid; \
(_ctl)->sinfo_context = context; \
- (_ctl)->sinfo_timetolive = 0; \
+ (_ctl)->fsn_included = 0xffffffff; \
+ (_ctl)->top_fsn = 0xffffffff; \
(_ctl)->sinfo_tsn = tsn; \
(_ctl)->sinfo_cumtsn = tsn; \
(_ctl)->sinfo_assoc_id = sctp_get_associd((in_it)); \
- (_ctl)->length = 0; \
- (_ctl)->held_length = 0; \
(_ctl)->whoFrom = net; \
(_ctl)->data = dm; \
- (_ctl)->tail_mbuf = NULL; \
- (_ctl)->aux_data = NULL; \
(_ctl)->stcb = (in_it); \
(_ctl)->port_from = (in_it)->rport; \
- (_ctl)->spec_flags = 0; \
- (_ctl)->do_not_ref_stcb = 0; \
- (_ctl)->end_added = 0; \
- (_ctl)->pdapi_aborted = 0; \
- (_ctl)->some_taken = 0; \
} \
} while (0)
@@ -112,12 +108,8 @@ void
int
sctp_process_data(struct mbuf **, int, int *, int,
- struct sockaddr *src, struct sockaddr *dst,
- struct sctphdr *,
struct sctp_inpcb *, struct sctp_tcb *,
- struct sctp_nets *, uint32_t *,
- uint8_t, uint32_t,
- uint32_t, uint16_t);
+ struct sctp_nets *, uint32_t *);
void sctp_slide_mapping_arrays(struct sctp_tcb *stcb);
diff --git a/freebsd/sys/netinet/sctp_input.c b/freebsd/sys/netinet/sctp_input.c
index 9e35c882..621784ea 100644
--- a/freebsd/sys/netinet/sctp_input.c
+++ b/freebsd/sys/netinet/sctp_input.c
@@ -49,7 +49,9 @@ __FBSDID("$FreeBSD$");
#include <netinet/sctp_bsd_addr.h>
#include <netinet/sctp_timer.h>
#include <netinet/sctp_crc32.h>
+#if defined(INET) || defined(INET6)
#include <netinet/udp.h>
+#endif
#include <sys/smp.h>
@@ -85,8 +87,8 @@ static void
sctp_handle_init(struct mbuf *m, int iphlen, int offset,
struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh,
struct sctp_init_chunk *cp, struct sctp_inpcb *inp,
- struct sctp_tcb *stcb, int *abort_no_unlock,
- uint8_t use_mflowid, uint32_t mflowid,
+ struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock,
+ uint8_t mflowtype, uint32_t mflowid,
uint32_t vrf_id, uint16_t port)
{
struct sctp_init *init;
@@ -101,7 +103,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset,
if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_chunk)) {
op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
if (stcb)
*abort_no_unlock = 1;
@@ -113,7 +115,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset,
/* protocol error... send abort */
op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
if (stcb)
*abort_no_unlock = 1;
@@ -123,7 +125,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset,
/* invalid parameter... send abort */
op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
if (stcb)
*abort_no_unlock = 1;
@@ -133,7 +135,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset,
/* protocol error... send abort */
op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
if (stcb)
*abort_no_unlock = 1;
@@ -143,7 +145,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset,
/* protocol error... send abort */
op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
if (stcb)
*abort_no_unlock = 1;
@@ -155,7 +157,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset,
op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
"Problem with AUTH parameters");
sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
if (stcb)
*abort_no_unlock = 1;
@@ -186,7 +188,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset,
op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
"No listener");
sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid, inp->fibnum,
vrf_id, port);
}
goto outnow;
@@ -198,9 +200,9 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset,
sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED);
} else {
SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending INIT-ACK\n");
- sctp_send_initiate_ack(inp, stcb, m, iphlen, offset, src, dst,
- sh, cp,
- use_mflowid, mflowid,
+ sctp_send_initiate_ack(inp, stcb, net, m, iphlen, offset,
+ src, dst, sh, cp,
+ mflowtype, mflowid,
vrf_id, port,
((stcb == NULL) ? SCTP_HOLDS_LOCK : SCTP_NOT_LOCKED));
}
@@ -221,18 +223,18 @@ sctp_is_there_unsent_data(struct sctp_tcb *stcb, int so_locked
#endif
)
{
- int unsent_data = 0;
+ int unsent_data;
unsigned int i;
struct sctp_stream_queue_pending *sp;
struct sctp_association *asoc;
/*
- * This function returns the number of streams that have true unsent
- * data on them. Note that as it looks through it will clean up any
- * places that have old data that has been sent but left at top of
- * stream queue.
+ * This function returns if any stream has true unsent data on it.
+ * Note that as it looks through it will clean up any places that
+ * have old data that has been sent but left at top of stream queue.
*/
asoc = &stcb->asoc;
+ unsent_data = 0;
SCTP_TCB_SEND_LOCK(stcb);
if (!stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) {
/* Check to see if some data queued */
@@ -260,6 +262,7 @@ sctp_is_there_unsent_data(struct sctp_tcb *stcb, int so_locked
}
atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1);
TAILQ_REMOVE(&stcb->asoc.strmout[i].outqueue, sp, next);
+ stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, &asoc->strmout[i], sp, 1);
if (sp->net) {
sctp_free_remote_addr(sp->net);
sp->net = NULL;
@@ -269,8 +272,13 @@ sctp_is_there_unsent_data(struct sctp_tcb *stcb, int so_locked
sp->data = NULL;
}
sctp_free_a_strmoq(stcb, sp, so_locked);
+ if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) {
+ unsent_data++;
+ }
} else {
unsent_data++;
+ }
+ if (unsent_data > 0) {
break;
}
}
@@ -341,8 +349,9 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb)
for (i = newcnt; i < asoc->pre_open_streams; i++) {
outs = &asoc->strmout[i];
TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) {
+ atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1);
TAILQ_REMOVE(&outs->outqueue, sp, next);
- asoc->stream_queue_cnt--;
+ stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 1);
sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL,
stcb, 0, sp, SCTP_SO_NOT_LOCKED);
if (sp->data) {
@@ -357,14 +366,19 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb)
sctp_free_a_strmoq(stcb, sp, SCTP_SO_NOT_LOCKED);
/* sa_ignore FREED_MEMORY */
}
+ outs->state = SCTP_STREAM_CLOSED;
}
}
/* cut back the count */
asoc->pre_open_streams = newcnt;
}
SCTP_TCB_SEND_UNLOCK(stcb);
- asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams;
-
+ asoc->streamoutcnt = asoc->pre_open_streams;
+ if (asoc->strmout) {
+ for (i = 0; i < asoc->streamoutcnt; i++) {
+ asoc->strmout[i].state = SCTP_STREAM_OPEN;
+ }
+ }
/* EY - nr_sack: initialize highest tsn in nr_mapping_array */
asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map;
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
@@ -381,17 +395,9 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb)
if (asoc->strmin != NULL) {
/* Free the old ones */
- struct sctp_queued_to_read *ctl, *nctl;
-
for (i = 0; i < asoc->streamincnt; i++) {
- TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[i].inqueue, next, nctl) {
- TAILQ_REMOVE(&asoc->strmin[i].inqueue, ctl, next);
- sctp_free_remote_addr(ctl->whoFrom);
- ctl->whoFrom = NULL;
- sctp_m_freem(ctl->data);
- ctl->data = NULL;
- sctp_free_a_readq(stcb, ctl);
- }
+ sctp_clean_up_stream(stcb, &asoc->strmin[i].inqueue);
+ sctp_clean_up_stream(stcb, &asoc->strmin[i].uno_inqueue);
}
SCTP_FREE(asoc->strmin, SCTP_M_STRMI);
}
@@ -409,8 +415,10 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb)
}
for (i = 0; i < asoc->streamincnt; i++) {
asoc->strmin[i].stream_no = i;
- asoc->strmin[i].last_sequence_delivered = 0xffff;
+ asoc->strmin[i].last_sequence_delivered = 0xffffffff;
TAILQ_INIT(&asoc->strmin[i].inqueue);
+ TAILQ_INIT(&asoc->strmin[i].uno_inqueue);
+ asoc->strmin[i].pd_api_started = 0;
asoc->strmin[i].delivery_started = 0;
}
/*
@@ -434,7 +442,7 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset,
struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh,
struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb,
struct sctp_nets *net, int *abort_no_unlock,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid,
uint32_t vrf_id)
{
struct sctp_association *asoc;
@@ -466,7 +474,7 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset,
/* load all addresses */
if ((retval = sctp_load_addresses_from_init(stcb, m,
(offset + sizeof(struct sctp_init_chunk)), initack_limit,
- src, dst, NULL))) {
+ src, dst, NULL, stcb->asoc.port))) {
op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
"Problem with address parameters");
SCTPDBG(SCTP_DEBUG_INPUT1,
@@ -474,13 +482,13 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset,
retval);
sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, net->port);
*abort_no_unlock = 1;
return (-1);
}
/* if the peer doesn't support asconf, flush the asconf queue */
- if (asoc->peer_supports_asconf == 0) {
+ if (asoc->asconf_supported == 0) {
struct sctp_asconf_addr *param, *nparam;
TAILQ_FOREACH_SAFE(param, &asoc->asconf_queue, next, nparam) {
@@ -513,12 +521,11 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset,
* primary.
*/
sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb,
- asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_4);
+ asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3);
/* calculate the RTO */
net->RTO = sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, sctp_align_safe_nocopy,
SCTP_RTT_FROM_NON_DATA);
-
retval = sctp_send_cookie_echo(m, offset, stcb, net);
if (retval < 0) {
/*
@@ -527,29 +534,25 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset,
* abandon the peer, its broke.
*/
if (retval == -3) {
+ uint16_t len;
+
+ len = (uint16_t) (sizeof(struct sctp_error_missing_param) + sizeof(uint16_t));
/* We abort with an error of missing mandatory param */
- op_err = sctp_generate_cause(SCTP_CAUSE_MISSING_PARAM, "");
- if (op_err) {
- /*
- * Expand beyond to include the mandatory
- * param cookie
- */
- struct sctp_inv_mandatory_param *mp;
+ op_err = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA);
+ if (op_err != NULL) {
+ struct sctp_error_missing_param *cause;
- SCTP_BUF_LEN(op_err) =
- sizeof(struct sctp_inv_mandatory_param);
- mp = mtod(op_err,
- struct sctp_inv_mandatory_param *);
+ SCTP_BUF_LEN(op_err) = len;
+ cause = mtod(op_err, struct sctp_error_missing_param *);
/* Subtract the reserved param */
- mp->length =
- htons(sizeof(struct sctp_inv_mandatory_param) - 2);
- mp->num_param = htonl(1);
- mp->param = htons(SCTP_STATE_COOKIE);
- mp->resv = 0;
+ cause->cause.code = htons(SCTP_CAUSE_MISSING_PARAM);
+ cause->cause.length = htons(len);
+ cause->num_missing_params = htonl(1);
+ cause->type[0] = htons(SCTP_STATE_COOKIE);
}
sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, net->port);
*abort_no_unlock = 1;
}
@@ -562,21 +565,12 @@ static void
sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp,
struct sctp_tcb *stcb, struct sctp_nets *net)
{
- struct sockaddr_storage store;
+ union sctp_sockstore store;
struct sctp_nets *r_net, *f_net;
struct timeval tv;
int req_prim = 0;
uint16_t old_error_counter;
-#ifdef INET
- struct sockaddr_in *sin;
-
-#endif
-#ifdef INET6
- struct sockaddr_in6 *sin6;
-
-#endif
-
if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_heartbeat_chunk)) {
/* Invalid length */
return;
@@ -586,12 +580,11 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp,
#ifdef INET
case AF_INET:
if (cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in)) {
- sin = (struct sockaddr_in *)&store;
- sin->sin_family = cp->heartbeat.hb_info.addr_family;
- sin->sin_len = cp->heartbeat.hb_info.addr_len;
- sin->sin_port = stcb->rport;
- memcpy(&sin->sin_addr, cp->heartbeat.hb_info.address,
- sizeof(sin->sin_addr));
+ store.sin.sin_family = cp->heartbeat.hb_info.addr_family;
+ store.sin.sin_len = cp->heartbeat.hb_info.addr_len;
+ store.sin.sin_port = stcb->rport;
+ memcpy(&store.sin.sin_addr, cp->heartbeat.hb_info.address,
+ sizeof(store.sin.sin_addr));
} else {
return;
}
@@ -600,12 +593,10 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp,
#ifdef INET6
case AF_INET6:
if (cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in6)) {
- sin6 = (struct sockaddr_in6 *)&store;
- sin6->sin6_family = cp->heartbeat.hb_info.addr_family;
- sin6->sin6_len = cp->heartbeat.hb_info.addr_len;
- sin6->sin6_port = stcb->rport;
- memcpy(&sin6->sin6_addr, cp->heartbeat.hb_info.address,
- sizeof(sin6->sin6_addr));
+ store.sin6.sin6_family = cp->heartbeat.hb_info.addr_family;
+ store.sin6.sin6_len = cp->heartbeat.hb_info.addr_len;
+ store.sin6.sin6_port = stcb->rport;
+ memcpy(&store.sin6.sin6_addr, cp->heartbeat.hb_info.address, sizeof(struct in6_addr));
} else {
return;
}
@@ -614,7 +605,7 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp,
default:
return;
}
- r_net = sctp_findnet(stcb, (struct sockaddr *)&store);
+ r_net = sctp_findnet(stcb, &store.sa);
if (r_net == NULL) {
SCTPDBG(SCTP_DEBUG_INPUT1, "Huh? I can't find the address I sent it to, discard\n");
return;
@@ -634,7 +625,7 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp,
if (f_net != r_net) {
/*
* first one on the list is NOT the primary
- * sctp_cmpaddr() is much more efficent if
+ * sctp_cmpaddr() is much more efficient if
* the primary is the first on the list,
* make it so.
*/
@@ -645,7 +636,8 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp,
}
sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED,
stcb, 0, (void *)r_net, SCTP_SO_NOT_LOCKED);
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb,
+ r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_4);
sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net);
}
old_error_counter = r_net->error_count;
@@ -666,7 +658,8 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp,
stcb->asoc.cc_functions.sctp_cwnd_update_exit_pf(stcb, net);
}
if (old_error_counter > 0) {
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep,
+ stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_5);
sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net);
}
if (r_net == stcb->asoc.primary_destination) {
@@ -685,7 +678,9 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp,
sctp_is_mobility_feature_on(stcb->sctp_ep,
SCTP_MOBILITY_PRIM_DELETED)) {
- sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_TIMER + SCTP_LOC_7);
+ sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED,
+ stcb->sctp_ep, stcb, NULL,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_6);
if (sctp_is_mobility_feature_on(stcb->sctp_ep,
SCTP_MOBILITY_FASTHANDOFF)) {
sctp_assoc_immediate_retrans(stcb,
@@ -756,7 +751,7 @@ sctp_handle_nat_missing_state(struct sctp_tcb *stcb,
* return 0 means we want you to proceed with the abort non-zero
* means no abort processing
*/
- if (stcb->asoc.peer_supports_auth == 0) {
+ if (stcb->asoc.auth_supported == 0) {
SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_nat_missing_state: Peer does not support AUTH, cannot send an asconf\n");
return (0);
}
@@ -786,10 +781,10 @@ sctp_handle_abort(struct sctp_abort_chunk *abort,
* Need to check the cause codes for our two magic nat
* aborts which don't kill the assoc necessarily.
*/
- struct sctp_missing_nat_state *natc;
+ struct sctp_gen_error_cause *cause;
- natc = (struct sctp_missing_nat_state *)(abort + 1);
- error = ntohs(natc->cause);
+ cause = (struct sctp_gen_error_cause *)(abort + 1);
+ error = ntohs(cause->code);
if (error == SCTP_CAUSE_NAT_COLLIDING_STATE) {
SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state abort flags:%x\n",
abort->ch.chunk_flags);
@@ -807,7 +802,8 @@ sctp_handle_abort(struct sctp_abort_chunk *abort,
error = 0;
}
/* stop any receive timers */
- sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_6);
+ sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_7);
/* notify user of the abort and clean up... */
sctp_abort_notification(stcb, 1, error, abort, SCTP_SO_NOT_LOCKED);
/* free the tcb */
@@ -829,7 +825,7 @@ sctp_handle_abort(struct sctp_abort_chunk *abort,
#endif
stcb->asoc.state |= SCTP_STATE_WAS_ABORTED;
(void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
- SCTP_FROM_SCTP_INPUT + SCTP_LOC_6);
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_8);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -871,6 +867,7 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp,
{
struct sctp_association *asoc;
int some_on_streamwheel;
+ int old_state;
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
struct socket *so;
@@ -889,17 +886,37 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp,
if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_shutdown_chunk)) {
/* Shutdown NOT the expected size */
return;
- } else {
- sctp_update_acked(stcb, cp, abort_flag);
- if (*abort_flag) {
- return;
- }
+ }
+ old_state = SCTP_GET_STATE(asoc);
+ sctp_update_acked(stcb, cp, abort_flag);
+ if (*abort_flag) {
+ return;
}
if (asoc->control_pdapi) {
/*
* With a normal shutdown we assume the end of last record.
*/
SCTP_INP_READ_LOCK(stcb->sctp_ep);
+ if (asoc->control_pdapi->on_strm_q) {
+ struct sctp_stream_in *strm;
+
+ strm = &asoc->strmin[asoc->control_pdapi->sinfo_stream];
+ if (asoc->control_pdapi->on_strm_q == SCTP_ON_UNORDERED) {
+ /* Unordered */
+ TAILQ_REMOVE(&strm->uno_inqueue, asoc->control_pdapi, next_instrm);
+ asoc->control_pdapi->on_strm_q = 0;
+ } else if (asoc->control_pdapi->on_strm_q == SCTP_ON_ORDERED) {
+ /* Ordered */
+ TAILQ_REMOVE(&strm->inqueue, asoc->control_pdapi, next_instrm);
+ asoc->control_pdapi->on_strm_q = 0;
+#ifdef INVARIANTS
+ } else {
+ panic("Unknown state on ctrl:%p on_strm_q:%d",
+ asoc->control_pdapi,
+ asoc->control_pdapi->on_strm_q);
+#endif
+ }
+ }
asoc->control_pdapi->end_added = 1;
asoc->control_pdapi->pdapi_aborted = 1;
asoc->control_pdapi = NULL;
@@ -917,7 +934,9 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp,
return;
}
#endif
- sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
+ if (stcb->sctp_socket) {
+ sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
+ }
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -944,7 +963,8 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp,
* stop the shutdown timer, since we WILL move to
* SHUTDOWN-ACK-SENT.
*/
- sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_8);
+ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb,
+ net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9);
}
/* Now is there unsent data on a stream somewhere? */
some_on_streamwheel = sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED);
@@ -962,12 +982,16 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp,
(SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
SCTP_STAT_DECR_GAUGE32(sctps_currestab);
}
- SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT);
SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
- sctp_stop_timers_for_shutdown(stcb);
- sctp_send_shutdown_ack(stcb, net);
- sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep,
- stcb, net);
+ if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) {
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT);
+ sctp_stop_timers_for_shutdown(stcb);
+ sctp_send_shutdown_ack(stcb, net);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK,
+ stcb->sctp_ep, stcb, net);
+ } else if (old_state == SCTP_STATE_SHUTDOWN_ACK_SENT) {
+ sctp_send_shutdown_ack(stcb, net);
+ }
}
}
@@ -1032,12 +1056,13 @@ sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED,
#ifdef INVARIANTS
if (!TAILQ_EMPTY(&asoc->send_queue) ||
!TAILQ_EMPTY(&asoc->sent_queue) ||
- !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) {
+ sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED)) {
panic("Queues are not empty when handling SHUTDOWN-ACK");
}
#endif
/* stop the timer */
- sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9);
+ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_10);
/* send SHUTDOWN-COMPLETE */
sctp_send_shutdown_complete(stcb, net, 0);
/* notify upper layer protocol */
@@ -1058,7 +1083,7 @@ sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED,
atomic_subtract_int(&stcb->asoc.refcnt, 1);
#endif
(void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
- SCTP_FROM_SCTP_INPUT + SCTP_LOC_10);
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_11);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -1066,7 +1091,7 @@ sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED,
/*
* Skip past the param header and then we will find the chunk that caused the
- * problem. There are two possiblities ASCONF or FWD-TSN other than that and
+ * problem. There are two possibilities ASCONF or FWD-TSN other than that and
* our peer must be broken.
*/
static void
@@ -1081,8 +1106,9 @@ sctp_process_unrecog_chunk(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr,
case SCTP_ASCONF:
sctp_asconf_cleanup(stcb, net);
break;
+ case SCTP_IFORWARD_CUM_TSN:
case SCTP_FORWARD_CUM_TSN:
- stcb->asoc.peer_supports_prsctp = 0;
+ stcb->asoc.prsctp_supported = 0;
break;
default:
SCTPDBG(SCTP_DEBUG_INPUT2,
@@ -1096,6 +1122,7 @@ sctp_process_unrecog_chunk(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr,
* Skip past the param header and then we will find the param that caused the
* problem. There are a number of param's in a ASCONF OR the prsctp param
* these will turn of specific features.
+ * XXX: Is this the right thing to do?
*/
static void
sctp_process_unrecog_param(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr)
@@ -1106,7 +1133,7 @@ sctp_process_unrecog_param(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr)
switch (ntohs(pbad->param_type)) {
/* pr-sctp draft */
case SCTP_PRSCTP_SUPPORTED:
- stcb->asoc.peer_supports_prsctp = 0;
+ stcb->asoc.prsctp_supported = 0;
break;
case SCTP_SUPPORTED_CHUNK_EXT:
break;
@@ -1117,14 +1144,14 @@ sctp_process_unrecog_param(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr)
case SCTP_ADD_IP_ADDRESS:
case SCTP_DEL_IP_ADDRESS:
case SCTP_SET_PRIM_ADDR:
- stcb->asoc.peer_supports_asconf = 0;
+ stcb->asoc.asconf_supported = 0;
break;
case SCTP_SUCCESS_REPORT:
case SCTP_ERROR_CAUSE_IND:
SCTPDBG(SCTP_DEBUG_INPUT2, "Huh, the peer does not support success? or error cause?\n");
SCTPDBG(SCTP_DEBUG_INPUT2,
"Turning off ASCONF to this strange peer\n");
- stcb->asoc.peer_supports_asconf = 0;
+ stcb->asoc.asconf_supported = 0;
break;
default:
SCTPDBG(SCTP_DEBUG_INPUT2,
@@ -1217,7 +1244,7 @@ sctp_handle_error(struct sctp_chunkhdr *ch,
atomic_subtract_int(&stcb->asoc.refcnt, 1);
#endif
(void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
- SCTP_FROM_SCTP_INPUT + SCTP_LOC_11);
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_12);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -1238,7 +1265,7 @@ sctp_handle_error(struct sctp_chunkhdr *ch,
* (or IPv4 for that matter) it does not matter. If
* they don't support that type of address, they can
* NOT possibly get that packet type... i.e. with no
- * IPv6 you can't recieve a IPv6 packet. so we can
+ * IPv6 you can't receive a IPv6 packet. so we can
* safely ignore this one. If we ever added support
* for HOSTNAME Addresses, then we would need to do
* something here.
@@ -1295,7 +1322,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset,
struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh,
struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb,
struct sctp_nets *net, int *abort_no_unlock,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid,
uint32_t vrf_id)
{
struct sctp_init_ack *init_ack;
@@ -1314,7 +1341,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset,
op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, net->port);
*abort_no_unlock = 1;
return (-1);
@@ -1326,7 +1353,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset,
op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, net->port);
*abort_no_unlock = 1;
return (-1);
@@ -1336,7 +1363,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset,
op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, net->port);
*abort_no_unlock = 1;
return (-1);
@@ -1346,7 +1373,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset,
op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, net->port);
*abort_no_unlock = 1;
return (-1);
@@ -1356,7 +1383,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset,
op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, "");
sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, net->port);
*abort_no_unlock = 1;
return (-1);
@@ -1381,7 +1408,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset,
}
if (sctp_process_init_ack(m, iphlen, offset, src, dst, sh, cp, stcb,
net, abort_no_unlock,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id) < 0) {
/* error in parsing parameters */
return (-1);
@@ -1438,7 +1465,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
struct sctp_inpcb *inp, struct sctp_nets **netp,
struct sockaddr *init_src, int *notification,
int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid,
uint32_t vrf_id, uint16_t port);
@@ -1455,7 +1482,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp,
struct sockaddr *init_src, int *notification,
int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid,
uint32_t vrf_id, uint16_t port)
{
struct sctp_association *asoc;
@@ -1468,6 +1495,11 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
int spec_flag = 0;
uint32_t how_indx;
+#if defined(SCTP_DETAILED_STR_STATS)
+ int j;
+
+#endif
+
net = *netp;
/* I know that the TCB is non-NULL from the caller */
asoc = &stcb->asoc;
@@ -1483,7 +1515,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
sctp_send_shutdown_ack(stcb, stcb->asoc.primary_destination);
op_err = sctp_generate_cause(SCTP_CAUSE_COOKIE_IN_SHUTDOWN, "");
sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid, inp->fibnum,
vrf_id, net->port);
if (how_indx < sizeof(asoc->cookie_how))
asoc->cookie_how[how_indx] = 2;
@@ -1564,9 +1596,12 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
return (NULL);
}
/* we have already processed the INIT so no problem */
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb,
- net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_12);
- sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_13);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp,
+ stcb, net,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_13);
+ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp,
+ stcb, net,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_14);
/* update current state */
if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)
SCTP_STAT_INCR_COUNTER32(sctps_activeestab);
@@ -1646,7 +1681,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
*/
if (sctp_load_addresses_from_init(stcb, m,
init_offset + sizeof(struct sctp_init_chunk),
- initack_offset, src, dst, init_src)) {
+ initack_offset, src, dst, init_src, stcb->asoc.port)) {
if (how_indx < sizeof(asoc->cookie_how))
asoc->cookie_how[how_indx] = 4;
return (NULL);
@@ -1690,7 +1725,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
*/
op_err = sctp_generate_cause(SCTP_CAUSE_NAT_COLLIDING_STATE, "");
sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid, inp->fibnum,
vrf_id, port);
return (NULL);
}
@@ -1726,7 +1761,8 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
}
if (how_indx < sizeof(asoc->cookie_how))
asoc->cookie_how[how_indx] = 8;
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_14);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_15);
sctp_stop_all_cookie_timers(stcb);
/*
* since we did not send a HB make sure we don't double
@@ -1772,7 +1808,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
}
if (sctp_load_addresses_from_init(stcb, m,
init_offset + sizeof(struct sctp_init_chunk),
- initack_offset, src, dst, init_src)) {
+ initack_offset, src, dst, init_src, stcb->asoc.port)) {
if (how_indx < sizeof(asoc->cookie_how))
asoc->cookie_how[how_indx] = 10;
return (NULL);
@@ -1862,7 +1898,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
sh, cookie, cookie_len,
inp, netp, init_src, notification,
auth_skipped, auth_offset, auth_len,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port));
}
/*
@@ -1871,8 +1907,10 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
/* temp code */
if (how_indx < sizeof(asoc->cookie_how))
asoc->cookie_how[how_indx] = 12;
- sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_15);
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_16);
+ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_16);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_17);
/* notify upper layer */
*notification = SCTP_NOTIFY_ASSOC_RESTART;
@@ -1930,8 +1968,18 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
sctp_report_all_outbound(stcb, 0, 1, SCTP_SO_LOCKED);
for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
stcb->asoc.strmout[i].chunks_on_queues = 0;
+#if defined(SCTP_DETAILED_STR_STATS)
+ for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) {
+ asoc->strmout[i].abandoned_sent[j] = 0;
+ asoc->strmout[i].abandoned_unsent[j] = 0;
+ }
+#else
+ asoc->strmout[i].abandoned_sent[0] = 0;
+ asoc->strmout[i].abandoned_unsent[0] = 0;
+#endif
stcb->asoc.strmout[i].stream_no = i;
- stcb->asoc.strmout[i].next_sequence_send = 0;
+ stcb->asoc.strmout[i].next_mid_ordered = 0;
+ stcb->asoc.strmout[i].next_mid_unordered = 0;
stcb->asoc.strmout[i].last_msg_incomplete = 0;
}
/* process the INIT-ACK info (my info) */
@@ -1973,7 +2021,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
if (sctp_load_addresses_from_init(stcb, m,
init_offset + sizeof(struct sctp_init_chunk),
- initack_offset, src, dst, init_src)) {
+ initack_offset, src, dst, init_src, stcb->asoc.port)) {
if (how_indx < sizeof(asoc->cookie_how))
asoc->cookie_how[how_indx] = 14;
@@ -2009,28 +2057,19 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
struct sctp_inpcb *inp, struct sctp_nets **netp,
struct sockaddr *init_src, int *notification,
int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid,
uint32_t vrf_id, uint16_t port)
{
struct sctp_tcb *stcb;
struct sctp_init_chunk *init_cp, init_buf;
struct sctp_init_ack_chunk *initack_cp, initack_buf;
- struct sockaddr_storage sa_store;
- struct sockaddr *initack_src = (struct sockaddr *)&sa_store;
+ union sctp_sockstore store;
struct sctp_association *asoc;
int init_offset, initack_offset, initack_limit;
int retval;
int error = 0;
uint8_t auth_chunk_buf[SCTP_PARAM_BUFFER_SIZE];
-#ifdef INET
- struct sockaddr_in *sin;
-
-#endif
-#ifdef INET6
- struct sockaddr_in6 *sin6;
-
-#endif
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
struct socket *so;
@@ -2093,6 +2132,8 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
*/
stcb = sctp_aloc_assoc(inp, init_src, &error,
ntohl(initack_cp->init.initiate_tag), vrf_id,
+ ntohs(initack_cp->init.num_outbound_streams),
+ port,
(struct thread *)NULL
);
if (stcb == NULL) {
@@ -2104,7 +2145,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen,
src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
return (NULL);
}
@@ -2132,7 +2173,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen,
src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_TCB_UNLOCK(stcb);
@@ -2140,7 +2181,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
SCTP_TCB_LOCK(stcb);
#endif
(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
- SCTP_FROM_SCTP_INPUT + SCTP_LOC_16);
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_18);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -2171,7 +2212,8 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
SCTP_SOCKET_LOCK(so, 1);
SCTP_TCB_LOCK(stcb);
#endif
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_16);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_19);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -2181,14 +2223,15 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
/* load all addresses */
if (sctp_load_addresses_from_init(stcb, m,
init_offset + sizeof(struct sctp_init_chunk), initack_offset,
- src, dst, init_src)) {
+ src, dst, init_src, port)) {
atomic_add_int(&stcb->asoc.refcnt, 1);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_TCB_UNLOCK(stcb);
SCTP_SOCKET_LOCK(so, 1);
SCTP_TCB_LOCK(stcb);
#endif
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_17);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_20);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -2217,7 +2260,8 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
SCTP_SOCKET_LOCK(so, 1);
SCTP_TCB_LOCK(stcb);
#endif
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_18);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_21);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -2254,23 +2298,20 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
#ifdef INET
case SCTP_IPV4_ADDRESS:
/* source addr is IPv4 */
- sin = (struct sockaddr_in *)initack_src;
- memset(sin, 0, sizeof(*sin));
- sin->sin_family = AF_INET;
- sin->sin_len = sizeof(struct sockaddr_in);
- sin->sin_addr.s_addr = cookie->laddress[0];
+ memset(&store.sin, 0, sizeof(struct sockaddr_in));
+ store.sin.sin_family = AF_INET;
+ store.sin.sin_len = sizeof(struct sockaddr_in);
+ store.sin.sin_addr.s_addr = cookie->laddress[0];
break;
#endif
#ifdef INET6
case SCTP_IPV6_ADDRESS:
/* source addr is IPv6 */
- sin6 = (struct sockaddr_in6 *)initack_src;
- memset(sin6, 0, sizeof(*sin6));
- sin6->sin6_family = AF_INET6;
- sin6->sin6_len = sizeof(struct sockaddr_in6);
- sin6->sin6_scope_id = cookie->scope_id;
- memcpy(&sin6->sin6_addr, cookie->laddress,
- sizeof(sin6->sin6_addr));
+ memset(&store.sin6, 0, sizeof(struct sockaddr_in6));
+ store.sin6.sin6_family = AF_INET6;
+ store.sin6.sin6_len = sizeof(struct sockaddr_in6);
+ store.sin6.sin6_scope_id = cookie->scope_id;
+ memcpy(&store.sin6.sin6_addr, cookie->laddress, sizeof(struct in6_addr));
break;
#endif
default:
@@ -2280,7 +2321,8 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
SCTP_SOCKET_LOCK(so, 1);
SCTP_TCB_LOCK(stcb);
#endif
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_19);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_22);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -2334,9 +2376,9 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) {
sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL);
}
- /* calculate the RTT */
(void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
- if ((netp) && (*netp)) {
+ if ((netp != NULL) && (*netp != NULL)) {
+ /* calculate the RTT and set the encaps port */
(*netp)->RTO = sctp_calculate_rto(stcb, asoc, *netp,
&cookie->time_entered, sctp_align_unsafe_makecopy,
SCTP_RTT_FROM_NON_DATA);
@@ -2351,7 +2393,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
sctp_check_address_list(stcb, m,
initack_offset + sizeof(struct sctp_init_ack_chunk),
initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk)),
- initack_src, cookie->local_scope, cookie->site_scope,
+ &store.sa, cookie->local_scope, cookie->site_scope,
cookie->ipv4_scope, cookie->loopback_scope);
@@ -2382,7 +2424,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
struct sctp_inpcb **inp_p, struct sctp_tcb **stcb, struct sctp_nets **netp,
int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
struct sctp_tcb **locked_tcb,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid,
uint32_t vrf_id, uint16_t port)
{
struct sctp_state_cookie *cookie;
@@ -2422,8 +2464,8 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
cookie_offset = offset + sizeof(struct sctp_chunkhdr);
cookie_len = ntohs(cp->ch.chunk_length);
- if ((cookie->peerport != sh->src_port) &&
- (cookie->myport != sh->dest_port) &&
+ if ((cookie->peerport != sh->src_port) ||
+ (cookie->myport != sh->dest_port) ||
(cookie->my_vtag != sh->v_tag)) {
/*
* invalid ports or bad tag. Note that we always leave the
@@ -2445,20 +2487,14 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
* calculated in the sctp_hmac_m() call).
*/
sig_offset = offset + cookie_len - SCTP_SIGNATURE_SIZE;
- m_sig = m_split(m, sig_offset, M_DONTWAIT);
+ m_sig = m_split(m, sig_offset, M_NOWAIT);
if (m_sig == NULL) {
/* out of memory or ?? */
return (NULL);
}
#ifdef SCTP_MBUF_LOGGING
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- struct mbuf *mat;
-
- for (mat = m_sig; mat; mat = SCTP_BUF_NEXT(mat)) {
- if (SCTP_BUF_IS_EXTENDED(mat)) {
- sctp_log_mb(mat, SCTP_MBUF_SPLIT);
- }
- }
+ sctp_log_mbc(m_sig, SCTP_MBUF_SPLIT);
}
#endif
@@ -2547,29 +2583,29 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
if (timevalcmp(&now, &time_expires, >)) {
/* cookie is stale! */
struct mbuf *op_err;
- struct sctp_stale_cookie_msg *scm;
+ struct sctp_error_stale_cookie *cause;
uint32_t tim;
- op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_stale_cookie_msg),
- 0, M_DONTWAIT, 1, MT_DATA);
+ op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_stale_cookie),
+ 0, M_NOWAIT, 1, MT_DATA);
if (op_err == NULL) {
/* FOOBAR */
return (NULL);
}
/* Set the len */
- SCTP_BUF_LEN(op_err) = sizeof(struct sctp_stale_cookie_msg);
- scm = mtod(op_err, struct sctp_stale_cookie_msg *);
- scm->ph.param_type = htons(SCTP_CAUSE_STALE_COOKIE);
- scm->ph.param_length = htons((sizeof(struct sctp_paramhdr) +
+ SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_stale_cookie);
+ cause = mtod(op_err, struct sctp_error_stale_cookie *);
+ cause->cause.code = htons(SCTP_CAUSE_STALE_COOKIE);
+ cause->cause.length = htons((sizeof(struct sctp_paramhdr) +
(sizeof(uint32_t))));
/* seconds to usec */
tim = (now.tv_sec - time_expires.tv_sec) * 1000000;
/* add in usec */
if (tim == 0)
tim = now.tv_usec - cookie->time_entered.tv_usec;
- scm->time_usec = htonl(tim);
+ cause->stale_time = htonl(tim);
sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid, l_inp->fibnum,
vrf_id, port);
return (NULL);
}
@@ -2610,7 +2646,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
/* This should not happen */
return (NULL);
}
- if ((*stcb == NULL) && to) {
+ if (*stcb == NULL) {
/* Yep, lets check */
*stcb = sctp_findassociation_ep_addr(inp_p, to, netp, dst, NULL);
if (*stcb == NULL) {
@@ -2649,9 +2685,6 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
}
}
}
- if (to == NULL) {
- return (NULL);
- }
cookie_len -= SCTP_SIGNATURE_SIZE;
if (*stcb == NULL) {
/* this is the "normal" case... get a new TCB */
@@ -2659,7 +2692,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
cookie, cookie_len, *inp_p,
netp, to, &notification,
auth_skipped, auth_offset, auth_len,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
} else {
/* this is abnormal... cookie-echo on existing TCB */
@@ -2668,7 +2701,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
src, dst, sh,
cookie, cookie_len, *inp_p, *stcb, netp, to,
&notification, auth_skipped, auth_offset, auth_len,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
}
@@ -2676,11 +2709,9 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
/* still no TCB... must be bad cookie-echo */
return (NULL);
}
- if ((*netp != NULL) && (use_mflowid != 0)) {
+ if (*netp != NULL) {
+ (*netp)->flowtype = mflowtype;
(*netp)->flowid = mflowid;
-#ifdef INVARIANTS
- (*netp)->flowidset = 1;
-#endif
}
/*
* Ok, we built an association so confirm the address we sent the
@@ -2692,7 +2723,8 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
*/
if (netl == NULL) {
/* TSNH! Huh, why do I need to add this address here? */
- if (sctp_add_remote_addr(*stcb, to, NULL, SCTP_DONOT_SETSCOPE, SCTP_IN_COOKIE_PROC)) {
+ if (sctp_add_remote_addr(*stcb, to, NULL, port,
+ SCTP_DONOT_SETSCOPE, SCTP_IN_COOKIE_PROC)) {
return (NULL);
}
netl = sctp_findnet(*stcb, to);
@@ -2751,7 +2783,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
sctp_abort_association(*inp_p, NULL, m, iphlen,
src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
pcb_so = SCTP_INP_SO(*inp_p);
@@ -2761,7 +2793,8 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
SCTP_TCB_LOCK((*stcb));
atomic_subtract_int(&(*stcb)->asoc.refcnt, 1);
#endif
- (void)sctp_free_assoc(*inp_p, *stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_20);
+ (void)sctp_free_assoc(*inp_p, *stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_23);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(pcb_so, 1);
#endif
@@ -2784,11 +2817,19 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
inp->sctp_mobility_features = (*inp_p)->sctp_mobility_features;
inp->sctp_socket = so;
inp->sctp_frag_point = (*inp_p)->sctp_frag_point;
+ inp->max_cwnd = (*inp_p)->max_cwnd;
inp->sctp_cmt_on_off = (*inp_p)->sctp_cmt_on_off;
- inp->sctp_ecn_enable = (*inp_p)->sctp_ecn_enable;
+ inp->ecn_supported = (*inp_p)->ecn_supported;
+ inp->prsctp_supported = (*inp_p)->prsctp_supported;
+ inp->auth_supported = (*inp_p)->auth_supported;
+ inp->asconf_supported = (*inp_p)->asconf_supported;
+ inp->reconfig_supported = (*inp_p)->reconfig_supported;
+ inp->nrsack_supported = (*inp_p)->nrsack_supported;
+ inp->pktdrop_supported = (*inp_p)->pktdrop_supported;
inp->partial_delivery_point = (*inp_p)->partial_delivery_point;
inp->sctp_context = (*inp_p)->sctp_context;
inp->local_strreset_support = (*inp_p)->local_strreset_support;
+ inp->fibnum = (*inp_p)->fibnum;
inp->inp_starting_point_for_iterator = NULL;
/*
* copy in the authentication parameters from the
@@ -2885,9 +2926,9 @@ sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED,
SCTPDBG(SCTP_DEBUG_INPUT2,
"sctp_handle_cookie_ack: handling COOKIE-ACK\n");
- if (stcb == NULL)
+ if ((stcb == NULL) || (net == NULL)) {
return;
-
+ }
asoc = &stcb->asoc;
sctp_stop_all_cookie_timers(stcb);
@@ -2962,7 +3003,7 @@ sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED,
* in flight)
*/
if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_DO_ASCONF)) &&
- (stcb->asoc.peer_supports_asconf) &&
+ (stcb->asoc.asconf_supported == 1) &&
(!TAILQ_EMPTY(&stcb->asoc.asconf_queue))) {
#ifdef SCTP_TIMER_BASED_ASCONF
sctp_timer_start(SCTP_TIMER_TYPE_ASCONF,
@@ -3123,7 +3164,6 @@ sctp_handle_ecn_cwr(struct sctp_cwr_chunk *cp, struct sctp_tcb *stcb, struct sct
uint32_t cwr_tsn;
cwr_tsn = ntohl(cp->tsn);
-
override = cp->ch.chunk_flags & SCTP_CWR_REDUCE_OVERRIDE;
TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) {
if (chk->rec.chunk_id.id != SCTP_ECN_ECHO) {
@@ -3139,10 +3179,8 @@ sctp_handle_ecn_cwr(struct sctp_cwr_chunk *cp, struct sctp_tcb *stcb, struct sct
stcb->asoc.ecn_echo_cnt_onq--;
TAILQ_REMOVE(&stcb->asoc.control_send_queue, chk,
sctp_next);
- if (chk->data) {
- sctp_m_freem(chk->data);
- chk->data = NULL;
- }
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
stcb->asoc.ctrl_queue_cnt--;
sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
if (override == 0) {
@@ -3184,12 +3222,13 @@ sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSE
#ifdef INVARIANTS
if (!TAILQ_EMPTY(&asoc->send_queue) ||
!TAILQ_EMPTY(&asoc->sent_queue) ||
- !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) {
+ sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED)) {
panic("Queues are not empty when handling SHUTDOWN-COMPLETE");
}
#endif
/* stop the timer */
- sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_22);
+ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_24);
SCTP_STAT_INCR_COUNTER32(sctps_shutdown);
/* free the TCB */
SCTPDBG(SCTP_DEBUG_INPUT2,
@@ -3202,7 +3241,8 @@ sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSE
SCTP_TCB_LOCK(stcb);
atomic_subtract_int(&stcb->asoc.refcnt, 1);
#endif
- (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_23);
+ (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_25);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -3310,7 +3350,8 @@ process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc,
/* restart the timer */
sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
- stcb, tp1->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_24);
+ stcb, tp1->whoTo,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_26);
sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep,
stcb, tp1->whoTo);
@@ -3319,7 +3360,7 @@ process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc,
sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PDRP,
tp1->whoTo->flight_size,
tp1->book_size,
- (uintptr_t) stcb,
+ (uint32_t) (uintptr_t) stcb,
tp1->rec.data.TSN_seq);
}
if (tp1->sent < SCTP_DATAGRAM_RESEND) {
@@ -3378,7 +3419,8 @@ process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc,
* this, otherwise we let the timer fire.
*/
sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep,
- stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_25);
+ stcb, net,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_27);
sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED);
}
break;
@@ -3429,6 +3471,7 @@ process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc,
/* resend last asconf ack */
sctp_send_asconf_ack(stcb);
break;
+ case SCTP_IFORWARD_CUM_TSN:
case SCTP_FORWARD_CUM_TSN:
send_forward_tsn(stcb, &stcb->asoc);
break;
@@ -3454,8 +3497,8 @@ sctp_reset_in_stream(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *
uint16_t temp;
/*
- * We set things to 0xffff since this is the last delivered sequence
- * and we will be sending in 0 after the reset.
+ * We set things to 0xffffffff since this is the last delivered
+ * sequence and we will be sending in 0 after the reset.
*/
if (number_entries) {
@@ -3464,12 +3507,12 @@ sctp_reset_in_stream(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *
if (temp >= stcb->asoc.streamincnt) {
continue;
}
- stcb->asoc.strmin[temp].last_sequence_delivered = 0xffff;
+ stcb->asoc.strmin[temp].last_sequence_delivered = 0xffffffff;
}
} else {
list = NULL;
for (i = 0; i < stcb->asoc.streamincnt; i++) {
- stcb->asoc.strmin[i].last_sequence_delivered = 0xffff;
+ stcb->asoc.strmin[i].last_sequence_delivered = 0xffffffff;
}
}
sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_RECV, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED);
@@ -3488,23 +3531,47 @@ sctp_reset_out_streams(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t
/* no such stream */
continue;
}
- stcb->asoc.strmout[temp].next_sequence_send = 0;
+ stcb->asoc.strmout[temp].next_mid_ordered = 0;
+ stcb->asoc.strmout[temp].next_mid_unordered = 0;
}
} else {
for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
- stcb->asoc.strmout[i].next_sequence_send = 0;
+ stcb->asoc.strmout[i].next_mid_ordered = 0;
+ stcb->asoc.strmout[i].next_mid_unordered = 0;
}
}
sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_SEND, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED);
}
+static void
+sctp_reset_clear_pending(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t * list)
+{
+ uint32_t i;
+ uint16_t temp;
-struct sctp_stream_reset_out_request *
+ if (number_entries > 0) {
+ for (i = 0; i < number_entries; i++) {
+ temp = ntohs(list[i]);
+ if (temp >= stcb->asoc.streamoutcnt) {
+ /* no such stream */
+ continue;
+ }
+ stcb->asoc.strmout[temp].state = SCTP_STREAM_OPEN;
+ }
+ } else {
+ for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
+ stcb->asoc.strmout[i].state = SCTP_STREAM_OPEN;
+ }
+ }
+}
+
+
+struct sctp_stream_reset_request *
sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chunk **bchk)
{
struct sctp_association *asoc;
struct sctp_chunkhdr *ch;
- struct sctp_stream_reset_out_request *r;
+ struct sctp_stream_reset_request *r;
struct sctp_tmit_chunk *chk;
int len, clen;
@@ -3527,7 +3594,7 @@ sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chu
}
clen = chk->send_size;
ch = mtod(chk->data, struct sctp_chunkhdr *);
- r = (struct sctp_stream_reset_out_request *)(ch + 1);
+ r = (struct sctp_stream_reset_request *)(ch + 1);
if (ntohl(r->request_seq) == seq) {
/* found it */
return (r);
@@ -3535,7 +3602,7 @@ sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chu
len = SCTP_SIZE32(ntohs(r->ph.param_length));
if (clen > (len + (int)sizeof(struct sctp_chunkhdr))) {
/* move to the next one, there can only be a max of two */
- r = (struct sctp_stream_reset_out_request *)((caddr_t)r + len);
+ r = (struct sctp_stream_reset_request *)((caddr_t)r + len);
if (ntohl(r->request_seq) == seq) {
return (r);
}
@@ -3555,7 +3622,8 @@ sctp_clean_up_stream_reset(struct sctp_tcb *stcb)
}
asoc = &stcb->asoc;
- sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_26);
+ sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb,
+ chk->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_28);
TAILQ_REMOVE(&asoc->control_send_queue,
chk,
sctp_next);
@@ -3579,7 +3647,9 @@ sctp_handle_stream_reset_response(struct sctp_tcb *stcb,
int lparm_len;
struct sctp_association *asoc = &stcb->asoc;
struct sctp_tmit_chunk *chk;
- struct sctp_stream_reset_out_request *srparam;
+ struct sctp_stream_reset_request *req_param;
+ struct sctp_stream_reset_out_request *req_out_param;
+ struct sctp_stream_reset_in_request *req_in_param;
uint32_t number_entries;
if (asoc->stream_reset_outstanding == 0) {
@@ -3587,35 +3657,50 @@ sctp_handle_stream_reset_response(struct sctp_tcb *stcb,
return (0);
}
if (seq == stcb->asoc.str_reset_seq_out) {
- srparam = sctp_find_stream_reset(stcb, seq, &chk);
- if (srparam) {
+ req_param = sctp_find_stream_reset(stcb, seq, &chk);
+ if (req_param != NULL) {
stcb->asoc.str_reset_seq_out++;
- type = ntohs(srparam->ph.param_type);
- lparm_len = ntohs(srparam->ph.param_length);
+ type = ntohs(req_param->ph.param_type);
+ lparm_len = ntohs(req_param->ph.param_length);
if (type == SCTP_STR_RESET_OUT_REQUEST) {
+ int no_clear = 0;
+
+ req_out_param = (struct sctp_stream_reset_out_request *)req_param;
number_entries = (lparm_len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t);
asoc->stream_reset_out_is_outstanding = 0;
if (asoc->stream_reset_outstanding)
asoc->stream_reset_outstanding--;
if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) {
/* do it */
- sctp_reset_out_streams(stcb, number_entries, srparam->list_of_streams);
+ sctp_reset_out_streams(stcb, number_entries, req_out_param->list_of_streams);
} else if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
- sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_OUT, stcb, number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED);
+ sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_OUT, stcb, number_entries, req_out_param->list_of_streams, SCTP_SO_NOT_LOCKED);
+ } else if (action == SCTP_STREAM_RESET_RESULT_IN_PROGRESS) {
+ /*
+ * Set it up so we don't stop
+ * retransmitting
+ */
+ asoc->stream_reset_outstanding++;
+ stcb->asoc.str_reset_seq_out--;
+ asoc->stream_reset_out_is_outstanding = 1;
+ no_clear = 1;
} else {
- sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_OUT, stcb, number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED);
+ sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_OUT, stcb, number_entries, req_out_param->list_of_streams, SCTP_SO_NOT_LOCKED);
+ }
+ if (no_clear == 0) {
+ sctp_reset_clear_pending(stcb, number_entries, req_out_param->list_of_streams);
}
} else if (type == SCTP_STR_RESET_IN_REQUEST) {
- /* Answered my request */
+ req_in_param = (struct sctp_stream_reset_in_request *)req_param;
number_entries = (lparm_len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t);
if (asoc->stream_reset_outstanding)
asoc->stream_reset_outstanding--;
if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_IN, stcb,
- number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED);
+ number_entries, req_in_param->list_of_streams, SCTP_SO_NOT_LOCKED);
} else if (action != SCTP_STREAM_RESET_RESULT_PERFORMED) {
sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_IN, stcb,
- number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED);
+ number_entries, req_in_param->list_of_streams, SCTP_SO_NOT_LOCKED);
}
} else if (type == SCTP_STR_RESET_ADD_OUT_STREAMS) {
/* Ok we now may have more streams */
@@ -3631,7 +3716,12 @@ sctp_handle_stream_reset_response(struct sctp_tcb *stcb,
asoc->stream_reset_outstanding--;
if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) {
/* Put the new streams into effect */
- stcb->asoc.streamoutcnt += num_stream;
+ int i;
+
+ for (i = asoc->streamoutcnt; i < (asoc->streamoutcnt + num_stream); i++) {
+ asoc->strmout[i].state = SCTP_STREAM_OPEN;
+ }
+ asoc->streamoutcnt += num_stream;
sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, 0);
} else if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt,
@@ -3708,6 +3798,9 @@ sctp_handle_stream_reset_response(struct sctp_tcb *stcb,
}
}
}
+ if (asoc->stream_reset_outstanding == 0) {
+ sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_NOT_LOCKED);
+ }
return (0);
}
@@ -3738,22 +3831,33 @@ sctp_handle_str_reset_request_in(struct sctp_tcb *stcb,
} else if (stcb->asoc.stream_reset_out_is_outstanding == 0) {
len = ntohs(req->ph.param_length);
number_entries = ((len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t));
- for (i = 0; i < number_entries; i++) {
- temp = ntohs(req->list_of_streams[i]);
- req->list_of_streams[i] = temp;
+ if (number_entries) {
+ for (i = 0; i < number_entries; i++) {
+ temp = ntohs(req->list_of_streams[i]);
+ if (temp >= stcb->asoc.streamoutcnt) {
+ asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
+ goto bad_boy;
+ }
+ req->list_of_streams[i] = temp;
+ }
+ for (i = 0; i < number_entries; i++) {
+ if (stcb->asoc.strmout[req->list_of_streams[i]].state == SCTP_STREAM_OPEN) {
+ stcb->asoc.strmout[req->list_of_streams[i]].state = SCTP_STREAM_RESET_PENDING;
+ }
+ }
+ } else {
+ /* Its all */
+ for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
+ if (stcb->asoc.strmout[i].state == SCTP_STREAM_OPEN)
+ stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_PENDING;
+ }
}
asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
- sctp_add_stream_reset_out(chk, number_entries, req->list_of_streams,
- asoc->str_reset_seq_out,
- seq, (asoc->sending_seq - 1));
- asoc->stream_reset_out_is_outstanding = 1;
- asoc->str_reset = chk;
- sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo);
- stcb->asoc.stream_reset_outstanding++;
} else {
/* Can't do it, since we have sent one out */
asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS;
}
+bad_boy:
sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
asoc->str_reset_seq_in++;
} else if (asoc->str_reset_seq_in - 1 == seq) {
@@ -3763,6 +3867,7 @@ sctp_handle_str_reset_request_in(struct sctp_tcb *stcb,
} else {
sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO);
}
+ sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_NOT_LOCKED);
}
static int
@@ -3881,11 +3986,12 @@ sctp_handle_str_reset_request_out(struct sctp_tcb *stcb,
sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
return;
}
+ liste->seq = seq;
liste->tsn = tsn;
liste->number_entries = number_entries;
memcpy(&liste->list_of_streams, req->list_of_streams, number_entries * sizeof(uint16_t));
TAILQ_INSERT_TAIL(&asoc->resetHead, liste, next_resp);
- asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
+ asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_IN_PROGRESS;
}
sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
asoc->str_reset_seq_in++;
@@ -3949,20 +4055,28 @@ sctp_handle_str_reset_add_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk *ch
/* copy off the old data */
for (i = 0; i < stcb->asoc.streamincnt; i++) {
TAILQ_INIT(&stcb->asoc.strmin[i].inqueue);
+ TAILQ_INIT(&stcb->asoc.strmin[i].uno_inqueue);
stcb->asoc.strmin[i].stream_no = i;
stcb->asoc.strmin[i].last_sequence_delivered = oldstrm[i].last_sequence_delivered;
stcb->asoc.strmin[i].delivery_started = oldstrm[i].delivery_started;
+ stcb->asoc.strmin[i].pd_api_started = oldstrm[i].pd_api_started;
/* now anything on those queues? */
- TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].inqueue, next, nctl) {
- TAILQ_REMOVE(&oldstrm[i].inqueue, ctl, next);
- TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].inqueue, ctl, next);
+ TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].inqueue, next_instrm, nctl) {
+ TAILQ_REMOVE(&oldstrm[i].inqueue, ctl, next_instrm);
+ TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].inqueue, ctl, next_instrm);
+ }
+ TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].uno_inqueue, next_instrm, nctl) {
+ TAILQ_REMOVE(&oldstrm[i].uno_inqueue, ctl, next_instrm);
+ TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].uno_inqueue, ctl, next_instrm);
}
}
/* Init the new streams */
for (i = stcb->asoc.streamincnt; i < num_stream; i++) {
TAILQ_INIT(&stcb->asoc.strmin[i].inqueue);
+ TAILQ_INIT(&stcb->asoc.strmin[i].uno_inqueue);
stcb->asoc.strmin[i].stream_no = i;
- stcb->asoc.strmin[i].last_sequence_delivered = 0xffff;
+ stcb->asoc.strmin[i].last_sequence_delivered = 0xffffffff;
+ stcb->asoc.strmin[i].pd_api_started = 0;
stcb->asoc.strmin[i].delivery_started = 0;
}
SCTP_FREE(oldstrm, SCTP_M_STRMI);
@@ -4022,7 +4136,7 @@ sctp_handle_str_reset_add_out_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk
mychk += num_stream;
if (mychk < 0x10000) {
stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
- if (sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 0, 1, num_stream, 0, 1)) {
+ if (sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 1, num_stream, 0, 1)) {
stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
}
} else {
@@ -4075,13 +4189,15 @@ __attribute__((noinline))
if (chk == NULL) {
return (ret_code);
}
+ chk->copy_by_ref = 0;
chk->rec.chunk_id.id = SCTP_STREAM_RESET;
chk->rec.chunk_id.can_take_data = 0;
+ chk->flags = 0;
chk->asoc = &stcb->asoc;
chk->no_fr_allowed = 0;
chk->book_size = chk->send_size = sizeof(struct sctp_chunkhdr);
chk->book_size_scale = 0;
- chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
if (chk->data == NULL) {
strres_nochunk:
if (chk->data) {
@@ -4366,7 +4482,7 @@ sctp_handle_packet_dropped(struct sctp_pktdrop_chunk *cp,
(stcb->asoc.sat_t3_loss_recovery == 0) &&
(stcb->asoc.sat_network)) {
/*
- * This is debateable but for sat networks it makes sense
+ * This is debatable but for sat networks it makes sense
* Note if a T3 timer has went off, we will prohibit any
* changes to cwnd until we exit the t3 loss recovery.
*/
@@ -4392,7 +4508,7 @@ __attribute__((noinline))
struct sockaddr *src, struct sockaddr *dst,
struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb *inp,
struct sctp_tcb *stcb, struct sctp_nets **netp, int *fwd_tsn_seen,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
uint32_t vrf_id, uint16_t port)
{
struct sctp_association *asoc;
@@ -4461,7 +4577,7 @@ __attribute__((noinline))
*/
if ((ch->chunk_type == SCTP_AUTHENTICATION) &&
(stcb == NULL) &&
- !SCTP_BASE_SYSCTL(sctp_auth_disable)) {
+ (inp->auth_supported == 1)) {
/* save this chunk for later processing */
auth_skipped = 1;
auth_offset = *offset;
@@ -4551,12 +4667,12 @@ __attribute__((noinline))
}
}
if (stcb == NULL) {
- snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s\n", __FILE__, __LINE__, __FUNCTION__);
+ snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__);
op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
msg);
/* no association, so it's out of the blue... */
sctp_handle_ootb(m, iphlen, *offset, src, dst, sh, inp, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid, inp->fibnum,
vrf_id, port);
*offset = length;
if (locked_tcb) {
@@ -4595,12 +4711,12 @@ __attribute__((noinline))
if (locked_tcb) {
SCTP_TCB_UNLOCK(locked_tcb);
}
- snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s\n", __FILE__, __LINE__, __FUNCTION__);
+ snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__);
op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
msg);
sctp_handle_ootb(m, iphlen, *offset, src, dst,
sh, inp, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid, fibnum,
vrf_id, port);
return (NULL);
}
@@ -4728,7 +4844,7 @@ process_control_chunks:
/* check to see if this chunk required auth, but isn't */
if ((stcb != NULL) &&
- !SCTP_BASE_SYSCTL(sctp_auth_disable) &&
+ (stcb->asoc.auth_supported == 1) &&
sctp_auth_is_required_chunk(ch->chunk_type, stcb->asoc.local_auth_chunks) &&
!stcb->asoc.authenticated) {
/* "silently" ignore */
@@ -4741,13 +4857,11 @@ process_control_chunks:
/* The INIT chunk must be the only chunk. */
if ((num_chunks > 1) ||
(length - *offset > (int)SCTP_SIZE32(chk_length))) {
- op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
- "INIT not the only chunk");
- sctp_abort_association(inp, stcb, m, iphlen,
- src, dst, sh, op_err,
- use_mflowid, mflowid,
- vrf_id, port);
+ /* RFC 4960 requires that no ABORT is sent */
*offset = length;
+ if (locked_tcb) {
+ SCTP_TCB_UNLOCK(locked_tcb);
+ }
return (NULL);
}
/* Honor our resource limit. */
@@ -4755,15 +4869,15 @@ process_control_chunks:
op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
sctp_abort_association(inp, stcb, m, iphlen,
src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
*offset = length;
return (NULL);
}
sctp_handle_init(m, iphlen, *offset, src, dst, sh,
(struct sctp_init_chunk *)ch, inp,
- stcb, &abort_no_unlock,
- use_mflowid, mflowid,
+ stcb, *netp, &abort_no_unlock,
+ mflowtype, mflowid,
vrf_id, port);
*offset = length;
if ((!abort_no_unlock) && (locked_tcb)) {
@@ -4780,7 +4894,7 @@ process_control_chunks:
if ((stcb) && (stcb->asoc.total_output_queue_size)) {
;
} else {
- if (locked_tcb != stcb) {
+ if ((locked_tcb != NULL) && (locked_tcb != stcb)) {
/* Very unlikely */
SCTP_TCB_UNLOCK(locked_tcb);
}
@@ -4794,7 +4908,8 @@ process_control_chunks:
SCTP_TCB_LOCK(stcb);
atomic_subtract_int(&stcb->asoc.refcnt, 1);
#endif
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_29);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -4817,7 +4932,7 @@ process_control_chunks:
(struct sctp_init_ack_chunk *)ch,
stcb, *netp,
&abort_no_unlock,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id);
} else {
ret = -1;
@@ -4936,8 +5051,7 @@ process_control_chunks:
SCTPDBG(SCTP_DEBUG_INDATA1, "No stcb when processing NR-SACK chunk\n");
break;
}
- if ((stcb->asoc.sctp_nr_sack_on_off == 0) ||
- (stcb->asoc.peer_supports_nr_sack == 0)) {
+ if (stcb->asoc.nrsack_supported == 0) {
goto unknown_chunk;
}
if (chk_length < sizeof(struct sctp_nr_sack_chunk)) {
@@ -5123,7 +5237,7 @@ process_control_chunks:
op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, "");
sctp_abort_association(inp, stcb, m, iphlen,
src, dst, sh, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
vrf_id, port);
}
*offset = length;
@@ -5158,7 +5272,7 @@ process_control_chunks:
auth_offset,
auth_len,
&locked_tcb,
- use_mflowid,
+ mflowtype,
mflowid,
vrf_id,
port);
@@ -5215,7 +5329,8 @@ process_control_chunks:
SCTP_TCB_LOCK(stcb);
atomic_subtract_int(&stcb->asoc.refcnt, 1);
#endif
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_30);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -5248,6 +5363,9 @@ process_control_chunks:
return (NULL);
}
if (stcb) {
+ if (stcb->asoc.ecn_supported == 0) {
+ goto unknown_chunk;
+ }
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
stcb->asoc.overall_error_count,
@@ -5273,6 +5391,9 @@ process_control_chunks:
return (NULL);
}
if (stcb) {
+ if (stcb->asoc.ecn_supported == 0) {
+ goto unknown_chunk;
+ }
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
stcb->asoc.overall_error_count,
@@ -5306,6 +5427,9 @@ process_control_chunks:
SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF\n");
/* He's alive so give him credit */
if (stcb) {
+ if (stcb->asoc.asconf_supported == 0) {
+ goto unknown_chunk;
+ }
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
stcb->asoc.overall_error_count,
@@ -5330,6 +5454,9 @@ process_control_chunks:
return (NULL);
}
if ((stcb) && netp && *netp) {
+ if (stcb->asoc.asconf_supported == 0) {
+ goto unknown_chunk;
+ }
/* He's alive so give him credit */
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
@@ -5346,6 +5473,7 @@ process_control_chunks:
}
break;
case SCTP_FORWARD_CUM_TSN:
+ case SCTP_IFORWARD_CUM_TSN:
SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_FWD-TSN\n");
if (chk_length < sizeof(struct sctp_forward_tsn_chunk)) {
/* Its not ours */
@@ -5359,6 +5487,9 @@ process_control_chunks:
if (stcb) {
int abort_flag = 0;
+ if (stcb->asoc.prsctp_supported == 0) {
+ goto unknown_chunk;
+ }
stcb->asoc.overall_error_count = 0;
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) {
sctp_misc_ints(SCTP_THRESHOLD_CLEAR,
@@ -5378,7 +5509,8 @@ process_control_chunks:
SCTP_TCB_LOCK(stcb);
atomic_subtract_int(&stcb->asoc.refcnt, 1);
#endif
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_29);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_INPUT + SCTP_LOC_31);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -5413,13 +5545,8 @@ process_control_chunks:
*offset = length;
return (NULL);
}
- if (stcb->asoc.peer_supports_strreset == 0) {
- /*
- * hmm, peer should have announced this, but
- * we will turn it on since he is sending us
- * a stream reset.
- */
- stcb->asoc.peer_supports_strreset = 1;
+ if (stcb->asoc.reconfig_supported == 0) {
+ goto unknown_chunk;
}
if (sctp_handle_stream_reset(stcb, m, *offset, ch)) {
/* stop processing */
@@ -5439,18 +5566,17 @@ process_control_chunks:
return (NULL);
}
if (ch && (stcb) && netp && (*netp)) {
+ if (stcb->asoc.pktdrop_supported == 0) {
+ goto unknown_chunk;
+ }
sctp_handle_packet_dropped((struct sctp_pktdrop_chunk *)ch,
stcb, *netp,
min(chk_length, (sizeof(chunk_buf) - 4)));
}
break;
-
case SCTP_AUTHENTICATION:
SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_AUTHENTICATION\n");
- if (SCTP_BASE_SYSCTL(sctp_auth_disable))
- goto unknown_chunk;
-
if (stcb == NULL) {
/* save the first AUTH for later processing */
if (auth_skipped == 0) {
@@ -5461,6 +5587,9 @@ process_control_chunks:
/* skip this chunk (temporarily) */
goto next_chunk;
}
+ if (stcb->asoc.auth_supported == 0) {
+ goto unknown_chunk;
+ }
if ((chk_length < (sizeof(struct sctp_auth_chunk))) ||
(chk_length > (sizeof(struct sctp_auth_chunk) +
SCTP_AUTH_DIGEST_LEN_MAX))) {
@@ -5491,43 +5620,27 @@ process_control_chunks:
unknown_chunk:
/* it's an unknown chunk! */
if ((ch->chunk_type & 0x40) && (stcb != NULL)) {
- struct mbuf *mm;
- struct sctp_paramhdr *phd;
-
- mm = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr),
- 0, M_DONTWAIT, 1, MT_DATA);
- if (mm) {
- phd = mtod(mm, struct sctp_paramhdr *);
- /*
- * We cheat and use param type since
- * we did not bother to define a
- * error cause struct. They are the
- * same basic format with different
- * names.
- */
- phd->param_type = htons(SCTP_CAUSE_UNRECOG_CHUNK);
- phd->param_length = htons(chk_length + sizeof(*phd));
- SCTP_BUF_LEN(mm) = sizeof(*phd);
- SCTP_BUF_NEXT(mm) = SCTP_M_COPYM(m, *offset, chk_length, M_DONTWAIT);
- if (SCTP_BUF_NEXT(mm)) {
- if (sctp_pad_lastmbuf(SCTP_BUF_NEXT(mm), SCTP_SIZE32(chk_length) - chk_length, NULL)) {
- sctp_m_freem(mm);
- } else {
+ struct sctp_gen_error_cause *cause;
+ int len;
+
+ op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_gen_error_cause),
+ 0, M_NOWAIT, 1, MT_DATA);
+ if (op_err != NULL) {
+ len = min(SCTP_SIZE32(chk_length), (uint32_t) (length - *offset));
+ cause = mtod(op_err, struct sctp_gen_error_cause *);
+ cause->code = htons(SCTP_CAUSE_UNRECOG_CHUNK);
+ cause->length = htons((uint16_t) (len + sizeof(struct sctp_gen_error_cause)));
+ SCTP_BUF_LEN(op_err) = sizeof(struct sctp_gen_error_cause);
+ SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(m, *offset, len, M_NOWAIT);
+ if (SCTP_BUF_NEXT(op_err) != NULL) {
#ifdef SCTP_MBUF_LOGGING
- if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- struct mbuf *mat;
-
- for (mat = SCTP_BUF_NEXT(mm); mat; mat = SCTP_BUF_NEXT(mat)) {
- if (SCTP_BUF_IS_EXTENDED(mat)) {
- sctp_log_mb(mat, SCTP_MBUF_ICOPY);
- }
- }
- }
-#endif
- sctp_queue_op_err(stcb, mm);
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
+ sctp_log_mbc(SCTP_BUF_NEXT(op_err), SCTP_MBUF_ICOPY);
}
+#endif
+ sctp_queue_op_err(stcb, op_err);
} else {
- sctp_m_freem(mm);
+ sctp_m_freem(op_err);
}
}
}
@@ -5565,30 +5678,6 @@ next_chunk:
}
-#ifdef INVARIANTS
-#ifdef __GNUC__
-__attribute__((noinline))
-#endif
- void
- sctp_validate_no_locks(struct sctp_inpcb *inp)
-{
- struct sctp_tcb *lstcb;
-
- LIST_FOREACH(lstcb, &inp->sctp_asoc_list, sctp_tcblist) {
- if (mtx_owned(&lstcb->tcb_mtx)) {
- panic("Own lock on stcb at return from input");
- }
- }
- if (mtx_owned(&inp->inp_create_mtx)) {
- panic("Own create lock on inp");
- }
- if (mtx_owned(&inp->inp_mtx)) {
- panic("Own inp lock on inp");
- }
-}
-
-#endif
-
/*
* common input chunk processing (v4 and v6)
*/
@@ -5600,7 +5689,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
uint8_t compute_crc,
#endif
uint8_t ecn_bits,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
uint32_t vrf_id, uint16_t port)
{
uint32_t high_tsn;
@@ -5631,17 +5720,26 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
calc_check, check, (void *)m, length, iphlen);
stcb = sctp_findassociation_addr(m, offset, src, dst,
sh, ch, &inp, &net, vrf_id);
- if ((net != NULL) && (port != 0)) {
+#if defined(INET) || defined(INET6)
+ if ((ch->chunk_type != SCTP_INITIATION) &&
+ (net != NULL) && (net->port != port)) {
if (net->port == 0) {
- sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr));
+ /* UDP encapsulation turned on. */
+ net->mtu -= sizeof(struct udphdr);
+ if (stcb->asoc.smallest_mtu > net->mtu) {
+ sctp_pathmtu_adjustment(stcb, net->mtu);
+ }
+ } else if (port == 0) {
+ /* UDP encapsulation turned off. */
+ net->mtu += sizeof(struct udphdr);
+ /* XXX Update smallest_mtu */
}
net->port = port;
}
- if ((net != NULL) && (use_mflowid != 0)) {
- net->flowid = mflowid;
-#ifdef INVARIANTS
- net->flowidset = 1;
#endif
+ if (net != NULL) {
+ net->flowtype = mflowtype;
+ net->flowid = mflowid;
}
if ((inp != NULL) && (stcb != NULL)) {
sctp_send_packet_dropped(stcb, net, m, length, iphlen, 1);
@@ -5662,17 +5760,26 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
}
stcb = sctp_findassociation_addr(m, offset, src, dst,
sh, ch, &inp, &net, vrf_id);
- if ((net != NULL) && (port != 0)) {
+#if defined(INET) || defined(INET6)
+ if ((ch->chunk_type != SCTP_INITIATION) &&
+ (net != NULL) && (net->port != port)) {
if (net->port == 0) {
- sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr));
+ /* UDP encapsulation turned on. */
+ net->mtu -= sizeof(struct udphdr);
+ if (stcb->asoc.smallest_mtu > net->mtu) {
+ sctp_pathmtu_adjustment(stcb, net->mtu);
+ }
+ } else if (port == 0) {
+ /* UDP encapsulation turned off. */
+ net->mtu += sizeof(struct udphdr);
+ /* XXX Update smallest_mtu */
}
net->port = port;
}
- if ((net != NULL) && (use_mflowid != 0)) {
- net->flowid = mflowid;
-#ifdef INVARIANTS
- net->flowidset = 1;
#endif
+ if (net != NULL) {
+ net->flowtype = mflowtype;
+ net->flowid = mflowid;
}
if (inp == NULL) {
SCTP_STAT_INCR(sctps_noport);
@@ -5681,7 +5788,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
}
if (ch->chunk_type == SCTP_SHUTDOWN_ACK) {
sctp_send_shutdown_complete2(src, dst, sh,
- use_mflowid, mflowid,
+ mflowtype, mflowid, fibnum,
vrf_id, port);
goto out;
}
@@ -5696,7 +5803,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
"Out of the blue");
sctp_send_abort(m, iphlen, src, dst,
sh, 0, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid, fibnum,
vrf_id, port);
}
}
@@ -5714,7 +5821,6 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
#ifdef INET
case AF_INET:
if (ipsec4_in_reject(m, &inp->ip_inp.inp)) {
- IPSECSTAT_INC(in_polvio);
SCTP_STAT_INCR(sctps_hdrops);
goto out;
}
@@ -5723,7 +5829,6 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
#ifdef INET6
case AF_INET6:
if (ipsec6_in_reject(m, &inp->ip_inp.inp)) {
- IPSEC6STAT_INC(in_polvio);
SCTP_STAT_INCR(sctps_hdrops);
goto out;
}
@@ -5753,11 +5858,11 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
*/
SCTP_TCB_UNLOCK(stcb);
stcb = NULL;
- snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s\n", __FILE__, __LINE__, __FUNCTION__);
+ snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__);
op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
msg);
sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid, inp->fibnum,
vrf_id, port);
goto out;
}
@@ -5768,7 +5873,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
stcb = sctp_process_control(m, iphlen, &offset, length,
src, dst, sh, ch,
inp, stcb, &net, &fwd_tsn_seen,
- use_mflowid, mflowid,
+ mflowtype, mflowid, fibnum,
vrf_id, port);
if (stcb) {
/*
@@ -5776,12 +5881,23 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
* it changes our INP.
*/
inp = stcb->sctp_ep;
- if ((net) && (port)) {
+#if defined(INET) || defined(INET6)
+ if ((ch->chunk_type != SCTP_INITIATION) &&
+ (net != NULL) && (net->port != port)) {
if (net->port == 0) {
- sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr));
+ /* UDP encapsulation turned on. */
+ net->mtu -= sizeof(struct udphdr);
+ if (stcb->asoc.smallest_mtu > net->mtu) {
+ sctp_pathmtu_adjustment(stcb, net->mtu);
+ }
+ } else if (port == 0) {
+ /* UDP encapsulation turned off. */
+ net->mtu += sizeof(struct udphdr);
+ /* XXX Update smallest_mtu */
}
net->port = port;
}
+#endif
}
} else {
/*
@@ -5795,7 +5911,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
* chunks
*/
if ((stcb != NULL) &&
- !SCTP_BASE_SYSCTL(sctp_auth_disable) &&
+ (stcb->asoc.auth_supported == 1) &&
sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks)) {
/* "silently" ignore */
SCTP_STAT_INCR(sctps_recvauthmissing);
@@ -5803,11 +5919,11 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
}
if (stcb == NULL) {
/* out of the blue DATA chunk */
- snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s\n", __FILE__, __LINE__, __FUNCTION__);
+ snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__);
op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
msg);
sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid, fibnum,
vrf_id, port);
goto out;
}
@@ -5837,7 +5953,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
*/
if ((length > offset) &&
(stcb != NULL) &&
- !SCTP_BASE_SYSCTL(sctp_auth_disable) &&
+ (stcb->asoc.auth_supported == 1) &&
sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks) &&
!stcb->asoc.authenticated) {
/* "silently" ignore */
@@ -5875,11 +5991,11 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
/*
* We consider OOTB any data sent during asoc setup.
*/
- snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s\n", __FILE__, __LINE__, __FUNCTION__);
+ snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__);
op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
msg);
sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid, inp->fibnum,
vrf_id, port);
goto out;
/* sa_ignore NOTREACHED */
@@ -5898,10 +6014,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
}
/* plow through the data chunks while length > offset */
retval = sctp_process_data(mm, iphlen, &offset, length,
- src, dst, sh,
- inp, stcb, net, &high_tsn,
- use_mflowid, mflowid,
- vrf_id, port);
+ inp, stcb, net, &high_tsn);
if (retval == 2) {
/*
* The association aborted, NO UNLOCK needed since
@@ -5918,7 +6031,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
}
/* take care of ecn */
if ((data_processed == 1) &&
- (stcb->asoc.ecn_allowed == 1) &&
+ (stcb->asoc.ecn_supported == 1) &&
((ecn_bits & SCTP_CE_BITS) == SCTP_CE_BITS)) {
/* Yep, we need to add a ECNE */
sctp_send_ecn_echo(stcb, net, high_tsn);
@@ -5953,7 +6066,9 @@ trigger_send:
if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue)) {
cnt_ctrl_ready = stcb->asoc.ctrl_queue_cnt - stcb->asoc.ecn_echo_cnt_onq;
}
- if (cnt_ctrl_ready ||
+ if (!TAILQ_EMPTY(&stcb->asoc.asconf_send_queue) ||
+ cnt_ctrl_ready ||
+ stcb->asoc.trigger_reset ||
((un_sent) &&
(stcb->asoc.peers_rwnd > 0 ||
(stcb->asoc.peers_rwnd <= 0 && stcb->asoc.total_flight == 0)))) {
@@ -5975,27 +6090,9 @@ out:
SCTP_INP_DECR_REF(inp_decr);
SCTP_INP_WUNLOCK(inp_decr);
}
-#ifdef INVARIANTS
- if (inp != NULL) {
- sctp_validate_no_locks(inp);
- }
-#endif
return;
}
-#if 0
-static void
-sctp_print_mbuf_chain(struct mbuf *m)
-{
- for (; m; m = SCTP_BUF_NEXT(m)) {
- SCTP_PRINTF("%p: m_len = %ld\n", (void *)m, SCTP_BUF_LEN(m));
- if (SCTP_BUF_IS_EXTENDED(m))
- SCTP_PRINTF("%p: extend_size = %d\n", (void *)m, SCTP_BUF_EXTEND_SIZE(m));
- }
-}
-
-#endif
-
#ifdef INET
void
sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port)
@@ -6015,7 +6112,8 @@ sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port)
#endif
uint32_t mflowid;
- uint8_t use_mflowid;
+ uint8_t mflowtype;
+ uint16_t fibnum;
iphlen = off;
if (SCTP_GET_PKT_VRFID(i_pak, vrf_id)) {
@@ -6026,13 +6124,7 @@ sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port)
#ifdef SCTP_MBUF_LOGGING
/* Log in any input mbufs */
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- struct mbuf *mat;
-
- for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) {
- if (SCTP_BUF_IS_EXTENDED(mat)) {
- sctp_log_mb(mat, SCTP_MBUF_INPUT);
- }
- }
+ sctp_log_mbc(m, SCTP_MBUF_INPUT);
}
#endif
#ifdef SCTP_PACKET_LOGGING
@@ -6041,17 +6133,13 @@ sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port)
}
#endif
SCTPDBG(SCTP_DEBUG_CRCOFFLOAD,
- "sctp_input(): Packet of length %d received on %s with csum_flags 0x%x.\n",
+ "sctp_input(): Packet of length %d received on %s with csum_flags 0x%b.\n",
m->m_pkthdr.len,
if_name(m->m_pkthdr.rcvif),
- m->m_pkthdr.csum_flags);
- if (m->m_flags & M_FLOWID) {
- mflowid = m->m_pkthdr.flowid;
- use_mflowid = 1;
- } else {
- mflowid = 0;
- use_mflowid = 0;
- }
+ (int)m->m_pkthdr.csum_flags, CSUM_BITS);
+ mflowid = m->m_pkthdr.flowid;
+ mflowtype = M_HASHTYPE_GET(m);
+ fibnum = M_GETFIB(m);
SCTP_STAT_INCR(sctps_recvpackets);
SCTP_STAT_INCR_COUNTER64(sctps_inpackets);
/* Get IP, SCTP, and first chunk header together in the first mbuf. */
@@ -6076,7 +6164,7 @@ sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port)
dst.sin_len = sizeof(struct sockaddr_in);
dst.sin_port = sh->dest_port;
dst.sin_addr = ip->ip_dst;
- length = ip->ip_len + iphlen;
+ length = ntohs(ip->ip_len);
/* Validate mbuf chain length with IP payload length. */
if (SCTP_HEADER_LEN(m) != length) {
SCTPDBG(SCTP_DEBUG_INPUT1,
@@ -6111,7 +6199,7 @@ sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port)
compute_crc,
#endif
ecn_bits,
- use_mflowid, mflowid,
+ mflowtype, mflowid, fibnum,
vrf_id, port);
out:
if (m) {
@@ -6125,18 +6213,23 @@ extern int *sctp_cpuarry;
#endif
-void
-sctp_input(struct mbuf *m, int off)
+int
+sctp_input(struct mbuf **mp, int *offp, int proto SCTP_UNUSED)
{
-#if defined(__FreeBSD__) && defined(SCTP_MCORE_INPUT) && defined(SMP)
- struct ip *ip;
- struct sctphdr *sh;
- int offset;
- int cpu_to_use;
- uint32_t flowid, tag;
+ struct mbuf *m;
+ int off;
+ m = *mp;
+ off = *offp;
+#if defined(__FreeBSD__) && defined(SCTP_MCORE_INPUT) && defined(SMP)
if (mp_ncpus > 1) {
- if (m->m_flags & M_FLOWID) {
+ struct ip *ip;
+ struct sctphdr *sh;
+ int offset;
+ int cpu_to_use;
+ uint32_t flowid, tag;
+
+ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
flowid = m->m_pkthdr.flowid;
} else {
/*
@@ -6147,7 +6240,7 @@ sctp_input(struct mbuf *m, int off)
if (SCTP_BUF_LEN(m) < offset) {
if ((m = m_pullup(m, offset)) == NULL) {
SCTP_STAT_INCR(sctps_hdrops);
- return;
+ return (IPPROTO_DONE);
}
}
ip = mtod(m, struct ip *);
@@ -6155,14 +6248,15 @@ sctp_input(struct mbuf *m, int off)
tag = htonl(sh->v_tag);
flowid = tag ^ ntohs(sh->dest_port) ^ ntohs(sh->src_port);
m->m_pkthdr.flowid = flowid;
- m->m_flags |= M_FLOWID;
+ M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE_HASH);
}
cpu_to_use = sctp_cpuarry[flowid % mp_ncpus];
sctp_queue_to_mcore(m, off, cpu_to_use);
- return;
+ return (IPPROTO_DONE);
}
#endif
sctp_input_with_port(m, off, 0);
+ return (IPPROTO_DONE);
}
#endif
diff --git a/freebsd/sys/netinet/sctp_input.h b/freebsd/sys/netinet/sctp_input.h
index 95208032..148864b6 100644
--- a/freebsd/sys/netinet/sctp_input.h
+++ b/freebsd/sys/netinet/sctp_input.h
@@ -45,10 +45,10 @@ sctp_common_input_processing(struct mbuf **, int, int, int,
uint8_t,
#endif
uint8_t,
- uint8_t, uint32_t,
+ uint8_t, uint32_t, uint16_t,
uint32_t, uint16_t);
-struct sctp_stream_reset_out_request *
+struct sctp_stream_reset_request *
sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq,
struct sctp_tmit_chunk **bchk);
diff --git a/freebsd/sys/netinet/sctp_lock_bsd.h b/freebsd/sys/netinet/sctp_lock_bsd.h
index 35cdf5f8..96e35214 100644
--- a/freebsd/sys/netinet/sctp_lock_bsd.h
+++ b/freebsd/sys/netinet/sctp_lock_bsd.h
@@ -49,7 +49,7 @@ __FBSDID("$FreeBSD$");
* Most other locks (INP and INFO) attempt to localize the locking i.e. we try
* to contain the lock and unlock within the function that needs to lock it.
* This sometimes mean we do extra locks and unlocks and lose a bit of
- * efficency, but if the performance statements about non-recursive locks are
+ * efficiency, but if the performance statements about non-recursive locks are
* true this should not be a problem. One issue that arises with this only
* lock when needed is that if an implicit association setup is done we have
* a problem. If at the time I lookup an association I have NULL in the tcb
diff --git a/freebsd/sys/netinet/sctp_os_bsd.h b/freebsd/sys/netinet/sctp_os_bsd.h
index d33d1fd3..e87914e5 100644
--- a/freebsd/sys/netinet/sctp_os_bsd.h
+++ b/freebsd/sys/netinet/sctp_os_bsd.h
@@ -95,7 +95,6 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
-#include <netinet/icmp6.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/nd6.h>
#include <netinet6/scope6_var.h>
@@ -105,7 +104,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip_options.h>
#include <crypto/sha1.h>
-#include <crypto/sha2/sha2.h>
+#include <crypto/sha2/sha256.h>
#ifndef in6pcb
#define in6pcb inpcb
@@ -152,33 +151,27 @@ MALLOC_DECLARE(SCTP_M_MCORE);
#define V_system_base_info VNET(system_base_info)
#define SCTP_BASE_INFO(__m) V_system_base_info.sctppcbinfo.__m
#define SCTP_BASE_STATS V_system_base_info.sctpstat
-#define SCTP_BASE_STATS_SYSCTL VNET_NAME(system_base_info.sctpstat)
-#define SCTP_BASE_STAT(__m) V_system_base_info.sctpstat.__m
-#define SCTP_BASE_SYSCTL(__m) VNET_NAME(system_base_info.sctpsysctl.__m)
+#define SCTP_BASE_STAT(__m) V_system_base_info.sctpstat.__m
+#define SCTP_BASE_SYSCTL(__m) V_system_base_info.sctpsysctl.__m
#define SCTP_BASE_VAR(__m) V_system_base_info.__m
-/*
- *
- */
-#define USER_ADDR_NULL (NULL) /* FIX ME: temp */
-
#define SCTP_PRINTF(params...) printf(params)
#if defined(SCTP_DEBUG)
#define SCTPDBG(level, params...) \
{ \
- do { \
- if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \
- SCTP_PRINTF(params); \
- } \
- } while (0); \
+ do { \
+ if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \
+ SCTP_PRINTF(params); \
+ } \
+ } while (0); \
}
#define SCTPDBG_ADDR(level, addr) \
{ \
- do { \
- if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \
- sctp_print_address(addr); \
- } \
- } while (0); \
+ do { \
+ if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \
+ sctp_print_address(addr); \
+ } \
+ } while (0); \
}
#else
#define SCTPDBG(level, params...)
@@ -194,11 +187,11 @@ MALLOC_DECLARE(SCTP_M_MCORE);
#ifdef SCTP_LTRACE_ERRORS
#define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err) \
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \
- SCTP_PRINTF("mbuf:%p inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
+ SCTP_PRINTF("mbuf:%p inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
m, inp, stcb, net, file, __LINE__, err);
#define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err) \
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \
- SCTP_PRINTF("inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
+ SCTP_PRINTF("inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
inp, stcb, net, file, __LINE__, err);
#else
#define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err)
@@ -232,16 +225,16 @@ MALLOC_DECLARE(SCTP_M_MCORE);
* general memory allocation
*/
#define SCTP_MALLOC(var, type, size, name) \
- do { \
- var = (type)malloc(size, name, M_NOWAIT); \
- } while (0)
+ do { \
+ var = (type)malloc(size, name, M_NOWAIT); \
+ } while (0)
#define SCTP_FREE(var, type) free(var, type)
#define SCTP_MALLOC_SONAME(var, type, size) \
- do { \
- var = (type)malloc(size, M_SONAME, M_WAITOK | M_ZERO); \
- } while (0)
+ do { \
+ var = (type)malloc(size, M_SONAME, M_WAITOK | M_ZERO); \
+ } while (0)
#define SCTP_FREE_SONAME(var) free(var, M_SONAME)
@@ -305,16 +298,12 @@ typedef struct callout sctp_os_timer_t;
#define SCTP_BUF_RESV_UF(m, size) m->m_data += size
#define SCTP_BUF_AT(m, size) m->m_data + size
#define SCTP_BUF_IS_EXTENDED(m) (m->m_flags & M_EXT)
-#define SCTP_BUF_EXTEND_SIZE(m) (m->m_ext.ext_size)
+#define SCTP_BUF_SIZE M_SIZE
#define SCTP_BUF_TYPE(m) (m->m_type)
#define SCTP_BUF_RECVIF(m) (m->m_pkthdr.rcvif)
#define SCTP_BUF_PREPEND M_PREPEND
-#define SCTP_ALIGN_TO_END(m, len) if(m->m_flags & M_PKTHDR) { \
- MH_ALIGN(m, len); \
- } else if ((m->m_flags & M_EXT) == 0) { \
- M_ALIGN(m, len); \
- }
+#define SCTP_ALIGN_TO_END(m, len) M_ALIGN(m, len)
/* We make it so if you have up to 4 threads
* writing based on the default size of
@@ -328,11 +317,11 @@ typedef struct callout sctp_os_timer_t;
/* MTU */
/*************************/
#define SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index, af) ((struct ifnet *)ifn)->if_mtu
-#define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, rt) ((rt != NULL) ? rt->rt_rmx.rmx_mtu : 0)
+#define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, rt) ((uint32_t)((rt != NULL) ? rt->rt_mtu : 0))
#define SCTP_GATHER_MTU_FROM_INTFC(sctp_ifn) ((sctp_ifn->ifn_p != NULL) ? ((struct ifnet *)(sctp_ifn->ifn_p))->if_mtu : 0)
#define SCTP_SET_MTU_OF_ROUTE(sa, rt, mtu) do { \
if (rt != NULL) \
- rt->rt_rmx.rmx_mtu = mtu; \
+ rt->rt_mtu = mtu; \
} while(0)
/* (de-)register interface event notifications */
@@ -346,7 +335,7 @@ typedef struct callout sctp_os_timer_t;
/* return the base ext data pointer */
#define SCTP_BUF_EXTEND_BASE(m) (m->m_ext.ext_buf)
/* return the refcnt of the data pointer */
-#define SCTP_BUF_EXTEND_REFCNT(m) (*m->m_ext.ref_cnt)
+#define SCTP_BUF_EXTEND_REFCNT(m) (*m->m_ext.ext_cnt)
/* return any buffer related flags, this is
* used beyond logging for apple only.
*/
@@ -399,6 +388,11 @@ typedef struct callout sctp_os_timer_t;
#define SCTP_CLEAR_SO_NBIO(so) ((so)->so_state &= ~SS_NBIO)
/* get the socket type */
#define SCTP_SO_TYPE(so) ((so)->so_type)
+/* Use a macro for renaming sb_cc to sb_acc.
+ * Initially sb_ccc was used, but this broke select() when used
+ * with SCTP sockets.
+ */
+#define sb_cc sb_acc
/* reserve sb space for a socket */
#define SCTP_SORESERVE(so, send, recv) soreserve(so, send, recv)
/* wakeup a socket */
@@ -418,13 +412,8 @@ typedef struct callout sctp_os_timer_t;
typedef struct route sctp_route_t;
typedef struct rtentry sctp_rtentry_t;
-/*
- * XXX multi-FIB support was backed out in r179783 and it seems clear that the
- * VRF support as currently in FreeBSD is not ready to support multi-FIB.
- * It might be best to implement multi-FIB support for both v4 and v6 indepedent
- * of VRFs and leave those to a real MPLS stack.
- */
-#define SCTP_RTALLOC(ro, vrf_id) rtalloc_ign((struct route *)ro, 0UL)
+#define SCTP_RTALLOC(ro, vrf_id, fibnum) \
+ rtalloc_ign_fib((struct route *)ro, 0UL, fibnum)
/* Future zero copy wakeup/send function */
#define SCTP_ZERO_COPY_EVENT(inp, so)
@@ -432,6 +421,11 @@ typedef struct rtentry sctp_rtentry_t;
#define SCTP_ZERO_COPY_SENDQ_EVENT(inp, so)
/*
+ * SCTP protocol specific mbuf flags.
+ */
+#define M_NOTIFICATION M_PROTO1 /* SCTP notification */
+
+/*
* IP output routines
*/
#define SCTP_IP_OUTPUT(result, o_pak, ro, stcb, vrf_id) \
@@ -442,12 +436,14 @@ typedef struct rtentry sctp_rtentry_t;
local_stcb->sctp_ep && \
local_stcb->sctp_ep->sctp_socket) \
o_flgs |= local_stcb->sctp_ep->sctp_socket->so_options & SO_DONTROUTE; \
+ m_clrprotoflags(o_pak); \
result = ip_output(o_pak, NULL, ro, o_flgs, 0, NULL); \
}
#define SCTP_IP6_OUTPUT(result, o_pak, ro, ifp, stcb, vrf_id) \
{ \
struct sctp_tcb *local_stcb = stcb; \
+ m_clrprotoflags(o_pak); \
if (local_stcb && local_stcb->sctp_ep) \
result = ip6_output(o_pak, \
((struct in6pcb *)(local_stcb->sctp_ep))->in6p_outputopts, \
diff --git a/freebsd/sys/netinet/sctp_output.c b/freebsd/sys/netinet/sctp_output.c
index cbc25b9c..9e12e775 100644
--- a/freebsd/sys/netinet/sctp_output.c
+++ b/freebsd/sys/netinet/sctp_output.c
@@ -52,7 +52,9 @@ __FBSDID("$FreeBSD$");
#include <netinet/sctp_bsd_addr.h>
#include <netinet/sctp_input.h>
#include <netinet/sctp_crc32.h>
+#if defined(INET) || defined(INET6)
#include <netinet/udp.h>
+#endif
#include <netinet/udp_var.h>
#include <machine/in_cksum.h>
@@ -67,7 +69,7 @@ struct sack_track {
struct sctp_gap_ack_block gaps[SCTP_MAX_GAPS_INARRAY];
};
-struct sack_track sack_array[256] = {
+const struct sack_track sack_array[256] = {
{0, 0, 0, 0, /* 0x00 */
{{0, 0},
{0, 0},
@@ -1881,7 +1883,7 @@ sctp_is_address_in_scope(struct sctp_ifa *ifa,
if (scope->ipv4_addr_legal) {
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)&ifa->address.sin;
+ sin = &ifa->address.sin;
if (sin->sin_addr.s_addr == 0) {
/* not in scope , unspecified */
return (0);
@@ -1912,7 +1914,7 @@ sctp_is_address_in_scope(struct sctp_ifa *ifa,
return (0);
}
/* ok to use deprecated addresses? */
- sin6 = (struct sockaddr_in6 *)&ifa->address.sin6;
+ sin6 = &ifa->address.sin6;
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
/* skip unspecifed addresses */
return (0);
@@ -1971,7 +1973,7 @@ sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa, uint16_t * len)
while (SCTP_BUF_NEXT(mret) != NULL) {
mret = SCTP_BUF_NEXT(mret);
}
- SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(plen, 0, M_DONTWAIT, 1, MT_DATA);
+ SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(plen, 0, M_NOWAIT, 1, MT_DATA);
if (SCTP_BUF_NEXT(mret) == NULL) {
/* We are hosed, can't add more addresses */
return (m);
@@ -1987,7 +1989,7 @@ sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa, uint16_t * len)
struct sctp_ipv4addr_param *ipv4p;
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)&ifa->address.sin;
+ sin = &ifa->address.sin;
ipv4p = (struct sctp_ipv4addr_param *)parmh;
parmh->param_type = htons(SCTP_IPV4_ADDRESS);
parmh->param_length = htons(plen);
@@ -2002,7 +2004,7 @@ sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa, uint16_t * len)
struct sctp_ipv6addr_param *ipv6p;
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)&ifa->address.sin6;
+ sin6 = &ifa->address.sin6;
ipv6p = (struct sctp_ipv6addr_param *)parmh;
parmh->param_type = htons(SCTP_IPV6_ADDRESS);
parmh->param_length = htons(plen);
@@ -2417,7 +2419,7 @@ sctp_is_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa)
LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) {
if (laddr->ifa == NULL) {
SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n",
- __FUNCTION__);
+ __func__);
continue;
}
if (laddr->ifa == ifa) {
@@ -2439,7 +2441,7 @@ sctp_is_addr_in_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa)
LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
if (laddr->ifa == NULL) {
SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n",
- __FUNCTION__);
+ __func__);
continue;
}
if ((laddr->ifa == ifa) && laddr->action == 0)
@@ -3071,7 +3073,7 @@ bound_all_plan_b:
ifn, num_preferred);
if (num_preferred == 0) {
/* None on this interface. */
- SCTPDBG(SCTP_DEBUG_OUTPUT2, "No prefered -- skipping to next\n");
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "No preferred -- skipping to next\n");
continue;
}
SCTPDBG(SCTP_DEBUG_OUTPUT2,
@@ -3156,12 +3158,10 @@ again_with_private_addresses_allowed:
* It is restricted for some reason..
* probably not yet added.
*/
- SCTPDBG(SCTP_DEBUG_OUTPUT2, "Its resticted\n");
+ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Its restricted\n");
sifa = NULL;
continue;
}
- } else {
- SCTP_PRINTF("Stcb is null - no print\n");
}
atomic_add_int(&sifa->refcount, 1);
goto out;
@@ -3224,12 +3224,14 @@ plan_d:
}
}
#ifdef INET
- if ((retried == 0) && (stcb->asoc.scope.ipv4_local_scope == 0)) {
- stcb->asoc.scope.ipv4_local_scope = 1;
- retried = 1;
- goto again_with_private_addresses_allowed;
- } else if (retried == 1) {
- stcb->asoc.scope.ipv4_local_scope = 0;
+ if (stcb) {
+ if ((retried == 0) && (stcb->asoc.scope.ipv4_local_scope == 0)) {
+ stcb->asoc.scope.ipv4_local_scope = 1;
+ retried = 1;
+ goto again_with_private_addresses_allowed;
+ } else if (retried == 1) {
+ stcb->asoc.scope.ipv4_local_scope = 0;
+ }
}
#endif
out:
@@ -3326,10 +3328,11 @@ sctp_source_address_selection(struct sctp_inpcb *inp,
#endif
/**
- * Rules: - Find the route if needed, cache if I can. - Look at
- * interface address in route, Is it in the bound list. If so we
- * have the best source. - If not we must rotate amongst the
- * addresses.
+ * Rules:
+ * - Find the route if needed, cache if I can.
+ * - Look at interface address in route, Is it in the bound list. If so we
+ * have the best source.
+ * - If not we must rotate amongst the addresses.
*
* Cavets and issues
*
@@ -3391,7 +3394,7 @@ sctp_source_address_selection(struct sctp_inpcb *inp,
/*
* Need a route to cache.
*/
- SCTP_RTALLOC(ro, vrf_id);
+ SCTP_RTALLOC(ro, vrf_id, inp->fibnum);
}
if (ro->ro_rt == NULL) {
return (NULL);
@@ -3508,7 +3511,7 @@ sctp_find_cmsg(int c_type, void *data, struct mbuf *control, size_t cpsize)
return (found);
}
/* It is exactly what we want. Copy it out. */
- m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), cpsize, (caddr_t)data);
+ m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), (int)cpsize, (caddr_t)data);
return (1);
} else {
struct sctp_sndrcvinfo *sndrcvinfo;
@@ -3618,6 +3621,11 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er
struct sctp_stream_out *tmp_str;
unsigned int i;
+#if defined(SCTP_DETAILED_STR_STATS)
+ int j;
+
+#endif
+
/* Default is NOT correct */
SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, default:%d pre_open:%d\n",
stcb->asoc.streamoutcnt, stcb->asoc.pre_open_streams);
@@ -3637,10 +3645,21 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er
for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
TAILQ_INIT(&stcb->asoc.strmout[i].outqueue);
stcb->asoc.strmout[i].chunks_on_queues = 0;
- stcb->asoc.strmout[i].next_sequence_send = 0;
+ stcb->asoc.strmout[i].next_mid_ordered = 0;
+ stcb->asoc.strmout[i].next_mid_unordered = 0;
+#if defined(SCTP_DETAILED_STR_STATS)
+ for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) {
+ stcb->asoc.strmout[i].abandoned_sent[j] = 0;
+ stcb->asoc.strmout[i].abandoned_unsent[j] = 0;
+ }
+#else
+ stcb->asoc.strmout[i].abandoned_sent[0] = 0;
+ stcb->asoc.strmout[i].abandoned_unsent[0] = 0;
+#endif
stcb->asoc.strmout[i].stream_no = i;
stcb->asoc.strmout[i].last_msg_incomplete = 0;
- stcb->asoc.ss_functions.sctp_ss_init_stream(&stcb->asoc.strmout[i], NULL);
+ stcb->asoc.strmout[i].state = SCTP_STREAM_OPENING;
+ stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], NULL);
}
}
break;
@@ -3661,7 +3680,7 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er
*error = EINVAL;
return (1);
}
- if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL,
+ if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL, stcb->asoc.port,
SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) {
*error = ENOBUFS;
return (1);
@@ -3693,14 +3712,14 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er
*error = EINVAL;
return (1);
}
- if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL,
+ if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL, stcb->asoc.port,
SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) {
*error = ENOBUFS;
return (1);
}
} else
#endif
- if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin6, NULL,
+ if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin6, NULL, stcb->asoc.port,
SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) {
*error = ENOBUFS;
return (1);
@@ -3821,28 +3840,22 @@ sctp_add_cookie(struct mbuf *init, int init_offset,
mret = sctp_get_mbuf_for_msg((sizeof(struct sctp_state_cookie) +
sizeof(struct sctp_paramhdr)), 0,
- M_DONTWAIT, 1, MT_DATA);
+ M_NOWAIT, 1, MT_DATA);
if (mret == NULL) {
return (NULL);
}
- copy_init = SCTP_M_COPYM(init, init_offset, M_COPYALL, M_DONTWAIT);
+ copy_init = SCTP_M_COPYM(init, init_offset, M_COPYALL, M_NOWAIT);
if (copy_init == NULL) {
sctp_m_freem(mret);
return (NULL);
}
#ifdef SCTP_MBUF_LOGGING
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- struct mbuf *mat;
-
- for (mat = copy_init; mat; mat = SCTP_BUF_NEXT(mat)) {
- if (SCTP_BUF_IS_EXTENDED(mat)) {
- sctp_log_mb(mat, SCTP_MBUF_ICOPY);
- }
- }
+ sctp_log_mbc(copy_init, SCTP_MBUF_ICOPY);
}
#endif
copy_initack = SCTP_M_COPYM(initack, initack_offset, M_COPYALL,
- M_DONTWAIT);
+ M_NOWAIT);
if (copy_initack == NULL) {
sctp_m_freem(mret);
sctp_m_freem(copy_init);
@@ -3850,13 +3863,7 @@ sctp_add_cookie(struct mbuf *init, int init_offset,
}
#ifdef SCTP_MBUF_LOGGING
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- struct mbuf *mat;
-
- for (mat = copy_initack; mat; mat = SCTP_BUF_NEXT(mat)) {
- if (SCTP_BUF_IS_EXTENDED(mat)) {
- sctp_log_mb(mat, SCTP_MBUF_ICOPY);
- }
- }
+ sctp_log_mbc(copy_initack, SCTP_MBUF_ICOPY);
}
#endif
/* easy side we just drop it on the end */
@@ -3892,7 +3899,7 @@ sctp_add_cookie(struct mbuf *init, int init_offset,
break;
}
}
- sig = sctp_get_mbuf_for_msg(SCTP_SECRET_SIZE, 0, M_DONTWAIT, 1, MT_DATA);
+ sig = sctp_get_mbuf_for_msg(SCTP_SECRET_SIZE, 0, M_NOWAIT, 1, MT_DATA);
if (sig == NULL) {
/* no space, so free the entire chain */
sctp_m_freem(mret);
@@ -3914,7 +3921,7 @@ sctp_add_cookie(struct mbuf *init, int init_offset,
static uint8_t
sctp_get_ect(struct sctp_tcb *stcb)
{
- if ((stcb != NULL) && (stcb->asoc.ecn_allowed == 1)) {
+ if ((stcb != NULL) && (stcb->asoc.ecn_supported == 1)) {
return (SCTP_ECT0_BIT);
} else {
return (0);
@@ -3985,7 +3992,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
uint32_t v_tag,
uint16_t port,
union sctp_sockstore *over_addr,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid,
#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
int so_locked SCTP_UNUSED
#else
@@ -4061,11 +4068,11 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
sctp_route_t iproute;
int len;
- len = sizeof(struct ip) + sizeof(struct sctphdr);
+ len = SCTP_MIN_V4_OVERHEAD;
if (port) {
len += sizeof(struct udphdr);
}
- newm = sctp_get_mbuf_for_msg(len, 1, M_DONTWAIT, 1, MT_DATA);
+ newm = sctp_get_mbuf_for_msg(len, 1, M_NOWAIT, 1, MT_DATA);
if (newm == NULL) {
sctp_m_freem(m);
SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
@@ -4076,18 +4083,11 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
SCTP_BUF_NEXT(newm) = m;
m = newm;
if (net != NULL) {
-#ifdef INVARIANTS
- if (net->flowidset == 0) {
- panic("Flow ID not set");
- }
-#endif
m->m_pkthdr.flowid = net->flowid;
- m->m_flags |= M_FLOWID;
+ M_HASHTYPE_SET(m, net->flowtype);
} else {
- if (use_mflowid != 0) {
- m->m_pkthdr.flowid = mflowid;
- m->m_flags |= M_FLOWID;
- }
+ m->m_pkthdr.flowid = mflowid;
+ M_HASHTYPE_SET(m, mflowtype);
}
packet_length = sctp_calculate_len(m);
ip = mtod(m, struct ip *);
@@ -4106,15 +4106,15 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
tos_value |= sctp_get_ect(stcb);
}
if ((nofragment_flag) && (port == 0)) {
- ip->ip_off = IP_DF;
+ ip->ip_off = htons(IP_DF);
} else {
- ip->ip_off = 0;
+ ip->ip_off = htons(0);
}
/* FreeBSD has a function for ip_id's */
- ip->ip_id = ip_newid();
+ ip_fillid(ip);
ip->ip_ttl = inp->ip_inp.inp.inp_ip_ttl;
- ip->ip_len = packet_length;
+ ip->ip_len = htons(packet_length);
ip->ip_tos = tos_value;
if (port) {
ip->ip_p = IPPROTO_UDP;
@@ -4177,7 +4177,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
sctp_free_ifa(_lsrc);
} else {
ip->ip_src = over_addr->sin.sin_addr;
- SCTP_RTALLOC(ro, vrf_id);
+ SCTP_RTALLOC(ro, vrf_id, inp->fibnum);
}
}
if (port) {
@@ -4190,7 +4190,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
udp->uh_dport = port;
- udp->uh_ulen = htons(packet_length - sizeof(struct ip));
+ udp->uh_ulen = htons((uint16_t) (packet_length - sizeof(struct ip)));
if (V_udp_cksum) {
udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP));
} else {
@@ -4350,11 +4350,11 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
flowlabel = ntohl(((struct in6pcb *)inp)->in6p_flowinfo);
}
flowlabel &= 0x000fffff;
- len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr);
+ len = SCTP_MIN_OVERHEAD;
if (port) {
len += sizeof(struct udphdr);
}
- newm = sctp_get_mbuf_for_msg(len, 1, M_DONTWAIT, 1, MT_DATA);
+ newm = sctp_get_mbuf_for_msg(len, 1, M_NOWAIT, 1, MT_DATA);
if (newm == NULL) {
sctp_m_freem(m);
SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
@@ -4365,18 +4365,11 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
SCTP_BUF_NEXT(newm) = m;
m = newm;
if (net != NULL) {
-#ifdef INVARIANTS
- if (net->flowidset == 0) {
- panic("Flow ID not set");
- }
-#endif
m->m_pkthdr.flowid = net->flowid;
- m->m_flags |= M_FLOWID;
+ M_HASHTYPE_SET(m, net->flowtype);
} else {
- if (use_mflowid != 0) {
- m->m_pkthdr.flowid = mflowid;
- m->m_flags |= M_FLOWID;
- }
+ m->m_pkthdr.flowid = mflowid;
+ M_HASHTYPE_SET(m, mflowtype);
}
packet_length = sctp_calculate_len(m);
@@ -4425,7 +4418,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
} else {
ip6h->ip6_nxt = IPPROTO_SCTP;
}
- ip6h->ip6_plen = (packet_length - sizeof(struct ip6_hdr));
+ ip6h->ip6_plen = (uint16_t) (packet_length - sizeof(struct ip6_hdr));
ip6h->ip6_dst = sin6->sin6_addr;
/*
@@ -4498,7 +4491,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
sctp_free_ifa(_lsrc);
} else {
lsa6->sin6_addr = over_addr->sin6.sin6_addr;
- SCTP_RTALLOC(ro, vrf_id);
+ SCTP_RTALLOC(ro, vrf_id, inp->fibnum);
}
(void)sa6_recoverscope(sin6);
}
@@ -4544,7 +4537,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
udp = (struct udphdr *)((caddr_t)ip6h + sizeof(struct ip6_hdr));
udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
udp->uh_dport = port;
- udp->uh_ulen = htons(packet_length - sizeof(struct ip6_hdr));
+ udp->uh_ulen = htons((uint16_t) (packet_length - sizeof(struct ip6_hdr)));
udp->uh_sum = 0;
sctphdr = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr));
} else {
@@ -4700,7 +4693,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked
#endif
)
{
- struct mbuf *m;
+ struct mbuf *m, *m_last;
struct sctp_nets *net;
struct sctp_init_chunk *init;
struct sctp_supported_addr_param *sup_addr;
@@ -4745,7 +4738,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked
/* start the INIT timer */
sctp_timer_start(SCTP_TIMER_TYPE_INIT, inp, stcb, net);
- m = sctp_get_mbuf_for_msg(MCLBYTES, 1, M_DONTWAIT, 1, MT_DATA);
+ m = sctp_get_mbuf_for_msg(MCLBYTES, 1, M_NOWAIT, 1, MT_DATA);
if (m == NULL) {
/* No memory, INIT timer will re-attempt. */
SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - mbuf?\n");
@@ -4753,12 +4746,6 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked
}
chunk_len = (uint16_t) sizeof(struct sctp_init_chunk);
padding_len = 0;
- /*
- * assume peer supports asconf in order to be able to queue local
- * address changes while an INIT is in flight and before the assoc
- * is established.
- */
- stcb->asoc.peer_supports_asconf = 1;
/* Now lets put the chunk header in place */
init = mtod(m, struct sctp_init_chunk *);
/* now the chunk header */
@@ -4775,120 +4762,76 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked
init->init.num_inbound_streams = htons(stcb->asoc.max_inbound_streams);
init->init.initial_tsn = htonl(stcb->asoc.init_seq_number);
- if (stcb->asoc.scope.ipv4_addr_legal || stcb->asoc.scope.ipv6_addr_legal) {
- uint8_t i;
-
- parameter_len = (uint16_t) sizeof(struct sctp_paramhdr);
- if (stcb->asoc.scope.ipv4_addr_legal) {
- parameter_len += (uint16_t) sizeof(uint16_t);
- }
- if (stcb->asoc.scope.ipv6_addr_legal) {
- parameter_len += (uint16_t) sizeof(uint16_t);
- }
- sup_addr = (struct sctp_supported_addr_param *)(mtod(m, caddr_t)+chunk_len);
- sup_addr->ph.param_type = htons(SCTP_SUPPORTED_ADDRTYPE);
- sup_addr->ph.param_length = htons(parameter_len);
- i = 0;
- if (stcb->asoc.scope.ipv4_addr_legal) {
- sup_addr->addr_type[i++] = htons(SCTP_IPV4_ADDRESS);
- }
- if (stcb->asoc.scope.ipv6_addr_legal) {
- sup_addr->addr_type[i++] = htons(SCTP_IPV6_ADDRESS);
- }
- padding_len = 4 - 2 * i;
- chunk_len += parameter_len;
- }
/* Adaptation layer indication parameter */
if (inp->sctp_ep.adaptation_layer_indicator_provided) {
- if (padding_len > 0) {
- memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
- chunk_len += padding_len;
- padding_len = 0;
- }
parameter_len = (uint16_t) sizeof(struct sctp_adaptation_layer_indication);
ali = (struct sctp_adaptation_layer_indication *)(mtod(m, caddr_t)+chunk_len);
ali->ph.param_type = htons(SCTP_ULP_ADAPTATION);
ali->ph.param_length = htons(parameter_len);
- ali->indication = ntohl(inp->sctp_ep.adaptation_layer_indicator);
+ ali->indication = htonl(inp->sctp_ep.adaptation_layer_indicator);
chunk_len += parameter_len;
}
- if (SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly)) {
- /* Add NAT friendly parameter. */
- if (padding_len > 0) {
- memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
- chunk_len += padding_len;
- padding_len = 0;
- }
+ /* ECN parameter */
+ if (stcb->asoc.ecn_supported == 1) {
parameter_len = (uint16_t) sizeof(struct sctp_paramhdr);
ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
- ph->param_type = htons(SCTP_HAS_NAT_SUPPORT);
+ ph->param_type = htons(SCTP_ECN_CAPABLE);
ph->param_length = htons(parameter_len);
chunk_len += parameter_len;
}
- /* now any cookie time extensions */
- if (stcb->asoc.cookie_preserve_req) {
- struct sctp_cookie_perserve_param *cookie_preserve;
-
- if (padding_len > 0) {
- memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
- chunk_len += padding_len;
- padding_len = 0;
- }
- parameter_len = (uint16_t) sizeof(struct sctp_cookie_perserve_param);
- cookie_preserve = (struct sctp_cookie_perserve_param *)(mtod(m, caddr_t)+chunk_len);
- cookie_preserve->ph.param_type = htons(SCTP_COOKIE_PRESERVE);
- cookie_preserve->ph.param_length = htons(parameter_len);
- cookie_preserve->time = htonl(stcb->asoc.cookie_preserve_req);
- stcb->asoc.cookie_preserve_req = 0;
+ /* PR-SCTP supported parameter */
+ if (stcb->asoc.prsctp_supported == 1) {
+ parameter_len = (uint16_t) sizeof(struct sctp_paramhdr);
+ ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
+ ph->param_type = htons(SCTP_PRSCTP_SUPPORTED);
+ ph->param_length = htons(parameter_len);
chunk_len += parameter_len;
}
- /* ECN parameter */
- if (stcb->asoc.ecn_allowed == 1) {
- if (padding_len > 0) {
- memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
- chunk_len += padding_len;
- padding_len = 0;
- }
+ /* Add NAT friendly parameter. */
+ if (SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly)) {
parameter_len = (uint16_t) sizeof(struct sctp_paramhdr);
ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
- ph->param_type = htons(SCTP_ECN_CAPABLE);
+ ph->param_type = htons(SCTP_HAS_NAT_SUPPORT);
ph->param_length = htons(parameter_len);
chunk_len += parameter_len;
}
- /* And now tell the peer we do support PR-SCTP. */
- if (padding_len > 0) {
- memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
- chunk_len += padding_len;
- padding_len = 0;
- }
- parameter_len = (uint16_t) sizeof(struct sctp_paramhdr);
- ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
- ph->param_type = htons(SCTP_PRSCTP_SUPPORTED);
- ph->param_length = htons(parameter_len);
- chunk_len += parameter_len;
-
- /* And now tell the peer we do all the extensions */
- pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len);
- pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT);
+ /* And now tell the peer which extensions we support */
num_ext = 0;
- pr_supported->chunk_types[num_ext++] = SCTP_ASCONF;
- pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK;
- pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN;
- pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED;
- pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET;
- if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) {
+ pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len);
+ if (stcb->asoc.prsctp_supported == 1) {
+ pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN;
+ if (stcb->asoc.idata_supported) {
+ pr_supported->chunk_types[num_ext++] = SCTP_IFORWARD_CUM_TSN;
+ }
+ }
+ if (stcb->asoc.auth_supported == 1) {
pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION;
}
- if (stcb->asoc.sctp_nr_sack_on_off == 1) {
+ if (stcb->asoc.asconf_supported == 1) {
+ pr_supported->chunk_types[num_ext++] = SCTP_ASCONF;
+ pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK;
+ }
+ if (stcb->asoc.reconfig_supported == 1) {
+ pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET;
+ }
+ if (stcb->asoc.idata_supported) {
+ pr_supported->chunk_types[num_ext++] = SCTP_IDATA;
+ }
+ if (stcb->asoc.nrsack_supported == 1) {
pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK;
}
- parameter_len = (uint16_t) sizeof(struct sctp_supported_chunk_types_param) + num_ext;
- pr_supported->ph.param_length = htons(parameter_len);
- padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
- chunk_len += parameter_len;
-
+ if (stcb->asoc.pktdrop_supported == 1) {
+ pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED;
+ }
+ if (num_ext > 0) {
+ parameter_len = (uint16_t) sizeof(struct sctp_supported_chunk_types_param) + num_ext;
+ pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT);
+ pr_supported->ph.param_length = htons(parameter_len);
+ padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
+ chunk_len += parameter_len;
+ }
/* add authentication parameters */
- if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) {
+ if (stcb->asoc.auth_supported) {
/* attach RANDOM parameter, if available */
if (stcb->asoc.authinfo.random != NULL) {
struct sctp_auth_random *randp;
@@ -4906,8 +4849,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked
chunk_len += parameter_len;
}
/* add HMAC_ALGO parameter */
- if ((stcb->asoc.local_hmacs != NULL) &&
- (stcb->asoc.local_hmacs->num_algo > 0)) {
+ if (stcb->asoc.local_hmacs != NULL) {
struct sctp_auth_hmac_algo *hmacs;
if (padding_len > 0) {
@@ -4925,7 +4867,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked
chunk_len += parameter_len;
}
/* add CHUNKS parameter */
- if (sctp_auth_get_chklist_size(stcb->asoc.local_auth_chunks) > 0) {
+ if (stcb->asoc.local_auth_chunks != NULL) {
struct sctp_auth_chunk_list *chunks;
if (padding_len > 0) {
@@ -4943,8 +4885,52 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked
chunk_len += parameter_len;
}
}
- SCTP_BUF_LEN(m) = chunk_len;
+ /* now any cookie time extensions */
+ if (stcb->asoc.cookie_preserve_req) {
+ struct sctp_cookie_perserve_param *cookie_preserve;
+
+ if (padding_len > 0) {
+ memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
+ chunk_len += padding_len;
+ padding_len = 0;
+ }
+ parameter_len = (uint16_t) sizeof(struct sctp_cookie_perserve_param);
+ cookie_preserve = (struct sctp_cookie_perserve_param *)(mtod(m, caddr_t)+chunk_len);
+ cookie_preserve->ph.param_type = htons(SCTP_COOKIE_PRESERVE);
+ cookie_preserve->ph.param_length = htons(parameter_len);
+ cookie_preserve->time = htonl(stcb->asoc.cookie_preserve_req);
+ stcb->asoc.cookie_preserve_req = 0;
+ chunk_len += parameter_len;
+ }
+ if (stcb->asoc.scope.ipv4_addr_legal || stcb->asoc.scope.ipv6_addr_legal) {
+ uint8_t i;
+ if (padding_len > 0) {
+ memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
+ chunk_len += padding_len;
+ padding_len = 0;
+ }
+ parameter_len = (uint16_t) sizeof(struct sctp_paramhdr);
+ if (stcb->asoc.scope.ipv4_addr_legal) {
+ parameter_len += (uint16_t) sizeof(uint16_t);
+ }
+ if (stcb->asoc.scope.ipv6_addr_legal) {
+ parameter_len += (uint16_t) sizeof(uint16_t);
+ }
+ sup_addr = (struct sctp_supported_addr_param *)(mtod(m, caddr_t)+chunk_len);
+ sup_addr->ph.param_type = htons(SCTP_SUPPORTED_ADDRTYPE);
+ sup_addr->ph.param_length = htons(parameter_len);
+ i = 0;
+ if (stcb->asoc.scope.ipv4_addr_legal) {
+ sup_addr->addr_type[i++] = htons(SCTP_IPV4_ADDRESS);
+ }
+ if (stcb->asoc.scope.ipv6_addr_legal) {
+ sup_addr->addr_type[i++] = htons(SCTP_IPV6_ADDRESS);
+ }
+ padding_len = 4 - 2 * i;
+ chunk_len += parameter_len;
+ }
+ SCTP_BUF_LEN(m) = chunk_len;
/* now the addresses */
/*
* To optimize this we could put the scoping stuff into a structure
@@ -4952,18 +4938,13 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked
* we could just sifa in the address within the stcb. But for now
* this is a quick hack to get the address stuff teased apart.
*/
- sctp_add_addresses_to_i_ia(inp, stcb, &stcb->asoc.scope, m, cnt_inits_to, &padding_len, &chunk_len);
+ m_last = sctp_add_addresses_to_i_ia(inp, stcb, &stcb->asoc.scope,
+ m, cnt_inits_to,
+ &padding_len, &chunk_len);
init->ch.chunk_length = htons(chunk_len);
if (padding_len > 0) {
- struct mbuf *m_at, *mp_last;
-
- mp_last = NULL;
- for (m_at = m; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
- if (SCTP_BUF_NEXT(m_at) == NULL)
- mp_last = m_at;
- }
- if ((mp_last == NULL) || sctp_add_pad_tombuf(mp_last, padding_len)) {
+ if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) {
sctp_m_freem(m);
return;
}
@@ -5100,7 +5081,6 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt,
*nat_friendly = 1;
/* fall through */
case SCTP_PRSCTP_SUPPORTED:
-
if (padded_size != sizeof(struct sctp_paramhdr)) {
SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error prsctp/nat support %d\n", plen);
goto invalid_size;
@@ -5108,7 +5088,7 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt,
at += padded_size;
break;
case SCTP_ECN_CAPABLE:
- if (padded_size != sizeof(struct sctp_ecn_supported_param)) {
+ if (padded_size != sizeof(struct sctp_paramhdr)) {
SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ecn %d\n", plen);
goto invalid_size;
}
@@ -5138,13 +5118,14 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt,
if (op_err == NULL) {
/* Ok need to try to get a mbuf */
#ifdef INET6
- l_len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+ l_len = SCTP_MIN_OVERHEAD;
#else
- l_len = sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+ l_len = SCTP_MIN_V4_OVERHEAD;
#endif
+ l_len += sizeof(struct sctp_chunkhdr);
l_len += plen;
l_len += sizeof(struct sctp_paramhdr);
- op_err = sctp_get_mbuf_for_msg(l_len, 0, M_DONTWAIT, 1, MT_DATA);
+ op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA);
if (op_err) {
SCTP_BUF_LEN(op_err) = 0;
/*
@@ -5207,13 +5188,14 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt,
/* Ok need to try to get an mbuf */
#ifdef INET6
- l_len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+ l_len = SCTP_MIN_OVERHEAD;
#else
- l_len = sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+ l_len = SCTP_MIN_V4_OVERHEAD;
#endif
+ l_len += sizeof(struct sctp_chunkhdr);
l_len += plen;
l_len += sizeof(struct sctp_paramhdr);
- op_err = sctp_get_mbuf_for_msg(l_len, 0, M_DONTWAIT, 1, MT_DATA);
+ op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA);
if (op_err) {
SCTP_BUF_LEN(op_err) = 0;
#ifdef INET6
@@ -5282,12 +5264,13 @@ invalid_size:
int l_len;
#ifdef INET6
- l_len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+ l_len = SCTP_MIN_OVERHEAD;
#else
- l_len = sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+ l_len = SCTP_MIN_V4_OVERHEAD;
#endif
+ l_len += sizeof(struct sctp_chunkhdr);
l_len += (2 * sizeof(struct sctp_paramhdr));
- op_err = sctp_get_mbuf_for_msg(l_len, 0, M_DONTWAIT, 1, MT_DATA);
+ op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA);
if (op_err) {
SCTP_BUF_LEN(op_err) = 0;
#ifdef INET6
@@ -5336,6 +5319,7 @@ sctp_are_there_new_addresses(struct sctp_association *asoc,
uint16_t ptype, plen;
uint8_t fnd;
struct sctp_nets *net;
+ int check_src;
#ifdef INET
struct sockaddr_in sin4, *sa4;
@@ -5357,39 +5341,61 @@ sctp_are_there_new_addresses(struct sctp_association *asoc,
sin6.sin6_len = sizeof(sin6);
#endif
/* First what about the src address of the pkt ? */
- fnd = 0;
- TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
- sa = (struct sockaddr *)&net->ro._l_addr;
- if (sa->sa_family == src->sa_family) {
+ check_src = 0;
+ switch (src->sa_family) {
+#ifdef INET
+ case AF_INET:
+ if (asoc->scope.ipv4_addr_legal) {
+ check_src = 1;
+ }
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ if (asoc->scope.ipv6_addr_legal) {
+ check_src = 1;
+ }
+ break;
+#endif
+ default:
+ /* TSNH */
+ break;
+ }
+ if (check_src) {
+ fnd = 0;
+ TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
+ sa = (struct sockaddr *)&net->ro._l_addr;
+ if (sa->sa_family == src->sa_family) {
#ifdef INET
- if (sa->sa_family == AF_INET) {
- struct sockaddr_in *src4;
+ if (sa->sa_family == AF_INET) {
+ struct sockaddr_in *src4;
- sa4 = (struct sockaddr_in *)sa;
- src4 = (struct sockaddr_in *)src;
- if (sa4->sin_addr.s_addr == src4->sin_addr.s_addr) {
- fnd = 1;
- break;
+ sa4 = (struct sockaddr_in *)sa;
+ src4 = (struct sockaddr_in *)src;
+ if (sa4->sin_addr.s_addr == src4->sin_addr.s_addr) {
+ fnd = 1;
+ break;
+ }
}
- }
#endif
#ifdef INET6
- if (sa->sa_family == AF_INET6) {
- struct sockaddr_in6 *src6;
+ if (sa->sa_family == AF_INET6) {
+ struct sockaddr_in6 *src6;
- sa6 = (struct sockaddr_in6 *)sa;
- src6 = (struct sockaddr_in6 *)src;
- if (SCTP6_ARE_ADDR_EQUAL(sa6, src6)) {
- fnd = 1;
- break;
+ sa6 = (struct sockaddr_in6 *)sa;
+ src6 = (struct sockaddr_in6 *)src;
+ if (SCTP6_ARE_ADDR_EQUAL(sa6, src6)) {
+ fnd = 1;
+ break;
+ }
}
- }
#endif
+ }
+ }
+ if (fnd == 0) {
+ /* New address added! no need to look further. */
+ return (1);
}
- }
- if (fnd == 0) {
- /* New address added! no need to look futher. */
- return (1);
}
/* Ok so far lets munge through the rest of the packet */
offset += sizeof(struct sctp_init_chunk);
@@ -5410,9 +5416,11 @@ sctp_are_there_new_addresses(struct sctp_association *asoc,
phdr == NULL) {
return (1);
}
- p4 = (struct sctp_ipv4addr_param *)phdr;
- sin4.sin_addr.s_addr = p4->addr;
- sa_touse = (struct sockaddr *)&sin4;
+ if (asoc->scope.ipv4_addr_legal) {
+ p4 = (struct sctp_ipv4addr_param *)phdr;
+ sin4.sin_addr.s_addr = p4->addr;
+ sa_touse = (struct sockaddr *)&sin4;
+ }
break;
}
#endif
@@ -5427,10 +5435,12 @@ sctp_are_there_new_addresses(struct sctp_association *asoc,
phdr == NULL) {
return (1);
}
- p6 = (struct sctp_ipv6addr_param *)phdr;
- memcpy((caddr_t)&sin6.sin6_addr, p6->addr,
- sizeof(p6->addr));
- sa_touse = (struct sockaddr *)&sin6;
+ if (asoc->scope.ipv6_addr_legal) {
+ p6 = (struct sctp_ipv6addr_param *)phdr;
+ memcpy((caddr_t)&sin6.sin6_addr, p6->addr,
+ sizeof(p6->addr));
+ sa_touse = (struct sockaddr *)&sin6;
+ }
break;
}
#endif
@@ -5486,20 +5496,21 @@ sctp_are_there_new_addresses(struct sctp_association *asoc,
*/
void
sctp_send_initiate_ack(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
- struct mbuf *init_pkt, int iphlen, int offset,
+ struct sctp_nets *src_net, struct mbuf *init_pkt,
+ int iphlen, int offset,
struct sockaddr *src, struct sockaddr *dst,
struct sctphdr *sh, struct sctp_init_chunk *init_chk,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid,
uint32_t vrf_id, uint16_t port, int hold_inp_lock)
{
struct sctp_association *asoc;
- struct mbuf *m, *m_at, *m_tmp, *m_cookie, *op_err, *mp_last;
+ struct mbuf *m, *m_tmp, *m_last, *m_cookie, *op_err;
struct sctp_init_ack_chunk *initack;
struct sctp_adaptation_layer_indication *ali;
- struct sctp_ecn_supported_param *ecn;
- struct sctp_prsctp_supported_param *prsctp;
struct sctp_supported_chunk_types_param *pr_supported;
+ struct sctp_paramhdr *ph;
union sctp_sockstore *over_addr;
+ struct sctp_scoping scp;
#ifdef INET
struct sockaddr_in *dst4 = (struct sockaddr_in *)dst;
@@ -5519,33 +5530,50 @@ sctp_send_initiate_ack(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
uint8_t *signature = NULL;
int cnt_inits_to = 0;
uint16_t his_limit, i_want;
- int abort_flag, padval;
- int num_ext;
- int p_len;
+ int abort_flag;
int nat_friendly = 0;
struct socket *so;
+ uint16_t num_ext, chunk_len, padding_len, parameter_len;
if (stcb) {
asoc = &stcb->asoc;
} else {
asoc = NULL;
}
- mp_last = NULL;
if ((asoc != NULL) &&
- (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) &&
- (sctp_are_there_new_addresses(asoc, init_pkt, offset, src))) {
- /* new addresses, out of here in non-cookie-wait states */
- /*
- * Send a ABORT, we don't add the new address error clause
- * though we even set the T bit and copy in the 0 tag.. this
- * looks no different than if no listener was present.
- */
- op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
- "Address added");
- sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, op_err,
- use_mflowid, mflowid,
- vrf_id, port);
- return;
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT)) {
+ if (sctp_are_there_new_addresses(asoc, init_pkt, offset, src)) {
+ /*
+ * new addresses, out of here in non-cookie-wait
+ * states
+ *
+ * Send an ABORT, without the new address error cause.
+ * This looks no different than if no listener was
+ * present.
+ */
+ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
+ "Address added");
+ sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, op_err,
+ mflowtype, mflowid, inp->fibnum,
+ vrf_id, port);
+ return;
+ }
+ if (src_net != NULL && (src_net->port != port)) {
+ /*
+ * change of remote encapsulation port, out of here
+ * in non-cookie-wait states
+ *
+ * Send an ABORT, without an specific error cause. This
+ * looks no different than if no listener was
+ * present.
+ */
+ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
+ "Remote encapsulation port changed");
+ sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, op_err,
+ mflowtype, mflowid, inp->fibnum,
+ vrf_id, port);
+ return;
+ }
}
abort_flag = 0;
op_err = sctp_arethere_unrecognized_parameters(init_pkt,
@@ -5556,24 +5584,25 @@ do_a_abort:
if (op_err == NULL) {
char msg[SCTP_DIAG_INFO_LEN];
- snprintf(msg, sizeof(msg), "%s:%d at %s\n", __FILE__, __LINE__, __FUNCTION__);
+ snprintf(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__);
op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
msg);
}
sctp_send_abort(init_pkt, iphlen, src, dst, sh,
init_chk->init.initiate_tag, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid, inp->fibnum,
vrf_id, port);
return;
}
- m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
if (m == NULL) {
/* No memory, INIT timer will re-attempt. */
if (op_err)
sctp_m_freem(op_err);
return;
}
- SCTP_BUF_LEN(m) = sizeof(struct sctp_init_chunk);
+ chunk_len = (uint16_t) sizeof(struct sctp_init_ack_chunk);
+ padding_len = 0;
/*
* We might not overwrite the identification[] completely and on
@@ -5605,7 +5634,7 @@ do_a_abort:
stc.peerport = sh->src_port;
/*
- * If we wanted to honor cookie life extentions, we would add to
+ * If we wanted to honor cookie life extensions, we would add to
* stc.cookie_life. For now we should NOT honor any extension
*/
stc.site_scope = stc.local_scope = stc.loopback_scope = 0;
@@ -5620,11 +5649,7 @@ do_a_abort:
stc.ipv6_addr_legal = 0;
stc.ipv4_addr_legal = 1;
}
-#ifdef SCTP_DONT_DO_PRIVADDR_SCOPE
- stc.ipv4_scope = 1;
-#else
stc.ipv4_scope = 0;
-#endif
if (net == NULL) {
to = src;
switch (dst->sa_family) {
@@ -5645,13 +5670,10 @@ do_a_abort:
stc.laddr_type = SCTP_IPV4_ADDRESS;
/* scope_id is only for v6 */
stc.scope_id = 0;
-#ifndef SCTP_DONT_DO_PRIVADDR_SCOPE
- if (IN4_ISPRIVATE_ADDRESS(&src4->sin_addr)) {
+ if ((IN4_ISPRIVATE_ADDRESS(&src4->sin_addr)) ||
+ (IN4_ISPRIVATE_ADDRESS(&dst4->sin_addr))) {
stc.ipv4_scope = 1;
}
-#else
- stc.ipv4_scope = 1;
-#endif /* SCTP_DONT_DO_PRIVADDR_SCOPE */
/* Must use the address in this case */
if (sctp_is_address_on_local_host(src, vrf_id)) {
stc.loopback_scope = 1;
@@ -5667,22 +5689,24 @@ do_a_abort:
{
stc.addr_type = SCTP_IPV6_ADDRESS;
memcpy(&stc.address, &src6->sin6_addr, sizeof(struct in6_addr));
- stc.scope_id = in6_getscope(&src6->sin6_addr);
+ stc.scope_id = ntohs(in6_getscope(&src6->sin6_addr));
if (sctp_is_address_on_local_host(src, vrf_id)) {
stc.loopback_scope = 1;
stc.local_scope = 0;
stc.site_scope = 1;
stc.ipv4_scope = 1;
- } else if (IN6_IS_ADDR_LINKLOCAL(&src6->sin6_addr)) {
+ } else if (IN6_IS_ADDR_LINKLOCAL(&src6->sin6_addr) ||
+ IN6_IS_ADDR_LINKLOCAL(&dst6->sin6_addr)) {
/*
- * If the new destination is a
- * LINK_LOCAL we must have common
- * both site and local scope. Don't
- * set local scope though since we
- * must depend on the source to be
- * added implicitly. We cannot
- * assure just because we share one
- * link that all links are common.
+ * If the new destination or source
+ * is a LINK_LOCAL we must have
+ * common both site and local scope.
+ * Don't set local scope though
+ * since we must depend on the
+ * source to be added implicitly. We
+ * cannot assure just because we
+ * share one link that all links are
+ * common.
*/
stc.local_scope = 0;
stc.site_scope = 1;
@@ -5698,11 +5722,12 @@ do_a_abort:
* pull out the scope_id from
* incoming pkt
*/
- } else if (IN6_IS_ADDR_SITELOCAL(&src6->sin6_addr)) {
+ } else if (IN6_IS_ADDR_SITELOCAL(&src6->sin6_addr) ||
+ IN6_IS_ADDR_SITELOCAL(&dst6->sin6_addr)) {
/*
- * If the new destination is
- * SITE_LOCAL then we must have site
- * scope in common.
+ * If the new destination or source
+ * is SITE_LOCAL then we must have
+ * site scope in common.
*/
stc.site_scope = 1;
}
@@ -5806,7 +5831,7 @@ do_a_abort:
/* Now lets put the SCTP header in place */
initack = mtod(m, struct sctp_init_ack_chunk *);
/* Save it off for quick ref */
- stc.peers_vtag = init_chk->init.initiate_tag;
+ stc.peers_vtag = ntohl(init_chk->init.initiate_tag);
/* who are we */
memcpy(stc.identification, SCTP_VERSION_STRING,
min(strlen(SCTP_VERSION_STRING), sizeof(stc.identification)));
@@ -5876,10 +5901,10 @@ do_a_abort:
his_limit = ntohs(init_chk->init.num_inbound_streams);
/* choose what I want */
if (asoc != NULL) {
- if (asoc->streamoutcnt > inp->sctp_ep.pre_open_stream_count) {
+ if (asoc->streamoutcnt > asoc->pre_open_streams) {
i_want = asoc->streamoutcnt;
} else {
- i_want = inp->sctp_ep.pre_open_stream_count;
+ i_want = asoc->pre_open_streams;
}
} else {
i_want = inp->sctp_ep.pre_open_stream_count;
@@ -5897,161 +5922,182 @@ do_a_abort:
/* adaptation layer indication parameter */
if (inp->sctp_ep.adaptation_layer_indicator_provided) {
- ali = (struct sctp_adaptation_layer_indication *)((caddr_t)initack + sizeof(*initack));
+ parameter_len = (uint16_t) sizeof(struct sctp_adaptation_layer_indication);
+ ali = (struct sctp_adaptation_layer_indication *)(mtod(m, caddr_t)+chunk_len);
ali->ph.param_type = htons(SCTP_ULP_ADAPTATION);
- ali->ph.param_length = htons(sizeof(*ali));
- ali->indication = ntohl(inp->sctp_ep.adaptation_layer_indicator);
- SCTP_BUF_LEN(m) += sizeof(*ali);
- ecn = (struct sctp_ecn_supported_param *)((caddr_t)ali + sizeof(*ali));
- } else {
- ecn = (struct sctp_ecn_supported_param *)((caddr_t)initack + sizeof(*initack));
+ ali->ph.param_length = htons(parameter_len);
+ ali->indication = htonl(inp->sctp_ep.adaptation_layer_indicator);
+ chunk_len += parameter_len;
}
-
/* ECN parameter */
- if (((asoc != NULL) && (asoc->ecn_allowed == 1)) ||
- (inp->sctp_ecn_enable == 1)) {
- ecn->ph.param_type = htons(SCTP_ECN_CAPABLE);
- ecn->ph.param_length = htons(sizeof(*ecn));
- SCTP_BUF_LEN(m) += sizeof(*ecn);
-
- prsctp = (struct sctp_prsctp_supported_param *)((caddr_t)ecn +
- sizeof(*ecn));
- } else {
- prsctp = (struct sctp_prsctp_supported_param *)((caddr_t)ecn);
+ if (((asoc != NULL) && (asoc->ecn_supported == 1)) ||
+ ((asoc == NULL) && (inp->ecn_supported == 1))) {
+ parameter_len = (uint16_t) sizeof(struct sctp_paramhdr);
+ ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
+ ph->param_type = htons(SCTP_ECN_CAPABLE);
+ ph->param_length = htons(parameter_len);
+ chunk_len += parameter_len;
+ }
+ /* PR-SCTP supported parameter */
+ if (((asoc != NULL) && (asoc->prsctp_supported == 1)) ||
+ ((asoc == NULL) && (inp->prsctp_supported == 1))) {
+ parameter_len = (uint16_t) sizeof(struct sctp_paramhdr);
+ ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
+ ph->param_type = htons(SCTP_PRSCTP_SUPPORTED);
+ ph->param_length = htons(parameter_len);
+ chunk_len += parameter_len;
}
- /* And now tell the peer we do pr-sctp */
- prsctp->ph.param_type = htons(SCTP_PRSCTP_SUPPORTED);
- prsctp->ph.param_length = htons(sizeof(*prsctp));
- SCTP_BUF_LEN(m) += sizeof(*prsctp);
+ /* Add NAT friendly parameter */
if (nat_friendly) {
- /* Add NAT friendly parameter */
- struct sctp_paramhdr *ph;
-
- ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
+ parameter_len = (uint16_t) sizeof(struct sctp_paramhdr);
+ ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len);
ph->param_type = htons(SCTP_HAS_NAT_SUPPORT);
- ph->param_length = htons(sizeof(struct sctp_paramhdr));
- SCTP_BUF_LEN(m) += sizeof(struct sctp_paramhdr);
+ ph->param_length = htons(parameter_len);
+ chunk_len += parameter_len;
}
- /* And now tell the peer we do all the extensions */
- pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
- pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT);
+ /* And now tell the peer which extensions we support */
num_ext = 0;
- pr_supported->chunk_types[num_ext++] = SCTP_ASCONF;
- pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK;
- pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN;
- pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED;
- pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET;
- if (!SCTP_BASE_SYSCTL(sctp_auth_disable))
+ pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len);
+ if (((asoc != NULL) && (asoc->prsctp_supported == 1)) ||
+ ((asoc == NULL) && (inp->prsctp_supported == 1))) {
+ pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN;
+ if (((asoc != NULL) && (asoc->idata_supported == 1)) ||
+ ((asoc == NULL) && (inp->idata_supported == 1))) {
+ pr_supported->chunk_types[num_ext++] = SCTP_IFORWARD_CUM_TSN;
+ }
+ }
+ if (((asoc != NULL) && (asoc->auth_supported == 1)) ||
+ ((asoc == NULL) && (inp->auth_supported == 1))) {
pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION;
- if (SCTP_BASE_SYSCTL(sctp_nr_sack_on_off))
+ }
+ if (((asoc != NULL) && (asoc->asconf_supported == 1)) ||
+ ((asoc == NULL) && (inp->asconf_supported == 1))) {
+ pr_supported->chunk_types[num_ext++] = SCTP_ASCONF;
+ pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK;
+ }
+ if (((asoc != NULL) && (asoc->reconfig_supported == 1)) ||
+ ((asoc == NULL) && (inp->reconfig_supported == 1))) {
+ pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET;
+ }
+ if (((asoc != NULL) && (asoc->idata_supported == 1)) ||
+ ((asoc == NULL) && (inp->idata_supported == 1))) {
+ pr_supported->chunk_types[num_ext++] = SCTP_IDATA;
+ }
+ if (((asoc != NULL) && (asoc->nrsack_supported == 1)) ||
+ ((asoc == NULL) && (inp->nrsack_supported == 1))) {
pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK;
- p_len = sizeof(*pr_supported) + num_ext;
- pr_supported->ph.param_length = htons(p_len);
- bzero((caddr_t)pr_supported + p_len, SCTP_SIZE32(p_len) - p_len);
- SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len);
-
+ }
+ if (((asoc != NULL) && (asoc->pktdrop_supported == 1)) ||
+ ((asoc == NULL) && (inp->pktdrop_supported == 1))) {
+ pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED;
+ }
+ if (num_ext > 0) {
+ parameter_len = (uint16_t) sizeof(struct sctp_supported_chunk_types_param) + num_ext;
+ pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT);
+ pr_supported->ph.param_length = htons(parameter_len);
+ padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
+ chunk_len += parameter_len;
+ }
/* add authentication parameters */
- if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) {
+ if (((asoc != NULL) && (asoc->auth_supported == 1)) ||
+ ((asoc == NULL) && (inp->auth_supported == 1))) {
struct sctp_auth_random *randp;
struct sctp_auth_hmac_algo *hmacs;
struct sctp_auth_chunk_list *chunks;
- uint16_t random_len;
+ if (padding_len > 0) {
+ memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
+ chunk_len += padding_len;
+ padding_len = 0;
+ }
/* generate and add RANDOM parameter */
- random_len = SCTP_AUTH_RANDOM_SIZE_DEFAULT;
- randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
+ randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+chunk_len);
+ parameter_len = (uint16_t) sizeof(struct sctp_auth_random) +
+ SCTP_AUTH_RANDOM_SIZE_DEFAULT;
randp->ph.param_type = htons(SCTP_RANDOM);
- p_len = sizeof(*randp) + random_len;
- randp->ph.param_length = htons(p_len);
- SCTP_READ_RANDOM(randp->random_data, random_len);
- /* zero out any padding required */
- bzero((caddr_t)randp + p_len, SCTP_SIZE32(p_len) - p_len);
- SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len);
+ randp->ph.param_length = htons(parameter_len);
+ SCTP_READ_RANDOM(randp->random_data, SCTP_AUTH_RANDOM_SIZE_DEFAULT);
+ padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
+ chunk_len += parameter_len;
+ if (padding_len > 0) {
+ memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
+ chunk_len += padding_len;
+ padding_len = 0;
+ }
/* add HMAC_ALGO parameter */
- hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
- p_len = sctp_serialize_hmaclist(inp->sctp_ep.local_hmacs,
+ hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+chunk_len);
+ parameter_len = (uint16_t) sizeof(struct sctp_auth_hmac_algo) +
+ sctp_serialize_hmaclist(inp->sctp_ep.local_hmacs,
(uint8_t *) hmacs->hmac_ids);
- if (p_len > 0) {
- p_len += sizeof(*hmacs);
- hmacs->ph.param_type = htons(SCTP_HMAC_LIST);
- hmacs->ph.param_length = htons(p_len);
- /* zero out any padding required */
- bzero((caddr_t)hmacs + p_len, SCTP_SIZE32(p_len) - p_len);
- SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len);
+ hmacs->ph.param_type = htons(SCTP_HMAC_LIST);
+ hmacs->ph.param_length = htons(parameter_len);
+ padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
+ chunk_len += parameter_len;
+
+ if (padding_len > 0) {
+ memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
+ chunk_len += padding_len;
+ padding_len = 0;
}
/* add CHUNKS parameter */
- chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m));
- p_len = sctp_serialize_auth_chunks(inp->sctp_ep.local_auth_chunks,
+ chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+chunk_len);
+ parameter_len = (uint16_t) sizeof(struct sctp_auth_chunk_list) +
+ sctp_serialize_auth_chunks(inp->sctp_ep.local_auth_chunks,
chunks->chunk_types);
- if (p_len > 0) {
- p_len += sizeof(*chunks);
- chunks->ph.param_type = htons(SCTP_CHUNK_LIST);
- chunks->ph.param_length = htons(p_len);
- /* zero out any padding required */
- bzero((caddr_t)chunks + p_len, SCTP_SIZE32(p_len) - p_len);
- SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len);
- }
+ chunks->ph.param_type = htons(SCTP_CHUNK_LIST);
+ chunks->ph.param_length = htons(parameter_len);
+ padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
+ chunk_len += parameter_len;
}
- m_at = m;
+ SCTP_BUF_LEN(m) = chunk_len;
+ m_last = m;
/* now the addresses */
- {
- struct sctp_scoping scp;
-
- /*
- * To optimize this we could put the scoping stuff into a
- * structure and remove the individual uint8's from the stc
- * structure. Then we could just sifa in the address within
- * the stc.. but for now this is a quick hack to get the
- * address stuff teased apart.
- */
- scp.ipv4_addr_legal = stc.ipv4_addr_legal;
- scp.ipv6_addr_legal = stc.ipv6_addr_legal;
- scp.loopback_scope = stc.loopback_scope;
- scp.ipv4_local_scope = stc.ipv4_scope;
- scp.local_scope = stc.local_scope;
- scp.site_scope = stc.site_scope;
- m_at = sctp_add_addresses_to_i_ia(inp, stcb, &scp, m_at, cnt_inits_to, NULL, NULL);
+ /*
+ * To optimize this we could put the scoping stuff into a structure
+ * and remove the individual uint8's from the stc structure. Then we
+ * could just sifa in the address within the stc.. but for now this
+ * is a quick hack to get the address stuff teased apart.
+ */
+ scp.ipv4_addr_legal = stc.ipv4_addr_legal;
+ scp.ipv6_addr_legal = stc.ipv6_addr_legal;
+ scp.loopback_scope = stc.loopback_scope;
+ scp.ipv4_local_scope = stc.ipv4_scope;
+ scp.local_scope = stc.local_scope;
+ scp.site_scope = stc.site_scope;
+ m_last = sctp_add_addresses_to_i_ia(inp, stcb, &scp, m_last,
+ cnt_inits_to,
+ &padding_len, &chunk_len);
+ /* padding_len can only be positive, if no addresses have been added */
+ if (padding_len > 0) {
+ memset(mtod(m, caddr_t)+chunk_len, 0, padding_len);
+ chunk_len += padding_len;
+ SCTP_BUF_LEN(m) += padding_len;
+ padding_len = 0;
}
-
/* tack on the operational error if present */
if (op_err) {
- struct mbuf *ol;
- int llen;
-
- llen = 0;
- ol = op_err;
-
- while (ol) {
- llen += SCTP_BUF_LEN(ol);
- ol = SCTP_BUF_NEXT(ol);
- }
- if (llen % 4) {
- /* must add a pad to the param */
- uint32_t cpthis = 0;
- int padlen;
-
- padlen = 4 - (llen % 4);
- m_copyback(op_err, llen, padlen, (caddr_t)&cpthis);
+ parameter_len = 0;
+ for (m_tmp = op_err; m_tmp != NULL; m_tmp = SCTP_BUF_NEXT(m_tmp)) {
+ parameter_len += SCTP_BUF_LEN(m_tmp);
}
- while (SCTP_BUF_NEXT(m_at) != NULL) {
- m_at = SCTP_BUF_NEXT(m_at);
- }
- SCTP_BUF_NEXT(m_at) = op_err;
- while (SCTP_BUF_NEXT(m_at) != NULL) {
- m_at = SCTP_BUF_NEXT(m_at);
+ padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
+ SCTP_BUF_NEXT(m_last) = op_err;
+ while (SCTP_BUF_NEXT(m_last) != NULL) {
+ m_last = SCTP_BUF_NEXT(m_last);
}
+ chunk_len += parameter_len;
}
- /* pre-calulate the size and update pkt header and chunk header */
- p_len = 0;
- for (m_tmp = m; m_tmp; m_tmp = SCTP_BUF_NEXT(m_tmp)) {
- p_len += SCTP_BUF_LEN(m_tmp);
- if (SCTP_BUF_NEXT(m_tmp) == NULL) {
- /* m_tmp should now point to last one */
- break;
+ if (padding_len > 0) {
+ m_last = sctp_add_pad_tombuf(m_last, padding_len);
+ if (m_last == NULL) {
+ /* Houston we have a problem, no space */
+ sctp_m_freem(m);
+ return;
}
+ chunk_len += padding_len;
+ padding_len = 0;
}
-
/* Now we must build a cookie */
m_cookie = sctp_add_cookie(init_pkt, offset, m, 0, &stc, &signature);
if (m_cookie == NULL) {
@@ -6060,21 +6106,22 @@ do_a_abort:
return;
}
/* Now append the cookie to the end and update the space/size */
- SCTP_BUF_NEXT(m_tmp) = m_cookie;
-
- for (m_tmp = m_cookie; m_tmp; m_tmp = SCTP_BUF_NEXT(m_tmp)) {
- p_len += SCTP_BUF_LEN(m_tmp);
+ SCTP_BUF_NEXT(m_last) = m_cookie;
+ parameter_len = 0;
+ for (m_tmp = m_cookie; m_tmp != NULL; m_tmp = SCTP_BUF_NEXT(m_tmp)) {
+ parameter_len += SCTP_BUF_LEN(m_tmp);
if (SCTP_BUF_NEXT(m_tmp) == NULL) {
- /* m_tmp should now point to last one */
- mp_last = m_tmp;
- break;
+ m_last = m_tmp;
}
}
+ padding_len = SCTP_SIZE32(parameter_len) - parameter_len;
+ chunk_len += parameter_len;
+
/*
* Place in the size, but we don't include the last pad (if any) in
* the INIT-ACK.
*/
- initack->ch.chunk_length = htons(p_len);
+ initack->ch.chunk_length = htons(chunk_len);
/*
* Time to sign the cookie, we don't sign over the cookie signature
@@ -6088,11 +6135,8 @@ do_a_abort:
* We sifa 0 here to NOT set IP_DF if its IPv4, we ignore the return
* here since the timer will drive a retranmission.
*/
- padval = p_len % 4;
- if ((padval) && (mp_last)) {
- /* see my previous comments on mp_last */
- if (sctp_add_pad_tombuf(mp_last, (4 - padval))) {
- /* Houston we have a problem, no space */
+ if (padding_len > 0) {
+ if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) {
sctp_m_freem(m);
return;
}
@@ -6107,7 +6151,7 @@ do_a_abort:
0, 0,
inp->sctp_lport, sh->src_port, init_chk->init.initiate_tag,
port, over_addr,
- use_mflowid, mflowid,
+ mflowtype, mflowid,
SCTP_SO_NOT_LOCKED);
SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
}
@@ -6123,7 +6167,7 @@ sctp_prune_prsctp(struct sctp_tcb *stcb,
struct sctp_tmit_chunk *chk, *nchk;
SCTP_TCB_LOCK_ASSERT(stcb);
- if ((asoc->peer_supports_prsctp) &&
+ if ((asoc->prsctp_supported) &&
(asoc->sent_queue_cnt_removeable > 0)) {
TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
/*
@@ -6165,7 +6209,7 @@ sctp_prune_prsctp(struct sctp_tcb *stcb,
return;
}
} /* if chunk was present */
- } /* if of sufficent priority */
+ } /* if of sufficient priority */
} /* if chunk has enabled */
} /* tailqforeach */
@@ -6206,11 +6250,15 @@ sctp_get_frag_point(struct sctp_tcb *stcb,
* we use a larger frag point.
*/
if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
- ovh = SCTP_MED_OVERHEAD;
+ ovh = SCTP_MIN_OVERHEAD;
} else {
- ovh = SCTP_MED_V4_OVERHEAD;
+ ovh = SCTP_MIN_V4_OVERHEAD;
+ }
+ if (stcb->asoc.idata_supported) {
+ ovh += sizeof(struct sctp_idata_chunk);
+ } else {
+ ovh += sizeof(struct sctp_data_chunk);
}
-
if (stcb->asoc.sctp_frag_point > asoc->smallest_mtu)
siz = asoc->smallest_mtu - ovh;
else
@@ -6335,6 +6383,7 @@ sctp_msg_append(struct sctp_tcb *stcb,
sp->timetolive = srcv->sinfo_timetolive;
sp->ppid = srcv->sinfo_ppid;
sp->context = srcv->sinfo_context;
+ sp->fsn = 0;
if (sp->sinfo_flags & SCTP_ADDR_OVER) {
sp->net = net;
atomic_add_int(&sp->net->ref_count, 1);
@@ -6420,7 +6469,7 @@ error_out:
if (outchain == NULL) {
/* This is the general case */
new_mbuf:
- outchain = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_HEADER);
+ outchain = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_HEADER);
if (outchain == NULL) {
goto error_out;
}
@@ -6453,10 +6502,10 @@ error_out:
}
}
/* get the new end of length */
- len = M_TRAILINGSPACE(*endofchain);
+ len = (int)M_TRAILINGSPACE(*endofchain);
} else {
/* how much is left at the end? */
- len = M_TRAILINGSPACE(*endofchain);
+ len = (int)M_TRAILINGSPACE(*endofchain);
}
/* Find the end of the data, for appending */
cp = (mtod((*endofchain), caddr_t)+SCTP_BUF_LEN((*endofchain)));
@@ -6474,7 +6523,7 @@ error_out:
/* now we need another one */
sizeofcpy -= len;
}
- m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_HEADER);
+ m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_HEADER);
if (m == NULL) {
/* We failed */
goto error_out;
@@ -6488,16 +6537,10 @@ error_out:
return (outchain);
} else {
/* copy the old fashion way */
- appendchain = SCTP_M_COPYM(clonechain, 0, M_COPYALL, M_DONTWAIT);
+ appendchain = SCTP_M_COPYM(clonechain, 0, M_COPYALL, M_NOWAIT);
#ifdef SCTP_MBUF_LOGGING
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- struct mbuf *mat;
-
- for (mat = appendchain; mat; mat = SCTP_BUF_NEXT(mat)) {
- if (SCTP_BUF_IS_EXTENDED(mat)) {
- sctp_log_mb(mat, SCTP_MBUF_ICOPY);
- }
- }
+ sctp_log_mbc(appendchain, SCTP_MBUF_ICOPY);
}
#endif
}
@@ -6523,7 +6566,7 @@ error_out:
}
}
/*
- * save off the end and update the end-chain postion
+ * save off the end and update the end-chain position
*/
m = appendchain;
while (m) {
@@ -6535,7 +6578,7 @@ error_out:
}
return (outchain);
} else {
- /* save off the end and update the end-chain postion */
+ /* save off the end and update the end-chain position */
m = appendchain;
while (m) {
if (SCTP_BUF_NEXT(m) == NULL) {
@@ -6582,7 +6625,7 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr,
return;
}
if (ca->sndlen > 0) {
- m = SCTP_M_COPYM(ca->m, 0, M_COPYALL, M_DONTWAIT);
+ m = SCTP_M_COPYM(ca->m, 0, M_COPYALL, M_NOWAIT);
if (m == NULL) {
/* can't copy so we are done */
ca->cnt_failed++;
@@ -6590,13 +6633,7 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr,
}
#ifdef SCTP_MBUF_LOGGING
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- struct mbuf *mat;
-
- for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) {
- if (SCTP_BUF_IS_EXTENDED(mat)) {
- sctp_log_mb(mat, SCTP_MBUF_ICOPY);
- }
- }
+ sctp_log_mbc(m, SCTP_MBUF_ICOPY);
}
#endif
} else {
@@ -6622,7 +6659,7 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr,
ph = mtod(m, struct sctp_paramhdr *);
ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT);
- ph->param_length = htons(sizeof(struct sctp_paramhdr) + ca->sndlen);
+ ph->param_length = htons((uint16_t) (sizeof(struct sctp_paramhdr) + ca->sndlen));
}
/*
* We add one here to keep the assoc from dis-appearing on
@@ -6652,14 +6689,10 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr,
asoc = &stcb->asoc;
if (ca->sndrcv.sinfo_flags & SCTP_EOF) {
/* shutdown this assoc */
- int cnt;
-
- cnt = sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED);
-
if (TAILQ_EMPTY(&asoc->send_queue) &&
TAILQ_EMPTY(&asoc->sent_queue) &&
- (cnt == 0)) {
- if (asoc->locked_on_sending) {
+ sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED) == 0) {
+ if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
goto abort_anyway;
}
/*
@@ -6701,27 +6734,24 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr,
if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
(SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) &&
(SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
- if (asoc->locked_on_sending) {
- /*
- * Locked to send out the
- * data
- */
- struct sctp_stream_queue_pending *sp;
-
- sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead);
- if (sp) {
- if ((sp->length == 0) && (sp->msg_is_complete == 0))
- asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
- }
+ if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
+ asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
}
asoc->state |= SCTP_STATE_SHUTDOWN_PENDING;
if (TAILQ_EMPTY(&asoc->send_queue) &&
TAILQ_EMPTY(&asoc->sent_queue) &&
(asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
+ struct mbuf *op_err;
+ char msg[SCTP_DIAG_INFO_LEN];
+
abort_anyway:
+ snprintf(msg, sizeof(msg),
+ "%s:%d at %s", __FILE__, __LINE__, __func__);
+ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
+ msg);
atomic_add_int(&stcb->asoc.refcnt, 1);
sctp_abort_an_association(stcb->sctp_ep, stcb,
- NULL, SCTP_SO_NOT_LOCKED);
+ op_err, SCTP_SO_NOT_LOCKED);
atomic_add_int(&stcb->asoc.refcnt, -1);
goto no_chunk_output;
}
@@ -6743,7 +6773,7 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr,
if (do_chunk_output)
sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_NOT_LOCKED);
else if (added_control) {
- int num_out = 0, reason = 0, now_filled = 0;
+ int num_out, reason, now_filled = 0;
struct timeval now;
int frag_point;
@@ -6768,7 +6798,7 @@ sctp_sendall_completes(void *ptr, uint32_t val SCTP_UNUSED)
/*
* Do a notify here? Kacheong suggests that the notify be done at
* the send time.. so you would push up a notification if any send
- * failed. Don't know if this is feasable since the only failures we
+ * failed. Don't know if this is feasible since the only failures we
* have is "memory" related and if you cannot get an mbuf to send
* the data you surely can't get an mbuf to send up to notify the
* user you can't send the data :->
@@ -6779,20 +6809,13 @@ sctp_sendall_completes(void *ptr, uint32_t val SCTP_UNUSED)
SCTP_FREE(ca, SCTP_M_COPYAL);
}
-
-#define MC_ALIGN(m, len) do { \
- SCTP_BUF_RESV_UF(m, ((MCLBYTES - (len)) & ~(sizeof(long) - 1)); \
-} while (0)
-
-
-
static struct mbuf *
sctp_copy_out_all(struct uio *uio, int len)
{
struct mbuf *ret, *at;
int left, willcpy, cancpy, error;
- ret = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_WAIT, 1, MT_DATA);
+ ret = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_WAITOK, 1, MT_DATA);
if (ret == NULL) {
/* TSNH */
return (NULL);
@@ -6800,7 +6823,7 @@ sctp_copy_out_all(struct uio *uio, int len)
left = len;
SCTP_BUF_LEN(ret) = 0;
/* save space for the data chunk header */
- cancpy = M_TRAILINGSPACE(ret);
+ cancpy = (int)M_TRAILINGSPACE(ret);
willcpy = min(cancpy, left);
at = ret;
while (left > 0) {
@@ -6815,13 +6838,13 @@ sctp_copy_out_all(struct uio *uio, int len)
SCTP_BUF_NEXT_PKT(at) = SCTP_BUF_NEXT(at) = 0;
left -= willcpy;
if (left > 0) {
- SCTP_BUF_NEXT(at) = sctp_get_mbuf_for_msg(left, 0, M_WAIT, 1, MT_DATA);
+ SCTP_BUF_NEXT(at) = sctp_get_mbuf_for_msg(left, 0, M_WAITOK, 1, MT_DATA);
if (SCTP_BUF_NEXT(at) == NULL) {
goto err_out_now;
}
at = SCTP_BUF_NEXT(at);
SCTP_BUF_LEN(at) = 0;
- cancpy = M_TRAILINGSPACE(at);
+ cancpy = (int)M_TRAILINGSPACE(at);
willcpy = min(cancpy, left);
}
}
@@ -6855,7 +6878,7 @@ sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m,
ca->sndrcv.sinfo_flags &= ~SCTP_SENDALL;
/* get length and mbuf chain */
if (uio) {
- ca->sndlen = uio->uio_resid;
+ ca->sndlen = (int)uio->uio_resid;
ca->m = sctp_copy_out_all(uio, ca->sndlen);
if (ca->m == NULL) {
SCTP_FREE(ca, SCTP_M_COPYAL);
@@ -7005,7 +7028,7 @@ all_done:
sctp_misc_ints(SCTP_FLIGHT_LOG_UP,
data_list[i]->whoTo->flight_size,
data_list[i]->book_size,
- (uintptr_t) data_list[i]->whoTo,
+ (uint32_t) (uintptr_t) data_list[i]->whoTo,
data_list[i]->rec.data.TSN_seq);
}
sctp_flight_size_increase(data_list[i]);
@@ -7135,7 +7158,6 @@ sctp_move_to_outqueue(struct sctp_tcb *stcb,
struct sctp_stream_out *strq,
uint32_t goal_mtu,
uint32_t frag_point,
- int *locked,
int *giveup,
int eeor_mode,
int *bail,
@@ -7149,8 +7171,10 @@ sctp_move_to_outqueue(struct sctp_tcb *stcb,
struct sctp_association *asoc;
struct sctp_stream_queue_pending *sp;
struct sctp_tmit_chunk *chk;
- struct sctp_data_chunk *dchkh;
+ struct sctp_data_chunk *dchkh = NULL;
+ struct sctp_idata_chunk *ndchkh = NULL;
uint32_t to_move, length;
+ int leading;
uint8_t rcv_flags = 0;
uint8_t some_taken;
uint8_t send_lock_up = 0;
@@ -7161,7 +7185,6 @@ one_more_time:
/* sa_ignore FREED_MEMORY */
sp = TAILQ_FIRST(&strq->outqueue);
if (sp == NULL) {
- *locked = 0;
if (send_lock_up == 0) {
SCTP_TCB_SEND_LOCK(stcb);
send_lock_up = 1;
@@ -7170,7 +7193,9 @@ one_more_time:
if (sp) {
goto one_more_time;
}
- if (strq->last_msg_incomplete) {
+ if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_EXPLICIT_EOR) == 0) &&
+ (stcb->asoc.idata_supported == 0) &&
+ (strq->last_msg_incomplete)) {
SCTP_PRINTF("Huh? Stream:%d lm_in_c=%d but queue is NULL\n",
strq->stream_no,
strq->last_msg_incomplete);
@@ -7206,6 +7231,11 @@ one_more_time:
atomic_subtract_int(&asoc->stream_queue_cnt, 1);
TAILQ_REMOVE(&strq->outqueue, sp, next);
stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, strq, sp, send_lock_up);
+ if ((strq->state == SCTP_STREAM_RESET_PENDING) &&
+ (strq->chunks_on_queues == 0) &&
+ TAILQ_EMPTY(&strq->outqueue)) {
+ stcb->asoc.trigger_reset = 1;
+ }
if (sp->net) {
sctp_free_remote_addr(sp->net);
sp->net = NULL;
@@ -7216,8 +7246,6 @@ one_more_time:
}
sctp_free_a_strmoq(stcb, sp, so_locked);
/* we can't be locked to it */
- *locked = 0;
- stcb->asoc.locked_on_sending = NULL;
if (send_lock_up) {
SCTP_TCB_SEND_UNLOCK(stcb);
send_lock_up = 0;
@@ -7229,7 +7257,6 @@ one_more_time:
* sender just finished this but still holds a
* reference
*/
- *locked = 1;
*giveup = 1;
to_move = 0;
goto out_of;
@@ -7238,7 +7265,6 @@ one_more_time:
/* is there some to get */
if (sp->length == 0) {
/* no */
- *locked = 1;
*giveup = 1;
to_move = 0;
goto out_of;
@@ -7249,7 +7275,7 @@ one_more_time:
}
/* Whack down the size */
atomic_subtract_int(&stcb->asoc.total_output_queue_size, sp->length);
- if ((stcb->sctp_socket != NULL) && \
+ if ((stcb->sctp_socket != NULL) &&
((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
(stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) {
atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc, sp->length);
@@ -7261,16 +7287,12 @@ one_more_time:
}
sp->length = 0;
sp->some_taken = 1;
- *locked = 1;
*giveup = 1;
to_move = 0;
goto out_of;
}
}
some_taken = sp->some_taken;
- if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
- sp->msg_is_complete = 1;
- }
re_look:
length = sp->length;
if (sp->msg_is_complete) {
@@ -7280,10 +7302,12 @@ re_look:
/* All of it fits in the MTU */
if (sp->some_taken) {
rcv_flags |= SCTP_DATA_LAST_FRAG;
- sp->put_last_out = 1;
} else {
rcv_flags |= SCTP_DATA_NOT_FRAG;
- sp->put_last_out = 1;
+ }
+ sp->put_last_out = 1;
+ if (sp->sinfo_flags & SCTP_SACK_IMMEDIATELY) {
+ rcv_flags |= SCTP_DATA_SACK_IMMEDIATELY;
}
} else {
/* Not all of it fits, we fragment */
@@ -7326,9 +7350,6 @@ re_look:
}
} else {
/* Nothing to take. */
- if (sp->some_taken) {
- *locked = 1;
- }
*giveup = 1;
to_move = 0;
goto out_of;
@@ -7350,8 +7371,8 @@ re_look:
if (sp->sinfo_flags & SCTP_UNORDERED) {
rcv_flags |= SCTP_DATA_UNORDERED;
}
- if ((SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) && ((sp->sinfo_flags & SCTP_EOF) == SCTP_EOF)) ||
- ((sp->sinfo_flags & SCTP_SACK_IMMEDIATELY) == SCTP_SACK_IMMEDIATELY)) {
+ if (SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) &&
+ (sp->sinfo_flags & SCTP_EOF) == SCTP_EOF) {
rcv_flags |= SCTP_DATA_SACK_IMMEDIATELY;
}
/* clear out the chunk before setting up */
@@ -7376,7 +7397,7 @@ re_look:
struct mbuf *m;
dont_do_it:
- chk->data = SCTP_M_COPYM(sp->data, 0, to_move, M_DONTWAIT);
+ chk->data = SCTP_M_COPYM(sp->data, 0, to_move, M_NOWAIT);
chk->last_mbuf = NULL;
if (chk->data == NULL) {
sp->some_taken = some_taken;
@@ -7387,13 +7408,7 @@ dont_do_it:
}
#ifdef SCTP_MBUF_LOGGING
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- struct mbuf *mat;
-
- for (mat = chk->data; mat; mat = SCTP_BUF_NEXT(mat)) {
- if (SCTP_BUF_IS_EXTENDED(mat)) {
- sctp_log_mb(mat, SCTP_MBUF_ICOPY);
- }
- }
+ sctp_log_mbc(chk->data, SCTP_MBUF_ICOPY);
}
#endif
/* Pull off the data */
@@ -7428,7 +7443,7 @@ dont_do_it:
chk->copy_by_ref = 0;
}
/*
- * get last_mbuf and counts of mb useage This is ugly but hopefully
+ * get last_mbuf and counts of mb usage This is ugly but hopefully
* its only one mbuf.
*/
if (chk->last_mbuf == NULL) {
@@ -7451,11 +7466,16 @@ dont_do_it:
} else {
atomic_subtract_int(&sp->length, to_move);
}
- if (M_LEADINGSPACE(chk->data) < (int)sizeof(struct sctp_data_chunk)) {
+ if (stcb->asoc.idata_supported == 0) {
+ leading = sizeof(struct sctp_data_chunk);
+ } else {
+ leading = sizeof(struct sctp_idata_chunk);
+ }
+ if (M_LEADINGSPACE(chk->data) < leading) {
/* Not enough room for a chunk header, get some */
struct mbuf *m;
- m = sctp_get_mbuf_for_msg(1, 0, M_DONTWAIT, 0, MT_DATA);
+ m = sctp_get_mbuf_for_msg(1, 0, M_NOWAIT, 0, MT_DATA);
if (m == NULL) {
/*
* we're in trouble here. _PREPEND below will free
@@ -7466,7 +7486,7 @@ dont_do_it:
SCTP_TCB_SEND_LOCK(stcb);
send_lock_up = 1;
}
- if (chk->data == NULL) {
+ if (sp->data == NULL) {
/* unsteal the data */
sp->data = chk->data;
sp->tail_mbuf = chk->last_mbuf;
@@ -7492,7 +7512,11 @@ dont_do_it:
M_ALIGN(chk->data, 4);
}
}
- SCTP_BUF_PREPEND(chk->data, sizeof(struct sctp_data_chunk), M_DONTWAIT);
+ if (stcb->asoc.idata_supported == 0) {
+ SCTP_BUF_PREPEND(chk->data, sizeof(struct sctp_data_chunk), M_NOWAIT);
+ } else {
+ SCTP_BUF_PREPEND(chk->data, sizeof(struct sctp_idata_chunk), M_NOWAIT);
+ }
if (chk->data == NULL) {
/* HELP, TSNH since we assured it would not above? */
#ifdef INVARIANTS
@@ -7505,8 +7529,13 @@ dont_do_it:
to_move = 0;
goto out_of;
}
- sctp_snd_sb_alloc(stcb, sizeof(struct sctp_data_chunk));
- chk->book_size = chk->send_size = (to_move + sizeof(struct sctp_data_chunk));
+ if (stcb->asoc.idata_supported == 0) {
+ sctp_snd_sb_alloc(stcb, sizeof(struct sctp_data_chunk));
+ chk->book_size = chk->send_size = (uint16_t) (to_move + sizeof(struct sctp_data_chunk));
+ } else {
+ sctp_snd_sb_alloc(stcb, sizeof(struct sctp_idata_chunk));
+ chk->book_size = chk->send_size = (uint16_t) (to_move + sizeof(struct sctp_idata_chunk));
+ }
chk->book_size_scale = 0;
chk->sent = SCTP_DATAGRAM_UNSENT;
@@ -7514,10 +7543,28 @@ dont_do_it:
chk->asoc = &stcb->asoc;
chk->pad_inplace = 0;
chk->no_fr_allowed = 0;
- chk->rec.data.stream_seq = strq->next_sequence_send;
- if ((rcv_flags & SCTP_DATA_LAST_FRAG) &&
- !(rcv_flags & SCTP_DATA_UNORDERED)) {
- strq->next_sequence_send++;
+ if (stcb->asoc.idata_supported == 0) {
+ if (rcv_flags & SCTP_DATA_UNORDERED) {
+ /* Just use 0. The receiver ignores the values. */
+ chk->rec.data.stream_seq = 0;
+ } else {
+ chk->rec.data.stream_seq = strq->next_mid_ordered;
+ if (rcv_flags & SCTP_DATA_LAST_FRAG) {
+ strq->next_mid_ordered++;
+ }
+ }
+ } else {
+ if (rcv_flags & SCTP_DATA_UNORDERED) {
+ chk->rec.data.stream_seq = strq->next_mid_unordered;
+ if (rcv_flags & SCTP_DATA_LAST_FRAG) {
+ strq->next_mid_unordered++;
+ }
+ } else {
+ chk->rec.data.stream_seq = strq->next_mid_ordered;
+ if (rcv_flags & SCTP_DATA_LAST_FRAG) {
+ strq->next_mid_ordered++;
+ }
+ }
}
chk->rec.data.stream_number = sp->stream;
chk->rec.data.payloadtype = sp->ppid;
@@ -7541,11 +7588,15 @@ dont_do_it:
chk->rec.data.TSN_seq = atomic_fetchadd_int(&asoc->sending_seq, 1);
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_AT_SEND_2_OUTQ) {
sctp_misc_ints(SCTP_STRMOUT_LOG_SEND,
- (uintptr_t) stcb, sp->length,
+ (uint32_t) (uintptr_t) stcb, sp->length,
(uint32_t) ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq),
chk->rec.data.TSN_seq);
}
- dchkh = mtod(chk->data, struct sctp_data_chunk *);
+ if (stcb->asoc.idata_supported == 0) {
+ dchkh = mtod(chk->data, struct sctp_data_chunk *);
+ } else {
+ ndchkh = mtod(chk->data, struct sctp_idata_chunk *);
+ }
/*
* Put the rest of the things in place now. Size was done earlier in
* previous loop prior to padding.
@@ -7567,14 +7618,28 @@ dont_do_it:
asoc->out_tsnlog[asoc->tsn_out_at].in_out = 2;
asoc->tsn_out_at++;
#endif
-
- dchkh->ch.chunk_type = SCTP_DATA;
- dchkh->ch.chunk_flags = chk->rec.data.rcv_flags;
- dchkh->dp.tsn = htonl(chk->rec.data.TSN_seq);
- dchkh->dp.stream_id = htons(strq->stream_no);
- dchkh->dp.stream_sequence = htons(chk->rec.data.stream_seq);
- dchkh->dp.protocol_id = chk->rec.data.payloadtype;
- dchkh->ch.chunk_length = htons(chk->send_size);
+ if (stcb->asoc.idata_supported == 0) {
+ dchkh->ch.chunk_type = SCTP_DATA;
+ dchkh->ch.chunk_flags = chk->rec.data.rcv_flags;
+ dchkh->dp.tsn = htonl(chk->rec.data.TSN_seq);
+ dchkh->dp.stream_id = htons((strq->stream_no & 0x0000ffff));
+ dchkh->dp.stream_sequence = htons((uint16_t) chk->rec.data.stream_seq);
+ dchkh->dp.protocol_id = chk->rec.data.payloadtype;
+ dchkh->ch.chunk_length = htons(chk->send_size);
+ } else {
+ ndchkh->ch.chunk_type = SCTP_IDATA;
+ ndchkh->ch.chunk_flags = chk->rec.data.rcv_flags;
+ ndchkh->dp.tsn = htonl(chk->rec.data.TSN_seq);
+ ndchkh->dp.stream_id = htons(strq->stream_no);
+ ndchkh->dp.reserved = htons(0);
+ ndchkh->dp.msg_id = htonl(chk->rec.data.stream_seq);
+ if (sp->fsn == 0)
+ ndchkh->dp.ppid_fsn.protocol_id = chk->rec.data.payloadtype;
+ else
+ ndchkh->dp.ppid_fsn.fsn = htonl(sp->fsn);
+ sp->fsn++;
+ ndchkh->ch.chunk_length = htons(chk->send_size);
+ }
/* Now advance the chk->send_size by the actual pad needed. */
if (chk->send_size < SCTP_SIZE32(chk->book_size)) {
/* need a pad */
@@ -7582,12 +7647,10 @@ dont_do_it:
int pads;
pads = SCTP_SIZE32(chk->book_size) - chk->send_size;
- if (sctp_pad_lastmbuf(chk->data, pads, chk->last_mbuf) == 0) {
- chk->pad_inplace = 1;
- }
- if ((lm = SCTP_BUF_NEXT(chk->last_mbuf)) != NULL) {
- /* pad added an mbuf */
+ lm = sctp_pad_lastmbuf(chk->data, pads, chk->last_mbuf);
+ if (lm != NULL) {
chk->last_mbuf = lm;
+ chk->pad_inplace = 1;
}
chk->send_size += pads;
}
@@ -7596,7 +7659,6 @@ dont_do_it:
}
if (sp->msg_is_complete && (sp->length == 0) && (sp->sender_all_done)) {
/* All done pull and kill the message */
- atomic_subtract_int(&asoc->stream_queue_cnt, 1);
if (sp->put_last_out == 0) {
SCTP_PRINTF("Gak, put out entire msg with NO end!-2\n");
SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d send_lock:%d\n",
@@ -7610,8 +7672,14 @@ dont_do_it:
SCTP_TCB_SEND_LOCK(stcb);
send_lock_up = 1;
}
+ atomic_subtract_int(&asoc->stream_queue_cnt, 1);
TAILQ_REMOVE(&strq->outqueue, sp, next);
stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, strq, sp, send_lock_up);
+ if ((strq->state == SCTP_STREAM_RESET_PENDING) &&
+ (strq->chunks_on_queues == 0) &&
+ TAILQ_EMPTY(&strq->outqueue)) {
+ stcb->asoc.trigger_reset = 1;
+ }
if (sp->net) {
sctp_free_remote_addr(sp->net);
sp->net = NULL;
@@ -7621,13 +7689,6 @@ dont_do_it:
sp->data = NULL;
}
sctp_free_a_strmoq(stcb, sp, so_locked);
-
- /* we can't be locked to it */
- *locked = 0;
- stcb->asoc.locked_on_sending = NULL;
- } else {
- /* more to go, we are locked */
- *locked = 1;
}
asoc->chunks_on_out_queue++;
strq->chunks_on_queues++;
@@ -7652,7 +7713,7 @@ sctp_fill_outqueue(struct sctp_tcb *stcb,
struct sctp_association *asoc;
struct sctp_stream_out *strq;
int goal_mtu, moved_how_much, total_moved = 0, bail = 0;
- int locked, giveup;
+ int giveup;
SCTP_TCB_LOCK_ASSERT(stcb);
asoc = &stcb->asoc;
@@ -7673,40 +7734,28 @@ sctp_fill_outqueue(struct sctp_tcb *stcb,
break;
}
/* Need an allowance for the data chunk header too */
- goal_mtu -= sizeof(struct sctp_data_chunk);
+ if (stcb->asoc.idata_supported == 0) {
+ goal_mtu -= sizeof(struct sctp_data_chunk);
+ } else {
+ goal_mtu -= sizeof(struct sctp_idata_chunk);
+ }
/* must make even word boundary */
goal_mtu &= 0xfffffffc;
- if (asoc->locked_on_sending) {
- /* We are stuck on one stream until the message completes. */
- strq = asoc->locked_on_sending;
- locked = 1;
- } else {
- strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc);
- locked = 0;
- }
+ strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc);
while ((goal_mtu > 0) && strq) {
giveup = 0;
bail = 0;
- moved_how_much = sctp_move_to_outqueue(stcb, strq, goal_mtu, frag_point, &locked,
+ moved_how_much = sctp_move_to_outqueue(stcb, strq, goal_mtu, frag_point,
&giveup, eeor_mode, &bail, so_locked);
- if (moved_how_much)
- stcb->asoc.ss_functions.sctp_ss_scheduled(stcb, net, asoc, strq, moved_how_much);
+ stcb->asoc.ss_functions.sctp_ss_scheduled(stcb, net, asoc, strq, moved_how_much);
- if (locked) {
- asoc->locked_on_sending = strq;
- if ((moved_how_much == 0) || (giveup) || bail)
- /* no more to move for now */
- break;
- } else {
- asoc->locked_on_sending = NULL;
- if ((giveup) || bail) {
- break;
- }
- strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc);
- if (strq == NULL) {
- break;
- }
+ if ((giveup) || bail) {
+ break;
+ }
+ strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc);
+ if (strq == NULL) {
+ break;
}
total_moved += moved_how_much;
goal_mtu -= (moved_how_much + sizeof(struct sctp_data_chunk));
@@ -7784,12 +7833,15 @@ sctp_med_chunk_output(struct sctp_inpcb *inp,
{
/**
* Ok this is the generic chunk service queue. we must do the
- * following: - Service the stream queue that is next, moving any
- * message (note I must get a complete message i.e. FIRST/MIDDLE and
- * LAST to the out queue in one pass) and assigning TSN's - Check to
- * see if the cwnd/rwnd allows any output, if so we go ahead and
- * fomulate and send the low level chunks. Making sure to combine
- * any control in the control chunk queue also.
+ * following:
+ * - Service the stream queue that is next, moving any
+ * message (note I must get a complete message i.e. FIRST/MIDDLE and
+ * LAST to the out queue in one pass) and assigning TSN's. This
+ * only applys though if the peer does not support NDATA. For NDATA
+ * chunks its ok to not send the entire message ;-)
+ * - Check to see if the cwnd/rwnd allows any output, if so we go ahead and
+ * fomulate and send the low level chunks. Making sure to combine
+ * any control in the control chunk queue also.
*/
struct sctp_nets *net, *start_at, *sack_goes_to = NULL, *old_start_at = NULL;
struct mbuf *outchain, *endoutchain;
@@ -7818,8 +7870,8 @@ sctp_med_chunk_output(struct sctp_inpcb *inp,
int quit_now = 0;
*num_out = 0;
+ *reason_code = 0;
auth_keyid = stcb->asoc.authinfo.active_keyid;
-
if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) ||
(asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED) ||
(sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) {
@@ -7838,7 +7890,7 @@ sctp_med_chunk_output(struct sctp_inpcb *inp,
#endif
SCTP_TCB_LOCK_ASSERT(stcb);
hbflag = 0;
- if ((control_only) || (asoc->stream_reset_outstanding))
+ if (control_only)
no_data_chunks = 1;
else
no_data_chunks = 0;
@@ -7848,7 +7900,7 @@ sctp_med_chunk_output(struct sctp_inpcb *inp,
(asoc->ctrl_queue_cnt == stcb->asoc.ecn_echo_cnt_onq)) &&
TAILQ_EMPTY(&asoc->asconf_send_queue) &&
TAILQ_EMPTY(&asoc->send_queue) &&
- stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) {
+ sctp_is_there_unsent_data(stcb, so_locked) == 0) {
nothing_to_send:
*reason_code = 9;
return (0);
@@ -8006,31 +8058,15 @@ again_one_more_time:
} else {
skip_data_for_this_net = 0;
}
- if ((net->ro.ro_rt) && (net->ro.ro_rt->rt_ifp)) {
- /*
- * if we have a route and an ifp check to see if we
- * have room to send to this guy
- */
- struct ifnet *ifp;
-
- ifp = net->ro.ro_rt->rt_ifp;
- if ((ifp->if_snd.ifq_len + 2) >= ifp->if_snd.ifq_maxlen) {
- SCTP_STAT_INCR(sctps_ifnomemqueued);
- if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) {
- sctp_log_maxburst(stcb, net, ifp->if_snd.ifq_len, ifp->if_snd.ifq_maxlen, SCTP_MAX_IFP_APPLIED);
- }
- continue;
- }
- }
switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) {
#ifdef INET
case AF_INET:
- mtu = net->mtu - (sizeof(struct ip) + sizeof(struct sctphdr));
+ mtu = net->mtu - SCTP_MIN_V4_OVERHEAD;
break;
#endif
#ifdef INET6
case AF_INET6:
- mtu = net->mtu - (sizeof(struct ip6_hdr) + sizeof(struct sctphdr));
+ mtu = net->mtu - SCTP_MIN_OVERHEAD;
break;
#endif
default:
@@ -8052,6 +8088,7 @@ again_one_more_time:
} else {
r_mtu = mtu;
}
+ error = 0;
/************************/
/* ASCONF transmission */
/************************/
@@ -8175,6 +8212,12 @@ again_one_more_time:
* it is used to do appropriate
* source address selection.
*/
+ if (*now_filled == 0) {
+ (void)SCTP_GETTIME_TIMEVAL(now);
+ *now_filled = 1;
+ }
+ net->last_sent_time = *now;
+ hbflag = 0;
if ((error = sctp_lowlevel_chunk_output(inp, stcb, net,
(struct sockaddr *)&net->ro._l_addr,
outchain, auth_offset, auth,
@@ -8185,21 +8228,18 @@ again_one_more_time:
net->port, NULL,
0, 0,
so_locked))) {
- if (error == ENOBUFS) {
- asoc->ifp_had_enobuf = 1;
- SCTP_STAT_INCR(sctps_lowlevelerr);
- }
+ /*
+ * error, we could not
+ * output
+ */
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error);
if (from_where == 0) {
SCTP_STAT_INCR(sctps_lowlevelerrusr);
}
- if (*now_filled == 0) {
- (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
- *now_filled = 1;
- *now = net->last_sent_time;
- } else {
- net->last_sent_time = *now;
+ if (error == ENOBUFS) {
+ asoc->ifp_had_enobuf = 1;
+ SCTP_STAT_INCR(sctps_lowlevelerr);
}
- hbflag = 0;
/* error, could not output */
if (error == EHOSTUNREACH) {
/*
@@ -8210,17 +8250,10 @@ again_one_more_time:
sctp_move_chunks_from_net(stcb, net);
}
*reason_code = 7;
- continue;
- } else
- asoc->ifp_had_enobuf = 0;
- if (*now_filled == 0) {
- (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
- *now_filled = 1;
- *now = net->last_sent_time;
+ break;
} else {
- net->last_sent_time = *now;
+ asoc->ifp_had_enobuf = 0;
}
- hbflag = 0;
/*
* increase the number we sent, if a
* cookie is sent we don't tell them
@@ -8253,6 +8286,10 @@ again_one_more_time:
}
}
}
+ if (error != 0) {
+ /* try next net */
+ continue;
+ }
/************************/
/* Control transmission */
/************************/
@@ -8391,7 +8428,8 @@ again_one_more_time:
/* turn off the timer */
if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) {
sctp_timer_stop(SCTP_TIMER_TYPE_RECV,
- inp, stcb, net, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_1);
+ inp, stcb, net,
+ SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_1);
}
}
ctl_cnt++;
@@ -8448,6 +8486,15 @@ again_one_more_time:
sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net);
cookie = 0;
}
+ /* Only HB or ASCONF advances time */
+ if (hbflag) {
+ if (*now_filled == 0) {
+ (void)SCTP_GETTIME_TIMEVAL(now);
+ *now_filled = 1;
+ }
+ net->last_sent_time = *now;
+ hbflag = 0;
+ }
if ((error = sctp_lowlevel_chunk_output(inp, stcb, net,
(struct sockaddr *)&net->ro._l_addr,
outchain,
@@ -8459,23 +8506,17 @@ again_one_more_time:
net->port, NULL,
0, 0,
so_locked))) {
- if (error == ENOBUFS) {
- asoc->ifp_had_enobuf = 1;
- SCTP_STAT_INCR(sctps_lowlevelerr);
- }
+ /*
+ * error, we could not
+ * output
+ */
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error);
if (from_where == 0) {
SCTP_STAT_INCR(sctps_lowlevelerrusr);
}
- /* error, could not output */
- if (hbflag) {
- if (*now_filled == 0) {
- (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
- *now_filled = 1;
- *now = net->last_sent_time;
- } else {
- net->last_sent_time = *now;
- }
- hbflag = 0;
+ if (error == ENOBUFS) {
+ asoc->ifp_had_enobuf = 1;
+ SCTP_STAT_INCR(sctps_lowlevelerr);
}
if (error == EHOSTUNREACH) {
/*
@@ -8486,19 +8527,9 @@ again_one_more_time:
sctp_move_chunks_from_net(stcb, net);
}
*reason_code = 7;
- continue;
- } else
+ break;
+ } else {
asoc->ifp_had_enobuf = 0;
- /* Only HB or ASCONF advances time */
- if (hbflag) {
- if (*now_filled == 0) {
- (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
- *now_filled = 1;
- *now = net->last_sent_time;
- } else {
- net->last_sent_time = *now;
- }
- hbflag = 0;
}
/*
* increase the number we sent, if a
@@ -8532,6 +8563,10 @@ again_one_more_time:
}
}
}
+ if (error != 0) {
+ /* try next net */
+ continue;
+ }
/* JRI: if dest is in PF state, do not send data to it */
if ((asoc->sctp_cmt_on_off > 0) &&
(net != stcb->asoc.alternate) &&
@@ -8576,16 +8611,16 @@ again_one_more_time:
switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) {
#ifdef INET
case AF_INET:
- if (net->mtu > (sizeof(struct ip) + sizeof(struct sctphdr)))
- omtu = net->mtu - (sizeof(struct ip) + sizeof(struct sctphdr));
+ if (net->mtu > SCTP_MIN_V4_OVERHEAD)
+ omtu = net->mtu - SCTP_MIN_V4_OVERHEAD;
else
omtu = 0;
break;
#endif
#ifdef INET6
case AF_INET6:
- if (net->mtu > (sizeof(struct ip6_hdr) + sizeof(struct sctphdr)))
- omtu = net->mtu - (sizeof(struct ip6_hdr) + sizeof(struct sctphdr));
+ if (net->mtu > SCTP_MIN_OVERHEAD)
+ omtu = net->mtu - SCTP_MIN_OVERHEAD;
else
omtu = 0;
break;
@@ -8595,7 +8630,8 @@ again_one_more_time:
omtu = 0;
break;
}
- if ((((asoc->state & SCTP_STATE_OPEN) == SCTP_STATE_OPEN) &&
+ if ((((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
+ (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) &&
(skip_data_for_this_net == 0)) ||
(cookie)) {
TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) {
@@ -8785,6 +8821,14 @@ no_data_fill:
*/
sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
}
+ if (bundle_at || hbflag) {
+ /* For data/asconf and hb set time */
+ if (*now_filled == 0) {
+ (void)SCTP_GETTIME_TIMEVAL(now);
+ *now_filled = 1;
+ }
+ net->last_sent_time = *now;
+ }
/* Now send it, if there is anything to send :> */
if ((error = sctp_lowlevel_chunk_output(inp,
stcb,
@@ -8803,23 +8847,13 @@ no_data_fill:
0, 0,
so_locked))) {
/* error, we could not output */
- if (error == ENOBUFS) {
- SCTP_STAT_INCR(sctps_lowlevelerr);
- asoc->ifp_had_enobuf = 1;
- }
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error);
if (from_where == 0) {
SCTP_STAT_INCR(sctps_lowlevelerrusr);
}
- SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error);
- if (hbflag) {
- if (*now_filled == 0) {
- (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
- *now_filled = 1;
- *now = net->last_sent_time;
- } else {
- net->last_sent_time = *now;
- }
- hbflag = 0;
+ if (error == ENOBUFS) {
+ SCTP_STAT_INCR(sctps_lowlevelerr);
+ asoc->ifp_had_enobuf = 1;
}
if (error == EHOSTUNREACH) {
/*
@@ -8844,16 +8878,6 @@ no_data_fill:
endoutchain = NULL;
auth = NULL;
auth_offset = 0;
- if (bundle_at || hbflag) {
- /* For data/asconf and hb set time */
- if (*now_filled == 0) {
- (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
- *now_filled = 1;
- *now = net->last_sent_time;
- } else {
- net->last_sent_time = *now;
- }
- }
if (!no_out_cnt) {
*num_out += (ctl_cnt + bundle_at);
}
@@ -8914,9 +8938,37 @@ sctp_queue_op_err(struct sctp_tcb *stcb, struct mbuf *op_err)
*/
struct sctp_chunkhdr *hdr;
struct sctp_tmit_chunk *chk;
- struct mbuf *mat;
+ struct mbuf *mat, *last_mbuf;
+ uint32_t chunk_length;
+ uint16_t padding_length;
SCTP_TCB_LOCK_ASSERT(stcb);
+ SCTP_BUF_PREPEND(op_err, sizeof(struct sctp_chunkhdr), M_NOWAIT);
+ if (op_err == NULL) {
+ return;
+ }
+ last_mbuf = NULL;
+ chunk_length = 0;
+ for (mat = op_err; mat != NULL; mat = SCTP_BUF_NEXT(mat)) {
+ chunk_length += SCTP_BUF_LEN(mat);
+ if (SCTP_BUF_NEXT(mat) == NULL) {
+ last_mbuf = mat;
+ }
+ }
+ if (chunk_length > SCTP_MAX_CHUNK_LENGTH) {
+ sctp_m_freem(op_err);
+ return;
+ }
+ padding_length = chunk_length % 4;
+ if (padding_length != 0) {
+ padding_length = 4 - padding_length;
+ }
+ if (padding_length != 0) {
+ if (sctp_add_pad_tombuf(last_mbuf, padding_length) == NULL) {
+ sctp_m_freem(op_err);
+ return;
+ }
+ }
sctp_alloc_a_chunk(stcb, chk);
if (chk == NULL) {
/* no memory */
@@ -8924,32 +8976,19 @@ sctp_queue_op_err(struct sctp_tcb *stcb, struct mbuf *op_err)
return;
}
chk->copy_by_ref = 0;
- SCTP_BUF_PREPEND(op_err, sizeof(struct sctp_chunkhdr), M_DONTWAIT);
- if (op_err == NULL) {
- sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
- return;
- }
- chk->send_size = 0;
- mat = op_err;
- while (mat != NULL) {
- chk->send_size += SCTP_BUF_LEN(mat);
- mat = SCTP_BUF_NEXT(mat);
- }
- chk->rec.chunk_id.id = SCTP_OPERATION_ERROR;
- chk->rec.chunk_id.can_take_data = 1;
+ chk->send_size = (uint16_t) chunk_length;
chk->sent = SCTP_DATAGRAM_UNSENT;
chk->snd_count = 0;
- chk->flags = 0;
chk->asoc = &stcb->asoc;
chk->data = op_err;
chk->whoTo = NULL;
+ chk->rec.chunk_id.id = SCTP_OPERATION_ERROR;
+ chk->rec.chunk_id.can_take_data = 0;
hdr = mtod(op_err, struct sctp_chunkhdr *);
hdr->chunk_type = SCTP_OPERATION_ERROR;
hdr->chunk_flags = 0;
hdr->chunk_length = htons(chk->send_size);
- TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue,
- chk,
- sctp_next);
+ TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
chk->asoc->ctrl_queue_cnt++;
}
@@ -8970,12 +9009,11 @@ sctp_send_cookie_echo(struct mbuf *m,
struct sctp_tmit_chunk *chk;
uint16_t ptype, plen;
+ SCTP_TCB_LOCK_ASSERT(stcb);
/* First find the cookie in the param area */
cookie = NULL;
at = offset + sizeof(struct sctp_init_chunk);
-
- SCTP_TCB_LOCK_ASSERT(stcb);
- do {
+ for (;;) {
phdr = sctp_get_next_param(m, at, &parm, sizeof(parm));
if (phdr == NULL) {
return (-3);
@@ -8989,32 +9027,21 @@ sctp_send_cookie_echo(struct mbuf *m,
if ((pad = (plen % 4))) {
plen += 4 - pad;
}
- cookie = SCTP_M_COPYM(m, at, plen, M_DONTWAIT);
+ cookie = SCTP_M_COPYM(m, at, plen, M_NOWAIT);
if (cookie == NULL) {
/* No memory */
return (-2);
}
#ifdef SCTP_MBUF_LOGGING
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- struct mbuf *mat;
-
- for (mat = cookie; mat; mat = SCTP_BUF_NEXT(mat)) {
- if (SCTP_BUF_IS_EXTENDED(mat)) {
- sctp_log_mb(mat, SCTP_MBUF_ICOPY);
- }
- }
+ sctp_log_mbc(cookie, SCTP_MBUF_ICOPY);
}
#endif
break;
}
at += SCTP_SIZE32(plen);
- } while (phdr);
- if (cookie == NULL) {
- /* Did not find the cookie */
- return (-3);
}
/* ok, we got the cookie lets change it into a cookie echo chunk */
-
/* first the change from param to cookie */
hdr = mtod(cookie, struct sctp_chunkhdr *);
hdr->chunk_type = SCTP_COOKIE_ECHO;
@@ -9027,12 +9054,12 @@ sctp_send_cookie_echo(struct mbuf *m,
return (-5);
}
chk->copy_by_ref = 0;
- chk->send_size = plen;
chk->rec.chunk_id.id = SCTP_COOKIE_ECHO;
chk->rec.chunk_id.can_take_data = 0;
+ chk->flags = CHUNK_FLAGS_FRAGMENT_OK;
+ chk->send_size = plen;
chk->sent = SCTP_DATAGRAM_UNSENT;
chk->snd_count = 0;
- chk->flags = CHUNK_FLAGS_FRAGMENT_OK;
chk->asoc = &stcb->asoc;
chk->data = cookie;
chk->whoTo = net;
@@ -9061,20 +9088,14 @@ sctp_send_heartbeat_ack(struct sctp_tcb *stcb,
/* must have a net pointer */
return;
- outchain = SCTP_M_COPYM(m, offset, chk_length, M_DONTWAIT);
+ outchain = SCTP_M_COPYM(m, offset, chk_length, M_NOWAIT);
if (outchain == NULL) {
/* gak out of memory */
return;
}
#ifdef SCTP_MBUF_LOGGING
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- struct mbuf *mat;
-
- for (mat = outchain; mat; mat = SCTP_BUF_NEXT(mat)) {
- if (SCTP_BUF_IS_EXTENDED(mat)) {
- sctp_log_mb(mat, SCTP_MBUF_ICOPY);
- }
- }
+ sctp_log_mbc(outchain, SCTP_MBUF_ICOPY);
}
#endif
chdr = mtod(outchain, struct sctp_chunkhdr *);
@@ -9095,12 +9116,12 @@ sctp_send_heartbeat_ack(struct sctp_tcb *stcb,
return;
}
chk->copy_by_ref = 0;
- chk->send_size = chk_length;
chk->rec.chunk_id.id = SCTP_HEARTBEAT_ACK;
chk->rec.chunk_id.can_take_data = 1;
+ chk->flags = 0;
+ chk->send_size = chk_length;
chk->sent = SCTP_DATAGRAM_UNSENT;
chk->snd_count = 0;
- chk->flags = 0;
chk->asoc = &stcb->asoc;
chk->data = outchain;
chk->whoTo = net;
@@ -9119,7 +9140,7 @@ sctp_send_cookie_ack(struct sctp_tcb *stcb)
SCTP_TCB_LOCK_ASSERT(stcb);
- cookie_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_DONTWAIT, 1, MT_HEADER);
+ cookie_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_NOWAIT, 1, MT_HEADER);
if (cookie_ack == NULL) {
/* no mbuf's */
return;
@@ -9132,12 +9153,12 @@ sctp_send_cookie_ack(struct sctp_tcb *stcb)
return;
}
chk->copy_by_ref = 0;
- chk->send_size = sizeof(struct sctp_chunkhdr);
chk->rec.chunk_id.id = SCTP_COOKIE_ACK;
chk->rec.chunk_id.can_take_data = 1;
+ chk->flags = 0;
+ chk->send_size = sizeof(struct sctp_chunkhdr);
chk->sent = SCTP_DATAGRAM_UNSENT;
chk->snd_count = 0;
- chk->flags = 0;
chk->asoc = &stcb->asoc;
chk->data = cookie_ack;
if (chk->asoc->last_control_chunk_from != NULL) {
@@ -9165,7 +9186,7 @@ sctp_send_shutdown_ack(struct sctp_tcb *stcb, struct sctp_nets *net)
struct sctp_shutdown_ack_chunk *ack_cp;
struct sctp_tmit_chunk *chk;
- m_shutdown_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_ack_chunk), 0, M_DONTWAIT, 1, MT_HEADER);
+ m_shutdown_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_ack_chunk), 0, M_NOWAIT, 1, MT_HEADER);
if (m_shutdown_ack == NULL) {
/* no mbuf's */
return;
@@ -9178,9 +9199,10 @@ sctp_send_shutdown_ack(struct sctp_tcb *stcb, struct sctp_nets *net)
return;
}
chk->copy_by_ref = 0;
- chk->send_size = sizeof(struct sctp_chunkhdr);
chk->rec.chunk_id.id = SCTP_SHUTDOWN_ACK;
chk->rec.chunk_id.can_take_data = 1;
+ chk->flags = 0;
+ chk->send_size = sizeof(struct sctp_chunkhdr);
chk->sent = SCTP_DATAGRAM_UNSENT;
chk->snd_count = 0;
chk->flags = 0;
@@ -9208,7 +9230,7 @@ sctp_send_shutdown(struct sctp_tcb *stcb, struct sctp_nets *net)
struct sctp_shutdown_chunk *shutdown_cp;
struct sctp_tmit_chunk *chk;
- m_shutdown = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_chunk), 0, M_DONTWAIT, 1, MT_HEADER);
+ m_shutdown = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_chunk), 0, M_NOWAIT, 1, MT_HEADER);
if (m_shutdown == NULL) {
/* no mbuf's */
return;
@@ -9221,9 +9243,10 @@ sctp_send_shutdown(struct sctp_tcb *stcb, struct sctp_nets *net)
return;
}
chk->copy_by_ref = 0;
- chk->send_size = sizeof(struct sctp_shutdown_chunk);
chk->rec.chunk_id.id = SCTP_SHUTDOWN;
chk->rec.chunk_id.can_take_data = 1;
+ chk->flags = 0;
+ chk->send_size = sizeof(struct sctp_shutdown_chunk);
chk->sent = SCTP_DATAGRAM_UNSENT;
chk->snd_count = 0;
chk->flags = 0;
@@ -9274,13 +9297,13 @@ sctp_send_asconf(struct sctp_tcb *stcb, struct sctp_nets *net, int addr_locked)
return;
}
chk->copy_by_ref = 0;
- chk->data = m_asconf;
- chk->send_size = len;
chk->rec.chunk_id.id = SCTP_ASCONF;
chk->rec.chunk_id.can_take_data = 0;
+ chk->flags = CHUNK_FLAGS_FRAGMENT_OK;
+ chk->data = m_asconf;
+ chk->send_size = len;
chk->sent = SCTP_DATAGRAM_UNSENT;
chk->snd_count = 0;
- chk->flags = CHUNK_FLAGS_FRAGMENT_OK;
chk->asoc = &stcb->asoc;
chk->whoTo = net;
if (chk->whoTo) {
@@ -9344,20 +9367,14 @@ sctp_send_asconf_ack(struct sctp_tcb *stcb)
continue;
}
/* copy the asconf_ack */
- m_ack = SCTP_M_COPYM(ack->data, 0, M_COPYALL, M_DONTWAIT);
+ m_ack = SCTP_M_COPYM(ack->data, 0, M_COPYALL, M_NOWAIT);
if (m_ack == NULL) {
/* couldn't copy it */
return;
}
#ifdef SCTP_MBUF_LOGGING
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- struct mbuf *mat;
-
- for (mat = m_ack; mat; mat = SCTP_BUF_NEXT(mat)) {
- if (SCTP_BUF_IS_EXTENDED(mat)) {
- sctp_log_mb(mat, SCTP_MBUF_ICOPY);
- }
- }
+ sctp_log_mbc(m_ack, SCTP_MBUF_ICOPY);
}
#endif
@@ -9369,20 +9386,17 @@ sctp_send_asconf_ack(struct sctp_tcb *stcb)
return;
}
chk->copy_by_ref = 0;
-
+ chk->rec.chunk_id.id = SCTP_ASCONF_ACK;
+ chk->rec.chunk_id.can_take_data = 1;
+ chk->flags = CHUNK_FLAGS_FRAGMENT_OK;
chk->whoTo = net;
if (chk->whoTo) {
atomic_add_int(&chk->whoTo->ref_count, 1);
}
chk->data = m_ack;
- chk->send_size = 0;
- /* Get size */
chk->send_size = ack->len;
- chk->rec.chunk_id.id = SCTP_ASCONF_ACK;
- chk->rec.chunk_id.can_take_data = 1;
chk->sent = SCTP_DATAGRAM_UNSENT;
chk->snd_count = 0;
- chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; /* XXX */
chk->asoc = &stcb->asoc;
TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next);
@@ -9491,7 +9505,7 @@ sctp_chunk_retransmission(struct sctp_inpcb *inp,
cnt_thru = 0;
/* do we have control chunks to retransmit? */
if (m != NULL) {
- /* Start a timer no matter if we suceed or fail */
+ /* Start a timer no matter if we succeed or fail */
if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) {
sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, chk->whoTo);
} else if (chk->rec.chunk_id.id == SCTP_ASCONF)
@@ -9555,12 +9569,16 @@ sctp_chunk_retransmission(struct sctp_inpcb *inp,
}
if ((SCTP_BASE_SYSCTL(sctp_max_retran_chunk)) &&
(chk->snd_count >= SCTP_BASE_SYSCTL(sctp_max_retran_chunk))) {
- /* Gak, we have exceeded max unlucky retran, abort! */
- SCTP_PRINTF("Gak, chk->snd_count:%d >= max:%d - send abort\n",
- chk->snd_count,
- SCTP_BASE_SYSCTL(sctp_max_retran_chunk));
+ struct mbuf *op_err;
+ char msg[SCTP_DIAG_INFO_LEN];
+
+ snprintf(msg, sizeof(msg), "TSN %8.8x retransmitted %d times, giving up",
+ chk->rec.data.TSN_seq, chk->snd_count);
+ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
+ msg);
atomic_add_int(&stcb->asoc.refcnt, 1);
- sctp_abort_an_association(stcb->sctp_ep, stcb, NULL, so_locked);
+ sctp_abort_an_association(stcb->sctp_ep, stcb, op_err,
+ so_locked);
SCTP_TCB_LOCK(stcb);
atomic_subtract_int(&stcb->asoc.refcnt, 1);
return (SCTP_RETRAN_EXIT);
@@ -9752,7 +9770,7 @@ one_chunk_around:
/* Is there something to send for this destination? */
if (m) {
/*
- * No matter if we fail/or suceed we should start a
+ * No matter if we fail/or succeed we should start a
* timer. A failure is like a lost IP packet :-)
*/
if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) {
@@ -9850,7 +9868,7 @@ one_chunk_around:
sctp_misc_ints(SCTP_FLIGHT_LOG_UP_RSND,
data_list[i]->whoTo->flight_size,
data_list[i]->book_size,
- (uintptr_t) data_list[i]->whoTo,
+ (uint32_t) (uintptr_t) data_list[i]->whoTo,
data_list[i]->rec.data.TSN_seq);
}
sctp_flight_size_increase(data_list[i]);
@@ -9874,7 +9892,7 @@ one_chunk_around:
* t3-expiring.
*/
sctp_timer_stop(SCTP_TIMER_TYPE_SEND, inp, stcb, net,
- SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_4);
+ SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_2);
sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
}
}
@@ -9954,7 +9972,7 @@ sctp_chunk_output(struct sctp_inpcb *inp,
*/
struct sctp_association *asoc;
struct sctp_nets *net;
- int error = 0, num_out = 0, tot_out = 0, ret = 0, reason_code = 0;
+ int error = 0, num_out, tot_out = 0, ret = 0, reason_code;
unsigned int burst_cnt = 0;
struct timeval now;
int now_filled = 0;
@@ -9965,6 +9983,7 @@ sctp_chunk_output(struct sctp_inpcb *inp,
unsigned int tot_frs = 0;
asoc = &stcb->asoc;
+do_it_again:
/* The Nagle algorithm is only applied when handling a send call. */
if (from_where == SCTP_OUTPUT_FROM_USR_SEND) {
if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NODELAY)) {
@@ -9982,7 +10001,8 @@ sctp_chunk_output(struct sctp_inpcb *inp,
if ((un_sent <= 0) &&
(TAILQ_EMPTY(&asoc->control_send_queue)) &&
(TAILQ_EMPTY(&asoc->asconf_send_queue)) &&
- (asoc->sent_queue_retran_cnt == 0)) {
+ (asoc->sent_queue_retran_cnt == 0) &&
+ (asoc->trigger_reset == 0)) {
/* Nothing to do unless there is something to be sent left */
return;
}
@@ -10156,15 +10176,14 @@ sctp_chunk_output(struct sctp_inpcb *inp,
un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) +
(stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk)));
if ((un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD)) &&
- (stcb->asoc.total_flight > 0) &&
- ((stcb->asoc.locked_on_sending == NULL) ||
- sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) {
+ (stcb->asoc.total_flight > 0)) {
+/* && sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) {*/
break;
}
}
if (TAILQ_EMPTY(&asoc->control_send_queue) &&
TAILQ_EMPTY(&asoc->send_queue) &&
- stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) {
+ sctp_is_there_unsent_data(stcb, so_locked) == 0) {
/* Nothing left to send */
break;
}
@@ -10201,6 +10220,12 @@ sctp_chunk_output(struct sctp_inpcb *inp,
*/
if (stcb->asoc.ecn_echo_cnt_onq)
sctp_fix_ecn_echo(asoc);
+
+ if (stcb->asoc.trigger_reset) {
+ if (sctp_send_stream_reset_out_if_possible(stcb, so_locked) == 0) {
+ goto do_it_again;
+ }
+ }
return;
}
@@ -10235,10 +10260,21 @@ void
send_forward_tsn(struct sctp_tcb *stcb,
struct sctp_association *asoc)
{
- struct sctp_tmit_chunk *chk;
+ struct sctp_tmit_chunk *chk, *at, *tp1, *last;
struct sctp_forward_tsn_chunk *fwdtsn;
+ struct sctp_strseq *strseq;
+ struct sctp_strseq_mid *strseq_m;
uint32_t advance_peer_ack_point;
+ unsigned int cnt_of_space, i, ovh;
+ unsigned int space_needed;
+ unsigned int cnt_of_skipped = 0;
+ int old;
+ if (asoc->idata_supported) {
+ old = 0;
+ } else {
+ old = 1;
+ }
SCTP_TCB_LOCK_ASSERT(stcb);
TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) {
if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) {
@@ -10260,11 +10296,17 @@ send_forward_tsn(struct sctp_tcb *stcb,
}
asoc->fwd_tsn_cnt++;
chk->copy_by_ref = 0;
+ /*
+ * We don't do the old thing here since this is used not for on-wire
+ * but to tell if we are sending a fwd-tsn by the stack during
+ * output. And if its a IFORWARD or a FORWARD it is a fwd-tsn.
+ */
chk->rec.chunk_id.id = SCTP_FORWARD_CUM_TSN;
chk->rec.chunk_id.can_take_data = 0;
+ chk->flags = 0;
chk->asoc = asoc;
chk->whoTo = NULL;
- chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
if (chk->data == NULL) {
sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
return;
@@ -10280,132 +10322,155 @@ sctp_fill_in_rest:
* stream/seq of the ones we skip.
*/
SCTP_BUF_LEN(chk->data) = 0;
- {
- struct sctp_tmit_chunk *at, *tp1, *last;
- struct sctp_strseq *strseq;
- unsigned int cnt_of_space, i, ovh;
- unsigned int space_needed;
- unsigned int cnt_of_skipped = 0;
-
- TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) {
- if ((at->sent != SCTP_FORWARD_TSN_SKIP) &&
- (at->sent != SCTP_DATAGRAM_NR_ACKED)) {
- /* no more to look at */
- break;
- }
- if (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED) {
- /* We don't report these */
- continue;
- }
- cnt_of_skipped++;
+ TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) {
+ if ((at->sent != SCTP_FORWARD_TSN_SKIP) &&
+ (at->sent != SCTP_DATAGRAM_NR_ACKED)) {
+ /* no more to look at */
+ break;
+ }
+ if (old && (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) {
+ /* We don't report these */
+ continue;
}
+ cnt_of_skipped++;
+ }
+ if (old) {
space_needed = (sizeof(struct sctp_forward_tsn_chunk) +
(cnt_of_skipped * sizeof(struct sctp_strseq)));
+ } else {
+ space_needed = (sizeof(struct sctp_forward_tsn_chunk) +
+ (cnt_of_skipped * sizeof(struct sctp_strseq_mid)));
+ }
+ cnt_of_space = (unsigned int)M_TRAILINGSPACE(chk->data);
- cnt_of_space = M_TRAILINGSPACE(chk->data);
-
- if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
- ovh = SCTP_MIN_OVERHEAD;
- } else {
- ovh = SCTP_MIN_V4_OVERHEAD;
- }
- if (cnt_of_space > (asoc->smallest_mtu - ovh)) {
- /* trim to a mtu size */
- cnt_of_space = asoc->smallest_mtu - ovh;
- }
+ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
+ ovh = SCTP_MIN_OVERHEAD;
+ } else {
+ ovh = SCTP_MIN_V4_OVERHEAD;
+ }
+ if (cnt_of_space > (asoc->smallest_mtu - ovh)) {
+ /* trim to a mtu size */
+ cnt_of_space = asoc->smallest_mtu - ovh;
+ }
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
+ sctp_misc_ints(SCTP_FWD_TSN_CHECK,
+ 0xff, 0, cnt_of_skipped,
+ asoc->advanced_peer_ack_point);
+ }
+ advance_peer_ack_point = asoc->advanced_peer_ack_point;
+ if (cnt_of_space < space_needed) {
+ /*-
+ * ok we must trim down the chunk by lowering the
+ * advance peer ack point.
+ */
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
sctp_misc_ints(SCTP_FWD_TSN_CHECK,
- 0xff, 0, cnt_of_skipped,
- asoc->advanced_peer_ack_point);
-
+ 0xff, 0xff, cnt_of_space,
+ space_needed);
}
- advance_peer_ack_point = asoc->advanced_peer_ack_point;
- if (cnt_of_space < space_needed) {
- /*-
- * ok we must trim down the chunk by lowering the
- * advance peer ack point.
- */
- if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
- sctp_misc_ints(SCTP_FWD_TSN_CHECK,
- 0xff, 0xff, cnt_of_space,
- space_needed);
- }
+ if (old) {
cnt_of_skipped = cnt_of_space - sizeof(struct sctp_forward_tsn_chunk);
cnt_of_skipped /= sizeof(struct sctp_strseq);
- /*-
- * Go through and find the TSN that will be the one
- * we report.
- */
- at = TAILQ_FIRST(&asoc->sent_queue);
- if (at != NULL) {
- for (i = 0; i < cnt_of_skipped; i++) {
- tp1 = TAILQ_NEXT(at, sctp_next);
- if (tp1 == NULL) {
- break;
- }
- at = tp1;
+ } else {
+ cnt_of_skipped = cnt_of_space - sizeof(struct sctp_forward_tsn_chunk);
+ cnt_of_skipped /= sizeof(struct sctp_strseq_mid);
+ }
+ /*-
+ * Go through and find the TSN that will be the one
+ * we report.
+ */
+ at = TAILQ_FIRST(&asoc->sent_queue);
+ if (at != NULL) {
+ for (i = 0; i < cnt_of_skipped; i++) {
+ tp1 = TAILQ_NEXT(at, sctp_next);
+ if (tp1 == NULL) {
+ break;
}
+ at = tp1;
}
- if (at && SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
- sctp_misc_ints(SCTP_FWD_TSN_CHECK,
- 0xff, cnt_of_skipped, at->rec.data.TSN_seq,
- asoc->advanced_peer_ack_point);
- }
- last = at;
- /*-
- * last now points to last one I can report, update
- * peer ack point
- */
- if (last)
- advance_peer_ack_point = last->rec.data.TSN_seq;
+ }
+ if (at && SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) {
+ sctp_misc_ints(SCTP_FWD_TSN_CHECK,
+ 0xff, cnt_of_skipped, at->rec.data.TSN_seq,
+ asoc->advanced_peer_ack_point);
+ }
+ last = at;
+ /*-
+ * last now points to last one I can report, update
+ * peer ack point
+ */
+ if (last) {
+ advance_peer_ack_point = last->rec.data.TSN_seq;
+ }
+ if (old) {
space_needed = sizeof(struct sctp_forward_tsn_chunk) +
cnt_of_skipped * sizeof(struct sctp_strseq);
+ } else {
+ space_needed = sizeof(struct sctp_forward_tsn_chunk) +
+ cnt_of_skipped * sizeof(struct sctp_strseq_mid);
}
- chk->send_size = space_needed;
- /* Setup the chunk */
- fwdtsn = mtod(chk->data, struct sctp_forward_tsn_chunk *);
- fwdtsn->ch.chunk_length = htons(chk->send_size);
- fwdtsn->ch.chunk_flags = 0;
+ }
+ chk->send_size = space_needed;
+ /* Setup the chunk */
+ fwdtsn = mtod(chk->data, struct sctp_forward_tsn_chunk *);
+ fwdtsn->ch.chunk_length = htons(chk->send_size);
+ fwdtsn->ch.chunk_flags = 0;
+ if (old) {
fwdtsn->ch.chunk_type = SCTP_FORWARD_CUM_TSN;
- fwdtsn->new_cumulative_tsn = htonl(advance_peer_ack_point);
- SCTP_BUF_LEN(chk->data) = chk->send_size;
- fwdtsn++;
- /*-
- * Move pointer to after the fwdtsn and transfer to the
- * strseq pointer.
- */
+ } else {
+ fwdtsn->ch.chunk_type = SCTP_IFORWARD_CUM_TSN;
+ }
+ fwdtsn->new_cumulative_tsn = htonl(advance_peer_ack_point);
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+ fwdtsn++;
+ /*-
+ * Move pointer to after the fwdtsn and transfer to the
+ * strseq pointer.
+ */
+ if (old) {
strseq = (struct sctp_strseq *)fwdtsn;
- /*-
- * Now populate the strseq list. This is done blindly
- * without pulling out duplicate stream info. This is
- * inefficent but won't harm the process since the peer will
- * look at these in sequence and will thus release anything.
- * It could mean we exceed the PMTU and chop off some that
- * we could have included.. but this is unlikely (aka 1432/4
- * would mean 300+ stream seq's would have to be reported in
- * one FWD-TSN. With a bit of work we can later FIX this to
- * optimize and pull out duplcates.. but it does add more
- * overhead. So for now... not!
- */
- at = TAILQ_FIRST(&asoc->sent_queue);
- for (i = 0; i < cnt_of_skipped; i++) {
- tp1 = TAILQ_NEXT(at, sctp_next);
- if (tp1 == NULL)
- break;
+ } else {
+ strseq_m = (struct sctp_strseq_mid *)fwdtsn;
+ }
+ /*-
+ * Now populate the strseq list. This is done blindly
+ * without pulling out duplicate stream info. This is
+ * inefficent but won't harm the process since the peer will
+ * look at these in sequence and will thus release anything.
+ * It could mean we exceed the PMTU and chop off some that
+ * we could have included.. but this is unlikely (aka 1432/4
+ * would mean 300+ stream seq's would have to be reported in
+ * one FWD-TSN. With a bit of work we can later FIX this to
+ * optimize and pull out duplicates.. but it does add more
+ * overhead. So for now... not!
+ */
+ i = 0;
+ TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) {
+ if (i >= cnt_of_skipped) {
+ break;
+ }
+ if (old && (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) {
+ /* We don't report these */
+ continue;
+ }
+ if (at->rec.data.TSN_seq == advance_peer_ack_point) {
+ at->rec.data.fwd_tsn_cnt = 0;
+ }
+ if (old) {
+ strseq->stream = htons(at->rec.data.stream_number);
+ strseq->sequence = htons((uint16_t) at->rec.data.stream_seq);
+ strseq++;
+ } else {
+ strseq_m->stream = htons(at->rec.data.stream_number);
if (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED) {
- /* We don't report these */
- i--;
- at = tp1;
- continue;
- }
- if (at->rec.data.TSN_seq == advance_peer_ack_point) {
- at->rec.data.fwd_tsn_cnt = 0;
+ strseq_m->flags = htons(PR_SCTP_UNORDERED_FLAG);
+ } else {
+ strseq_m->flags = 0;
}
- strseq->stream = ntohs(at->rec.data.stream_number);
- strseq->sequence = ntohs(at->rec.data.stream_seq);
- strseq++;
- at = tp1;
+ strseq_m->msg_id = htonl(at->rec.data.stream_seq);
+ strseq_m++;
}
+ i++;
}
return;
}
@@ -10428,7 +10493,7 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked
struct sctp_sack_chunk *sack;
struct sctp_nr_sack_chunk *nr_sack;
struct sctp_gap_ack_block *gap_descriptor;
- struct sack_track *selector;
+ const struct sack_track *selector;
int mergeable = 0;
int offset;
caddr_t limit;
@@ -10443,8 +10508,7 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked
uint8_t type;
uint8_t tsn_map;
- if ((stcb->asoc.sctp_nr_sack_on_off == 1) &&
- (stcb->asoc.peer_supports_nr_sack == 1)) {
+ if (stcb->asoc.nrsack_supported == 1) {
type = SCTP_NR_SELECTIVE_ACK;
} else {
type = SCTP_SELECTIVE_ACK;
@@ -10481,7 +10545,8 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked
/* No memory so we drop the idea, and set a timer */
if (stcb->asoc.delayed_ack) {
sctp_timer_stop(SCTP_TIMER_TYPE_RECV,
- stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_5);
+ stcb->sctp_ep, stcb, NULL,
+ SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_3);
sctp_timer_start(SCTP_TIMER_TYPE_RECV,
stcb->sctp_ep, stcb, NULL);
} else {
@@ -10496,38 +10561,24 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked
/* Clear our pkt counts */
asoc->data_pkts_seen = 0;
+ a_chk->flags = 0;
a_chk->asoc = asoc;
a_chk->snd_count = 0;
a_chk->send_size = 0; /* fill in later */
a_chk->sent = SCTP_DATAGRAM_UNSENT;
a_chk->whoTo = NULL;
- if ((asoc->numduptsns) ||
- (!(asoc->last_data_chunk_from->dest_state & SCTP_ADDR_REACHABLE))) {
+ if (!(asoc->last_data_chunk_from->dest_state & SCTP_ADDR_REACHABLE)) {
/*-
- * Ok, we have some duplicates or the destination for the
- * sack is unreachable, lets see if we can select an
- * alternate than asoc->last_data_chunk_from
+ * Ok, the destination for the SACK is unreachable, lets see if
+ * we can select an alternate to asoc->last_data_chunk_from
*/
- if ((asoc->last_data_chunk_from->dest_state & SCTP_ADDR_REACHABLE) &&
- (asoc->used_alt_onsack > asoc->numnets)) {
- /* We used an alt last time, don't this time */
- a_chk->whoTo = NULL;
- } else {
- asoc->used_alt_onsack++;
- a_chk->whoTo = sctp_find_alternate_net(stcb, asoc->last_data_chunk_from, 0);
- }
+ a_chk->whoTo = sctp_find_alternate_net(stcb, asoc->last_data_chunk_from, 0);
if (a_chk->whoTo == NULL) {
/* Nope, no alternate */
a_chk->whoTo = asoc->last_data_chunk_from;
- asoc->used_alt_onsack = 0;
}
} else {
- /*
- * No duplicates so we use the last place we received data
- * from.
- */
- asoc->used_alt_onsack = 0;
a_chk->whoTo = asoc->last_data_chunk_from;
}
if (a_chk->whoTo) {
@@ -10550,7 +10601,7 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked
space_req = MCLBYTES;
}
/* Ok now lets formulate a MBUF with our sack */
- a_chk->data = sctp_get_mbuf_for_msg(space_req, 0, M_DONTWAIT, 1, MT_DATA);
+ a_chk->data = sctp_get_mbuf_for_msg(space_req, 0, M_NOWAIT, 1, MT_DATA);
if ((a_chk->data == NULL) ||
(a_chk->whoTo == NULL)) {
/* rats, no mbuf memory */
@@ -10563,7 +10614,8 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked
/* sa_ignore NO_NULL_CHK */
if (stcb->asoc.delayed_ack) {
sctp_timer_stop(SCTP_TIMER_TYPE_RECV,
- stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_6);
+ stcb->sctp_ep, stcb, NULL,
+ SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_4);
sctp_timer_start(SCTP_TIMER_TYPE_RECV,
stcb->sctp_ep, stcb, NULL);
} else {
@@ -10573,7 +10625,7 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked
}
/* ok, lets go through and fill it in */
SCTP_BUF_RESV_UF(a_chk->data, SCTP_MIN_OVERHEAD);
- space = M_TRAILINGSPACE(a_chk->data);
+ space = (unsigned int)M_TRAILINGSPACE(a_chk->data);
if (space > (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD)) {
space = (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD);
}
@@ -10642,7 +10694,7 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked
* Clear all bits corresponding to TSNs
* smaller or equal to the cumulative TSN.
*/
- tsn_map &= (~0 << (1 - offset));
+ tsn_map &= (~0U << (1 - offset));
}
selector = &sack_array[tsn_map];
if (mergeable && selector->right_edge) {
@@ -10717,7 +10769,7 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked
* TSNs smaller or equal to the
* cumulative TSN.
*/
- tsn_map &= (~0 << (1 - offset));
+ tsn_map &= (~0U << (1 - offset));
}
selector = &sack_array[tsn_map];
if (mergeable && selector->right_edge) {
@@ -10787,9 +10839,9 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked
* queue.
*/
if (type == SCTP_SELECTIVE_ACK) {
- a_chk->send_size = sizeof(struct sctp_sack_chunk) +
+ a_chk->send_size = (uint16_t) (sizeof(struct sctp_sack_chunk) +
(num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) +
- num_dups * sizeof(int32_t);
+ num_dups * sizeof(int32_t));
SCTP_BUF_LEN(a_chk->data) = a_chk->send_size;
sack->sack.cum_tsn_ack = htonl(asoc->cumulative_tsn);
sack->sack.a_rwnd = htonl(asoc->my_rwnd);
@@ -10799,9 +10851,9 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked
sack->ch.chunk_flags = flags;
sack->ch.chunk_length = htons(a_chk->send_size);
} else {
- a_chk->send_size = sizeof(struct sctp_nr_sack_chunk) +
+ a_chk->send_size = (uint16_t) (sizeof(struct sctp_nr_sack_chunk) +
(num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) +
- num_dups * sizeof(int32_t);
+ num_dups * sizeof(int32_t));
SCTP_BUF_LEN(a_chk->data) = a_chk->send_size;
nr_sack->nr_sack.cum_tsn_ack = htonl(asoc->cumulative_tsn);
nr_sack->nr_sack.a_rwnd = htonl(asoc->my_rwnd);
@@ -10850,7 +10902,7 @@ sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked
} else {
m_out = NULL;
}
- m_abort = sctp_get_mbuf_for_msg(sizeof(struct sctp_abort_chunk), 0, M_DONTWAIT, 1, MT_HEADER);
+ m_abort = sctp_get_mbuf_for_msg(sizeof(struct sctp_abort_chunk), 0, M_NOWAIT, 1, MT_HEADER);
if (m_abort == NULL) {
if (m_out) {
sctp_m_freem(m_out);
@@ -10900,7 +10952,8 @@ sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked
abort->ch.chunk_length = htons(chunk_len);
/* Add padding, if necessary. */
if (padding_len > 0) {
- if ((m_last == NULL) || sctp_add_pad_tombuf(m_last, padding_len)) {
+ if ((m_last == NULL) ||
+ (sctp_add_pad_tombuf(m_last, padding_len) == NULL)) {
sctp_m_freem(m_out);
return;
}
@@ -10926,7 +10979,7 @@ sctp_send_shutdown_complete(struct sctp_tcb *stcb,
uint32_t vtag;
uint8_t flags;
- m_shutdown_comp = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_DONTWAIT, 1, MT_HEADER);
+ m_shutdown_comp = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_NOWAIT, 1, MT_HEADER);
if (m_shutdown_comp == NULL) {
/* no mbuf's */
return;
@@ -10959,20 +11012,21 @@ static void
sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
struct sctphdr *sh, uint32_t vtag,
uint8_t type, struct mbuf *cause,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
uint32_t vrf_id, uint16_t port)
{
struct mbuf *o_pak;
struct mbuf *mout;
struct sctphdr *shout;
struct sctp_chunkhdr *ch;
- struct udphdr *udp;
- int len, cause_len, padding_len;
#if defined(INET) || defined(INET6)
+ struct udphdr *udp;
int ret;
#endif
+ int len, cause_len, padding_len;
+
#ifdef INET
struct sockaddr_in *src_sin, *dst_sin;
struct ip *ip;
@@ -10999,7 +11053,7 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
padding_len = 4 - padding_len;
}
if (padding_len != 0) {
- if (sctp_add_pad_tombuf(m_last, padding_len)) {
+ if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) {
sctp_m_freem(cause);
return;
}
@@ -11023,10 +11077,12 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
default:
break;
}
+#if defined(INET) || defined(INET6)
if (port) {
len += sizeof(struct udphdr);
}
- mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_DONTWAIT, 1, MT_DATA);
+#endif
+ mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_NOWAIT, 1, MT_DATA);
if (mout == NULL) {
if (cause) {
sctp_m_freem(cause);
@@ -11036,10 +11092,9 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
SCTP_BUF_RESV_UF(mout, max_linkhdr);
SCTP_BUF_LEN(mout) = len;
SCTP_BUF_NEXT(mout) = cause;
- if (use_mflowid != 0) {
- mout->m_pkthdr.flowid = mflowid;
- mout->m_flags |= M_FLOWID;
- }
+ M_SETFIB(mout, fibnum);
+ mout->m_pkthdr.flowid = mflowid;
+ M_HASHTYPE_SET(mout, mflowtype);
#ifdef INET
ip = NULL;
#endif
@@ -11055,8 +11110,8 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
ip->ip_v = IPVERSION;
ip->ip_hl = (sizeof(struct ip) >> 2);
ip->ip_tos = 0;
- ip->ip_id = ip_newid();
ip->ip_off = 0;
+ ip_fillid(ip);
ip->ip_ttl = MODULE_GLOBAL(ip_defttl);
if (port) {
ip->ip_p = IPPROTO_UDP;
@@ -11096,6 +11151,7 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
shout = mtod(mout, struct sctphdr *);
break;
}
+#if defined(INET) || defined(INET6)
if (port) {
if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) {
sctp_m_freem(mout);
@@ -11105,15 +11161,16 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
udp->uh_dport = port;
udp->uh_sum = 0;
- udp->uh_ulen = htons(sizeof(struct udphdr) +
+ udp->uh_ulen = htons((uint16_t) (sizeof(struct udphdr) +
sizeof(struct sctphdr) +
sizeof(struct sctp_chunkhdr) +
- cause_len + padding_len);
+ cause_len + padding_len));
len += sizeof(struct udphdr);
shout = (struct sctphdr *)((caddr_t)shout + sizeof(struct udphdr));
} else {
udp = NULL;
}
+#endif
shout->src_port = sh->dest_port;
shout->dest_port = sh->src_port;
shout->checksum = 0;
@@ -11130,7 +11187,7 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
} else {
ch->chunk_flags = SCTP_HAD_NO_TCB;
}
- ch->chunk_length = htons(sizeof(struct sctp_chunkhdr) + cause_len);
+ ch->chunk_length = htons((uint16_t) (sizeof(struct sctp_chunkhdr) + cause_len));
len += sizeof(struct sctp_chunkhdr);
len += cause_len + padding_len;
@@ -11149,7 +11206,7 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
udp->uh_sum = 0;
}
}
- ip->ip_len = len;
+ ip->ip_len = htons(len);
if (port) {
#if defined(SCTP_WITH_NO_CSUM)
SCTP_STAT_INCR(sctps_sendnocrc);
@@ -11179,7 +11236,7 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
#endif
#ifdef INET6
case AF_INET6:
- ip6->ip6_plen = len - sizeof(struct ip6_hdr);
+ ip6->ip6_plen = (uint16_t) (len - sizeof(struct ip6_hdr));
if (port) {
#if defined(SCTP_WITH_NO_CSUM)
SCTP_STAT_INCR(sctps_sendnocrc);
@@ -11223,11 +11280,11 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
void
sctp_send_shutdown_complete2(struct sockaddr *src, struct sockaddr *dst,
struct sctphdr *sh,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
uint32_t vrf_id, uint16_t port)
{
sctp_send_resp_msg(src, dst, sh, 0, SCTP_SHUTDOWN_COMPLETE, NULL,
- use_mflowid, mflowid,
+ mflowtype, mflowid, fibnum,
vrf_id, port);
}
@@ -11267,10 +11324,11 @@ sctp_send_hb(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked
chk->copy_by_ref = 0;
chk->rec.chunk_id.id = SCTP_HEARTBEAT_REQUEST;
chk->rec.chunk_id.can_take_data = 1;
+ chk->flags = 0;
chk->asoc = &stcb->asoc;
chk->send_size = sizeof(struct sctp_heartbeat_chunk);
- chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_DONTWAIT, 1, MT_HEADER);
+ chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER);
if (chk->data == NULL) {
sctp_free_a_chunk(stcb, chk, so_locked);
return;
@@ -11294,7 +11352,7 @@ sctp_send_hb(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked
hb->heartbeat.hb_info.time_value_1 = now.tv_sec;
hb->heartbeat.hb_info.time_value_2 = now.tv_usec;
/* Did our user request this one, put it in */
- hb->heartbeat.hb_info.addr_family = net->ro._l_addr.sa.sa_family;
+ hb->heartbeat.hb_info.addr_family = (uint8_t) net->ro._l_addr.sa.sa_family;
hb->heartbeat.hb_info.addr_len = net->ro._l_addr.sa.sa_len;
if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
/*
@@ -11323,6 +11381,11 @@ sctp_send_hb(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked
break;
#endif
default:
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ sctp_free_a_chunk(stcb, chk, so_locked);
return;
break;
}
@@ -11368,13 +11431,14 @@ sctp_send_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net,
if (chk == NULL) {
return;
}
- chk->copy_by_ref = 0;
SCTP_STAT_INCR(sctps_queue_upd_ecne);
+ chk->copy_by_ref = 0;
chk->rec.chunk_id.id = SCTP_ECN_ECHO;
chk->rec.chunk_id.can_take_data = 0;
+ chk->flags = 0;
chk->asoc = &stcb->asoc;
chk->send_size = sizeof(struct sctp_ecne_chunk);
- chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_DONTWAIT, 1, MT_HEADER);
+ chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER);
if (chk->data == NULL) {
sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
return;
@@ -11417,7 +11481,7 @@ sctp_send_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net,
}
asoc = &stcb->asoc;
SCTP_TCB_LOCK_ASSERT(stcb);
- if (asoc->peer_supports_pktdrop == 0) {
+ if (asoc->pktdrop_supported == 0) {
/*-
* peer must declare support before I send one.
*/
@@ -11431,6 +11495,9 @@ sctp_send_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net,
return;
}
chk->copy_by_ref = 0;
+ chk->rec.chunk_id.id = SCTP_PACKET_DROPPED;
+ chk->rec.chunk_id.can_take_data = 1;
+ chk->flags = 0;
len -= iphlen;
chk->send_size = len;
/* Validate that we do not have an ABORT in here. */
@@ -11473,7 +11540,7 @@ sctp_send_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net,
was_trunc = 1;
}
chk->asoc = &stcb->asoc;
- chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
if (chk->data == NULL) {
jump_out:
sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
@@ -11496,7 +11563,7 @@ jump_out:
* Len is already adjusted to size minus overhead above take
* out the pkt_drop chunk itself from it.
*/
- chk->send_size = len - sizeof(struct sctp_pktdrop_chunk);
+ chk->send_size = (uint16_t) (len - sizeof(struct sctp_pktdrop_chunk));
len = chk->send_size;
} else {
/* no truncation needed */
@@ -11517,8 +11584,6 @@ jump_out:
} else {
chk->whoTo = NULL;
}
- chk->rec.chunk_id.id = SCTP_PACKET_DROPPED;
- chk->rec.chunk_id.can_take_data = 1;
drp->ch.chunk_type = SCTP_PACKET_DROPPED;
drp->ch.chunk_length = htons(chk->send_size);
spc = SCTP_SB_LIMIT_RCV(stcb->sctp_socket);
@@ -11584,9 +11649,10 @@ sctp_send_cwr(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t high_tsn, u
chk->copy_by_ref = 0;
chk->rec.chunk_id.id = SCTP_ECN_CWR;
chk->rec.chunk_id.can_take_data = 1;
+ chk->flags = 0;
chk->asoc = &stcb->asoc;
chk->send_size = sizeof(struct sctp_cwr_chunk);
- chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_DONTWAIT, 1, MT_HEADER);
+ chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER);
if (chk->data == NULL) {
sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
return;
@@ -11606,30 +11672,60 @@ sctp_send_cwr(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t high_tsn, u
asoc->ctrl_queue_cnt++;
}
-void
-sctp_add_stream_reset_out(struct sctp_tmit_chunk *chk,
- int number_entries, uint16_t * list,
+static int
+sctp_add_stream_reset_out(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk,
uint32_t seq, uint32_t resp_seq, uint32_t last_sent)
{
uint16_t len, old_len, i;
struct sctp_stream_reset_out_request *req_out;
struct sctp_chunkhdr *ch;
+ int at;
+ int number_entries = 0;
ch = mtod(chk->data, struct sctp_chunkhdr *);
old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length));
-
/* get to new offset for the param. */
req_out = (struct sctp_stream_reset_out_request *)((caddr_t)ch + len);
/* now how long will this param be? */
- len = (sizeof(struct sctp_stream_reset_out_request) + (sizeof(uint16_t) * number_entries));
+ for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
+ if ((stcb->asoc.strmout[i].state == SCTP_STREAM_RESET_PENDING) &&
+ (stcb->asoc.strmout[i].chunks_on_queues == 0) &&
+ TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) {
+ number_entries++;
+ }
+ }
+ if (number_entries == 0) {
+ return (0);
+ }
+ if (number_entries == stcb->asoc.streamoutcnt) {
+ number_entries = 0;
+ }
+ if (number_entries > SCTP_MAX_STREAMS_AT_ONCE_RESET) {
+ number_entries = SCTP_MAX_STREAMS_AT_ONCE_RESET;
+ }
+ len = (uint16_t) (sizeof(struct sctp_stream_reset_out_request) + (sizeof(uint16_t) * number_entries));
req_out->ph.param_type = htons(SCTP_STR_RESET_OUT_REQUEST);
req_out->ph.param_length = htons(len);
req_out->request_seq = htonl(seq);
req_out->response_seq = htonl(resp_seq);
req_out->send_reset_at_tsn = htonl(last_sent);
+ at = 0;
if (number_entries) {
- for (i = 0; i < number_entries; i++) {
- req_out->list_of_streams[i] = htons(list[i]);
+ for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
+ if ((stcb->asoc.strmout[i].state == SCTP_STREAM_RESET_PENDING) &&
+ (stcb->asoc.strmout[i].chunks_on_queues == 0) &&
+ TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) {
+ req_out->list_of_streams[at] = htons(i);
+ at++;
+ stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_IN_FLIGHT;
+ if (at >= number_entries) {
+ break;
+ }
+ }
+ }
+ } else {
+ for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
+ stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_IN_FLIGHT;
}
}
if (SCTP_SIZE32(len) > len) {
@@ -11646,7 +11742,7 @@ sctp_add_stream_reset_out(struct sctp_tmit_chunk *chk,
chk->book_size_scale = 0;
chk->send_size = SCTP_SIZE32(chk->book_size);
SCTP_BUF_LEN(chk->data) = chk->send_size;
- return;
+ return (1);
}
static void
@@ -11664,7 +11760,7 @@ sctp_add_stream_reset_in(struct sctp_tmit_chunk *chk,
/* get to new offset for the param. */
req_in = (struct sctp_stream_reset_in_request *)((caddr_t)ch + len);
/* now how long will this param be? */
- len = (sizeof(struct sctp_stream_reset_in_request) + (sizeof(uint16_t) * number_entries));
+ len = (uint16_t) (sizeof(struct sctp_stream_reset_in_request) + (sizeof(uint16_t) * number_entries));
req_in->ph.param_type = htons(SCTP_STR_RESET_IN_REQUEST);
req_in->ph.param_length = htons(len);
req_in->request_seq = htonl(seq);
@@ -11748,6 +11844,68 @@ sctp_add_stream_reset_result(struct sctp_tmit_chunk *chk,
}
void
+sctp_send_deferred_reset_response(struct sctp_tcb *stcb,
+ struct sctp_stream_reset_list *ent,
+ int response)
+{
+ struct sctp_association *asoc;
+ struct sctp_tmit_chunk *chk;
+ struct sctp_chunkhdr *ch;
+
+ asoc = &stcb->asoc;
+
+ /*
+ * Reset our last reset action to the new one IP -> response
+ * (PERFORMED probably). This assures that if we fail to send, a
+ * retran from the peer will get the new response.
+ */
+ asoc->last_reset_action[0] = response;
+ if (asoc->stream_reset_outstanding) {
+ return;
+ }
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return;
+ }
+ chk->copy_by_ref = 0;
+ chk->rec.chunk_id.id = SCTP_STREAM_RESET;
+ chk->rec.chunk_id.can_take_data = 0;
+ chk->flags = 0;
+ chk->asoc = &stcb->asoc;
+ chk->book_size = sizeof(struct sctp_chunkhdr);
+ chk->send_size = SCTP_SIZE32(chk->book_size);
+ chk->book_size_scale = 0;
+ chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
+ if (chk->data == NULL) {
+ sctp_free_a_chunk(stcb, chk, SCTP_SO_LOCKED);
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return;
+ }
+ SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
+ /* setup chunk parameters */
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ if (stcb->asoc.alternate) {
+ chk->whoTo = stcb->asoc.alternate;
+ } else {
+ chk->whoTo = stcb->asoc.primary_destination;
+ }
+ ch = mtod(chk->data, struct sctp_chunkhdr *);
+ ch->chunk_type = SCTP_STREAM_RESET;
+ ch->chunk_flags = 0;
+ ch->chunk_length = htons(chk->book_size);
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+ sctp_add_stream_reset_result(chk, ent->seq, response);
+ /* insert the chunk for sending */
+ TAILQ_INSERT_TAIL(&asoc->control_send_queue,
+ chk,
+ sctp_next);
+ asoc->ctrl_queue_cnt++;
+}
+
+void
sctp_add_stream_reset_result_tsn(struct sctp_tmit_chunk *chk,
uint32_t resp_seq, uint32_t result,
uint32_t send_una, uint32_t recv_next)
@@ -11845,19 +12003,90 @@ sctp_add_an_in_stream(struct sctp_tmit_chunk *chk,
}
int
+sctp_send_stream_reset_out_if_possible(struct sctp_tcb *stcb, int so_locked)
+{
+ struct sctp_association *asoc;
+ struct sctp_tmit_chunk *chk;
+ struct sctp_chunkhdr *ch;
+ uint32_t seq;
+
+ asoc = &stcb->asoc;
+ asoc->trigger_reset = 0;
+ if (asoc->stream_reset_outstanding) {
+ return (EALREADY);
+ }
+ sctp_alloc_a_chunk(stcb, chk);
+ if (chk == NULL) {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ chk->copy_by_ref = 0;
+ chk->rec.chunk_id.id = SCTP_STREAM_RESET;
+ chk->rec.chunk_id.can_take_data = 0;
+ chk->flags = 0;
+ chk->asoc = &stcb->asoc;
+ chk->book_size = sizeof(struct sctp_chunkhdr);
+ chk->send_size = SCTP_SIZE32(chk->book_size);
+ chk->book_size_scale = 0;
+ chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
+ if (chk->data == NULL) {
+ sctp_free_a_chunk(stcb, chk, so_locked);
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
+ }
+ SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD);
+
+ /* setup chunk parameters */
+ chk->sent = SCTP_DATAGRAM_UNSENT;
+ chk->snd_count = 0;
+ if (stcb->asoc.alternate) {
+ chk->whoTo = stcb->asoc.alternate;
+ } else {
+ chk->whoTo = stcb->asoc.primary_destination;
+ }
+ ch = mtod(chk->data, struct sctp_chunkhdr *);
+ ch->chunk_type = SCTP_STREAM_RESET;
+ ch->chunk_flags = 0;
+ ch->chunk_length = htons(chk->book_size);
+ atomic_add_int(&chk->whoTo->ref_count, 1);
+ SCTP_BUF_LEN(chk->data) = chk->send_size;
+ seq = stcb->asoc.str_reset_seq_out;
+ if (sctp_add_stream_reset_out(stcb, chk, seq, (stcb->asoc.str_reset_seq_in - 1), (stcb->asoc.sending_seq - 1))) {
+ seq++;
+ asoc->stream_reset_outstanding++;
+ } else {
+ m_freem(chk->data);
+ chk->data = NULL;
+ sctp_free_a_chunk(stcb, chk, so_locked);
+ return (ENOENT);
+ }
+ asoc->str_reset = chk;
+ /* insert the chunk for sending */
+ TAILQ_INSERT_TAIL(&asoc->control_send_queue,
+ chk,
+ sctp_next);
+ asoc->ctrl_queue_cnt++;
+
+ if (stcb->asoc.send_sack) {
+ sctp_send_sack(stcb, so_locked);
+ }
+ sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo);
+ return (0);
+}
+
+int
sctp_send_str_reset_req(struct sctp_tcb *stcb,
- int number_entries, uint16_t * list,
- uint8_t send_out_req,
+ uint16_t number_entries, uint16_t * list,
uint8_t send_in_req,
uint8_t send_tsn_req,
uint8_t add_stream,
uint16_t adding_o,
uint16_t adding_i, uint8_t peer_asked)
{
-
struct sctp_association *asoc;
struct sctp_tmit_chunk *chk;
struct sctp_chunkhdr *ch;
+ int can_send_out_req = 0;
uint32_t seq;
asoc = &stcb->asoc;
@@ -11868,16 +12097,26 @@ sctp_send_str_reset_req(struct sctp_tcb *stcb,
SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EBUSY);
return (EBUSY);
}
- if ((send_out_req == 0) && (send_in_req == 0) && (send_tsn_req == 0) &&
+ if ((send_in_req == 0) && (send_tsn_req == 0) &&
(add_stream == 0)) {
/* nothing to do */
SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
return (EINVAL);
}
- if (send_tsn_req && (send_out_req || send_in_req)) {
+ if (send_tsn_req && send_in_req) {
/* error, can't do that */
SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL);
return (EINVAL);
+ } else if (send_in_req) {
+ can_send_out_req = 1;
+ }
+ if (number_entries > (MCLBYTES -
+ SCTP_MIN_OVERHEAD -
+ sizeof(struct sctp_chunkhdr) -
+ sizeof(struct sctp_stream_reset_out_request)) /
+ sizeof(uint16_t)) {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
+ return (ENOMEM);
}
sctp_alloc_a_chunk(stcb, chk);
if (chk == NULL) {
@@ -11887,12 +12126,13 @@ sctp_send_str_reset_req(struct sctp_tcb *stcb,
chk->copy_by_ref = 0;
chk->rec.chunk_id.id = SCTP_STREAM_RESET;
chk->rec.chunk_id.can_take_data = 0;
+ chk->flags = 0;
chk->asoc = &stcb->asoc;
chk->book_size = sizeof(struct sctp_chunkhdr);
chk->send_size = SCTP_SIZE32(chk->book_size);
chk->book_size_scale = 0;
- chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
if (chk->data == NULL) {
sctp_free_a_chunk(stcb, chk, SCTP_SO_LOCKED);
SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
@@ -11916,12 +12156,14 @@ sctp_send_str_reset_req(struct sctp_tcb *stcb,
SCTP_BUF_LEN(chk->data) = chk->send_size;
seq = stcb->asoc.str_reset_seq_out;
- if (send_out_req) {
- sctp_add_stream_reset_out(chk, number_entries, list,
- seq, (stcb->asoc.str_reset_seq_in - 1), (stcb->asoc.sending_seq - 1));
- asoc->stream_reset_out_is_outstanding = 1;
- seq++;
- asoc->stream_reset_outstanding++;
+ if (can_send_out_req) {
+ int ret;
+
+ ret = sctp_add_stream_reset_out(stcb, chk, seq, (stcb->asoc.str_reset_seq_in - 1), (stcb->asoc.sending_seq - 1));
+ if (ret) {
+ seq++;
+ asoc->stream_reset_outstanding++;
+ }
}
if ((add_stream & 1) &&
((stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt) < adding_o)) {
@@ -11930,10 +12172,15 @@ sctp_send_str_reset_req(struct sctp_tcb *stcb,
struct sctp_stream_queue_pending *sp, *nsp;
int i;
+#if defined(SCTP_DETAILED_STR_STATS)
+ int j;
+
+#endif
+
oldstream = stcb->asoc.strmout;
/* get some more */
SCTP_MALLOC(stcb->asoc.strmout, struct sctp_stream_out *,
- ((stcb->asoc.streamoutcnt + adding_o) * sizeof(struct sctp_stream_out)),
+ (stcb->asoc.streamoutcnt + adding_o) * sizeof(struct sctp_stream_out),
SCTP_M_STRMO);
if (stcb->asoc.strmout == NULL) {
uint8_t x;
@@ -11953,32 +12200,44 @@ sctp_send_str_reset_req(struct sctp_tcb *stcb,
for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
TAILQ_INIT(&stcb->asoc.strmout[i].outqueue);
stcb->asoc.strmout[i].chunks_on_queues = oldstream[i].chunks_on_queues;
- stcb->asoc.strmout[i].next_sequence_send = oldstream[i].next_sequence_send;
+ stcb->asoc.strmout[i].next_mid_ordered = oldstream[i].next_mid_ordered;
+ stcb->asoc.strmout[i].next_mid_unordered = oldstream[i].next_mid_unordered;
stcb->asoc.strmout[i].last_msg_incomplete = oldstream[i].last_msg_incomplete;
stcb->asoc.strmout[i].stream_no = i;
- stcb->asoc.ss_functions.sctp_ss_init_stream(&stcb->asoc.strmout[i], &oldstream[i]);
+ stcb->asoc.strmout[i].state = oldstream[i].state;
+ /* FIX ME FIX ME */
+ /*
+ * This should be a SS_COPY operation FIX ME STREAM
+ * SCHEDULER EXPERT
+ */
+ stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], &oldstream[i]);
/* now anything on those queues? */
TAILQ_FOREACH_SAFE(sp, &oldstream[i].outqueue, next, nsp) {
TAILQ_REMOVE(&oldstream[i].outqueue, sp, next);
TAILQ_INSERT_TAIL(&stcb->asoc.strmout[i].outqueue, sp, next);
}
- /* Now move assoc pointers too */
- if (stcb->asoc.last_out_stream == &oldstream[i]) {
- stcb->asoc.last_out_stream = &stcb->asoc.strmout[i];
- }
- if (stcb->asoc.locked_on_sending == &oldstream[i]) {
- stcb->asoc.locked_on_sending = &stcb->asoc.strmout[i];
- }
+
}
/* now the new streams */
stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1);
for (i = stcb->asoc.streamoutcnt; i < (stcb->asoc.streamoutcnt + adding_o); i++) {
TAILQ_INIT(&stcb->asoc.strmout[i].outqueue);
stcb->asoc.strmout[i].chunks_on_queues = 0;
- stcb->asoc.strmout[i].next_sequence_send = 0x0;
+#if defined(SCTP_DETAILED_STR_STATS)
+ for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) {
+ stcb->asoc.strmout[i].abandoned_sent[j] = 0;
+ stcb->asoc.strmout[i].abandoned_unsent[j] = 0;
+ }
+#else
+ stcb->asoc.strmout[i].abandoned_sent[0] = 0;
+ stcb->asoc.strmout[i].abandoned_unsent[0] = 0;
+#endif
+ stcb->asoc.strmout[i].next_mid_ordered = 0;
+ stcb->asoc.strmout[i].next_mid_unordered = 0;
stcb->asoc.strmout[i].stream_no = i;
stcb->asoc.strmout[i].last_msg_incomplete = 0;
- stcb->asoc.ss_functions.sctp_ss_init_stream(&stcb->asoc.strmout[i], NULL);
+ stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], NULL);
+ stcb->asoc.strmout[i].state = SCTP_STREAM_CLOSED;
}
stcb->asoc.strm_realoutsize = stcb->asoc.streamoutcnt + adding_o;
SCTP_FREE(oldstream, SCTP_M_STRMO);
@@ -12012,6 +12271,9 @@ skip_stuff:
chk,
sctp_next);
asoc->ctrl_queue_cnt++;
+ if (stcb->asoc.send_sack) {
+ sctp_send_sack(stcb, SCTP_SO_LOCKED);
+ }
sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo);
return (0);
}
@@ -12019,7 +12281,7 @@ skip_stuff:
void
sctp_send_abort(struct mbuf *m, int iphlen, struct sockaddr *src, struct sockaddr *dst,
struct sctphdr *sh, uint32_t vtag, struct mbuf *cause,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
uint32_t vrf_id, uint16_t port)
{
/* Don't respond to an ABORT with an ABORT. */
@@ -12029,7 +12291,7 @@ sctp_send_abort(struct mbuf *m, int iphlen, struct sockaddr *src, struct sockadd
return;
}
sctp_send_resp_msg(src, dst, sh, vtag, SCTP_ABORT_ASSOCIATION, cause,
- use_mflowid, mflowid,
+ mflowtype, mflowid, fibnum,
vrf_id, port);
return;
}
@@ -12037,11 +12299,11 @@ sctp_send_abort(struct mbuf *m, int iphlen, struct sockaddr *src, struct sockadd
void
sctp_send_operr_to(struct sockaddr *src, struct sockaddr *dst,
struct sctphdr *sh, uint32_t vtag, struct mbuf *cause,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
uint32_t vrf_id, uint16_t port)
{
sctp_send_resp_msg(src, dst, sh, vtag, SCTP_OPERATION_ERROR, cause,
- use_mflowid, mflowid,
+ mflowtype, mflowid, fibnum,
vrf_id, port);
return;
}
@@ -12073,9 +12335,6 @@ sctp_copy_one(struct sctp_stream_queue_pending *sp,
struct uio *uio,
int resv_upfront)
{
- int left;
-
- left = sp->length;
sp->data = m_uiotombuf(uio, M_WAITOK, sp->length,
resv_upfront, 0);
if (sp->data == NULL) {
@@ -12131,10 +12390,11 @@ sctp_copy_it_in(struct sctp_tcb *stcb,
sp->timetolive = srcv->sinfo_timetolive;
sp->ppid = srcv->sinfo_ppid;
sp->context = srcv->sinfo_context;
+ sp->fsn = 0;
(void)SCTP_GETTIME_TIMEVAL(&sp->ts);
sp->stream = srcv->sinfo_stream;
- sp->length = min(uio->uio_resid, max_send_len);
+ sp->length = (uint32_t) min(uio->uio_resid, max_send_len);
if ((sp->length == (uint32_t) uio->uio_resid) &&
((user_marks_eor == 0) ||
(srcv->sinfo_flags & SCTP_EOF) ||
@@ -12293,7 +12553,7 @@ sctp_lower_sosend(struct socket *so,
SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
return (EINVAL);
}
- sndlen = uio->uio_resid;
+ sndlen = (unsigned int)uio->uio_resid;
} else {
top = SCTP_HEADER_TO_CHAIN(i_pak);
sndlen = SCTP_HEADER_LEN(i_pak);
@@ -12381,7 +12641,10 @@ sctp_lower_sosend(struct socket *so,
}
SCTP_INP_RUNLOCK(inp);
} else if (sinfo_assoc_id) {
- stcb = sctp_findassociation_ep_asocid(inp, sinfo_assoc_id, 0);
+ stcb = sctp_findassociation_ep_asocid(inp, sinfo_assoc_id, 1);
+ if (stcb != NULL) {
+ hold_tcblock = 1;
+ }
} else if (addr) {
/*-
* Since we did not use findep we must
@@ -12469,8 +12732,9 @@ sctp_lower_sosend(struct socket *so,
}
#endif
stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id,
- p
- );
+ inp->sctp_ep.pre_open_stream_count,
+ inp->sctp_ep.port,
+ p);
if (stcb == NULL) {
/* Error is setup for us in the call */
goto out_unlocked;
@@ -12504,7 +12768,8 @@ sctp_lower_sosend(struct socket *so,
if (control) {
if (sctp_process_cmsgs_for_init(stcb, control, &error)) {
- sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_7);
+ sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE,
+ SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_5);
hold_tcblock = 0;
stcb = NULL;
goto out_unlocked;
@@ -12590,12 +12855,24 @@ sctp_lower_sosend(struct socket *so,
SCTP_ASOC_CREATE_UNLOCK(inp);
create_lock_applied = 0;
}
- if (asoc->stream_reset_outstanding) {
+ /* Is the stream no. valid? */
+ if (srcv->sinfo_stream >= asoc->streamoutcnt) {
+ /* Invalid stream number */
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
+ error = EINVAL;
+ goto out_unlocked;
+ }
+ if ((asoc->strmout[srcv->sinfo_stream].state != SCTP_STREAM_OPEN) &&
+ (asoc->strmout[srcv->sinfo_stream].state != SCTP_STREAM_OPENING)) {
/*
* Can't queue any data while stream reset is underway.
*/
- SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EAGAIN);
- error = EAGAIN;
+ if (asoc->strmout[srcv->sinfo_stream].state > SCTP_STREAM_OPEN) {
+ error = EAGAIN;
+ } else {
+ error = EINVAL;
+ }
+ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, error);
goto out_unlocked;
}
if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) ||
@@ -12646,7 +12923,7 @@ sctp_lower_sosend(struct socket *so,
if (top) {
struct mbuf *cntm = NULL;
- mm = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_WAIT, 1, MT_DATA);
+ mm = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_WAITOK, 1, MT_DATA);
if (sndlen != 0) {
for (cntm = top; cntm; cntm = SCTP_BUF_NEXT(cntm)) {
tot_out += SCTP_BUF_LEN(cntm);
@@ -12662,7 +12939,7 @@ sctp_lower_sosend(struct socket *so,
error = EMSGSIZE;
goto out;
}
- mm = sctp_get_mbuf_for_msg(tot_demand, 0, M_WAIT, 1, MT_DATA);
+ mm = sctp_get_mbuf_for_msg(tot_demand, 0, M_WAITOK, 1, MT_DATA);
}
if (mm == NULL) {
SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
@@ -12680,7 +12957,7 @@ sctp_lower_sosend(struct socket *so,
/* now move forward the data pointer */
ph = mtod(mm, struct sctp_paramhdr *);
ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT);
- ph->param_length = htons(sizeof(struct sctp_paramhdr) + tot_out);
+ ph->param_length = htons((uint16_t) (sizeof(struct sctp_paramhdr) + tot_out));
ph++;
SCTP_BUF_LEN(mm) = tot_out + sizeof(struct sctp_paramhdr);
if (top == NULL) {
@@ -12736,13 +13013,6 @@ sctp_lower_sosend(struct socket *so,
SCTP_TCB_UNLOCK(stcb);
hold_tcblock = 0;
}
- /* Is the stream no. valid? */
- if (srcv->sinfo_stream >= asoc->streamoutcnt) {
- /* Invalid stream number */
- SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
- error = EINVAL;
- goto out_unlocked;
- }
if (asoc->strmout == NULL) {
/* huh? software error */
SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT);
@@ -12818,6 +13088,7 @@ sctp_lower_sosend(struct socket *so,
asoc, stcb->asoc.total_output_queue_size);
}
if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ SOCKBUF_UNLOCK(&so->so_snd);
goto out_unlocked;
}
inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk));
@@ -12879,8 +13150,10 @@ skip_preblock:
* interrupt.
*/
strm->last_msg_incomplete = 1;
- asoc->stream_locked = 1;
- asoc->stream_locked_on = srcv->sinfo_stream;
+ if (stcb->asoc.idata_supported == 0) {
+ asoc->stream_locked = 1;
+ asoc->stream_locked_on = srcv->sinfo_stream;
+ }
sp->sender_all_done = 0;
}
sctp_snd_sb_alloc(stcb, sp->length);
@@ -12959,7 +13232,9 @@ skip_preblock:
sctp_snd_sb_alloc(stcb, sndout);
atomic_add_int(&sp->length, sndout);
len += sndout;
-
+ if (srcv->sinfo_flags & SCTP_SACK_IMMEDIATELY) {
+ sp->sinfo_flags |= SCTP_SACK_IMMEDIATELY;
+ }
/* Did we reach EOR? */
if ((uio->uio_resid == 0) &&
((user_marks_eor == 0) ||
@@ -12976,7 +13251,7 @@ skip_preblock:
continue;
}
/* PR-SCTP? */
- if ((asoc->peer_supports_prsctp) && (asoc->sent_queue_cnt_removeable > 0)) {
+ if ((asoc->prsctp_supported) && (asoc->sent_queue_cnt_removeable > 0)) {
/*
* This is ugly but we must assure locking
* order
@@ -13038,7 +13313,7 @@ skip_preblock:
/*-
* Ok, Nagle is set on and we have data outstanding.
* Don't send anything and let SACKs drive out the
- * data unless wen have a "full" segment to send.
+ * data unless we have a "full" segment to send.
*/
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) {
sctp_log_nagle_event(stcb, SCTP_NAGLE_APPLIED);
@@ -13107,7 +13382,7 @@ skip_preblock:
min(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTP_SB_LIMIT_SND(so)))) {
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) {
sctp_log_block(SCTP_BLOCK_LOG_INTO_BLK,
- asoc, uio->uio_resid);
+ asoc, (size_t)uio->uio_resid);
}
be.error = 0;
stcb->block_entry = &be;
@@ -13136,11 +13411,17 @@ skip_preblock:
}
}
SCTP_TCB_SEND_LOCK(stcb);
+ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ SCTP_TCB_SEND_UNLOCK(stcb);
+ goto out_unlocked;
+ }
if (sp) {
if (sp->msg_is_complete == 0) {
strm->last_msg_incomplete = 1;
- asoc->stream_locked = 1;
- asoc->stream_locked_on = srcv->sinfo_stream;
+ if (stcb->asoc.idata_supported == 0) {
+ asoc->stream_locked = 1;
+ asoc->stream_locked_on = srcv->sinfo_stream;
+ }
} else {
sp->sender_all_done = 1;
strm->last_msg_incomplete = 0;
@@ -13176,19 +13457,16 @@ dataless_eof:
/* EOF thing ? */
if ((srcv->sinfo_flags & SCTP_EOF) &&
(got_all_of_the_send == 1)) {
- int cnt;
-
SCTP_STAT_INCR(sctps_sends_with_eof);
error = 0;
if (hold_tcblock == 0) {
SCTP_TCB_LOCK(stcb);
hold_tcblock = 1;
}
- cnt = sctp_is_there_unsent_data(stcb, SCTP_SO_LOCKED);
if (TAILQ_EMPTY(&asoc->send_queue) &&
TAILQ_EMPTY(&asoc->sent_queue) &&
- (cnt == 0)) {
- if (asoc->locked_on_sending) {
+ sctp_is_there_unsent_data(stcb, SCTP_SO_LOCKED) == 0) {
+ if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
goto abort_anyway;
}
/* there is nothing queued to send, so I'm done... */
@@ -13233,27 +13511,27 @@ dataless_eof:
SCTP_TCB_LOCK(stcb);
hold_tcblock = 1;
}
- if (asoc->locked_on_sending) {
- /* Locked to send out the data */
- struct sctp_stream_queue_pending *sp;
-
- sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead);
- if (sp) {
- if ((sp->length == 0) && (sp->msg_is_complete == 0))
- asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
- }
+ if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
+ asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
}
asoc->state |= SCTP_STATE_SHUTDOWN_PENDING;
if (TAILQ_EMPTY(&asoc->send_queue) &&
TAILQ_EMPTY(&asoc->sent_queue) &&
(asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
+ struct mbuf *op_err;
+ char msg[SCTP_DIAG_INFO_LEN];
+
abort_anyway:
if (free_cnt_applied) {
atomic_add_int(&stcb->asoc.refcnt, -1);
free_cnt_applied = 0;
}
+ snprintf(msg, sizeof(msg),
+ "%s:%d at %s", __FILE__, __LINE__, __func__);
+ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
+ msg);
sctp_abort_an_association(stcb->sctp_ep, stcb,
- NULL, SCTP_SO_LOCKED);
+ op_err, SCTP_SO_LOCKED);
/*
* now relock the stcb so everything
* is sane
@@ -13393,13 +13671,6 @@ out_unlocked:
}
}
#endif
-#ifdef INVARIANTS
- if (inp) {
- sctp_validate_no_locks(inp);
- } else {
- SCTP_PRINTF("Warning - inp is NULL so cant validate locks\n");
- }
-#endif
if (top) {
sctp_m_freem(top);
}
@@ -13427,19 +13698,14 @@ sctp_add_auth_chunk(struct mbuf *m, struct mbuf **m_end,
(stcb == NULL))
return (m);
- /* sysctl disabled auth? */
- if (SCTP_BASE_SYSCTL(sctp_auth_disable))
- return (m);
-
- /* peer doesn't do auth... */
- if (!stcb->asoc.peer_supports_auth) {
+ if (stcb->asoc.auth_supported == 0) {
return (m);
}
/* does the requested chunk require auth? */
if (!sctp_auth_is_required_chunk(chunk, stcb->asoc.peer_auth_chunks)) {
return (m);
}
- m_auth = sctp_get_mbuf_for_msg(sizeof(*auth), 0, M_DONTWAIT, 1, MT_HEADER);
+ m_auth = sctp_get_mbuf_for_msg(sizeof(*auth), 0, M_NOWAIT, 1, MT_HEADER);
if (m_auth == NULL) {
/* no mbuf's */
return (m);
@@ -13538,7 +13804,7 @@ sctp_v4src_match_nexthop(struct sctp_ifa *sifa, sctp_route_t * ro)
}
ifa = (struct ifaddr *)sifa->ifa;
mask = (struct sockaddr_in *)(ifa->ifa_netmask);
- sin = (struct sockaddr_in *)&sifa->address.sin;
+ sin = &sifa->address.sin;
srcnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr);
SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: src address is ");
SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa);
diff --git a/freebsd/sys/netinet/sctp_output.h b/freebsd/sys/netinet/sctp_output.h
index 59af5af2..b2441a6f 100644
--- a/freebsd/sys/netinet/sctp_output.h
+++ b/freebsd/sys/netinet/sctp_output.h
@@ -80,7 +80,8 @@ sctp_send_initiate(struct sctp_inpcb *, struct sctp_tcb *, int
);
void
-sctp_send_initiate_ack(struct sctp_inpcb *, struct sctp_tcb *, struct mbuf *,
+sctp_send_initiate_ack(struct sctp_inpcb *, struct sctp_tcb *,
+ struct sctp_nets *, struct mbuf *,
int, int,
struct sockaddr *, struct sockaddr *,
struct sctphdr *, struct sctp_init_chunk *,
@@ -117,7 +118,7 @@ void sctp_send_shutdown_complete(struct sctp_tcb *, struct sctp_nets *, int);
void
sctp_send_shutdown_complete2(struct sockaddr *, struct sockaddr *,
struct sctphdr *,
- uint8_t, uint32_t,
+ uint8_t, uint32_t, uint16_t,
uint32_t, uint16_t);
void sctp_send_asconf(struct sctp_tcb *, struct sctp_nets *, int addr_locked);
@@ -170,30 +171,33 @@ void sctp_send_cwr(struct sctp_tcb *, struct sctp_nets *, uint32_t, uint8_t);
void
-sctp_add_stream_reset_out(struct sctp_tmit_chunk *,
- int, uint16_t *, uint32_t, uint32_t, uint32_t);
+ sctp_add_stream_reset_result(struct sctp_tmit_chunk *, uint32_t, uint32_t);
void
- sctp_add_stream_reset_result(struct sctp_tmit_chunk *, uint32_t, uint32_t);
+sctp_send_deferred_reset_response(struct sctp_tcb *,
+ struct sctp_stream_reset_list *,
+ int);
void
sctp_add_stream_reset_result_tsn(struct sctp_tmit_chunk *,
uint32_t, uint32_t, uint32_t, uint32_t);
+int
+ sctp_send_stream_reset_out_if_possible(struct sctp_tcb *, int);
int
-sctp_send_str_reset_req(struct sctp_tcb *, int, uint16_t *, uint8_t, uint8_t,
- uint8_t, uint8_t, uint16_t, uint16_t, uint8_t);
+sctp_send_str_reset_req(struct sctp_tcb *, uint16_t, uint16_t *,
+ uint8_t, uint8_t, uint8_t, uint16_t, uint16_t, uint8_t);
void
sctp_send_abort(struct mbuf *, int, struct sockaddr *, struct sockaddr *,
struct sctphdr *, uint32_t, struct mbuf *,
- uint8_t, uint32_t,
+ uint8_t, uint32_t, uint16_t,
uint32_t, uint16_t);
void
sctp_send_operr_to(struct sockaddr *, struct sockaddr *,
struct sctphdr *, uint32_t, struct mbuf *,
- uint8_t, uint32_t,
+ uint8_t, uint32_t, uint16_t,
uint32_t, uint16_t);
#endif /* _KERNEL || __Userspace__ */
diff --git a/freebsd/sys/netinet/sctp_pcb.c b/freebsd/sys/netinet/sctp_pcb.c
index 16dc231f..62ef1e3d 100644
--- a/freebsd/sys/netinet/sctp_pcb.c
+++ b/freebsd/sys/netinet/sctp_pcb.c
@@ -48,7 +48,9 @@ __FBSDID("$FreeBSD$");
#include <netinet/sctp_timer.h>
#include <netinet/sctp_bsd_addr.h>
#include <netinet/sctp_dtrace_define.h>
+#if defined(INET) || defined(INET6)
#include <netinet/udp.h>
+#endif
#ifdef INET6
#include <netinet6/ip6_var.h>
#endif
@@ -330,7 +332,7 @@ sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr,
goto out;
}
if (sctp_ifap->ifn_p == NULL) {
- SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unuseable\n");
+ SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unusable\n");
goto out;
}
if (if_name) {
@@ -374,7 +376,7 @@ sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr,
goto out;
}
if (sctp_ifap->ifn_p == NULL) {
- SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unuseable\n");
+ SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unusable\n");
goto out;
}
if (if_name) {
@@ -625,7 +627,7 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index,
{
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)&sctp_ifap->address.sin;
+ sin = &sctp_ifap->address.sin;
if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) ||
(IN4_ISLOOPBACK_ADDRESS(&sin->sin_addr))) {
sctp_ifap->src_is_loop = 1;
@@ -645,7 +647,7 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index,
/* ok to use deprecated addresses? */
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)&sctp_ifap->address.sin6;
+ sin6 = &sctp_ifap->address.sin6;
if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) ||
(IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))) {
sctp_ifap->src_is_loop = 1;
@@ -974,7 +976,7 @@ sctp_does_stcb_own_this_addr(struct sctp_tcb *stcb, struct sockaddr *to)
{
struct sockaddr_in *sin, *rsin;
- sin = (struct sockaddr_in *)&laddr->ifa->address.sin;
+ sin = &laddr->ifa->address.sin;
rsin = (struct sockaddr_in *)to;
if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) {
SCTP_IPI_ADDR_RUNLOCK();
@@ -988,7 +990,7 @@ sctp_does_stcb_own_this_addr(struct sctp_tcb *stcb, struct sockaddr *to)
{
struct sockaddr_in6 *sin6, *rsin6;
- sin6 = (struct sockaddr_in6 *)&laddr->ifa->address.sin6;
+ sin6 = &laddr->ifa->address.sin6;
rsin6 = (struct sockaddr_in6 *)to;
if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) {
SCTP_IPI_ADDR_RUNLOCK();
@@ -1115,7 +1117,7 @@ sctp_tcb_special_locate(struct sctp_inpcb **inp_p, struct sockaddr *from,
LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
if (laddr->ifa == NULL) {
- SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __FUNCTION__);
+ SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__);
continue;
}
if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
@@ -1441,9 +1443,6 @@ sctp_findassociation_ep_addr(struct sctp_inpcb **inp_p, struct sockaddr *remote,
}
head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(rport,
inp->sctp_hashmark)];
- if (head == NULL) {
- goto null_return;
- }
LIST_FOREACH(stcb, head, sctp_tcbhash) {
if (stcb->rport != rport) {
/* remote port does not match */
@@ -1776,7 +1775,7 @@ sctp_endpoint_probe(struct sockaddr *nam, struct sctppcbhead *head,
LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
if (laddr->ifa == NULL) {
SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n",
- __FUNCTION__);
+ __func__);
continue;
}
SCTPDBG(SCTP_DEBUG_PCB1, "Ok laddr->ifa:%p is possible, ",
@@ -1870,7 +1869,7 @@ sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp)
{
/* For 1-2-1 with port reuse */
struct sctppcbhead *head;
- struct sctp_inpcb *tinp;
+ struct sctp_inpcb *tinp, *ninp;
if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) {
/* only works with port reuse on */
@@ -1880,10 +1879,11 @@ sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp)
return (0);
}
SCTP_INP_RUNLOCK(inp);
+ SCTP_INP_INFO_WLOCK();
head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport,
SCTP_BASE_INFO(hashmark))];
/* Kick out all non-listeners to the TCP hash */
- LIST_FOREACH(tinp, head, sctp_hash) {
+ LIST_FOREACH_SAFE(tinp, head, sctp_hash, ninp) {
if (tinp->sctp_lport != inp->sctp_lport) {
continue;
}
@@ -1911,6 +1911,7 @@ sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp)
LIST_INSERT_HEAD(head, inp, sctp_hash);
SCTP_INP_WUNLOCK(inp);
SCTP_INP_RLOCK(inp);
+ SCTP_INP_INFO_WUNLOCK();
return (0);
}
@@ -2166,11 +2167,6 @@ sctp_findassoc_by_vtag(struct sockaddr *from, struct sockaddr *to, uint32_t vtag
SCTP_INP_INFO_RLOCK();
head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(vtag,
SCTP_BASE_INFO(hashasocmark))];
- if (head == NULL) {
- /* invalid vtag */
- SCTP_INP_INFO_RUNLOCK();
- return (NULL);
- }
LIST_FOREACH(stcb, head, sctp_asocs) {
SCTP_INP_RLOCK(stcb->sctp_ep);
if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) {
@@ -2262,7 +2258,6 @@ sctp_findassociation_addr(struct mbuf *m, int offset,
struct sctphdr *sh, struct sctp_chunkhdr *ch,
struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id)
{
- int find_tcp_pool;
struct sctp_tcb *stcb;
struct sctp_inpcb *inp;
@@ -2274,21 +2269,13 @@ sctp_findassociation_addr(struct mbuf *m, int offset,
return (stcb);
}
}
- find_tcp_pool = 0;
- if ((ch->chunk_type != SCTP_INITIATION) &&
- (ch->chunk_type != SCTP_INITIATION_ACK) &&
- (ch->chunk_type != SCTP_COOKIE_ACK) &&
- (ch->chunk_type != SCTP_COOKIE_ECHO)) {
- /* Other chunk types go to the tcp pool. */
- find_tcp_pool = 1;
- }
if (inp_p) {
stcb = sctp_findassociation_addr_sa(src, dst, inp_p, netp,
- find_tcp_pool, vrf_id);
+ 1, vrf_id);
inp = *inp_p;
} else {
stcb = sctp_findassociation_addr_sa(src, dst, &inp, netp,
- find_tcp_pool, vrf_id);
+ 1, vrf_id);
}
SCTPDBG(SCTP_DEBUG_PCB1, "stcb:%p inp:%p\n", (void *)stcb, (void *)inp);
if (stcb == NULL && inp) {
@@ -2330,7 +2317,7 @@ sctp_findassociation_ep_asconf(struct mbuf *m, int offset,
struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id)
{
struct sctp_tcb *stcb;
- struct sockaddr_storage remote_store;
+ union sctp_sockstore remote_store;
struct sctp_paramhdr parm_buf, *phdr;
int ptype;
int zero_address = 0;
@@ -2349,7 +2336,7 @@ sctp_findassociation_ep_asconf(struct mbuf *m, int offset,
&parm_buf, sizeof(struct sctp_paramhdr));
if (phdr == NULL) {
SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf lookup addr\n",
- __FUNCTION__);
+ __func__);
return NULL;
}
ptype = (int)((uint32_t) ntohs(phdr->param_type));
@@ -2369,10 +2356,10 @@ sctp_findassociation_ep_asconf(struct mbuf *m, int offset,
&p6_buf.ph, sizeof(*p6));
if (p6 == NULL) {
SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v6 lookup addr\n",
- __FUNCTION__);
+ __func__);
return (NULL);
}
- sin6 = (struct sockaddr_in6 *)&remote_store;
+ sin6 = &remote_store.sin6;
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(*sin6);
sin6->sin6_port = sh->src_port;
@@ -2396,10 +2383,10 @@ sctp_findassociation_ep_asconf(struct mbuf *m, int offset,
&p4_buf.ph, sizeof(*p4));
if (p4 == NULL) {
SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v4 lookup addr\n",
- __FUNCTION__);
+ __func__);
return (NULL);
}
- sin = (struct sockaddr_in *)&remote_store;
+ sin = &remote_store.sin;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_port = sh->src_port;
@@ -2422,7 +2409,7 @@ sctp_findassociation_ep_asconf(struct mbuf *m, int offset,
}
} else {
stcb = sctp_findassociation_ep_addr(inp_p,
- (struct sockaddr *)&remote_store, netp,
+ &remote_store.sa, netp,
dst, NULL);
}
return (stcb);
@@ -2482,8 +2469,18 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
inp->sctp_associd_counter = 1;
inp->partial_delivery_point = SCTP_SB_LIMIT_RCV(so) >> SCTP_PARTIAL_DELIVERY_SHIFT;
inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT;
+ inp->max_cwnd = 0;
inp->sctp_cmt_on_off = SCTP_BASE_SYSCTL(sctp_cmt_on_off);
- inp->sctp_ecn_enable = SCTP_BASE_SYSCTL(sctp_ecn_enable);
+ inp->ecn_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_ecn_enable);
+ inp->prsctp_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_pr_enable);
+ inp->auth_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_auth_enable);
+ inp->asconf_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_asconf_enable);
+ inp->reconfig_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_reconfig_enable);
+ inp->nrsack_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_nrsack_enable);
+ inp->pktdrop_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_pktdrop_enable);
+ inp->idata_supported = 0;
+
+ inp->fibnum = so->so_fibnum;
/* init the small hash table we use to track asocid <-> tcb */
inp->sctp_asocidhash = SCTP_HASH_INIT(SCTP_STACK_VTAG_HASH_SIZE, &inp->hashasocidmark);
if (inp->sctp_asocidhash == NULL) {
@@ -2493,14 +2490,7 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
return (ENOBUFS);
}
#ifdef IPSEC
- {
- struct inpcbpolicy *pcb_sp = NULL;
-
- error = ipsec_init_policy(so, &pcb_sp);
- /* Arrange to share the policy */
- inp->ip_inp.inp.inp_sp = pcb_sp;
- ((struct in6pcb *)(&inp->ip_inp.inp))->in6p_sp = pcb_sp;
- }
+ error = ipsec_init_policy(so, &inp->ip_inp.inp.inp_sp);
if (error != 0) {
crfree(inp->ip_inp.inp.inp_cred);
SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
@@ -2534,6 +2524,9 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EOPNOTSUPP);
so->so_pcb = NULL;
crfree(inp->ip_inp.inp.inp_cred);
+#ifdef IPSEC
+ ipsec_delete_pcbpolicy(&inp->ip_inp.inp);
+#endif
SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
return (EOPNOTSUPP);
}
@@ -2554,6 +2547,9 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS);
so->so_pcb = NULL;
crfree(inp->ip_inp.inp.inp_cred);
+#ifdef IPSEC
+ ipsec_delete_pcbpolicy(&inp->ip_inp.inp);
+#endif
SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp);
return (ENOBUFS);
}
@@ -2647,12 +2643,15 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
*/
m->local_hmacs = sctp_default_supported_hmaclist();
m->local_auth_chunks = sctp_alloc_chunklist();
+ if (inp->asconf_supported) {
+ sctp_auth_add_chunk(SCTP_ASCONF, m->local_auth_chunks);
+ sctp_auth_add_chunk(SCTP_ASCONF_ACK, m->local_auth_chunks);
+ }
m->default_dscp = 0;
#ifdef INET6
m->default_flowlabel = 0;
#endif
m->port = 0; /* encapsulation disabled by default */
- sctp_auth_set_default_chunks(m->local_auth_chunks);
LIST_INIT(&m->shared_keys);
/* add default NULL key as key id 0 */
null_key = sctp_alloc_sharedkey();
@@ -2786,6 +2785,45 @@ sctp_move_pcb_and_assoc(struct sctp_inpcb *old_inp, struct sctp_inpcb *new_inp,
SCTP_INP_WUNLOCK(old_inp);
}
+/*
+ * insert an laddr entry with the given ifa for the desired list
+ */
+static int
+sctp_insert_laddr(struct sctpladdr *list, struct sctp_ifa *ifa, uint32_t act)
+{
+ struct sctp_laddr *laddr;
+
+ laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
+ if (laddr == NULL) {
+ /* out of memory? */
+ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
+ return (EINVAL);
+ }
+ SCTP_INCR_LADDR_COUNT();
+ bzero(laddr, sizeof(*laddr));
+ (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time);
+ laddr->ifa = ifa;
+ laddr->action = act;
+ atomic_add_int(&ifa->refcount, 1);
+ /* insert it */
+ LIST_INSERT_HEAD(list, laddr, sctp_nxt_addr);
+
+ return (0);
+}
+
+/*
+ * Remove an laddr entry from the local address list (on an assoc)
+ */
+static void
+sctp_remove_laddr(struct sctp_laddr *laddr)
+{
+
+ /* remove from the list */
+ LIST_REMOVE(laddr, sctp_nxt_addr);
+ sctp_free_ifa(laddr->ifa);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), laddr);
+ SCTP_DECR_LADDR_COUNT();
+}
@@ -3117,31 +3155,21 @@ continue_anyway:
* too (before adding).
*/
struct sctp_ifa *ifa;
- struct sockaddr_storage store_sa;
+ union sctp_sockstore store;
- memset(&store_sa, 0, sizeof(store_sa));
+ memset(&store, 0, sizeof(store));
switch (addr->sa_family) {
#ifdef INET
case AF_INET:
- {
- struct sockaddr_in *sin;
-
- sin = (struct sockaddr_in *)&store_sa;
- memcpy(sin, addr, sizeof(struct sockaddr_in));
- sin->sin_port = 0;
- break;
- }
+ memcpy(&store.sin, addr, sizeof(struct sockaddr_in));
+ store.sin.sin_port = 0;
+ break;
#endif
#ifdef INET6
case AF_INET6:
- {
- struct sockaddr_in6 *sin6;
-
- sin6 = (struct sockaddr_in6 *)&store_sa;
- memcpy(sin6, addr, sizeof(struct sockaddr_in6));
- sin6->sin6_port = 0;
- break;
- }
+ memcpy(&store.sin6, addr, sizeof(struct sockaddr_in6));
+ store.sin6.sin6_port = 0;
+ break;
#endif
default:
break;
@@ -3159,7 +3187,7 @@ continue_anyway:
* pass things in via the sctp_ifap argument
* (Panda).
*/
- ifa = sctp_find_ifa_by_addr((struct sockaddr *)&store_sa,
+ ifa = sctp_find_ifa_by_addr(&store.sa,
vrf_id, SCTP_ADDR_NOT_LOCKED);
}
if (ifa == NULL) {
@@ -3418,7 +3446,7 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from)
} else if (TAILQ_EMPTY(&asoc->asoc.send_queue) &&
TAILQ_EMPTY(&asoc->asoc.sent_queue) &&
(asoc->asoc.stream_queue_cnt == 0)) {
- if (asoc->asoc.locked_on_sending) {
+ if ((*asoc->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (asoc, &asoc->asoc)) {
goto abort_anyway;
}
if ((SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
@@ -3450,22 +3478,11 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from)
}
} else {
/* mark into shutdown pending */
- struct sctp_stream_queue_pending *sp;
-
asoc->asoc.state |= SCTP_STATE_SHUTDOWN_PENDING;
sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc,
asoc->asoc.primary_destination);
- if (asoc->asoc.locked_on_sending) {
- sp = TAILQ_LAST(&((asoc->asoc.locked_on_sending)->outqueue),
- sctp_streamhead);
- if (sp == NULL) {
- SCTP_PRINTF("Error, sp is NULL, locked on sending is %p strm:%d\n",
- (void *)asoc->asoc.locked_on_sending,
- asoc->asoc.locked_on_sending->stream_no);
- } else {
- if ((sp->length == 0) && (sp->msg_is_complete == 0))
- asoc->asoc.state |= SCTP_STATE_PARTIAL_MSG_LEFT;
- }
+ if ((*asoc->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (asoc, &asoc->asoc)) {
+ asoc->asoc.state |= SCTP_STATE_PARTIAL_MSG_LEFT;
}
if (TAILQ_EMPTY(&asoc->asoc.send_queue) &&
TAILQ_EMPTY(&asoc->asoc.sent_queue) &&
@@ -3550,7 +3567,8 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from)
(SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
SCTP_STAT_DECR_GAUGE32(sctps_currestab);
}
- if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_8) == 0) {
+ if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_FORCE,
+ SCTP_FROM_SCTP_PCB + SCTP_LOC_8) == 0) {
cnt++;
}
}
@@ -3637,8 +3655,7 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from)
* no need to free the net count, since at this point all
* assoc's are gone.
*/
- SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), sq);
- SCTP_DECR_READQ_COUNT();
+ sctp_free_a_readq(NULL, sq);
}
/* Now the sctp_pcb things */
/*
@@ -3646,13 +3663,9 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from)
* macro here since le_next will get freed as part of the
* sctp_free_assoc() call.
*/
- if (so) {
#ifdef IPSEC
- ipsec_delete_pcbpolicy(ip_pcb);
-#endif /* IPSEC */
-
- /* Unlocks not needed since the socket is gone now */
- }
+ ipsec_delete_pcbpolicy(ip_pcb);
+#endif
if (ip_pcb->inp_options) {
(void)sctp_m_free(ip_pcb->inp_options);
ip_pcb->inp_options = 0;
@@ -3746,7 +3759,7 @@ sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id)
*/
int
sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr,
- struct sctp_nets **netp, int set_scope, int from)
+ struct sctp_nets **netp, uint16_t port, int set_scope, int from)
{
/*
* The following is redundant to the same lines in the
@@ -3799,13 +3812,9 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr,
/* assure len is set */
sin->sin_len = sizeof(struct sockaddr_in);
if (set_scope) {
-#ifdef SCTP_DONT_DO_PRIVADDR_SCOPE
- stcb->asoc.scope.ipv4_local_scope = 1;
-#else
if (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
stcb->asoc.scope.ipv4_local_scope = 1;
}
-#endif /* SCTP_DONT_DO_PRIVADDR_SCOPE */
} else {
/* Validate the address is in scope */
if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) &&
@@ -3928,7 +3937,7 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr,
stcb->asoc.numnets++;
net->ref_count = 1;
net->cwr_window_tsn = net->last_cwr_tsn = stcb->asoc.sending_seq - 1;
- net->port = stcb->asoc.port;
+ net->port = port;
net->dscp = stcb->asoc.default_dscp;
#ifdef INET6
net->flowlabel = stcb->asoc.default_flowlabel;
@@ -3960,7 +3969,9 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr,
sin6->sin6_scope_id = 0;
}
#endif
- SCTP_RTALLOC((sctp_route_t *) & net->ro, stcb->asoc.vrf_id);
+ SCTP_RTALLOC((sctp_route_t *) & net->ro,
+ stcb->asoc.vrf_id,
+ stcb->sctp_ep->fibnum);
if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro)) {
/* Get source address */
@@ -3970,9 +3981,14 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr,
net,
0,
stcb->asoc.vrf_id);
- /* Now get the interface MTU */
- if (net->ro._s_addr && net->ro._s_addr->ifn_p) {
- net->mtu = SCTP_GATHER_MTU_FROM_INTFC(net->ro._s_addr->ifn_p);
+ if (net->ro._s_addr != NULL) {
+ net->src_addr_selected = 1;
+ /* Now get the interface MTU */
+ if (net->ro._s_addr->ifn_p != NULL) {
+ net->mtu = SCTP_GATHER_MTU_FROM_INTFC(net->ro._s_addr->ifn_p);
+ }
+ } else {
+ net->src_addr_selected = 0;
}
if (net->mtu > 0) {
uint32_t rmtu;
@@ -3994,6 +4010,8 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr,
net->mtu = rmtu;
}
}
+ } else {
+ net->src_addr_selected = 0;
}
if (net->mtu == 0) {
switch (newaddr->sa_family) {
@@ -4011,14 +4029,16 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr,
break;
}
}
+#if defined(INET) || defined(INET6)
if (net->port) {
net->mtu -= (uint32_t) sizeof(struct udphdr);
}
+#endif
if (from == SCTP_ALLOC_ASOC) {
stcb->asoc.smallest_mtu = net->mtu;
}
if (stcb->asoc.smallest_mtu > net->mtu) {
- stcb->asoc.smallest_mtu = net->mtu;
+ sctp_pathmtu_adjustment(stcb, net->mtu);
}
#ifdef INET6
if (newaddr->sa_family == AF_INET6) {
@@ -4039,14 +4059,11 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr,
*/
net->find_pseudo_cumack = 1;
net->find_rtx_pseudo_cumack = 1;
- net->src_addr_selected = 0;
/* Choose an initial flowid. */
net->flowid = stcb->asoc.my_vtag ^
ntohs(stcb->rport) ^
ntohs(stcb->sctp_ep->sctp_lport);
-#ifdef INVARIANTS
- net->flowidset = 1;
-#endif
+ net->flowtype = M_HASHTYPE_OPAQUE_HASH;
if (netp) {
*netp = net;
}
@@ -4167,6 +4184,7 @@ try_again:
struct sctp_tcb *
sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr,
int *error, uint32_t override_tag, uint32_t vrf_id,
+ uint16_t o_streams, uint16_t port,
struct thread *p
)
{
@@ -4325,7 +4343,7 @@ sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr,
/* setup back pointer's */
stcb->sctp_ep = inp;
stcb->sctp_socket = inp->sctp_socket;
- if ((err = sctp_init_asoc(inp, stcb, override_tag, vrf_id))) {
+ if ((err = sctp_init_asoc(inp, stcb, override_tag, vrf_id, o_streams))) {
/* failed */
SCTP_TCB_LOCK_DESTROY(stcb);
SCTP_TCB_SEND_LOCK_DESTROY(stcb);
@@ -4359,7 +4377,7 @@ sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr,
LIST_INSERT_HEAD(head, stcb, sctp_asocs);
SCTP_INP_INFO_WUNLOCK();
- if ((err = sctp_add_remote_addr(stcb, firstaddr, NULL, SCTP_DO_SETSCOPE, SCTP_ALLOC_ASOC))) {
+ if ((err = sctp_add_remote_addr(stcb, firstaddr, NULL, port, SCTP_DO_SETSCOPE, SCTP_ALLOC_ASOC))) {
/* failure.. memory error? */
if (asoc->strmout) {
SCTP_FREE(asoc->strmout, SCTP_M_STRMO);
@@ -4625,6 +4643,45 @@ sctp_add_vtag_to_timewait(uint32_t tag, uint32_t time, uint16_t lport, uint16_t
}
}
+void
+sctp_clean_up_stream(struct sctp_tcb *stcb, struct sctp_readhead *rh)
+{
+ struct sctp_tmit_chunk *chk, *nchk;
+ struct sctp_queued_to_read *ctl, *nctl;
+
+ TAILQ_FOREACH_SAFE(ctl, rh, next_instrm, nctl) {
+ TAILQ_REMOVE(rh, ctl, next_instrm);
+ ctl->on_strm_q = 0;
+ if (ctl->on_read_q == 0) {
+ sctp_free_remote_addr(ctl->whoFrom);
+ if (ctl->data) {
+ sctp_m_freem(ctl->data);
+ ctl->data = NULL;
+ }
+ }
+ /* Reassembly free? */
+ TAILQ_FOREACH_SAFE(chk, &ctl->reasm, sctp_next, nchk) {
+ TAILQ_REMOVE(&ctl->reasm, chk, sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ if (chk->holds_key_ref)
+ sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
+ sctp_free_remote_addr(chk->whoTo);
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
+ SCTP_DECR_CHK_COUNT();
+ /* sa_ignore FREED_MEMORY */
+ }
+ /*
+ * We don't free the address here since all the net's were
+ * freed above.
+ */
+ if (ctl->on_read_q == 0) {
+ sctp_free_a_readq(stcb, ctl);
+ }
+ }
+}
/*-
@@ -4925,7 +4982,9 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre
outs = &asoc->strmout[i];
/* now clean up any chunks here */
TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) {
+ atomic_subtract_int(&asoc->stream_queue_cnt, 1);
TAILQ_REMOVE(&outs->outqueue, sp, next);
+ stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 0);
sctp_free_spbufspace(stcb, asoc, sp);
if (sp->data) {
if (so) {
@@ -4962,8 +5021,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre
sq->whoFrom = NULL;
sq->stcb = NULL;
/* Free the ctl entry */
- SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), sq);
- SCTP_DECR_READQ_COUNT();
+ sctp_free_a_readq(stcb, sq);
/* sa_ignore FREED_MEMORY */
}
TAILQ_FOREACH_SAFE(chk, &asoc->free_chunks, sctp_next, nchk) {
@@ -5076,20 +5134,6 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre
SCTP_DECR_CHK_COUNT();
/* sa_ignore FREED_MEMORY */
}
- TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) {
- TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
- if (chk->data) {
- sctp_m_freem(chk->data);
- chk->data = NULL;
- }
- if (chk->holds_key_ref)
- sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED);
- sctp_free_remote_addr(chk->whoTo);
- SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk);
- SCTP_DECR_CHK_COUNT();
- /* sa_ignore FREED_MEMORY */
- }
-
if (asoc->mapping_array) {
SCTP_FREE(asoc->mapping_array, SCTP_M_MAP);
asoc->mapping_array = NULL;
@@ -5105,23 +5149,9 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre
}
asoc->strm_realoutsize = asoc->streamoutcnt = 0;
if (asoc->strmin) {
- struct sctp_queued_to_read *ctl, *nctl;
-
for (i = 0; i < asoc->streamincnt; i++) {
- TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[i].inqueue, next, nctl) {
- TAILQ_REMOVE(&asoc->strmin[i].inqueue, ctl, next);
- sctp_free_remote_addr(ctl->whoFrom);
- if (ctl->data) {
- sctp_m_freem(ctl->data);
- ctl->data = NULL;
- }
- /*
- * We don't free the address here since all
- * the net's were freed above.
- */
- SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), ctl);
- SCTP_DECR_READQ_COUNT();
- }
+ sctp_clean_up_stream(stcb, &asoc->strmin[i].inqueue);
+ sctp_clean_up_stream(stcb, &asoc->strmin[i].uno_inqueue);
}
SCTP_FREE(asoc->strmin, SCTP_M_STRMI);
asoc->strmin = NULL;
@@ -5302,7 +5332,7 @@ sctp_update_ep_vflag(struct sctp_inpcb *inp)
LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
if (laddr->ifa == NULL) {
SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n",
- __FUNCTION__);
+ __func__);
continue;
}
if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) {
@@ -5333,6 +5363,7 @@ void
sctp_add_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa, uint32_t action)
{
struct sctp_laddr *laddr;
+ struct sctp_tcb *stcb;
int fnd, error = 0;
fnd = 0;
@@ -5378,6 +5409,9 @@ sctp_add_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa, uint32_t ac
default:
break;
}
+ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
+ sctp_add_local_addr_restricted(stcb, ifa);
+ }
}
return;
}
@@ -5408,7 +5442,7 @@ sctp_select_primary_destination(struct sctp_tcb *stcb)
/*
- * Delete the address from the endpoint local address list There is nothing
+ * Delete the address from the endpoint local address list. There is nothing
* to be done if we are bound to all addresses
*/
void
@@ -5459,8 +5493,7 @@ sctp_del_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa)
* to laddr
*/
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
- if (net->ro._s_addr &&
- (net->ro._s_addr->ifa == laddr->ifa)) {
+ if (net->ro._s_addr == laddr->ifa) {
/* Yep, purge src address selected */
sctp_rtentry_t *rt;
@@ -5524,46 +5557,6 @@ sctp_add_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa)
}
/*
- * insert an laddr entry with the given ifa for the desired list
- */
-int
-sctp_insert_laddr(struct sctpladdr *list, struct sctp_ifa *ifa, uint32_t act)
-{
- struct sctp_laddr *laddr;
-
- laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr);
- if (laddr == NULL) {
- /* out of memory? */
- SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL);
- return (EINVAL);
- }
- SCTP_INCR_LADDR_COUNT();
- bzero(laddr, sizeof(*laddr));
- (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time);
- laddr->ifa = ifa;
- laddr->action = act;
- atomic_add_int(&ifa->refcount, 1);
- /* insert it */
- LIST_INSERT_HEAD(list, laddr, sctp_nxt_addr);
-
- return (0);
-}
-
-/*
- * Remove an laddr entry from the local address list (on an assoc)
- */
-void
-sctp_remove_laddr(struct sctp_laddr *laddr)
-{
-
- /* remove from the list */
- LIST_REMOVE(laddr, sctp_nxt_addr);
- sctp_free_ifa(laddr->ifa);
- SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), laddr);
- SCTP_DECR_LADDR_COUNT();
-}
-
-/*
* Remove a local address from the TCB local address restricted list
*/
void
@@ -5774,7 +5767,7 @@ sctp_pcb_init()
{
/*
* SCTP initialization for the PCB structures should be called by
- * the sctp_init() funciton.
+ * the sctp_init() function.
*/
int i;
struct timeval tv;
@@ -5933,12 +5926,32 @@ sctp_pcb_finish(void)
int i;
struct sctp_iterator *it, *nit;
+ if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) {
+ SCTP_PRINTF("%s: race condition on teardown.\n", __func__);
+ return;
+ }
+ SCTP_BASE_VAR(sctp_pcb_initialized) = 0;
/*
* In FreeBSD the iterator thread never exits but we do clean up.
* The only way FreeBSD reaches here is if we have VRF's but we
* still add the ifdef to make it compile on old versions.
*/
+retry:
SCTP_IPI_ITERATOR_WQ_LOCK();
+ /*
+ * sctp_iterator_worker() might be working on an it entry without
+ * holding the lock. We won't find it on the list either and
+ * continue and free/destroy it. While holding the lock, spin, to
+ * avoid the race condition as sctp_iterator_worker() will have to
+ * wait to re-aquire the lock.
+ */
+ if (sctp_it_ctl.iterator_running != 0 || sctp_it_ctl.cur_it != NULL) {
+ SCTP_IPI_ITERATOR_WQ_UNLOCK();
+ SCTP_PRINTF("%s: Iterator running while we held the lock. Retry. "
+ "cur_it=%p\n", __func__, sctp_it_ctl.cur_it);
+ DELAY(10);
+ goto retry;
+ }
TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) {
if (it->vn != curvnet) {
continue;
@@ -5956,11 +5969,14 @@ sctp_pcb_finish(void)
sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT;
}
SCTP_ITERATOR_UNLOCK();
- SCTP_OS_TIMER_STOP(&SCTP_BASE_INFO(addr_wq_timer.timer));
+ SCTP_OS_TIMER_STOP_DRAIN(&SCTP_BASE_INFO(addr_wq_timer.timer));
SCTP_WQ_ADDR_LOCK();
LIST_FOREACH_SAFE(wi, &SCTP_BASE_INFO(addr_wq), sctp_nxt_addr, nwi) {
LIST_REMOVE(wi, sctp_nxt_addr);
SCTP_DECR_LADDR_COUNT();
+ if (wi->action == SCTP_DEL_IP_ADDRESS) {
+ SCTP_FREE(wi->ifa, SCTP_M_IFA);
+ }
SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), wi);
}
SCTP_WQ_ADDR_UNLOCK();
@@ -6020,6 +6036,14 @@ sctp_pcb_finish(void)
SCTP_WQ_ADDR_DESTROY();
+ /* Get rid of other stuff too. */
+ if (SCTP_BASE_INFO(sctp_asochash) != NULL)
+ SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_asochash), SCTP_BASE_INFO(hashasocmark));
+ if (SCTP_BASE_INFO(sctp_ephash) != NULL)
+ SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_ephash), SCTP_BASE_INFO(hashmark));
+ if (SCTP_BASE_INFO(sctp_tcpephash) != NULL)
+ SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_tcpephash), SCTP_BASE_INFO(hashtcpmark));
+
SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_ep));
SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asoc));
SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_laddr));
@@ -6029,13 +6053,6 @@ sctp_pcb_finish(void)
SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_strmoq));
SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf));
SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf_ack));
- /* Get rid of other stuff to */
- if (SCTP_BASE_INFO(sctp_asochash) != NULL)
- SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_asochash), SCTP_BASE_INFO(hashasocmark));
- if (SCTP_BASE_INFO(sctp_ephash) != NULL)
- SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_ephash), SCTP_BASE_INFO(hashmark));
- if (SCTP_BASE_INFO(sctp_tcpephash) != NULL)
- SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_tcpephash), SCTP_BASE_INFO(hashtcpmark));
#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
SCTP_FREE(SCTP_BASE_STATS, SCTP_M_MCORE);
#endif
@@ -6046,7 +6063,7 @@ int
sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
int offset, int limit,
struct sockaddr *src, struct sockaddr *dst,
- struct sockaddr *altsa)
+ struct sockaddr *altsa, uint16_t port)
{
/*
* grub through the INIT pulling addresses and loading them to the
@@ -6075,7 +6092,15 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
sctp_key_t *new_key;
uint32_t keylen;
int got_random = 0, got_hmacs = 0, got_chklist = 0;
- uint8_t ecn_allowed;
+ uint8_t peer_supports_ecn;
+ uint8_t peer_supports_prsctp;
+ uint8_t peer_supports_auth;
+ uint8_t peer_supports_asconf;
+ uint8_t peer_supports_asconf_ack;
+ uint8_t peer_supports_reconfig;
+ uint8_t peer_supports_nrsack;
+ uint8_t peer_supports_pktdrop;
+ uint8_t peer_supports_idata;
#ifdef INET
struct sockaddr_in sin;
@@ -6104,8 +6129,14 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
} else {
sa = src;
}
- /* Turn off ECN until we get through all params */
- ecn_allowed = 0;
+ peer_supports_idata = 0;
+ peer_supports_ecn = 0;
+ peer_supports_prsctp = 0;
+ peer_supports_auth = 0;
+ peer_supports_asconf = 0;
+ peer_supports_reconfig = 0;
+ peer_supports_nrsack = 0;
+ peer_supports_pktdrop = 0;
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
/* mark all addresses that we have currently on the list */
net->dest_state |= SCTP_ADDR_NOT_IN_ASSOC;
@@ -6123,7 +6154,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
#ifdef INET
case AF_INET:
if (stcb->asoc.scope.ipv4_addr_legal) {
- if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_2)) {
+ if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_2)) {
return (-1);
}
}
@@ -6132,7 +6163,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
#ifdef INET6
case AF_INET6:
if (stcb->asoc.scope.ipv6_addr_legal) {
- if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_3)) {
+ if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_3)) {
return (-2);
}
}
@@ -6155,12 +6186,6 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
/* the assoc was freed? */
return (-4);
}
- /*
- * peer must explicitly turn this on. This may have been initialized
- * to be "on" in order to allow local addr changes while INIT's are
- * in flight.
- */
- stcb->asoc.peer_supports_asconf = 0;
/* now we must go through each of the params. */
phdr = sctp_get_next_param(m, offset, &parm_buf, sizeof(parm_buf));
while (phdr) {
@@ -6223,7 +6248,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
/* the assoc was freed? */
return (-7);
}
- if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_4)) {
+ if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_4)) {
return (-8);
}
} else if (stcb_tmp == stcb) {
@@ -6243,12 +6268,20 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
*/
if (stcb_tmp) {
if (SCTP_GET_STATE(&stcb_tmp->asoc) & SCTP_STATE_COOKIE_WAIT) {
+ struct mbuf *op_err;
+ char msg[SCTP_DIAG_INFO_LEN];
+
/*
* in setup state we
* abort this guy
*/
+ snprintf(msg, sizeof(msg),
+ "%s:%d at %s", __FILE__, __LINE__, __func__);
+ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
+ msg);
sctp_abort_an_association(stcb_tmp->sctp_ep,
- stcb_tmp, NULL, SCTP_SO_NOT_LOCKED);
+ stcb_tmp, op_err,
+ SCTP_SO_NOT_LOCKED);
goto add_it_now;
}
SCTP_TCB_UNLOCK(stcb_tmp);
@@ -6310,7 +6343,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
* we must add the address, no scope
* set
*/
- if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_5)) {
+ if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_5)) {
return (-17);
}
} else if (stcb_tmp == stcb) {
@@ -6332,18 +6365,26 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
* strange, address is in another
* assoc? straighten out locks.
*/
- if (stcb_tmp)
+ if (stcb_tmp) {
if (SCTP_GET_STATE(&stcb_tmp->asoc) & SCTP_STATE_COOKIE_WAIT) {
+ struct mbuf *op_err;
+ char msg[SCTP_DIAG_INFO_LEN];
+
/*
* in setup state we
* abort this guy
*/
+ snprintf(msg, sizeof(msg),
+ "%s:%d at %s", __FILE__, __LINE__, __func__);
+ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
+ msg);
sctp_abort_an_association(stcb_tmp->sctp_ep,
- stcb_tmp, NULL, SCTP_SO_NOT_LOCKED);
+ stcb_tmp, op_err,
+ SCTP_SO_NOT_LOCKED);
goto add_it_now6;
}
- SCTP_TCB_UNLOCK(stcb_tmp);
-
+ SCTP_TCB_UNLOCK(stcb_tmp);
+ }
if (stcb->asoc.state == 0) {
/* the assoc was freed? */
return (-21);
@@ -6354,7 +6395,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
} else
#endif
if (ptype == SCTP_ECN_CAPABLE) {
- ecn_allowed = 1;
+ peer_supports_ecn = 1;
} else if (ptype == SCTP_ULP_ADAPTATION) {
if (stcb->asoc.state != SCTP_STATE_OPEN) {
struct sctp_adaptation_layer_indication ai,
@@ -6378,7 +6419,9 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
#endif
- stcb->asoc.peer_supports_asconf = 1;
+ if (stcb->asoc.asconf_supported == 0) {
+ return (-100);
+ }
if (plen > sizeof(lstore)) {
return (-23);
}
@@ -6430,7 +6473,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
stcb->asoc.peer_supports_nat = 1;
} else if (ptype == SCTP_PRSCTP_SUPPORTED) {
/* Peer supports pr-sctp */
- stcb->asoc.peer_supports_prsctp = 1;
+ peer_supports_prsctp = 1;
} else if (ptype == SCTP_SUPPORTED_CHUNK_EXT) {
/* A supported extension chunk */
struct sctp_supported_chunk_types_param *pr_supported;
@@ -6442,34 +6485,33 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
if (phdr == NULL) {
return (-25);
}
- stcb->asoc.peer_supports_asconf = 0;
- stcb->asoc.peer_supports_prsctp = 0;
- stcb->asoc.peer_supports_pktdrop = 0;
- stcb->asoc.peer_supports_strreset = 0;
- stcb->asoc.peer_supports_nr_sack = 0;
- stcb->asoc.peer_supports_auth = 0;
pr_supported = (struct sctp_supported_chunk_types_param *)phdr;
num_ent = plen - sizeof(struct sctp_paramhdr);
for (i = 0; i < num_ent; i++) {
switch (pr_supported->chunk_types[i]) {
case SCTP_ASCONF:
+ peer_supports_asconf = 1;
+ break;
case SCTP_ASCONF_ACK:
- stcb->asoc.peer_supports_asconf = 1;
+ peer_supports_asconf_ack = 1;
break;
case SCTP_FORWARD_CUM_TSN:
- stcb->asoc.peer_supports_prsctp = 1;
+ peer_supports_prsctp = 1;
break;
case SCTP_PACKET_DROPPED:
- stcb->asoc.peer_supports_pktdrop = 1;
+ peer_supports_pktdrop = 1;
break;
case SCTP_NR_SELECTIVE_ACK:
- stcb->asoc.peer_supports_nr_sack = 1;
+ peer_supports_nrsack = 1;
break;
case SCTP_STREAM_RESET:
- stcb->asoc.peer_supports_strreset = 1;
+ peer_supports_reconfig = 1;
break;
case SCTP_AUTHENTICATION:
- stcb->asoc.peer_supports_auth = 1;
+ peer_supports_auth = 1;
+ break;
+ case SCTP_IDATA:
+ peer_supports_idata = 1;
break;
default:
/* one I have not learned yet */
@@ -6498,8 +6540,8 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
}
got_random = 1;
} else if (ptype == SCTP_HMAC_LIST) {
- int num_hmacs;
- int i;
+ uint16_t num_hmacs;
+ uint16_t i;
if (plen > sizeof(hmacs_store))
break;
@@ -6606,24 +6648,51 @@ next_param:
}
}
}
- if (ecn_allowed == 0) {
- stcb->asoc.ecn_allowed = 0;
+ if ((stcb->asoc.ecn_supported == 1) &&
+ (peer_supports_ecn == 0)) {
+ stcb->asoc.ecn_supported = 0;
}
- /* validate authentication required parameters */
- if (got_random && got_hmacs) {
- stcb->asoc.peer_supports_auth = 1;
- } else {
- stcb->asoc.peer_supports_auth = 0;
+ if ((stcb->asoc.prsctp_supported == 1) &&
+ (peer_supports_prsctp == 0)) {
+ stcb->asoc.prsctp_supported = 0;
+ }
+ if ((stcb->asoc.auth_supported == 1) &&
+ ((peer_supports_auth == 0) ||
+ (got_random == 0) || (got_hmacs == 0))) {
+ stcb->asoc.auth_supported = 0;
+ }
+ if ((stcb->asoc.asconf_supported == 1) &&
+ ((peer_supports_asconf == 0) || (peer_supports_asconf_ack == 0) ||
+ (stcb->asoc.auth_supported == 0) ||
+ (saw_asconf == 0) || (saw_asconf_ack == 0))) {
+ stcb->asoc.asconf_supported = 0;
+ }
+ if ((stcb->asoc.reconfig_supported == 1) &&
+ (peer_supports_reconfig == 0)) {
+ stcb->asoc.reconfig_supported = 0;
}
- if (!stcb->asoc.peer_supports_auth && got_chklist) {
+ if ((stcb->asoc.idata_supported == 1) &&
+ (peer_supports_idata == 0)) {
+ stcb->asoc.idata_supported = 0;
+ }
+ if ((stcb->asoc.nrsack_supported == 1) &&
+ (peer_supports_nrsack == 0)) {
+ stcb->asoc.nrsack_supported = 0;
+ }
+ if ((stcb->asoc.pktdrop_supported == 1) &&
+ (peer_supports_pktdrop == 0)) {
+ stcb->asoc.pktdrop_supported = 0;
+ }
+ /* validate authentication required parameters */
+ if ((peer_supports_auth == 0) && (got_chklist == 1)) {
/* peer does not support auth but sent a chunks list? */
return (-31);
}
- if (!SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk) && stcb->asoc.peer_supports_asconf &&
- !stcb->asoc.peer_supports_auth) {
+ if ((peer_supports_asconf == 1) && (peer_supports_auth == 0)) {
/* peer supports asconf but not auth? */
return (-32);
- } else if ((stcb->asoc.peer_supports_asconf) && (stcb->asoc.peer_supports_auth) &&
+ } else if ((peer_supports_asconf == 1) &&
+ (peer_supports_auth == 1) &&
((saw_asconf == 0) || (saw_asconf_ack == 0))) {
return (-33);
}
@@ -6718,10 +6787,6 @@ sctp_is_vtag_good(uint32_t tag, uint16_t lport, uint16_t rport, struct timeval *
SCTP_INP_INFO_RLOCK();
head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag,
SCTP_BASE_INFO(hashasocmark))];
- if (head == NULL) {
- /* invalid vtag */
- goto skip_vtag_check;
- }
LIST_FOREACH(stcb, head, sctp_asocs) {
/*
* We choose not to lock anything here. TCB's can't be
@@ -6745,8 +6810,6 @@ sctp_is_vtag_good(uint32_t tag, uint16_t lport, uint16_t rport, struct timeval *
return (0);
}
}
-skip_vtag_check:
-
chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
/* Now what about timed wait ? */
LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
@@ -6803,26 +6866,15 @@ sctp_drain_mbufs(struct sctp_tcb *stcb)
SCTP_STAT_INCR(sctps_protocol_drains_done);
cumulative_tsn_p1 = asoc->cumulative_tsn + 1;
cnt = 0;
- /* First look in the re-assembly queue */
- TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) {
- if (SCTP_TSN_GT(chk->rec.data.TSN_seq, cumulative_tsn_p1)) {
- /* Yep it is above cum-ack */
- cnt++;
- SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.TSN_seq, asoc->mapping_array_base_tsn);
- asoc->size_on_reasm_queue = sctp_sbspace_sub(asoc->size_on_reasm_queue, chk->send_size);
- sctp_ucount_decr(asoc->cnt_on_reasm_queue);
- SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
- TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next);
- if (chk->data) {
- sctp_m_freem(chk->data);
- chk->data = NULL;
- }
- sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
- }
- }
/* Ok that was fun, now we will drain all the inbound streams? */
for (strmat = 0; strmat < asoc->streamincnt; strmat++) {
- TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[strmat].inqueue, next, nctl) {
+ TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[strmat].inqueue, next_instrm, nctl) {
+#ifdef INVARIANTS
+ if (ctl->on_strm_q != SCTP_ON_ORDERED) {
+ panic("Huh control: %p on_q: %d -- not ordered?",
+ ctl, ctl->on_strm_q);
+ }
+#endif
if (SCTP_TSN_GT(ctl->sinfo_tsn, cumulative_tsn_p1)) {
/* Yep it is above cum-ack */
cnt++;
@@ -6830,14 +6882,74 @@ sctp_drain_mbufs(struct sctp_tcb *stcb)
asoc->size_on_all_streams = sctp_sbspace_sub(asoc->size_on_all_streams, ctl->length);
sctp_ucount_decr(asoc->cnt_on_all_streams);
SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
- TAILQ_REMOVE(&asoc->strmin[strmat].inqueue, ctl, next);
+ if (ctl->on_read_q) {
+ TAILQ_REMOVE(&stcb->sctp_ep->read_queue, ctl, next);
+ ctl->on_read_q = 0;
+ }
+ TAILQ_REMOVE(&asoc->strmin[strmat].inqueue, ctl, next_instrm);
+ ctl->on_strm_q = 0;
if (ctl->data) {
sctp_m_freem(ctl->data);
ctl->data = NULL;
}
sctp_free_remote_addr(ctl->whoFrom);
- SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), ctl);
- SCTP_DECR_READQ_COUNT();
+ /* Now its reasm? */
+ TAILQ_FOREACH_SAFE(chk, &ctl->reasm, sctp_next, nchk) {
+ cnt++;
+ SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.TSN_seq, asoc->mapping_array_base_tsn);
+ asoc->size_on_reasm_queue = sctp_sbspace_sub(asoc->size_on_reasm_queue, chk->send_size);
+ sctp_ucount_decr(asoc->cnt_on_reasm_queue);
+ SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
+ TAILQ_REMOVE(&ctl->reasm, chk, sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
+ }
+ sctp_free_a_readq(stcb, ctl);
+ }
+ }
+ TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[strmat].uno_inqueue, next_instrm, nctl) {
+#ifdef INVARIANTS
+ if (ctl->on_strm_q != SCTP_ON_UNORDERED) {
+ panic("Huh control: %p on_q: %d -- not unordered?",
+ ctl, ctl->on_strm_q);
+ }
+#endif
+ if (SCTP_TSN_GT(ctl->sinfo_tsn, cumulative_tsn_p1)) {
+ /* Yep it is above cum-ack */
+ cnt++;
+ SCTP_CALC_TSN_TO_GAP(gap, ctl->sinfo_tsn, asoc->mapping_array_base_tsn);
+ asoc->size_on_all_streams = sctp_sbspace_sub(asoc->size_on_all_streams, ctl->length);
+ sctp_ucount_decr(asoc->cnt_on_all_streams);
+ SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
+ if (ctl->on_read_q) {
+ TAILQ_REMOVE(&stcb->sctp_ep->read_queue, ctl, next);
+ ctl->on_read_q = 0;
+ }
+ TAILQ_REMOVE(&asoc->strmin[strmat].uno_inqueue, ctl, next_instrm);
+ ctl->on_strm_q = 0;
+ if (ctl->data) {
+ sctp_m_freem(ctl->data);
+ ctl->data = NULL;
+ }
+ sctp_free_remote_addr(ctl->whoFrom);
+ /* Now its reasm? */
+ TAILQ_FOREACH_SAFE(chk, &ctl->reasm, sctp_next, nchk) {
+ cnt++;
+ SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.TSN_seq, asoc->mapping_array_base_tsn);
+ asoc->size_on_reasm_queue = sctp_sbspace_sub(asoc->size_on_reasm_queue, chk->send_size);
+ sctp_ucount_decr(asoc->cnt_on_reasm_queue);
+ SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap);
+ TAILQ_REMOVE(&ctl->reasm, chk, sctp_next);
+ if (chk->data) {
+ sctp_m_freem(chk->data);
+ chk->data = NULL;
+ }
+ sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
+ }
+ sctp_free_a_readq(stcb, ctl);
}
}
}
@@ -6962,6 +7074,11 @@ sctp_initiate_iterator(inp_func inpf,
if (af == NULL) {
return (-1);
}
+ if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) {
+ SCTP_PRINTF("%s: abort on initialize being %d\n", __func__,
+ SCTP_BASE_VAR(sctp_pcb_initialized));
+ return (-1);
+ }
SCTP_MALLOC(it, struct sctp_iterator *, sizeof(struct sctp_iterator),
SCTP_M_ITER);
if (it == NULL) {
@@ -7000,7 +7117,13 @@ sctp_initiate_iterator(inp_func inpf,
}
SCTP_IPI_ITERATOR_WQ_LOCK();
-
+ if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) {
+ SCTP_IPI_ITERATOR_WQ_UNLOCK();
+ SCTP_PRINTF("%s: rollback on initialize being %d it=%p\n", __func__,
+ SCTP_BASE_VAR(sctp_pcb_initialized), it);
+ SCTP_FREE(it, SCTP_M_ITER);
+ return (-1);
+ }
TAILQ_INSERT_TAIL(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr);
if (sctp_it_ctl.iterator_running == 0) {
sctp_wakeup_iterator();
diff --git a/freebsd/sys/netinet/sctp_pcb.h b/freebsd/sys/netinet/sctp_pcb.h
index 8045765c..98204096 100644
--- a/freebsd/sys/netinet/sctp_pcb.h
+++ b/freebsd/sys/netinet/sctp_pcb.h
@@ -107,7 +107,7 @@ struct sctp_ifa {
* that we MUST lock appropriate locks. This
* is for V6. */
union sctp_sockstore address;
- uint32_t refcount; /* number of folks refering to this */
+ uint32_t refcount; /* number of folks referring to this */
uint32_t flags;
uint32_t localifa_flags;
uint32_t vrf_id; /* vrf_id of this addr (for deleting) */
@@ -360,7 +360,7 @@ struct sctp_pcbtsn_rlog {
struct sctp_inpcb {
/*-
* put an inpcb in front of it all, kind of a waste but we need to
- * for compatability with all the other stuff.
+ * for compatibility with all the other stuff.
*/
union {
struct inpcb inp;
@@ -404,9 +404,17 @@ struct sctp_inpcb {
uint32_t sctp_frag_point;
uint32_t partial_delivery_point;
uint32_t sctp_context;
+ uint32_t max_cwnd;
uint8_t local_strreset_support;
uint32_t sctp_cmt_on_off;
- uint32_t sctp_ecn_enable;
+ uint8_t ecn_supported;
+ uint8_t prsctp_supported;
+ uint8_t auth_supported;
+ uint8_t idata_supported;
+ uint8_t asconf_supported;
+ uint8_t reconfig_supported;
+ uint8_t nrsack_supported;
+ uint8_t pktdrop_supported;
struct sctp_nonpad_sndrcvinfo def_send;
/*-
* These three are here for the sosend_dgram
@@ -423,6 +431,7 @@ struct sctp_inpcb {
struct mtx inp_rdata_mtx;
int32_t refcount;
uint32_t def_vrf_id;
+ uint16_t fibnum;
uint32_t total_sends;
uint32_t total_recvs;
uint32_t last_abort_code;
@@ -576,7 +585,7 @@ void sctp_inpcb_free(struct sctp_inpcb *, int, int);
struct sctp_tcb *
sctp_aloc_assoc(struct sctp_inpcb *, struct sockaddr *,
- int *, uint32_t, uint32_t, struct thread *);
+ int *, uint32_t, uint32_t, uint16_t, uint16_t, struct thread *);
int sctp_free_assoc(struct sctp_inpcb *, struct sctp_tcb *, int, int);
@@ -590,13 +599,9 @@ void
void sctp_add_local_addr_ep(struct sctp_inpcb *, struct sctp_ifa *, uint32_t);
-int sctp_insert_laddr(struct sctpladdr *, struct sctp_ifa *, uint32_t);
-
-void sctp_remove_laddr(struct sctp_laddr *);
-
void sctp_del_local_addr_ep(struct sctp_inpcb *, struct sctp_ifa *);
-int sctp_add_remote_addr(struct sctp_tcb *, struct sockaddr *, struct sctp_nets **, int, int);
+int sctp_add_remote_addr(struct sctp_tcb *, struct sockaddr *, struct sctp_nets **, uint16_t, int, int);
void sctp_remove_net(struct sctp_tcb *, struct sctp_nets *);
@@ -611,7 +616,7 @@ void sctp_del_local_addr_restricted(struct sctp_tcb *, struct sctp_ifa *);
int
sctp_load_addresses_from_init(struct sctp_tcb *, struct mbuf *, int, int,
- struct sockaddr *, struct sockaddr *, struct sockaddr *);
+ struct sockaddr *, struct sockaddr *, struct sockaddr *, uint16_t);
int
sctp_set_primary_addr(struct sctp_tcb *, struct sockaddr *,
@@ -625,6 +630,8 @@ int sctp_destination_is_reachable(struct sctp_tcb *, struct sockaddr *);
int sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp);
+void sctp_clean_up_stream(struct sctp_tcb *stcb, struct sctp_readhead *rh);
+
/*-
* Null in last arg inpcb indicate run on ALL ep's. Specific inp in last arg
* indicates run on ONLY assoc's of the specified endpoint.
@@ -646,11 +653,5 @@ void
#endif
-#ifdef INVARIANTS
-void
- sctp_validate_no_locks(struct sctp_inpcb *inp);
-
-#endif
-
#endif /* _KERNEL */
#endif /* !__sctp_pcb_h__ */
diff --git a/freebsd/sys/netinet/sctp_peeloff.c b/freebsd/sys/netinet/sctp_peeloff.c
index e8bb0444..3603e41a 100644
--- a/freebsd/sys/netinet/sctp_peeloff.c
+++ b/freebsd/sys/netinet/sctp_peeloff.c
@@ -120,9 +120,16 @@ sctp_do_peeloff(struct socket *head, struct socket *so, sctp_assoc_t assoc_id)
n_inp->sctp_mobility_features = inp->sctp_mobility_features;
n_inp->sctp_frag_point = inp->sctp_frag_point;
n_inp->sctp_cmt_on_off = inp->sctp_cmt_on_off;
- n_inp->sctp_ecn_enable = inp->sctp_ecn_enable;
+ n_inp->ecn_supported = inp->ecn_supported;
+ n_inp->prsctp_supported = inp->prsctp_supported;
+ n_inp->auth_supported = inp->auth_supported;
+ n_inp->asconf_supported = inp->asconf_supported;
+ n_inp->reconfig_supported = inp->reconfig_supported;
+ n_inp->nrsack_supported = inp->nrsack_supported;
+ n_inp->pktdrop_supported = inp->pktdrop_supported;
n_inp->partial_delivery_point = inp->partial_delivery_point;
n_inp->sctp_context = inp->sctp_context;
+ n_inp->max_cwnd = inp->max_cwnd;
n_inp->local_strreset_support = inp->local_strreset_support;
n_inp->inp_starting_point_for_iterator = NULL;
/* copy in the authentication parameters from the original endpoint */
diff --git a/freebsd/sys/netinet/sctp_structs.h b/freebsd/sys/netinet/sctp_structs.h
index a8b86c62..280100bb 100644
--- a/freebsd/sys/netinet/sctp_structs.h
+++ b/freebsd/sys/netinet/sctp_structs.h
@@ -76,6 +76,7 @@ TAILQ_HEAD(sctpnetlisthead, sctp_nets);
struct sctp_stream_reset_list {
TAILQ_ENTRY(sctp_stream_reset_list) next_resp;
+ uint32_t seq;
uint32_t tsn;
uint32_t number_entries;
uint16_t list_of_streams[];
@@ -188,9 +189,12 @@ struct iterator_control {
struct sctp_net_route {
sctp_rtentry_t *ro_rt;
- void *ro_lle;
- void *ro_ia;
- int ro_flags;
+ struct llentry *ro_lle;
+ char *ro_prepend;
+ uint16_t ro_plen;
+ uint16_t ro_flags;
+ uint16_t ro_mtu;
+ uint16_t spare;
union sctp_sockstore _l_addr; /* remote peer addr */
struct sctp_ifa *_s_addr; /* our selected src addr */
};
@@ -380,15 +384,13 @@ struct sctp_nets {
uint8_t lan_type;
uint8_t rto_needed;
uint32_t flowid;
-#ifdef INVARIANTS
- uint8_t flowidset;
-#endif
+ uint8_t flowtype;
};
struct sctp_data_chunkrec {
uint32_t TSN_seq; /* the TSN of this transmit */
- uint16_t stream_seq; /* the stream sequence number of this transmit */
+ uint32_t stream_seq; /* the stream sequence number of this transmit */
uint16_t stream_number; /* the stream number of this guy */
uint32_t payloadtype;
uint32_t context; /* from send */
@@ -399,6 +401,7 @@ struct sctp_data_chunkrec {
*/
uint32_t fast_retran_tsn; /* sending_seq at the time of FR */
struct timeval timetodrop; /* time we drop it from queue */
+ uint32_t fsn_num; /* Fragment Sequence Number */
uint8_t doing_fast_retransmit;
uint8_t rcv_flags; /* flags pulled from data chunk on inbound for
* outbound holds sending flags for PR-SCTP. */
@@ -418,8 +421,8 @@ TAILQ_HEAD(sctpchunk_listhead, sctp_tmit_chunk);
#define CHUNK_FLAGS_FRAGMENT_OK 0x0100
struct chk_id {
- uint16_t id;
- uint16_t can_take_data;
+ uint8_t id;
+ uint8_t can_take_data;
};
@@ -450,14 +453,9 @@ struct sctp_tmit_chunk {
uint8_t window_probe;
};
-/*
- * The first part of this structure MUST be the entire sinfo structure. Maybe
- * I should have made it a sub structure... we can circle back later and do
- * that if we want.
- */
struct sctp_queued_to_read { /* sinfo structure Pluse more */
uint16_t sinfo_stream; /* off the wire */
- uint16_t sinfo_ssn; /* off the wire */
+ uint32_t sinfo_ssn; /* off the wire */
uint16_t sinfo_flags; /* SCTP_UNORDERED from wire use SCTP_EOF for
* EOR */
uint32_t sinfo_ppid; /* off the wire */
@@ -467,8 +465,11 @@ struct sctp_queued_to_read { /* sinfo structure Pluse more */
uint32_t sinfo_cumtsn; /* Use this in reassembly as last TSN */
sctp_assoc_t sinfo_assoc_id; /* our assoc id */
/* Non sinfo stuff */
+ uint32_t msg_id; /* Fragment Index */
uint32_t length; /* length of data */
uint32_t held_length; /* length held in sb */
+ uint32_t top_fsn; /* Highest FSN in queue */
+ uint32_t fsn_included; /* Highest FSN in *data portion */
struct sctp_nets *whoFrom; /* where it came from */
struct mbuf *data; /* front of the mbuf chain of data with
* PKT_HDR */
@@ -477,14 +478,24 @@ struct sctp_queued_to_read { /* sinfo structure Pluse more */
* take it from us */
struct sctp_tcb *stcb; /* assoc, used for window update */
TAILQ_ENTRY(sctp_queued_to_read) next;
+ TAILQ_ENTRY(sctp_queued_to_read) next_instrm;
+ struct sctpchunk_listhead reasm;
uint16_t port_from;
uint16_t spec_flags; /* Flags to hold the notification field */
uint8_t do_not_ref_stcb;
uint8_t end_added;
uint8_t pdapi_aborted;
+ uint8_t pdapi_started;
uint8_t some_taken;
+ uint8_t last_frag_seen;
+ uint8_t first_frag_seen;
+ uint8_t on_read_q;
+ uint8_t on_strm_q;
};
+#define SCTP_ON_ORDERED 1
+#define SCTP_ON_UNORDERED 2
+
/* This data structure will be on the outbound
* stream queues. Data will be pulled off from
* the front of the mbuf data and chunk-ified
@@ -510,6 +521,7 @@ struct sctp_stream_queue_pending {
struct sctp_nets *net;
TAILQ_ENTRY(sctp_stream_queue_pending) next;
TAILQ_ENTRY(sctp_stream_queue_pending) ss_next;
+ uint32_t fsn;
uint32_t length;
uint32_t timetolive;
uint32_t ppid;
@@ -533,14 +545,17 @@ struct sctp_stream_queue_pending {
TAILQ_HEAD(sctpwheelunrel_listhead, sctp_stream_in);
struct sctp_stream_in {
struct sctp_readhead inqueue;
+ struct sctp_readhead uno_inqueue;
+ uint32_t last_sequence_delivered; /* used for re-order */
uint16_t stream_no;
- uint16_t last_sequence_delivered; /* used for re-order */
uint8_t delivery_started;
+ uint8_t pd_api_started;
};
TAILQ_HEAD(sctpwheel_listhead, sctp_stream_out);
TAILQ_HEAD(sctplist_listhead, sctp_stream_queue_pending);
+
/* Round-robin schedulers */
struct ss_rr {
/* next link in wheel */
@@ -567,9 +582,14 @@ struct ss_fb {
* This union holds all data necessary for
* different stream schedulers.
*/
-union scheduling_data {
- struct sctpwheel_listhead out_wheel;
- struct sctplist_listhead out_list;
+struct scheduling_data {
+ struct sctp_stream_out *locked_on_sending;
+ /* circular looking for output selection */
+ struct sctp_stream_out *last_out_stream;
+ union {
+ struct sctpwheel_listhead wheel;
+ struct sctplist_listhead list;
+ } out;
};
/*
@@ -582,14 +602,37 @@ union scheduling_parameters {
struct ss_fb fb;
};
+/* States for outgoing streams */
+#define SCTP_STREAM_CLOSED 0x00
+#define SCTP_STREAM_OPENING 0x01
+#define SCTP_STREAM_OPEN 0x02
+#define SCTP_STREAM_RESET_PENDING 0x03
+#define SCTP_STREAM_RESET_IN_FLIGHT 0x04
+
+#define SCTP_MAX_STREAMS_AT_ONCE_RESET 200
+
/* This struct is used to track the traffic on outbound streams */
struct sctp_stream_out {
struct sctp_streamhead outqueue;
union scheduling_parameters ss_params;
- uint32_t chunks_on_queues;
+ uint32_t chunks_on_queues; /* send queue and sent queue */
+#if defined(SCTP_DETAILED_STR_STATS)
+ uint32_t abandoned_unsent[SCTP_PR_SCTP_MAX + 1];
+ uint32_t abandoned_sent[SCTP_PR_SCTP_MAX + 1];
+#else
+ /* Only the aggregation */
+ uint32_t abandoned_unsent[1];
+ uint32_t abandoned_sent[1];
+#endif
+ /*
+ * For associations using DATA chunks, the lower 16-bit of
+ * next_mid_ordered are used as the next SSN.
+ */
+ uint32_t next_mid_ordered;
+ uint32_t next_mid_unordered;
uint16_t stream_no;
- uint16_t next_sequence_send; /* next one I expect to send out */
uint8_t last_msg_incomplete;
+ uint8_t state;
};
/* used to keep track of the addresses yet to try to add/delete */
@@ -616,12 +659,13 @@ struct sctp_scoping {
struct sctp_tsn_log {
void *stcb;
uint32_t tsn;
+ uint32_t seq;
uint16_t strm;
- uint16_t seq;
uint16_t sz;
uint16_t flgs;
uint16_t in_pos;
uint16_t in_out;
+ uint16_t resv;
};
#define SCTP_FS_SPEC_LOG_SIZE 200
@@ -697,7 +741,7 @@ struct sctp_ss_functions {
int holds_lock);
void (*sctp_ss_clear) (struct sctp_tcb *stcb, struct sctp_association *asoc,
int clear_values, int holds_lock);
- void (*sctp_ss_init_stream) (struct sctp_stream_out *strq, struct sctp_stream_out *with_strq);
+ void (*sctp_ss_init_stream) (struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq);
void (*sctp_ss_add_to_stream) (struct sctp_tcb *stcb, struct sctp_association *asoc,
struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp, int holds_lock);
int (*sctp_ss_is_empty) (struct sctp_tcb *stcb, struct sctp_association *asoc);
@@ -713,6 +757,7 @@ struct sctp_ss_functions {
struct sctp_stream_out *strq, uint16_t * value);
int (*sctp_ss_set_value) (struct sctp_tcb *stcb, struct sctp_association *asoc,
struct sctp_stream_out *strq, uint16_t value);
+ int (*sctp_ss_is_user_msgs_incomplete) (struct sctp_tcb *stcb, struct sctp_association *asoc);
};
/* used to save ASCONF chunks for retransmission */
@@ -792,19 +837,8 @@ struct sctp_association {
struct sctpchunk_listhead sent_queue;
struct sctpchunk_listhead send_queue;
- /* re-assembly queue for fragmented chunks on the inbound path */
- struct sctpchunk_listhead reasmqueue;
-
/* Scheduling queues */
- union scheduling_data ss_data;
-
- /*
- * This pointer will be set to NULL most of the time. But when we
- * have a fragmented message, where we could not get out all of the
- * message at the last send then this will point to the stream to go
- * get data from.
- */
- struct sctp_stream_out *locked_on_sending;
+ struct scheduling_data ss_data;
/* If an iterator is looking at me, this is it */
struct sctp_iterator *stcb_starting_point_for_iterator;
@@ -837,8 +871,6 @@ struct sctp_association {
/* last place I got a control from */
struct sctp_nets *last_control_chunk_from;
- /* circular looking for output selection */
- struct sctp_stream_out *last_out_stream;
/*
* wait to the point the cum-ack passes req->send_reset_at_tsn for
@@ -862,7 +894,6 @@ struct sctp_association {
uint32_t stream_scheduling_module;
uint32_t vrf_id;
-
uint32_t cookie_preserve_req;
/* ASCONF next seq I am sending out, inits at init-tsn */
uint32_t asconf_seq_out;
@@ -936,7 +967,7 @@ struct sctp_association {
uint32_t sat_t3_recovery_tsn;
uint32_t tsn_last_delivered;
/*
- * For the pd-api we should re-write this a bit more efficent. We
+ * For the pd-api we should re-write this a bit more efficient. We
* could have multiple sctp_queued_to_read's that we are building at
* once. Now we only do this when we get ready to deliver to the
* socket buffer. Note that we depend on the fact that the struct is
@@ -1142,7 +1173,7 @@ struct sctp_association {
uint8_t hb_random_idx;
uint8_t default_dscp;
uint8_t asconf_del_pending; /* asconf delete last addr pending */
-
+ uint8_t trigger_reset;
/*
* This value, plus all other ack'd but above cum-ack is added
* together to cross check against the bit that we have yet to
@@ -1150,34 +1181,24 @@ struct sctp_association {
* sum is updated as well.
*/
- /* Flag to tell if ECN is allowed */
- uint8_t ecn_allowed;
+ /* Flags whether an extension is supported or not */
+ uint8_t ecn_supported;
+ uint8_t prsctp_supported;
+ uint8_t auth_supported;
+ uint8_t asconf_supported;
+ uint8_t reconfig_supported;
+ uint8_t nrsack_supported;
+ uint8_t pktdrop_supported;
+ uint8_t idata_supported;
/* Did the peer make the stream config (add out) request */
uint8_t peer_req_out;
- /* flag to indicate if peer can do asconf */
- uint8_t peer_supports_asconf;
- /* EY - flag to indicate if peer can do nr_sack */
- uint8_t peer_supports_nr_sack;
- /* pr-sctp support flag */
- uint8_t peer_supports_prsctp;
- /* peer authentication support flag */
- uint8_t peer_supports_auth;
- /* stream resets are supported by the peer */
- uint8_t peer_supports_strreset;
uint8_t local_strreset_support;
-
uint8_t peer_supports_nat;
- /*
- * packet drop's are supported by the peer, we don't really care
- * about this but we bookkeep it anyway.
- */
- uint8_t peer_supports_pktdrop;
struct sctp_scoping scope;
/* flags to handle send alternate net tracking */
- uint8_t used_alt_onsack;
uint8_t used_alt_asconfack;
uint8_t fast_retran_loss_recovery;
uint8_t sat_t3_loss_recovery;
@@ -1198,12 +1219,11 @@ struct sctp_association {
uint8_t sctp_cmt_on_off;
uint8_t iam_blocking;
uint8_t cookie_how[8];
- /* EY 05/05/08 - NR_SACK variable */
- uint8_t sctp_nr_sack_on_off;
/* JRS 5/21/07 - CMT PF variable */
uint8_t sctp_cmt_pf;
uint8_t use_precise_time;
uint64_t sctp_features;
+ uint32_t max_cwnd;
uint16_t port; /* remote UDP encapsulation port */
/*
* The mapping array is used to track out of order sequences above
@@ -1222,6 +1242,8 @@ struct sctp_association {
uint32_t timoshutdownack;
struct timeval start_time;
struct timeval discontinuity_time;
+ uint64_t abandoned_unsent[SCTP_PR_SCTP_MAX + 1];
+ uint64_t abandoned_sent[SCTP_PR_SCTP_MAX + 1];
};
#endif
diff --git a/freebsd/sys/netinet/sctp_sysctl.c b/freebsd/sys/netinet/sctp_sysctl.c
index d0da7a6f..8715c69b 100644
--- a/freebsd/sys/netinet/sctp_sysctl.c
+++ b/freebsd/sys/netinet/sctp_sysctl.c
@@ -43,6 +43,9 @@ __FBSDID("$FreeBSD$");
#include <netinet/sctputil.h>
#include <netinet/sctp_output.h>
#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+FEATURE(sctp, "Stream Control Transmission Protocol");
/*
* sysctl tunable variables
@@ -56,7 +59,12 @@ sctp_init_sysctls()
SCTP_BASE_SYSCTL(sctp_auto_asconf) = SCTPCTL_AUTOASCONF_DEFAULT;
SCTP_BASE_SYSCTL(sctp_multiple_asconfs) = SCTPCTL_MULTIPLEASCONFS_DEFAULT;
SCTP_BASE_SYSCTL(sctp_ecn_enable) = SCTPCTL_ECN_ENABLE_DEFAULT;
- SCTP_BASE_SYSCTL(sctp_strict_sacks) = SCTPCTL_STRICT_SACKS_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_pr_enable) = SCTPCTL_PR_ENABLE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_auth_enable) = SCTPCTL_AUTH_ENABLE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_asconf_enable) = SCTPCTL_ASCONF_ENABLE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_reconfig_enable) = SCTPCTL_RECONFIG_ENABLE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_nrsack_enable) = SCTPCTL_NRSACK_ENABLE_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_pktdrop_enable) = SCTPCTL_PKTDROP_ENABLE_DEFAULT;
SCTP_BASE_SYSCTL(sctp_peer_chunk_oh) = SCTPCTL_PEER_CHKOH_DEFAULT;
SCTP_BASE_SYSCTL(sctp_max_burst_default) = SCTPCTL_MAXBURST_DEFAULT;
SCTP_BASE_SYSCTL(sctp_fr_max_burst_default) = SCTPCTL_FRMAXBURST_DEFAULT;
@@ -86,25 +94,18 @@ sctp_init_sysctls()
SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default) = SCTPCTL_INCOMING_STREAMS_DEFAULT;
SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default) = SCTPCTL_OUTGOING_STREAMS_DEFAULT;
SCTP_BASE_SYSCTL(sctp_cmt_on_off) = SCTPCTL_CMT_ON_OFF_DEFAULT;
- /* EY */
- SCTP_BASE_SYSCTL(sctp_nr_sack_on_off) = SCTPCTL_NR_SACK_ON_OFF_DEFAULT;
SCTP_BASE_SYSCTL(sctp_cmt_use_dac) = SCTPCTL_CMT_USE_DAC_DEFAULT;
SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) = SCTPCTL_CWND_MAXBURST_DEFAULT;
- SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk) = SCTPCTL_ASCONF_AUTH_NOCHK_DEFAULT;
- SCTP_BASE_SYSCTL(sctp_auth_disable) = SCTPCTL_AUTH_DISABLE_DEFAULT;
SCTP_BASE_SYSCTL(sctp_nat_friendly) = SCTPCTL_NAT_FRIENDLY_DEFAULT;
SCTP_BASE_SYSCTL(sctp_L2_abc_variable) = SCTPCTL_ABC_L_VAR_DEFAULT;
SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count) = SCTPCTL_MAX_CHAINED_MBUFS_DEFAULT;
SCTP_BASE_SYSCTL(sctp_do_drain) = SCTPCTL_DO_SCTP_DRAIN_DEFAULT;
SCTP_BASE_SYSCTL(sctp_hb_maxburst) = SCTPCTL_HB_MAX_BURST_DEFAULT;
SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit) = SCTPCTL_ABORT_AT_LIMIT_DEFAULT;
- SCTP_BASE_SYSCTL(sctp_strict_data_order) = SCTPCTL_STRICT_DATA_ORDER_DEFAULT;
SCTP_BASE_SYSCTL(sctp_min_residual) = SCTPCTL_MIN_RESIDUAL_DEFAULT;
SCTP_BASE_SYSCTL(sctp_max_retran_chunk) = SCTPCTL_MAX_RETRAN_CHUNK_DEFAULT;
SCTP_BASE_SYSCTL(sctp_logging_level) = SCTPCTL_LOGGING_LEVEL_DEFAULT;
- /* JRS - Variable for default congestion control module */
SCTP_BASE_SYSCTL(sctp_default_cc_module) = SCTPCTL_DEFAULT_CC_MODULE_DEFAULT;
- /* RS - Variable for default stream scheduling module */
SCTP_BASE_SYSCTL(sctp_default_ss_module) = SCTPCTL_DEFAULT_SS_MODULE_DEFAULT;
SCTP_BASE_SYSCTL(sctp_default_frag_interleave) = SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DEFAULT;
SCTP_BASE_SYSCTL(sctp_mobility_base) = SCTPCTL_MOBILITY_BASE_DEFAULT;
@@ -136,7 +137,7 @@ sctp_init_sysctls()
/* It returns an upper limit. No filtering is done here */
static unsigned int
-number_of_addresses(struct sctp_inpcb *inp)
+sctp_sysctl_number_of_addresses(struct sctp_inpcb *inp)
{
unsigned int cnt;
struct sctp_vrf *vrf;
@@ -186,7 +187,7 @@ number_of_addresses(struct sctp_inpcb *inp)
}
static int
-copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sysctl_req *req)
+sctp_sysctl_copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sysctl_req *req)
{
struct sctp_ifn *sctp_ifn;
struct sctp_ifa *sctp_ifa;
@@ -251,7 +252,7 @@ copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct s
if (ipv4_addr_legal) {
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)&sctp_ifa->address.sa;
+ sin = &sctp_ifa->address.sin;
if (sin->sin_addr.s_addr == 0)
continue;
if (prison_check_ip4(inp->ip_inp.inp.inp_cred,
@@ -270,7 +271,7 @@ copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct s
if (ipv6_addr_legal) {
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa;
+ sin6 = &sctp_ifa->address.sin6;
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
continue;
if (prison_check_ip6(inp->ip_inp.inp.inp_cred,
@@ -280,15 +281,6 @@ copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct s
if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
if (local_scope == 0)
continue;
- if (sin6->sin6_scope_id == 0) {
- /*
- * bad link
- * local
- * address
- */
- if (sa6_recoverscope(sin6) != 0)
- continue;
- }
}
if ((site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)))
continue;
@@ -352,7 +344,7 @@ copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct s
* sysctl functions
*/
static int
-sctp_assoclist(SYSCTL_HANDLER_ARGS)
+sctp_sysctl_handle_assoclist(SYSCTL_HANDLER_ARGS)
{
unsigned int number_of_endpoints;
unsigned int number_of_local_addresses;
@@ -374,14 +366,14 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS)
number_of_remote_addresses = 0;
SCTP_INP_INFO_RLOCK();
- if (req->oldptr == USER_ADDR_NULL) {
+ if (req->oldptr == NULL) {
LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) {
SCTP_INP_RLOCK(inp);
number_of_endpoints++;
- number_of_local_addresses += number_of_addresses(inp);
+ number_of_local_addresses += sctp_sysctl_number_of_addresses(inp);
LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
number_of_associations++;
- number_of_local_addresses += number_of_addresses(inp);
+ number_of_local_addresses += sctp_sysctl_number_of_addresses(inp);
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
number_of_remote_addresses++;
}
@@ -398,7 +390,7 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS)
req->oldidx = (n + n / 8);
return (0);
}
- if (req->newptr != USER_ADDR_NULL) {
+ if (req->newptr != NULL) {
SCTP_INP_INFO_RUNLOCK();
SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_SYSCTL, EPERM);
return (EPERM);
@@ -412,11 +404,12 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS)
xinpcb.last = 0;
xinpcb.local_port = ntohs(inp->sctp_lport);
xinpcb.flags = inp->sctp_flags;
- xinpcb.features = (uint32_t) inp->sctp_features;
+ xinpcb.features = inp->sctp_features;
xinpcb.total_sends = inp->total_sends;
xinpcb.total_recvs = inp->total_recvs;
xinpcb.total_nospaces = inp->total_nospaces;
xinpcb.fragmentation_point = inp->sctp_frag_point;
+ xinpcb.socket = inp->sctp_socket;
so = inp->sctp_socket;
if ((so == NULL) ||
(inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
@@ -424,7 +417,11 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS)
xinpcb.maxqlen = 0;
} else {
xinpcb.qlen = so->so_qlen;
+ xinpcb.qlen_old = so->so_qlen > USHRT_MAX ?
+ USHRT_MAX : (uint16_t) so->so_qlen;
xinpcb.maxqlen = so->so_qlimit;
+ xinpcb.maxqlen_old = so->so_qlimit > USHRT_MAX ?
+ USHRT_MAX : (uint16_t) so->so_qlimit;
}
SCTP_INP_INCR_REF(inp);
SCTP_INP_RUNLOCK(inp);
@@ -436,7 +433,7 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS)
}
SCTP_INP_INFO_RLOCK();
SCTP_INP_RLOCK(inp);
- error = copy_out_local_addresses(inp, NULL, req);
+ error = sctp_sysctl_copy_out_local_addresses(inp, NULL, req);
if (error) {
SCTP_INP_DECR_REF(inp);
return (error);
@@ -451,7 +448,7 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS)
if (stcb->asoc.primary_destination != NULL)
xstcb.primary_addr = stcb->asoc.primary_destination->ro._l_addr;
xstcb.heartbeat_interval = stcb->asoc.heart_beat_delay;
- xstcb.state = SCTP_GET_STATE(&stcb->asoc); /* FIXME */
+ xstcb.state = (uint32_t) sctp_map_assoc_state(stcb->asoc.state);
/* 7.0 does not support these */
xstcb.assoc_id = sctp_get_associd(stcb);
xstcb.peers_rwnd = stcb->asoc.peers_rwnd;
@@ -487,7 +484,7 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS)
}
SCTP_INP_INFO_RLOCK();
SCTP_INP_RLOCK(inp);
- error = copy_out_local_addresses(inp, stcb, req);
+ error = sctp_sysctl_copy_out_local_addresses(inp, stcb, req);
if (error) {
SCTP_INP_DECR_REF(inp);
atomic_subtract_int(&stcb->asoc.refcnt, 1);
@@ -509,6 +506,7 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS)
xraddr.mtu = net->mtu;
xraddr.rtt = net->rtt / 1000;
xraddr.heartbeat_interval = net->heart_beat_delay;
+ xraddr.ssthresh = net->ssthresh;
xraddr.start_time.tv_sec = (uint32_t) net->start_time.tv_sec;
xraddr.start_time.tv_usec = (uint32_t) net->start_time.tv_usec;
SCTP_INP_RUNLOCK(inp);
@@ -555,153 +553,120 @@ skip:
return (error);
}
-
-#define RANGECHK(var, min, max) \
- if ((var) < (min)) { (var) = (min); } \
- else if ((var) > (max)) { (var) = (max); }
-
static int
-sysctl_sctp_udp_tunneling_check(SYSCTL_HANDLER_ARGS)
+sctp_sysctl_handle_udp_tunneling(SYSCTL_HANDLER_ARGS)
{
int error;
- uint32_t old_sctp_udp_tunneling_port;
+ uint32_t old, new;
SCTP_INP_INFO_RLOCK();
- old_sctp_udp_tunneling_port = SCTP_BASE_SYSCTL(sctp_udp_tunneling_port);
+ old = SCTP_BASE_SYSCTL(sctp_udp_tunneling_port);
SCTP_INP_INFO_RUNLOCK();
- error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
- if (error == 0) {
- RANGECHK(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port), SCTPCTL_UDP_TUNNELING_PORT_MIN, SCTPCTL_UDP_TUNNELING_PORT_MAX);
- if (old_sctp_udp_tunneling_port == SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) {
- error = 0;
- goto out;
- }
- SCTP_INP_INFO_WLOCK();
- if (old_sctp_udp_tunneling_port) {
- sctp_over_udp_stop();
- }
- if (SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) {
- if (sctp_over_udp_start()) {
- SCTP_BASE_SYSCTL(sctp_udp_tunneling_port) = 0;
+ new = old;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if ((error == 0) &&
+ (req->newptr != NULL)) {
+#if (SCTPCTL_UDP_TUNNELING_PORT_MIN == 0)
+ if (new > SCTPCTL_UDP_TUNNELING_PORT_MAX) {
+#else
+ if ((new < SCTPCTL_UDP_TUNNELING_PORT_MIN) ||
+ (new > SCTPCTL_UDP_TUNNELING_PORT_MAX)) {
+#endif
+ error = EINVAL;
+ } else {
+ SCTP_INP_INFO_WLOCK();
+ SCTP_BASE_SYSCTL(sctp_udp_tunneling_port) = new;
+ if (old != 0) {
+ sctp_over_udp_stop();
}
+ if (new != 0) {
+ error = sctp_over_udp_start();
+ }
+ SCTP_INP_INFO_WUNLOCK();
}
- SCTP_INP_INFO_WUNLOCK();
}
-out:
return (error);
}
static int
-sysctl_sctp_check(SYSCTL_HANDLER_ARGS)
+sctp_sysctl_handle_auth(SYSCTL_HANDLER_ARGS)
{
int error;
-
-#ifdef VIMAGE
- error = vnet_sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+ uint32_t new;
+
+ new = SCTP_BASE_SYSCTL(sctp_auth_enable);
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if ((error == 0) &&
+ (req->newptr != NULL)) {
+#if (SCTPCTL_AUTH_ENABLE_MIN == 0)
+ if ((new > SCTPCTL_AUTH_ENABLE_MAX) ||
+ ((new == 0) && (SCTP_BASE_SYSCTL(sctp_asconf_enable) == 1))) {
#else
- error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+ if ((new < SCTPCTL_AUTH_ENABLE_MIN) ||
+ (new > SCTPCTL_AUTH_ENABLE_MAX) ||
+ ((new == 0) && (SCTP_BASE_SYSCTL(sctp_asconf_enable) == 1))) {
#endif
- if (error == 0) {
- RANGECHK(SCTP_BASE_SYSCTL(sctp_sendspace), SCTPCTL_MAXDGRAM_MIN, SCTPCTL_MAXDGRAM_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_recvspace), SCTPCTL_RECVSPACE_MIN, SCTPCTL_RECVSPACE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_auto_asconf), SCTPCTL_AUTOASCONF_MIN, SCTPCTL_AUTOASCONF_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_ecn_enable), SCTPCTL_ECN_ENABLE_MIN, SCTPCTL_ECN_ENABLE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_strict_sacks), SCTPCTL_STRICT_SACKS_MIN, SCTPCTL_STRICT_SACKS_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_peer_chunk_oh), SCTPCTL_PEER_CHKOH_MIN, SCTPCTL_PEER_CHKOH_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_max_burst_default), SCTPCTL_MAXBURST_MIN, SCTPCTL_MAXBURST_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_fr_max_burst_default), SCTPCTL_FRMAXBURST_MIN, SCTPCTL_FRMAXBURST_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue), SCTPCTL_MAXCHUNKS_MIN, SCTPCTL_MAXCHUNKS_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_hashtblsize), SCTPCTL_TCBHASHSIZE_MIN, SCTPCTL_TCBHASHSIZE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_pcbtblsize), SCTPCTL_PCBHASHSIZE_MIN, SCTPCTL_PCBHASHSIZE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_min_split_point), SCTPCTL_MIN_SPLIT_POINT_MIN, SCTPCTL_MIN_SPLIT_POINT_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_chunkscale), SCTPCTL_CHUNKSCALE_MIN, SCTPCTL_CHUNKSCALE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default), SCTPCTL_DELAYED_SACK_TIME_MIN, SCTPCTL_DELAYED_SACK_TIME_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_sack_freq_default), SCTPCTL_SACK_FREQ_MIN, SCTPCTL_SACK_FREQ_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_system_free_resc_limit), SCTPCTL_SYS_RESOURCE_MIN, SCTPCTL_SYS_RESOURCE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit), SCTPCTL_ASOC_RESOURCE_MIN, SCTPCTL_ASOC_RESOURCE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default), SCTPCTL_HEARTBEAT_INTERVAL_MIN, SCTPCTL_HEARTBEAT_INTERVAL_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default), SCTPCTL_PMTU_RAISE_TIME_MIN, SCTPCTL_PMTU_RAISE_TIME_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default), SCTPCTL_SHUTDOWN_GUARD_TIME_MIN, SCTPCTL_SHUTDOWN_GUARD_TIME_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_secret_lifetime_default), SCTPCTL_SECRET_LIFETIME_MIN, SCTPCTL_SECRET_LIFETIME_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_rto_max_default), SCTPCTL_RTO_MAX_MIN, SCTPCTL_RTO_MAX_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_rto_min_default), SCTPCTL_RTO_MIN_MIN, SCTPCTL_RTO_MIN_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_rto_initial_default), SCTPCTL_RTO_INITIAL_MIN, SCTPCTL_RTO_INITIAL_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_init_rto_max_default), SCTPCTL_INIT_RTO_MAX_MIN, SCTPCTL_INIT_RTO_MAX_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default), SCTPCTL_VALID_COOKIE_LIFE_MIN, SCTPCTL_VALID_COOKIE_LIFE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_init_rtx_max_default), SCTPCTL_INIT_RTX_MAX_MIN, SCTPCTL_INIT_RTX_MAX_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default), SCTPCTL_ASSOC_RTX_MAX_MIN, SCTPCTL_ASSOC_RTX_MAX_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_path_rtx_max_default), SCTPCTL_PATH_RTX_MAX_MIN, SCTPCTL_PATH_RTX_MAX_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_path_pf_threshold), SCTPCTL_PATH_PF_THRESHOLD_MIN, SCTPCTL_PATH_PF_THRESHOLD_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTPCTL_ADD_MORE_ON_OUTPUT_MIN, SCTPCTL_ADD_MORE_ON_OUTPUT_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default), SCTPCTL_INCOMING_STREAMS_MIN, SCTPCTL_INCOMING_STREAMS_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), SCTPCTL_OUTGOING_STREAMS_MIN, SCTPCTL_OUTGOING_STREAMS_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_on_off), SCTPCTL_CMT_ON_OFF_MIN, SCTPCTL_CMT_ON_OFF_MAX);
- /* EY */
- RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_sack_on_off), SCTPCTL_NR_SACK_ON_OFF_MIN, SCTPCTL_NR_SACK_ON_OFF_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_use_dac), SCTPCTL_CMT_USE_DAC_MIN, SCTPCTL_CMT_USE_DAC_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst), SCTPCTL_CWND_MAXBURST_MIN, SCTPCTL_CWND_MAXBURST_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk), SCTPCTL_ASCONF_AUTH_NOCHK_MIN, SCTPCTL_ASCONF_AUTH_NOCHK_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_auth_disable), SCTPCTL_AUTH_DISABLE_MIN, SCTPCTL_AUTH_DISABLE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_nat_friendly), SCTPCTL_NAT_FRIENDLY_MIN, SCTPCTL_NAT_FRIENDLY_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_L2_abc_variable), SCTPCTL_ABC_L_VAR_MIN, SCTPCTL_ABC_L_VAR_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count), SCTPCTL_MAX_CHAINED_MBUFS_MIN, SCTPCTL_MAX_CHAINED_MBUFS_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_do_drain), SCTPCTL_DO_SCTP_DRAIN_MIN, SCTPCTL_DO_SCTP_DRAIN_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_hb_maxburst), SCTPCTL_HB_MAX_BURST_MIN, SCTPCTL_HB_MAX_BURST_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit), SCTPCTL_ABORT_AT_LIMIT_MIN, SCTPCTL_ABORT_AT_LIMIT_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_strict_data_order), SCTPCTL_STRICT_DATA_ORDER_MIN, SCTPCTL_STRICT_DATA_ORDER_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_min_residual), SCTPCTL_MIN_RESIDUAL_MIN, SCTPCTL_MIN_RESIDUAL_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_max_retran_chunk), SCTPCTL_MAX_RETRAN_CHUNK_MIN, SCTPCTL_MAX_RETRAN_CHUNK_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_logging_level), SCTPCTL_LOGGING_LEVEL_MIN, SCTPCTL_LOGGING_LEVEL_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_default_cc_module), SCTPCTL_DEFAULT_CC_MODULE_MIN, SCTPCTL_DEFAULT_CC_MODULE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_default_ss_module), SCTPCTL_DEFAULT_SS_MODULE_MIN, SCTPCTL_DEFAULT_SS_MODULE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_default_frag_interleave), SCTPCTL_DEFAULT_FRAG_INTERLEAVE_MIN, SCTPCTL_DEFAULT_FRAG_INTERLEAVE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_vtag_time_wait), SCTPCTL_TIME_WAIT_MIN, SCTPCTL_TIME_WAIT_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_buffer_splitting), SCTPCTL_BUFFER_SPLITTING_MIN, SCTPCTL_BUFFER_SPLITTING_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_initial_cwnd), SCTPCTL_INITIAL_CWND_MIN, SCTPCTL_INITIAL_CWND_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_rttvar_bw), SCTPCTL_RTTVAR_BW_MIN, SCTPCTL_RTTVAR_BW_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_rttvar_rtt), SCTPCTL_RTTVAR_RTT_MIN, SCTPCTL_RTTVAR_RTT_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_rttvar_eqret), SCTPCTL_RTTVAR_EQRET_MIN, SCTPCTL_RTTVAR_EQRET_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_steady_step), SCTPCTL_RTTVAR_STEADYS_MIN, SCTPCTL_RTTVAR_STEADYS_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_use_dccc_ecn), SCTPCTL_RTTVAR_DCCCECN_MIN, SCTPCTL_RTTVAR_DCCCECN_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_mobility_base), SCTPCTL_MOBILITY_BASE_MIN, SCTPCTL_MOBILITY_BASE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff), SCTPCTL_MOBILITY_FASTHANDOFF_MIN, SCTPCTL_MOBILITY_FASTHANDOFF_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_enable_sack_immediately), SCTPCTL_SACK_IMMEDIATELY_ENABLE_MIN, SCTPCTL_SACK_IMMEDIATELY_ENABLE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly), SCTPCTL_NAT_FRIENDLY_INITS_MIN, SCTPCTL_NAT_FRIENDLY_INITS_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_blackhole), SCTPCTL_BLACKHOLE_MIN, SCTPCTL_BLACKHOLE_MAX);
- RANGECHK(SCTP_BASE_SYSCTL(sctp_diag_info_code), SCTPCTL_DIAG_INFO_CODE_MIN, SCTPCTL_DIAG_INFO_CODE_MAX);
+ error = EINVAL;
+ } else {
+ SCTP_BASE_SYSCTL(sctp_auth_enable) = new;
+ }
+ }
+ return (error);
+}
-#ifdef SCTP_DEBUG
- RANGECHK(SCTP_BASE_SYSCTL(sctp_debug_on), SCTPCTL_DEBUG_MIN, SCTPCTL_DEBUG_MAX);
-#endif
-#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
- RANGECHK(SCTP_BASE_SYSCTL(sctp_output_unlocked), SCTPCTL_OUTPUT_UNLOCKED_MIN, SCTPCTL_OUTPUT_UNLOCKED_MAX);
+static int
+sctp_sysctl_handle_asconf(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ uint32_t new;
+
+ new = SCTP_BASE_SYSCTL(sctp_asconf_enable);
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if ((error == 0) &&
+ (req->newptr != NULL)) {
+#if (SCTPCTL_ASCONF_ENABLE_MIN == 0)
+ if ((new > SCTPCTL_ASCONF_ENABLE_MAX) ||
+ ((new == 1) && (SCTP_BASE_SYSCTL(sctp_auth_enable) == 0))) {
+#else
+ if ((new < SCTPCTL_ASCONF_ENABLE_MIN) ||
+ (new > SCTPCTL_ASCONF_ENABLE_MAX) ||
+ ((new == 1) && (SCTP_BASE_SYSCTL(sctp_auth_enable) == 0))) {
#endif
+ error = EINVAL;
+ } else {
+ SCTP_BASE_SYSCTL(sctp_asconf_enable) = new;
+ }
}
return (error);
}
-#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
static int
-sysctl_stat_get(SYSCTL_HANDLER_ARGS)
+sctp_sysctl_handle_stats(SYSCTL_HANDLER_ARGS)
{
- int cpu, error;
- struct sctpstat sb, sb_temp, *sarry, *cpin = NULL;
+ int error;
- if ((req->newptr) && (req->newlen == sizeof(struct sctpstat))) {
- /*
- * User wants us to clear or at least reset the counters to
- * the specified values.
- */
- cpin = &sb_temp;
- memset(&sb_temp, 0, sizeof(sb_temp));
- error = SYSCTL_IN(req, &sb_temp, sizeof(sb_temp));
- if (error != 0)
- return (error);
- } else if (req->newptr) {
- /* Must be a stat structure */
+#if defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
+ struct sctpstat *sarry;
+ struct sctpstat sb;
+ int cpu;
+
+#endif
+ struct sctpstat sb_temp;
+
+ if ((req->newptr != NULL) &&
+ (req->newlen != sizeof(struct sctpstat))) {
return (EINVAL);
}
+ memset(&sb_temp, 0, sizeof(struct sctpstat));
+
+ if (req->newptr != NULL) {
+ error = SYSCTL_IN(req, &sb_temp, sizeof(struct sctpstat));
+ if (error != 0) {
+ return (error);
+ }
+ }
+#if defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
memset(&sb, 0, sizeof(sb));
for (cpu = 0; cpu < mp_maxid; cpu++) {
sarry = &SCTP_BASE_STATS[cpu];
@@ -830,19 +795,35 @@ sysctl_stat_get(SYSCTL_HANDLER_ARGS)
sb.sctps_send_burst_avoid += sarry->sctps_send_burst_avoid;
sb.sctps_send_cwnd_avoid += sarry->sctps_send_cwnd_avoid;
sb.sctps_fwdtsn_map_over += sarry->sctps_fwdtsn_map_over;
- if (cpin) {
- memcpy(sarry, cpin, sizeof(struct sctpstat));
+ if (req->newptr != NULL) {
+ memcpy(sarry, &sb_temp, sizeof(struct sctpstat));
}
}
- error = SYSCTL_OUT(req, &sb, sizeof(sb));
+ error = SYSCTL_OUT(req, &sb, sizeof(struct sctpstat));
+#else
+ error = SYSCTL_OUT(req, &SCTP_BASE_STATS, sizeof(struct sctpstat));
+ if (error != 0) {
+ return (error);
+ }
+ if (req->newptr != NULL) {
+ memcpy(&SCTP_BASE_STATS, &sb_temp, sizeof(struct sctpstat));
+ }
+#endif
return (error);
}
-#endif
-
#if defined(SCTP_LOCAL_TRACE_BUF)
static int
-sysctl_sctp_cleartrace(SYSCTL_HANDLER_ARGS)
+sctp_sysctl_handle_trace_log(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+
+ error = SYSCTL_OUT(req, &SCTP_BASE_SYSCTL(sctp_log), sizeof(struct sctp_log));
+ return (error);
+}
+
+static int
+sctp_sysctl_handle_trace_log_clear(SYSCTL_HANDLER_ARGS)
{
int error = 0;
@@ -852,314 +833,117 @@ sysctl_sctp_cleartrace(SYSCTL_HANDLER_ARGS)
#endif
+#define SCTP_UINT_SYSCTL(mib_name, var_name, prefix) \
+ static int \
+ sctp_sysctl_handle_##mib_name(SYSCTL_HANDLER_ARGS) \
+ { \
+ int error; \
+ uint32_t new; \
+ \
+ new = SCTP_BASE_SYSCTL(var_name); \
+ error = sysctl_handle_int(oidp, &new, 0, req); \
+ if ((error == 0) && (req->newptr != NULL)) { \
+ if ((new < prefix##_MIN) || \
+ (new > prefix##_MAX)) { \
+ error = EINVAL; \
+ } else { \
+ SCTP_BASE_SYSCTL(var_name) = new; \
+ } \
+ } \
+ return (error); \
+ } \
+ SYSCTL_PROC(_net_inet_sctp, OID_AUTO, mib_name, \
+ CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, NULL, 0, \
+ sctp_sysctl_handle_##mib_name, "UI", prefix##_DESC);
/*
* sysctl definitions
*/
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, sendspace, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_sendspace), 0, sysctl_sctp_check, "IU",
- SCTPCTL_MAXDGRAM_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, recvspace, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_recvspace), 0, sysctl_sctp_check, "IU",
- SCTPCTL_RECVSPACE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, auto_asconf, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_auto_asconf), 0, sysctl_sctp_check, "IU",
- SCTPCTL_AUTOASCONF_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, ecn_enable, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_ecn_enable), 0, sysctl_sctp_check, "IU",
- SCTPCTL_ECN_ENABLE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, strict_sacks, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_strict_sacks), 0, sysctl_sctp_check, "IU",
- SCTPCTL_STRICT_SACKS_DESC);
-
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, peer_chkoh, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_peer_chunk_oh), 0, sysctl_sctp_check, "IU",
- SCTPCTL_PEER_CHKOH_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, maxburst, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_max_burst_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_MAXBURST_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, fr_maxburst, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_fr_max_burst_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_FRMAXBURST_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, maxchunks, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue), 0, sysctl_sctp_check, "IU",
- SCTPCTL_MAXCHUNKS_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, tcbhashsize, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_hashtblsize), 0, sysctl_sctp_check, "IU",
- SCTPCTL_TCBHASHSIZE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, pcbhashsize, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_pcbtblsize), 0, sysctl_sctp_check, "IU",
- SCTPCTL_PCBHASHSIZE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, min_split_point, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_min_split_point), 0, sysctl_sctp_check, "IU",
- SCTPCTL_MIN_SPLIT_POINT_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, chunkscale, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_chunkscale), 0, sysctl_sctp_check, "IU",
- SCTPCTL_CHUNKSCALE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, delayed_sack_time, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_DELAYED_SACK_TIME_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, sack_freq, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_sack_freq_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_SACK_FREQ_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, sys_resource, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_system_free_resc_limit), 0, sysctl_sctp_check, "IU",
- SCTPCTL_SYS_RESOURCE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, asoc_resource, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit), 0, sysctl_sctp_check, "IU",
- SCTPCTL_ASOC_RESOURCE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, heartbeat_interval, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_HEARTBEAT_INTERVAL_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, pmtu_raise_time, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_PMTU_RAISE_TIME_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, shutdown_guard_time, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_SHUTDOWN_GUARD_TIME_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, secret_lifetime, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_secret_lifetime_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_SECRET_LIFETIME_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rto_max, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_rto_max_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_RTO_MAX_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rto_min, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_rto_min_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_RTO_MIN_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rto_initial, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_rto_initial_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_RTO_INITIAL_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, init_rto_max, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_init_rto_max_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_INIT_RTO_MAX_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, valid_cookie_life, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_VALID_COOKIE_LIFE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, init_rtx_max, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_init_rtx_max_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_INIT_RTX_MAX_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, assoc_rtx_max, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_ASSOC_RTX_MAX_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, path_rtx_max, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_path_rtx_max_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_PATH_RTX_MAX_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, path_pf_threshold, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_path_pf_threshold), 0, sysctl_sctp_check, "IU",
- SCTPCTL_PATH_PF_THRESHOLD_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, add_more_on_output, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_add_more_threshold), 0, sysctl_sctp_check, "IU",
- SCTPCTL_ADD_MORE_ON_OUTPUT_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, incoming_streams, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_INCOMING_STREAMS_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, outgoing_streams, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), 0, sysctl_sctp_check, "IU",
- SCTPCTL_OUTGOING_STREAMS_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, cmt_on_off, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_cmt_on_off), 0, sysctl_sctp_check, "IU",
- SCTPCTL_CMT_ON_OFF_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, nr_sack_on_off, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_nr_sack_on_off), 0, sysctl_sctp_check, "IU",
- SCTPCTL_NR_SACK_ON_OFF_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, cmt_use_dac, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_cmt_use_dac), 0, sysctl_sctp_check, "IU",
- SCTPCTL_CMT_USE_DAC_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, cwnd_maxburst, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst), 0, sysctl_sctp_check, "IU",
- SCTPCTL_CWND_MAXBURST_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, asconf_auth_nochk, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk), 0, sysctl_sctp_check, "IU",
- SCTPCTL_ASCONF_AUTH_NOCHK_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, auth_disable, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_auth_disable), 0, sysctl_sctp_check, "IU",
- SCTPCTL_AUTH_DISABLE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, nat_friendly, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_nat_friendly), 0, sysctl_sctp_check, "IU",
- SCTPCTL_NAT_FRIENDLY_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, abc_l_var, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_L2_abc_variable), 0, sysctl_sctp_check, "IU",
- SCTPCTL_ABC_L_VAR_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, max_chained_mbufs, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count), 0, sysctl_sctp_check, "IU",
- SCTPCTL_MAX_CHAINED_MBUFS_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, do_sctp_drain, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_do_drain), 0, sysctl_sctp_check, "IU",
- SCTPCTL_DO_SCTP_DRAIN_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, hb_max_burst, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_hb_maxburst), 0, sysctl_sctp_check, "IU",
- SCTPCTL_HB_MAX_BURST_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, abort_at_limit, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit), 0, sysctl_sctp_check, "IU",
- SCTPCTL_ABORT_AT_LIMIT_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, strict_data_order, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_strict_data_order), 0, sysctl_sctp_check, "IU",
- SCTPCTL_STRICT_DATA_ORDER_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, min_residual, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_min_residual), 0, sysctl_sctp_check, "IU",
- SCTPCTL_MIN_RESIDUAL_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, max_retran_chunk, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_max_retran_chunk), 0, sysctl_sctp_check, "IU",
- SCTPCTL_MAX_RETRAN_CHUNK_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, log_level, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_logging_level), 0, sysctl_sctp_check, "IU",
- SCTPCTL_LOGGING_LEVEL_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, default_cc_module, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_default_cc_module), 0, sysctl_sctp_check, "IU",
- SCTPCTL_DEFAULT_CC_MODULE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, default_ss_module, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_default_ss_module), 0, sysctl_sctp_check, "IU",
- SCTPCTL_DEFAULT_SS_MODULE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, default_frag_interleave, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_default_frag_interleave), 0, sysctl_sctp_check, "IU",
- SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, mobility_base, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_mobility_base), 0, sysctl_sctp_check, "IU",
- SCTPCTL_MOBILITY_BASE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, mobility_fasthandoff, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff), 0, sysctl_sctp_check, "IU",
- SCTPCTL_MOBILITY_FASTHANDOFF_DESC);
-
+SCTP_UINT_SYSCTL(sendspace, sctp_sendspace, SCTPCTL_MAXDGRAM)
+SCTP_UINT_SYSCTL(recvspace, sctp_recvspace, SCTPCTL_RECVSPACE)
+SCTP_UINT_SYSCTL(auto_asconf, sctp_auto_asconf, SCTPCTL_AUTOASCONF)
+SCTP_UINT_SYSCTL(ecn_enable, sctp_ecn_enable, SCTPCTL_ECN_ENABLE)
+SCTP_UINT_SYSCTL(pr_enable, sctp_pr_enable, SCTPCTL_PR_ENABLE)
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, auth_enable, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
+ NULL, 0, sctp_sysctl_handle_auth, "IU", SCTPCTL_AUTH_ENABLE_DESC);
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, asconf_enable, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
+ NULL, 0, sctp_sysctl_handle_asconf, "IU", SCTPCTL_ASCONF_ENABLE_DESC);
+SCTP_UINT_SYSCTL(reconfig_enable, sctp_reconfig_enable, SCTPCTL_RECONFIG_ENABLE)
+SCTP_UINT_SYSCTL(nrsack_enable, sctp_nrsack_enable, SCTPCTL_NRSACK_ENABLE)
+SCTP_UINT_SYSCTL(pktdrop_enable, sctp_pktdrop_enable, SCTPCTL_PKTDROP_ENABLE)
+SCTP_UINT_SYSCTL(peer_chkoh, sctp_peer_chunk_oh, SCTPCTL_PEER_CHKOH)
+SCTP_UINT_SYSCTL(maxburst, sctp_max_burst_default, SCTPCTL_MAXBURST)
+SCTP_UINT_SYSCTL(fr_maxburst, sctp_fr_max_burst_default, SCTPCTL_FRMAXBURST)
+SCTP_UINT_SYSCTL(maxchunks, sctp_max_chunks_on_queue, SCTPCTL_MAXCHUNKS)
+SCTP_UINT_SYSCTL(tcbhashsize, sctp_hashtblsize, SCTPCTL_TCBHASHSIZE)
+SCTP_UINT_SYSCTL(pcbhashsize, sctp_pcbtblsize, SCTPCTL_PCBHASHSIZE)
+SCTP_UINT_SYSCTL(min_split_point, sctp_min_split_point, SCTPCTL_MIN_SPLIT_POINT)
+SCTP_UINT_SYSCTL(chunkscale, sctp_chunkscale, SCTPCTL_CHUNKSCALE)
+SCTP_UINT_SYSCTL(delayed_sack_time, sctp_delayed_sack_time_default, SCTPCTL_DELAYED_SACK_TIME)
+SCTP_UINT_SYSCTL(sack_freq, sctp_sack_freq_default, SCTPCTL_SACK_FREQ)
+SCTP_UINT_SYSCTL(sys_resource, sctp_system_free_resc_limit, SCTPCTL_SYS_RESOURCE)
+SCTP_UINT_SYSCTL(asoc_resource, sctp_asoc_free_resc_limit, SCTPCTL_ASOC_RESOURCE)
+SCTP_UINT_SYSCTL(heartbeat_interval, sctp_heartbeat_interval_default, SCTPCTL_HEARTBEAT_INTERVAL)
+SCTP_UINT_SYSCTL(pmtu_raise_time, sctp_pmtu_raise_time_default, SCTPCTL_PMTU_RAISE_TIME)
+SCTP_UINT_SYSCTL(shutdown_guard_time, sctp_shutdown_guard_time_default, SCTPCTL_SHUTDOWN_GUARD_TIME)
+SCTP_UINT_SYSCTL(secret_lifetime, sctp_secret_lifetime_default, SCTPCTL_SECRET_LIFETIME)
+SCTP_UINT_SYSCTL(rto_max, sctp_rto_max_default, SCTPCTL_RTO_MAX)
+SCTP_UINT_SYSCTL(rto_min, sctp_rto_min_default, SCTPCTL_RTO_MIN)
+SCTP_UINT_SYSCTL(rto_initial, sctp_rto_initial_default, SCTPCTL_RTO_INITIAL)
+SCTP_UINT_SYSCTL(init_rto_max, sctp_init_rto_max_default, SCTPCTL_INIT_RTO_MAX)
+SCTP_UINT_SYSCTL(valid_cookie_life, sctp_valid_cookie_life_default, SCTPCTL_VALID_COOKIE_LIFE)
+SCTP_UINT_SYSCTL(init_rtx_max, sctp_init_rtx_max_default, SCTPCTL_INIT_RTX_MAX)
+SCTP_UINT_SYSCTL(assoc_rtx_max, sctp_assoc_rtx_max_default, SCTPCTL_ASSOC_RTX_MAX)
+SCTP_UINT_SYSCTL(path_rtx_max, sctp_path_rtx_max_default, SCTPCTL_PATH_RTX_MAX)
+SCTP_UINT_SYSCTL(path_pf_threshold, sctp_path_pf_threshold, SCTPCTL_PATH_PF_THRESHOLD)
+SCTP_UINT_SYSCTL(add_more_on_output, sctp_add_more_threshold, SCTPCTL_ADD_MORE_ON_OUTPUT)
+SCTP_UINT_SYSCTL(incoming_streams, sctp_nr_incoming_streams_default, SCTPCTL_INCOMING_STREAMS)
+SCTP_UINT_SYSCTL(outgoing_streams, sctp_nr_outgoing_streams_default, SCTPCTL_OUTGOING_STREAMS)
+SCTP_UINT_SYSCTL(cmt_on_off, sctp_cmt_on_off, SCTPCTL_CMT_ON_OFF)
+SCTP_UINT_SYSCTL(cmt_use_dac, sctp_cmt_use_dac, SCTPCTL_CMT_USE_DAC)
+SCTP_UINT_SYSCTL(cwnd_maxburst, sctp_use_cwnd_based_maxburst, SCTPCTL_CWND_MAXBURST)
+SCTP_UINT_SYSCTL(nat_friendly, sctp_nat_friendly, SCTPCTL_NAT_FRIENDLY)
+SCTP_UINT_SYSCTL(abc_l_var, sctp_L2_abc_variable, SCTPCTL_ABC_L_VAR)
+SCTP_UINT_SYSCTL(max_chained_mbufs, sctp_mbuf_threshold_count, SCTPCTL_MAX_CHAINED_MBUFS)
+SCTP_UINT_SYSCTL(do_sctp_drain, sctp_do_drain, SCTPCTL_DO_SCTP_DRAIN)
+SCTP_UINT_SYSCTL(hb_max_burst, sctp_hb_maxburst, SCTPCTL_HB_MAX_BURST)
+SCTP_UINT_SYSCTL(abort_at_limit, sctp_abort_if_one_2_one_hits_limit, SCTPCTL_ABORT_AT_LIMIT)
+SCTP_UINT_SYSCTL(min_residual, sctp_min_residual, SCTPCTL_MIN_RESIDUAL)
+SCTP_UINT_SYSCTL(max_retran_chunk, sctp_max_retran_chunk, SCTPCTL_MAX_RETRAN_CHUNK)
+SCTP_UINT_SYSCTL(log_level, sctp_logging_level, SCTPCTL_LOGGING_LEVEL)
+SCTP_UINT_SYSCTL(default_cc_module, sctp_default_cc_module, SCTPCTL_DEFAULT_CC_MODULE)
+SCTP_UINT_SYSCTL(default_ss_module, sctp_default_ss_module, SCTPCTL_DEFAULT_SS_MODULE)
+SCTP_UINT_SYSCTL(default_frag_interleave, sctp_default_frag_interleave, SCTPCTL_DEFAULT_FRAG_INTERLEAVE)
+SCTP_UINT_SYSCTL(mobility_base, sctp_mobility_base, SCTPCTL_MOBILITY_BASE)
+SCTP_UINT_SYSCTL(mobility_fasthandoff, sctp_mobility_fasthandoff, SCTPCTL_MOBILITY_FASTHANDOFF)
#if defined(SCTP_LOCAL_TRACE_BUF)
-SYSCTL_VNET_STRUCT(_net_inet_sctp, OID_AUTO, log, CTLFLAG_RD,
- &SCTP_BASE_SYSCTL(sctp_log), sctp_log,
- "SCTP logging (struct sctp_log)");
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, clear_trace, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_log), 0, sysctl_sctp_cleartrace, "IU",
- "Clear SCTP Logging buffer");
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, log, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_RD,
+ NULL, 0, sctp_sysctl_handle_trace_log, "S,sctplog", "SCTP logging (struct sctp_log)");
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, clear_trace, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
+ NULL, 0, sctp_sysctl_handle_trace_log_clear, "IU", "Clear SCTP Logging buffer");
#endif
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, udp_tunneling_port, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_udp_tunneling_port), 0, sysctl_sctp_udp_tunneling_check, "IU",
- SCTPCTL_UDP_TUNNELING_PORT_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, enable_sack_immediately, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_enable_sack_immediately), 0, sysctl_sctp_check, "IU",
- SCTPCTL_SACK_IMMEDIATELY_ENABLE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, nat_friendly_init, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly), 0, sysctl_sctp_check, "IU",
- SCTPCTL_NAT_FRIENDLY_INITS_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, vtag_time_wait, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_vtag_time_wait), 0, sysctl_sctp_check, "IU",
- SCTPCTL_TIME_WAIT_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, buffer_splitting, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_buffer_splitting), 0, sysctl_sctp_check, "IU",
- SCTPCTL_BUFFER_SPLITTING_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, initial_cwnd, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_initial_cwnd), 0, sysctl_sctp_check, "IU",
- SCTPCTL_INITIAL_CWND_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rttvar_bw, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_rttvar_bw), 0, sysctl_sctp_check, "IU",
- SCTPCTL_RTTVAR_BW_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rttvar_rtt, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_rttvar_rtt), 0, sysctl_sctp_check, "IU",
- SCTPCTL_RTTVAR_RTT_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rttvar_eqret, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_rttvar_eqret), 0, sysctl_sctp_check, "IU",
- SCTPCTL_RTTVAR_EQRET_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rttvar_steady_step, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_steady_step), 0, sysctl_sctp_check, "IU",
- SCTPCTL_RTTVAR_STEADYS_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, use_dcccecn, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_use_dccc_ecn), 0, sysctl_sctp_check, "IU",
- SCTPCTL_RTTVAR_DCCCECN_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, blackhole, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_blackhole), 0, sysctl_sctp_check, "IU",
- SCTPCTL_BLACKHOLE_DESC);
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, diag_info_code, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_diag_info_code), 0, sysctl_sctp_check, "IU",
- SCTPCTL_DIAG_INFO_CODE_DESC);
-
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, udp_tunneling_port, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
+ NULL, 0, sctp_sysctl_handle_udp_tunneling, "IU", SCTPCTL_UDP_TUNNELING_PORT_DESC);
+SCTP_UINT_SYSCTL(enable_sack_immediately, sctp_enable_sack_immediately, SCTPCTL_SACK_IMMEDIATELY_ENABLE)
+SCTP_UINT_SYSCTL(nat_friendly_init, sctp_inits_include_nat_friendly, SCTPCTL_NAT_FRIENDLY_INITS)
+SCTP_UINT_SYSCTL(vtag_time_wait, sctp_vtag_time_wait, SCTPCTL_TIME_WAIT)
+SCTP_UINT_SYSCTL(buffer_splitting, sctp_buffer_splitting, SCTPCTL_BUFFER_SPLITTING)
+SCTP_UINT_SYSCTL(initial_cwnd, sctp_initial_cwnd, SCTPCTL_INITIAL_CWND)
+SCTP_UINT_SYSCTL(rttvar_bw, sctp_rttvar_bw, SCTPCTL_RTTVAR_BW)
+SCTP_UINT_SYSCTL(rttvar_rtt, sctp_rttvar_rtt, SCTPCTL_RTTVAR_RTT)
+SCTP_UINT_SYSCTL(rttvar_eqret, sctp_rttvar_eqret, SCTPCTL_RTTVAR_EQRET)
+SCTP_UINT_SYSCTL(rttvar_steady_step, sctp_steady_step, SCTPCTL_RTTVAR_STEADYS)
+SCTP_UINT_SYSCTL(use_dcccecn, sctp_use_dccc_ecn, SCTPCTL_RTTVAR_DCCCECN)
+SCTP_UINT_SYSCTL(blackhole, sctp_blackhole, SCTPCTL_BLACKHOLE)
+SCTP_UINT_SYSCTL(diag_info_code, sctp_diag_info_code, SCTPCTL_DIAG_INFO_CODE)
#ifdef SCTP_DEBUG
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, debug, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_debug_on), 0, sysctl_sctp_check, "IU",
- SCTPCTL_DEBUG_DESC);
+SCTP_UINT_SYSCTL(debug, sctp_debug_on, SCTPCTL_DEBUG)
#endif
-
-
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, output_unlocked, CTLTYPE_UINT | CTLFLAG_RW,
- &SCTP_BASE_SYSCTL(sctp_output_unlocked), 0, sysctl_sctp_check, "IU",
- SCTPCTL_OUTPUT_UNLOCKED_DESC);
+SCTP_UINT_SYSCTL(output_unlocked, sctp_output_unlocked, SCTPCTL_OUTPUT_UNLOCKED)
#endif
-
-#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT)
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, stats,
- CTLTYPE_STRUCT | CTLFLAG_RW,
- 0, 0, sysctl_stat_get, "S,sctpstat",
- "SCTP statistics (struct sctp_stat)");
-#else
-SYSCTL_VNET_STRUCT(_net_inet_sctp, OID_AUTO, stats, CTLFLAG_RW,
- &SCTP_BASE_STATS_SYSCTL, sctpstat,
- "SCTP statistics (struct sctp_stat)");
-#endif
-
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, assoclist, CTLTYPE_OPAQUE | CTLFLAG_RD,
- 0, 0, sctp_assoclist,
- "S,xassoc", "List of active SCTP associations");
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, stats, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_RW,
+ NULL, 0, sctp_sysctl_handle_stats, "S,sctpstat", "SCTP statistics (struct sctp_stat)");
+SYSCTL_PROC(_net_inet_sctp, OID_AUTO, assoclist, CTLFLAG_VNET | CTLTYPE_OPAQUE | CTLFLAG_RD,
+ NULL, 0, sctp_sysctl_handle_assoclist, "S,xassoc", "List of active SCTP associations");
diff --git a/freebsd/sys/netinet/sctp_sysctl.h b/freebsd/sys/netinet/sctp_sysctl.h
index 432d36a4..959bd1e4 100644
--- a/freebsd/sys/netinet/sctp_sysctl.h
+++ b/freebsd/sys/netinet/sctp_sysctl.h
@@ -45,8 +45,13 @@ struct sctp_sysctl {
uint32_t sctp_auto_asconf;
uint32_t sctp_multiple_asconfs;
uint32_t sctp_ecn_enable;
+ uint32_t sctp_pr_enable;
+ uint32_t sctp_auth_enable;
+ uint32_t sctp_asconf_enable;
+ uint32_t sctp_reconfig_enable;
+ uint32_t sctp_nrsack_enable;
+ uint32_t sctp_pktdrop_enable;
uint32_t sctp_fr_max_burst_default;
- uint32_t sctp_strict_sacks;
uint32_t sctp_peer_chunk_oh;
uint32_t sctp_max_burst_default;
uint32_t sctp_max_chunks_on_queue;
@@ -76,18 +81,13 @@ struct sctp_sysctl {
uint32_t sctp_nr_outgoing_streams_default;
uint32_t sctp_cmt_on_off;
uint32_t sctp_cmt_use_dac;
- /* EY 5/5/08 - nr_sack flag variable */
- uint32_t sctp_nr_sack_on_off;
uint32_t sctp_use_cwnd_based_maxburst;
- uint32_t sctp_asconf_auth_nochk;
- uint32_t sctp_auth_disable;
uint32_t sctp_nat_friendly;
uint32_t sctp_L2_abc_variable;
uint32_t sctp_mbuf_threshold_count;
uint32_t sctp_do_drain;
uint32_t sctp_hb_maxburst;
uint32_t sctp_abort_if_one_2_one_hits_limit;
- uint32_t sctp_strict_data_order;
uint32_t sctp_min_residual;
uint32_t sctp_max_retran_chunk;
uint32_t sctp_logging_level;
@@ -141,7 +141,7 @@ struct sctp_sysctl {
#define SCTPCTL_AUTOASCONF_DESC "Enable SCTP Auto-ASCONF"
#define SCTPCTL_AUTOASCONF_MIN 0
#define SCTPCTL_AUTOASCONF_MAX 1
-#define SCTPCTL_AUTOASCONF_DEFAULT SCTP_DEFAULT_AUTO_ASCONF
+#define SCTPCTL_AUTOASCONF_DEFAULT 1
/* autoasconf: Enable SCTP Auto-ASCONF */
#define SCTPCTL_MULTIPLEASCONFS_DESC "Enable SCTP Muliple-ASCONFs"
@@ -155,11 +155,41 @@ struct sctp_sysctl {
#define SCTPCTL_ECN_ENABLE_MAX 1
#define SCTPCTL_ECN_ENABLE_DEFAULT 1
-/* strict_sacks: Enable SCTP Strict SACK checking */
-#define SCTPCTL_STRICT_SACKS_DESC "Enable SCTP Strict SACK checking"
-#define SCTPCTL_STRICT_SACKS_MIN 0
-#define SCTPCTL_STRICT_SACKS_MAX 1
-#define SCTPCTL_STRICT_SACKS_DEFAULT 1
+/* pr_enable: Enable PR-SCTP */
+#define SCTPCTL_PR_ENABLE_DESC "Enable PR-SCTP"
+#define SCTPCTL_PR_ENABLE_MIN 0
+#define SCTPCTL_PR_ENABLE_MAX 1
+#define SCTPCTL_PR_ENABLE_DEFAULT 1
+
+/* auth_enable: Enable SCTP AUTH function */
+#define SCTPCTL_AUTH_ENABLE_DESC "Enable SCTP AUTH function"
+#define SCTPCTL_AUTH_ENABLE_MIN 0
+#define SCTPCTL_AUTH_ENABLE_MAX 1
+#define SCTPCTL_AUTH_ENABLE_DEFAULT 1
+
+/* asconf_enable: Enable SCTP ASCONF */
+#define SCTPCTL_ASCONF_ENABLE_DESC "Enable SCTP ASCONF"
+#define SCTPCTL_ASCONF_ENABLE_MIN 0
+#define SCTPCTL_ASCONF_ENABLE_MAX 1
+#define SCTPCTL_ASCONF_ENABLE_DEFAULT 1
+
+/* reconfig_enable: Enable SCTP RE-CONFIG */
+#define SCTPCTL_RECONFIG_ENABLE_DESC "Enable SCTP RE-CONFIG"
+#define SCTPCTL_RECONFIG_ENABLE_MIN 0
+#define SCTPCTL_RECONFIG_ENABLE_MAX 1
+#define SCTPCTL_RECONFIG_ENABLE_DEFAULT 1
+
+/* nrsack_enable: Enable NR_SACK */
+#define SCTPCTL_NRSACK_ENABLE_DESC "Enable SCTP NR-SACK"
+#define SCTPCTL_NRSACK_ENABLE_MIN 0
+#define SCTPCTL_NRSACK_ENABLE_MAX 1
+#define SCTPCTL_NRSACK_ENABLE_DEFAULT 0
+
+/* pktdrop_enable: Enable SCTP Packet Drop Reports */
+#define SCTPCTL_PKTDROP_ENABLE_DESC "Enable SCTP PKTDROP"
+#define SCTPCTL_PKTDROP_ENABLE_MIN 0
+#define SCTPCTL_PKTDROP_ENABLE_MAX 1
+#define SCTPCTL_PKTDROP_ENABLE_DEFAULT 0
/* loopback_nocsum: Enable NO Csum on packets sent on loopback */
#define SCTPCTL_LOOPBACK_NOCSUM_DESC "Enable NO Csum on packets sent on loopback"
@@ -253,10 +283,10 @@ struct sctp_sysctl {
#define SCTPCTL_PMTU_RAISE_TIME_DEFAULT SCTP_DEF_PMTU_RAISE_SEC
/* shutdown_guard_time: Default shutdown guard timer in seconds */
-#define SCTPCTL_SHUTDOWN_GUARD_TIME_DESC "Default shutdown guard timer in seconds"
+#define SCTPCTL_SHUTDOWN_GUARD_TIME_DESC "Shutdown guard timer in seconds (0 means 5 times RTO.Max)"
#define SCTPCTL_SHUTDOWN_GUARD_TIME_MIN 0
#define SCTPCTL_SHUTDOWN_GUARD_TIME_MAX 0xFFFFFFFF
-#define SCTPCTL_SHUTDOWN_GUARD_TIME_DEFAULT SCTP_DEF_MAX_SHUTDOWN_SEC
+#define SCTPCTL_SHUTDOWN_GUARD_TIME_DEFAULT 0
/* secret_lifetime: Default secret lifetime in seconds */
#define SCTPCTL_SECRET_LIFETIME_DESC "Default secret lifetime in seconds"
@@ -342,12 +372,6 @@ struct sctp_sysctl {
#define SCTPCTL_CMT_ON_OFF_MAX SCTP_CMT_MAX
#define SCTPCTL_CMT_ON_OFF_DEFAULT SCTP_CMT_OFF
-/* EY - nr_sack_on_off: NR_SACK on/off flag */
-#define SCTPCTL_NR_SACK_ON_OFF_DESC "NR_SACK on/off flag"
-#define SCTPCTL_NR_SACK_ON_OFF_MIN 0
-#define SCTPCTL_NR_SACK_ON_OFF_MAX 1
-#define SCTPCTL_NR_SACK_ON_OFF_DEFAULT 0
-
/* cmt_use_dac: CMT DAC on/off flag */
#define SCTPCTL_CMT_USE_DAC_DESC "CMT DAC on/off flag"
#define SCTPCTL_CMT_USE_DAC_MIN 0
@@ -360,18 +384,6 @@ struct sctp_sysctl {
#define SCTPCTL_CWND_MAXBURST_MAX 1
#define SCTPCTL_CWND_MAXBURST_DEFAULT 1
-/* asconf_auth_nochk: Disable SCTP ASCONF AUTH requirement */
-#define SCTPCTL_ASCONF_AUTH_NOCHK_DESC "Disable SCTP ASCONF AUTH requirement"
-#define SCTPCTL_ASCONF_AUTH_NOCHK_MIN 0
-#define SCTPCTL_ASCONF_AUTH_NOCHK_MAX 1
-#define SCTPCTL_ASCONF_AUTH_NOCHK_DEFAULT 0
-
-/* auth_disable: Disable SCTP AUTH function */
-#define SCTPCTL_AUTH_DISABLE_DESC "Disable SCTP AUTH function"
-#define SCTPCTL_AUTH_DISABLE_MIN 0
-#define SCTPCTL_AUTH_DISABLE_MAX 1
-#define SCTPCTL_AUTH_DISABLE_DEFAULT 0
-
/* nat_friendly: SCTP NAT friendly operation */
#define SCTPCTL_NAT_FRIENDLY_DESC "SCTP NAT friendly operation"
#define SCTPCTL_NAT_FRIENDLY_MIN 0
@@ -408,12 +420,6 @@ struct sctp_sysctl {
#define SCTPCTL_ABORT_AT_LIMIT_MAX 1
#define SCTPCTL_ABORT_AT_LIMIT_DEFAULT 0
-/* strict_data_order: Enforce strict data ordering, abort if control inside data */
-#define SCTPCTL_STRICT_DATA_ORDER_DESC "Enforce strict data ordering, abort if control inside data"
-#define SCTPCTL_STRICT_DATA_ORDER_MIN 0
-#define SCTPCTL_STRICT_DATA_ORDER_MAX 1
-#define SCTPCTL_STRICT_DATA_ORDER_DEFAULT 0
-
/* min_residual: min residual in a data fragment leftover */
#define SCTPCTL_MIN_RESIDUAL_DESC "Minimum residual data chunk in second part of split"
#define SCTPCTL_MIN_RESIDUAL_MIN 20
@@ -454,13 +460,13 @@ struct sctp_sysctl {
#define SCTPCTL_MOBILITY_BASE_DESC "Enable SCTP base mobility"
#define SCTPCTL_MOBILITY_BASE_MIN 0
#define SCTPCTL_MOBILITY_BASE_MAX 1
-#define SCTPCTL_MOBILITY_BASE_DEFAULT SCTP_DEFAULT_MOBILITY_BASE
+#define SCTPCTL_MOBILITY_BASE_DEFAULT 0
/* mobility_fasthandoff: Enable SCTP fast handoff support */
#define SCTPCTL_MOBILITY_FASTHANDOFF_DESC "Enable SCTP fast handoff"
#define SCTPCTL_MOBILITY_FASTHANDOFF_MIN 0
#define SCTPCTL_MOBILITY_FASTHANDOFF_MAX 1
-#define SCTPCTL_MOBILITY_FASTHANDOFF_DEFAULT SCTP_DEFAULT_MOBILITY_FASTHANDOFF
+#define SCTPCTL_MOBILITY_FASTHANDOFF_DEFAULT 0
/* Enable SCTP/UDP tunneling port */
#define SCTPCTL_UDP_TUNNELING_PORT_DESC "Set the SCTP/UDP tunneling port"
@@ -472,7 +478,7 @@ struct sctp_sysctl {
#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_DESC "Enable sending of the SACK-IMMEDIATELY-bit."
#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_MIN 0
#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_MAX 1
-#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_DEFAULT SCTPCTL_SACK_IMMEDIATELY_ENABLE_MIN
+#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_DEFAULT SCTPCTL_SACK_IMMEDIATELY_ENABLE_MAX
/* Enable sending of the NAT-FRIENDLY message */
#define SCTPCTL_NAT_FRIENDLY_INITS_DESC "Enable sending of the nat-friendly SCTP option on INITs."
@@ -525,7 +531,7 @@ struct sctp_sysctl {
#define SCTPCTL_RTTVAR_DCCCECN_MAX 1
#define SCTPCTL_RTTVAR_DCCCECN_DEFAULT 1 /* 0 means disable feature */
-#define SCTPCTL_BLACKHOLE_DESC "Enable SCTP blackholing"
+#define SCTPCTL_BLACKHOLE_DESC "Enable SCTP blackholing. See blackhole(4) for more details."
#define SCTPCTL_BLACKHOLE_MIN 0
#define SCTPCTL_BLACKHOLE_MAX 2
#define SCTPCTL_BLACKHOLE_DEFAULT SCTPCTL_BLACKHOLE_MIN
diff --git a/freebsd/sys/netinet/sctp_timer.c b/freebsd/sys/netinet/sctp_timer.c
index 7d010c7b..c851317b 100644
--- a/freebsd/sys/netinet/sctp_timer.c
+++ b/freebsd/sys/netinet/sctp_timer.c
@@ -51,7 +51,9 @@ __FBSDID("$FreeBSD$");
#include <netinet/sctp_input.h>
#include <netinet/sctp.h>
#include <netinet/sctp_uio.h>
+#if defined(INET) || defined(INET6)
#include <netinet/udp.h>
+#endif
void
@@ -85,7 +87,7 @@ sctp_audit_retranmission_queue(struct sctp_association *asoc)
asoc->sent_queue_cnt);
}
-int
+static int
sctp_threshold_management(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
struct sctp_nets *net, uint16_t threshold)
{
@@ -110,8 +112,10 @@ sctp_threshold_management(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
net->dest_state |= SCTP_ADDR_PF;
net->last_active = sctp_get_tick_count();
sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED);
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_TIMER + SCTP_LOC_3);
- sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
+ inp, stcb, net,
+ SCTP_FROM_SCTP_TIMER + SCTP_LOC_1);
+ sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net);
}
}
}
@@ -151,9 +155,9 @@ sctp_threshold_management(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
/* Abort notification sends a ULP notify */
struct mbuf *op_err;
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION,
- "Association error couter exceeded");
- inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_1;
+ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
+ "Association error counter exceeded");
+ inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_2;
sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED);
return (1);
}
@@ -337,7 +341,7 @@ sctp_find_alternate_net(struct sctp_tcb *stcb,
return (NULL);
}
}
- do {
+ for (;;) {
alt = TAILQ_NEXT(mnet, sctp_next);
if (alt == NULL) {
once++;
@@ -356,7 +360,6 @@ sctp_find_alternate_net(struct sctp_tcb *stcb,
}
alt->src_addr_selected = 0;
}
- /* sa_ignore NO_NULL_CHK */
if (((alt->dest_state & SCTP_ADDR_REACHABLE) == SCTP_ADDR_REACHABLE) &&
(alt->ro.ro_rt != NULL) &&
(!(alt->dest_state & SCTP_ADDR_UNCONFIRMED))) {
@@ -364,14 +367,14 @@ sctp_find_alternate_net(struct sctp_tcb *stcb,
break;
}
mnet = alt;
- } while (alt != NULL);
+ }
if (alt == NULL) {
/* Case where NO insv network exists (dormant state) */
/* we rotate destinations */
once = 0;
mnet = net;
- do {
+ for (;;) {
if (mnet == NULL) {
return (TAILQ_FIRST(&stcb->asoc.nets));
}
@@ -382,15 +385,17 @@ sctp_find_alternate_net(struct sctp_tcb *stcb,
break;
}
alt = TAILQ_FIRST(&stcb->asoc.nets);
+ if (alt == NULL) {
+ break;
+ }
}
- /* sa_ignore NO_NULL_CHK */
if ((!(alt->dest_state & SCTP_ADDR_UNCONFIRMED)) &&
(alt != net)) {
/* Found an alternate address */
break;
}
mnet = alt;
- } while (alt != NULL);
+ }
}
if (alt == NULL) {
return (net);
@@ -405,7 +410,11 @@ sctp_backoff_on_timeout(struct sctp_tcb *stcb,
int num_marked, int num_abandoned)
{
if (net->RTO == 0) {
- net->RTO = stcb->asoc.minrto;
+ if (net->RTO_measured) {
+ net->RTO = stcb->asoc.minrto;
+ } else {
+ net->RTO = stcb->asoc.initial_rto;
+ }
}
net->RTO <<= 1;
if (net->RTO > stcb->asoc.maxrto) {
@@ -435,6 +444,11 @@ sctp_recover_sent_list(struct sctp_tcb *stcb)
asoc->strmout[chk->rec.data.stream_number].chunks_on_queues--;
}
}
+ if ((asoc->strmout[chk->rec.data.stream_number].chunks_on_queues == 0) &&
+ (asoc->strmout[chk->rec.data.stream_number].state == SCTP_STREAM_RESET_PENDING) &&
+ TAILQ_EMPTY(&asoc->strmout[chk->rec.data.stream_number].outqueue)) {
+ asoc->trigger_reset = 1;
+ }
TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next);
if (PR_SCTP_ENABLED(chk->flags)) {
if (asoc->pr_sctp_cnt != 0)
@@ -445,7 +459,7 @@ sctp_recover_sent_list(struct sctp_tcb *stcb)
sctp_free_bufspace(stcb, asoc, chk, 1);
sctp_m_freem(chk->data);
chk->data = NULL;
- if (asoc->peer_supports_prsctp && PR_SCTP_BUF_ENABLED(chk->flags)) {
+ if (asoc->prsctp_supported && PR_SCTP_BUF_ENABLED(chk->flags)) {
asoc->sent_queue_cnt_removeable--;
}
}
@@ -600,7 +614,7 @@ start_again:
continue;
}
}
- if (stcb->asoc.peer_supports_prsctp && PR_SCTP_TTL_ENABLED(chk->flags)) {
+ if (stcb->asoc.prsctp_supported && PR_SCTP_TTL_ENABLED(chk->flags)) {
/* Is it expired? */
if (timevalcmp(&now, &chk->rec.data.timetodrop, >)) {
/* Yes so drop it */
@@ -614,7 +628,7 @@ start_again:
continue;
}
}
- if (stcb->asoc.peer_supports_prsctp && PR_SCTP_RTX_ENABLED(chk->flags)) {
+ if (stcb->asoc.prsctp_supported && PR_SCTP_RTX_ENABLED(chk->flags)) {
/* Has it been retransmitted tv_sec times? */
if (chk->snd_count > chk->rec.data.timetodrop.tv_sec) {
if (chk->data) {
@@ -650,7 +664,7 @@ start_again:
sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_RSND_TO,
chk->whoTo->flight_size,
chk->book_size,
- (uintptr_t) chk->whoTo,
+ (uint32_t) (uintptr_t) chk->whoTo,
chk->rec.data.TSN_seq);
}
sctp_flight_size_decrease(chk);
@@ -778,7 +792,7 @@ start_again:
sctp_misc_ints(SCTP_FLIGHT_LOG_UP,
chk->whoTo->flight_size,
chk->book_size,
- (uintptr_t) chk->whoTo,
+ (uint32_t) (uintptr_t) chk->whoTo,
chk->rec.data.TSN_seq);
}
sctp_flight_size_increase(chk);
@@ -957,7 +971,7 @@ sctp_t3rxt_timer(struct sctp_inpcb *inp,
sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net);
return (0);
}
- if (stcb->asoc.peer_supports_prsctp) {
+ if (stcb->asoc.prsctp_supported) {
struct sctp_tmit_chunk *lchk;
lchk = sctp_try_advance_peer_ack_point(stcb, &stcb->asoc);
@@ -1043,9 +1057,9 @@ sctp_cookie_timer(struct sctp_inpcb *inp,
/* FOOBAR! */
struct mbuf *op_err;
- op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION,
+ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
"Cookie timer expired, but no cookie");
- inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_4;
+ inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_3;
sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED);
} else {
#ifdef INVARIANTS
@@ -1064,8 +1078,8 @@ sctp_cookie_timer(struct sctp_inpcb *inp,
return (1);
}
/*
- * cleared theshold management now lets backoff the address & select
- * an alternate
+ * Cleared threshold management, now lets backoff the address and
+ * select an alternate
*/
stcb->asoc.dropped_special_cnt = 0;
sctp_backoff_on_timeout(stcb, cookie->whoTo, 1, 0, 0);
@@ -1110,8 +1124,8 @@ sctp_strreset_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
return (1);
}
/*
- * cleared theshold management now lets backoff the address & select
- * an alternate
+ * Cleared threshold management, now lets backoff the address and
+ * select an alternate
*/
sctp_backoff_on_timeout(stcb, strrst->whoTo, 1, 0, 0);
alt = sctp_find_alternate_net(stcb, strrst->whoTo, 0);
@@ -1270,7 +1284,7 @@ sctp_shutdown_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
{
struct sctp_nets *alt;
- /* first threshold managment */
+ /* first threshold management */
if (sctp_threshold_management(inp, stcb, net, stcb->asoc.max_send_times)) {
/* Assoc is over */
return (1);
@@ -1293,7 +1307,7 @@ sctp_shutdownack_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
{
struct sctp_nets *alt;
- /* first threshold managment */
+ /* first threshold management */
if (sctp_threshold_management(inp, stcb, net, stcb->asoc.max_send_times)) {
/* Assoc is over */
return (1);
@@ -1482,11 +1496,15 @@ sctp_pathmtu_timer(struct sctp_inpcb *inp,
}
if (net->ro._s_addr) {
mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._s_addr.sa, net->ro.ro_rt);
+#if defined(INET) || defined(INET6)
if (net->port) {
mtu -= sizeof(struct udphdr);
}
+#endif
if (mtu > next_mtu) {
net->mtu = next_mtu;
+ } else {
+ net->mtu = mtu;
}
}
}
diff --git a/freebsd/sys/netinet/sctp_timer.h b/freebsd/sys/netinet/sctp_timer.h
index fd9df804..6d409cdc 100644
--- a/freebsd/sys/netinet/sctp_timer.h
+++ b/freebsd/sys/netinet/sctp_timer.h
@@ -46,10 +46,6 @@ sctp_find_alternate_net(struct sctp_tcb *,
struct sctp_nets *, int mode);
int
-sctp_threshold_management(struct sctp_inpcb *, struct sctp_tcb *,
- struct sctp_nets *, uint16_t);
-
-int
sctp_t3rxt_timer(struct sctp_inpcb *, struct sctp_tcb *,
struct sctp_nets *);
int
diff --git a/freebsd/sys/netinet/sctp_uio.h b/freebsd/sys/netinet/sctp_uio.h
index 968fc980..e65b7b5e 100644
--- a/freebsd/sys/netinet/sctp_uio.h
+++ b/freebsd/sys/netinet/sctp_uio.h
@@ -134,20 +134,27 @@ struct sctp_extrcvinfo {
uint16_t sinfo_flags;
uint32_t sinfo_ppid;
uint32_t sinfo_context;
- uint32_t sinfo_timetolive;
+ uint32_t sinfo_timetolive; /* should have been sinfo_pr_value */
uint32_t sinfo_tsn;
uint32_t sinfo_cumtsn;
sctp_assoc_t sinfo_assoc_id;
- uint16_t sreinfo_next_flags;
- uint16_t sreinfo_next_stream;
- uint32_t sreinfo_next_aid;
- uint32_t sreinfo_next_length;
- uint32_t sreinfo_next_ppid;
+ uint16_t serinfo_next_flags;
+ uint16_t serinfo_next_stream;
+ uint32_t serinfo_next_aid;
+ uint32_t serinfo_next_length;
+ uint32_t serinfo_next_ppid;
uint16_t sinfo_keynumber;
uint16_t sinfo_keynumber_valid;
uint8_t __reserve_pad[SCTP_ALIGN_RESV_PAD_SHORT];
};
+#define sinfo_pr_value sinfo_timetolive
+#define sreinfo_next_flags serinfo_next_flags
+#define sreinfo_next_stream serinfo_next_stream
+#define sreinfo_next_aid serinfo_next_aid
+#define sreinfo_next_length serinfo_next_length
+#define sreinfo_next_ppid serinfo_next_ppid
+
struct sctp_sndinfo {
uint16_t snd_sid;
uint16_t snd_flags;
@@ -249,18 +256,24 @@ struct sctp_snd_all_completes {
SCTP_SACK_IMMEDIATELY)) != 0)
/* for the endpoint */
-/* The lower byte is an enumeration of PR-SCTP policies */
+/* The lower four bits is an enumeration of PR-SCTP policies */
#define SCTP_PR_SCTP_NONE 0x0000/* Reliable transfer */
#define SCTP_PR_SCTP_TTL 0x0001/* Time based PR-SCTP */
-#define SCTP_PR_SCTP_BUF 0x0002/* Buffer based PR-SCTP */
+#define SCTP_PR_SCTP_PRIO 0x0002/* Buffer based PR-SCTP */
+#define SCTP_PR_SCTP_BUF SCTP_PR_SCTP_PRIO /* For backwards compatibility */
#define SCTP_PR_SCTP_RTX 0x0003/* Number of retransmissions based PR-SCTP */
+#define SCTP_PR_SCTP_MAX SCTP_PR_SCTP_RTX
+#define SCTP_PR_SCTP_ALL 0x000f/* Used for aggregated stats */
#define PR_SCTP_POLICY(x) ((x) & 0x0f)
-#define PR_SCTP_ENABLED(x) (PR_SCTP_POLICY(x) != SCTP_PR_SCTP_NONE)
+#define PR_SCTP_ENABLED(x) ((PR_SCTP_POLICY(x) != SCTP_PR_SCTP_NONE) && \
+ (PR_SCTP_POLICY(x) != SCTP_PR_SCTP_ALL))
#define PR_SCTP_TTL_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_TTL)
#define PR_SCTP_BUF_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_BUF)
#define PR_SCTP_RTX_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_RTX)
-#define PR_SCTP_INVALID_POLICY(x) (PR_SCTP_POLICY(x) > SCTP_PR_SCTP_RTX)
+#define PR_SCTP_INVALID_POLICY(x) (PR_SCTP_POLICY(x) > SCTP_PR_SCTP_MAX)
+#define PR_SCTP_VALID_POLICY(x) (PR_SCTP_POLICY(x) <= SCTP_PR_SCTP_MAX)
+
/* Stat's */
struct sctp_pcbinfo {
uint32_t ep_count;
@@ -306,12 +319,13 @@ struct sctp_assoc_change {
#define SCTP_CANT_STR_ASSOC 0x0005
/* sac_info values */
-#define SCTP_ASSOC_SUPPORTS_PR 0x01
-#define SCTP_ASSOC_SUPPORTS_AUTH 0x02
-#define SCTP_ASSOC_SUPPORTS_ASCONF 0x03
-#define SCTP_ASSOC_SUPPORTS_MULTIBUF 0x04
-#define SCTP_ASSOC_SUPPORTS_RE_CONFIG 0x05
-#define SCTP_ASSOC_SUPPORTS_MAX 0x05
+#define SCTP_ASSOC_SUPPORTS_PR 0x01
+#define SCTP_ASSOC_SUPPORTS_AUTH 0x02
+#define SCTP_ASSOC_SUPPORTS_ASCONF 0x03
+#define SCTP_ASSOC_SUPPORTS_MULTIBUF 0x04
+#define SCTP_ASSOC_SUPPORTS_RE_CONFIG 0x05
+#define SCTP_ASSOC_SUPPORTS_INTERLEAVING 0x06
+#define SCTP_ASSOC_SUPPORTS_MAX 0x06
/*
* Address event
*/
@@ -323,7 +337,6 @@ struct sctp_paddr_change {
uint32_t spc_state;
uint32_t spc_error;
sctp_assoc_t spc_assoc_id;
- uint8_t spc_padding[4];
};
/* paddr state values */
@@ -346,7 +359,7 @@ struct sctp_remote_error {
uint32_t sre_length;
uint16_t sre_error;
sctp_assoc_t sre_assoc_id;
- uint8_t sre_data[4];
+ uint8_t sre_data[];
};
/* data send failure event (deprecated) */
@@ -578,6 +591,7 @@ struct sctp_paddrthlds {
sctp_assoc_t spt_assoc_id;
uint16_t spt_pathmaxrxt;
uint16_t spt_pathpfthld;
+ uint16_t spt_pathcpthld;
};
struct sctp_paddrinfo {
@@ -720,6 +734,14 @@ struct sctp_udpencaps {
uint16_t sue_port;
};
+struct sctp_prstatus {
+ sctp_assoc_t sprstat_assoc_id;
+ uint16_t sprstat_sid;
+ uint16_t sprstat_policy;
+ uint64_t sprstat_abandoned_unsent;
+ uint64_t sprstat_abandoned_sent;
+};
+
struct sctp_cwnd_args {
struct sctp_nets *net; /* network to *//* FIXME: LP64 issue */
uint32_t cwnd_new_value;/* cwnd in k */
@@ -1145,15 +1167,22 @@ union sctp_sockstore {
struct xsctp_inpcb {
uint32_t last;
uint32_t flags;
- uint32_t features;
+ uint64_t features;
uint32_t total_sends;
uint32_t total_recvs;
uint32_t total_nospaces;
uint32_t fragmentation_point;
uint16_t local_port;
- uint16_t qlen;
- uint16_t maxqlen;
- uint32_t extra_padding[32]; /* future */
+ uint16_t qlen_old;
+ uint16_t maxqlen_old;
+ void *socket;
+ uint32_t qlen;
+ uint32_t maxqlen;
+#if defined(__LP64__)
+ uint32_t extra_padding[27]; /* future */
+#else
+ uint32_t extra_padding[28]; /* future */
+#endif
};
struct xsctp_tcb {
@@ -1211,7 +1240,8 @@ struct xsctp_raddr {
struct sctp_timeval start_time; /* sctpAssocLocalRemEntry 8 */
uint32_t rtt;
uint32_t heartbeat_interval;
- uint32_t extra_padding[31]; /* future */
+ uint32_t ssthresh;
+ uint32_t extra_padding[30]; /* future */
};
#define SCTP_MAX_LOGGING_SIZE 30000
diff --git a/freebsd/sys/netinet/sctp_usrreq.c b/freebsd/sys/netinet/sctp_usrreq.c
index b19a7499..1cbb7076 100644
--- a/freebsd/sys/netinet/sctp_usrreq.c
+++ b/freebsd/sys/netinet/sctp_usrreq.c
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/sctp_header.h>
#include <netinet/sctp_var.h>
#ifdef INET6
+#include <netinet6/sctp6_var.h>
#endif
#include <netinet/sctp_sysctl.h>
#include <netinet/sctp_output.h>
@@ -55,8 +56,8 @@ __FBSDID("$FreeBSD$");
-extern struct sctp_cc_functions sctp_cc_functions[];
-extern struct sctp_ss_functions sctp_ss_functions[];
+extern const struct sctp_cc_functions sctp_cc_functions[];
+extern const struct sctp_ss_functions sctp_ss_functions[];
void
sctp_init(void)
@@ -90,13 +91,15 @@ sctp_init(void)
#endif
}
-void
-sctp_finish(void)
+#ifdef VIMAGE
+static void
+sctp_finish(void *unused __unused)
{
sctp_pcb_finish();
}
-
+VNET_SYSUNINIT(sctp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, sctp_finish, NULL);
+#endif
void
sctp_pathmtu_adjustment(struct sctp_tcb *stcb, uint16_t nxtsz)
@@ -126,148 +129,55 @@ sctp_pathmtu_adjustment(struct sctp_tcb *stcb, uint16_t nxtsz)
if (chk->sent < SCTP_DATAGRAM_RESEND) {
sctp_flight_size_decrease(chk);
sctp_total_flight_decrease(stcb, chk);
- }
- if (chk->sent != SCTP_DATAGRAM_RESEND) {
+ chk->sent = SCTP_DATAGRAM_RESEND;
sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt);
+ chk->rec.data.doing_fast_retransmit = 0;
+ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
+ sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PMTU,
+ chk->whoTo->flight_size,
+ chk->book_size,
+ (uint32_t) (uintptr_t) chk->whoTo,
+ chk->rec.data.TSN_seq);
+ }
+ /* Clear any time so NO RTT is being done */
+ chk->do_rtt = 0;
}
- chk->sent = SCTP_DATAGRAM_RESEND;
- chk->rec.data.doing_fast_retransmit = 0;
- if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
- sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PMTU,
- chk->whoTo->flight_size,
- chk->book_size,
- (uintptr_t) chk->whoTo,
- chk->rec.data.TSN_seq);
- }
- /* Clear any time so NO RTT is being done */
- chk->do_rtt = 0;
}
}
}
#ifdef INET
-static void
-sctp_notify_mbuf(struct sctp_inpcb *inp,
- struct sctp_tcb *stcb,
- struct sctp_nets *net,
- struct ip *ip,
- struct sctphdr *sh)
-{
- struct icmp *icmph;
- int totsz, tmr_stopped = 0;
- uint16_t nxtsz;
-
- /* protection */
- if ((inp == NULL) || (stcb == NULL) || (net == NULL) ||
- (ip == NULL) || (sh == NULL)) {
- if (stcb != NULL) {
- SCTP_TCB_UNLOCK(stcb);
- }
- return;
- }
- /* First job is to verify the vtag matches what I would send */
- if (ntohl(sh->v_tag) != (stcb->asoc.peer_vtag)) {
- SCTP_TCB_UNLOCK(stcb);
- return;
- }
- icmph = (struct icmp *)((caddr_t)ip - (sizeof(struct icmp) -
- sizeof(struct ip)));
- if (icmph->icmp_type != ICMP_UNREACH) {
- /* We only care about unreachable */
- SCTP_TCB_UNLOCK(stcb);
- return;
- }
- if (icmph->icmp_code != ICMP_UNREACH_NEEDFRAG) {
- /* not a unreachable message due to frag. */
- SCTP_TCB_UNLOCK(stcb);
- return;
- }
- totsz = ip->ip_len;
-
- nxtsz = ntohs(icmph->icmp_nextmtu);
- if (nxtsz == 0) {
- /*
- * old type router that does not tell us what the next size
- * mtu is. Rats we will have to guess (in a educated fashion
- * of course)
- */
- nxtsz = sctp_get_prev_mtu(totsz);
- }
- /* Stop any PMTU timer */
- if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
- tmr_stopped = 1;
- sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
- SCTP_FROM_SCTP_USRREQ + SCTP_LOC_1);
- }
- /* Adjust destination size limit */
- if (net->mtu > nxtsz) {
- net->mtu = nxtsz;
- if (net->port) {
- net->mtu -= sizeof(struct udphdr);
- }
- }
- /* now what about the ep? */
- if (stcb->asoc.smallest_mtu > nxtsz) {
- sctp_pathmtu_adjustment(stcb, nxtsz);
- }
- if (tmr_stopped)
- sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
-
- SCTP_TCB_UNLOCK(stcb);
-}
-
-#endif
-
void
sctp_notify(struct sctp_inpcb *inp,
- struct ip *ip,
- struct sctphdr *sh,
- struct sockaddr *to,
struct sctp_tcb *stcb,
- struct sctp_nets *net)
+ struct sctp_nets *net,
+ uint8_t icmp_type,
+ uint8_t icmp_code,
+ uint16_t ip_len,
+ uint16_t next_mtu)
{
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
struct socket *so;
#endif
- struct icmp *icmph;
+ int timer_stopped;
- /* protection */
- if ((inp == NULL) || (stcb == NULL) || (net == NULL) ||
- (sh == NULL) || (to == NULL)) {
- if (stcb)
- SCTP_TCB_UNLOCK(stcb);
- return;
- }
- /* First job is to verify the vtag matches what I would send */
- if (ntohl(sh->v_tag) != (stcb->asoc.peer_vtag)) {
- SCTP_TCB_UNLOCK(stcb);
- return;
- }
- icmph = (struct icmp *)((caddr_t)ip - (sizeof(struct icmp) -
- sizeof(struct ip)));
- if (icmph->icmp_type != ICMP_UNREACH) {
+ if (icmp_type != ICMP_UNREACH) {
/* We only care about unreachable */
SCTP_TCB_UNLOCK(stcb);
return;
}
- if ((icmph->icmp_code == ICMP_UNREACH_NET) ||
- (icmph->icmp_code == ICMP_UNREACH_HOST) ||
- (icmph->icmp_code == ICMP_UNREACH_NET_UNKNOWN) ||
- (icmph->icmp_code == ICMP_UNREACH_HOST_UNKNOWN) ||
- (icmph->icmp_code == ICMP_UNREACH_ISOLATED) ||
- (icmph->icmp_code == ICMP_UNREACH_NET_PROHIB) ||
- (icmph->icmp_code == ICMP_UNREACH_HOST_PROHIB) ||
- (icmph->icmp_code == ICMP_UNREACH_FILTER_PROHIB)) {
-
- /*
- * Hmm reachablity problems we must examine closely. If its
- * not reachable, we may have lost a network. Or if there is
- * NO protocol at the other end named SCTP. well we consider
- * it a OOTB abort.
- */
+ if ((icmp_code == ICMP_UNREACH_NET) ||
+ (icmp_code == ICMP_UNREACH_HOST) ||
+ (icmp_code == ICMP_UNREACH_NET_UNKNOWN) ||
+ (icmp_code == ICMP_UNREACH_HOST_UNKNOWN) ||
+ (icmp_code == ICMP_UNREACH_ISOLATED) ||
+ (icmp_code == ICMP_UNREACH_NET_PROHIB) ||
+ (icmp_code == ICMP_UNREACH_HOST_PROHIB) ||
+ (icmp_code == ICMP_UNREACH_FILTER_PROHIB)) {
+ /* Mark the net unreachable. */
if (net->dest_state & SCTP_ADDR_REACHABLE) {
- /* Ok that destination is NOT reachable */
+ /* OK, that destination is NOT reachable. */
net->dest_state &= ~SCTP_ADDR_REACHABLE;
net->dest_state &= ~SCTP_ADDR_PF;
sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN,
@@ -275,15 +185,9 @@ sctp_notify(struct sctp_inpcb *inp,
(void *)net, SCTP_SO_NOT_LOCKED);
}
SCTP_TCB_UNLOCK(stcb);
- } else if ((icmph->icmp_code == ICMP_UNREACH_PROTOCOL) ||
- (icmph->icmp_code == ICMP_UNREACH_PORT)) {
- /*
- * Here the peer is either playing tricks on us, including
- * an address that belongs to someone who does not support
- * SCTP OR was a userland implementation that shutdown and
- * now is dead. In either case treat it like a OOTB abort
- * with no TCB
- */
+ } else if ((icmp_code == ICMP_UNREACH_PROTOCOL) ||
+ (icmp_code == ICMP_UNREACH_PORT)) {
+ /* Treat it like an ABORT. */
sctp_abort_notification(stcb, 1, 0, NULL, SCTP_SO_NOT_LOCKED);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
so = SCTP_INP_SO(inp);
@@ -293,72 +197,141 @@ sctp_notify(struct sctp_inpcb *inp,
SCTP_TCB_LOCK(stcb);
atomic_subtract_int(&stcb->asoc.refcnt, 1);
#endif
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_2);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_2);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
/* SCTP_TCB_UNLOCK(stcb); MT: I think this is not needed. */
#endif
/* no need to unlock here, since the TCB is gone */
+ } else if (icmp_code == ICMP_UNREACH_NEEDFRAG) {
+ /* Find the next (smaller) MTU */
+ if (next_mtu == 0) {
+ /*
+ * Old type router that does not tell us what the
+ * next MTU is. Rats we will have to guess (in a
+ * educated fashion of course).
+ */
+ next_mtu = sctp_get_prev_mtu(ip_len);
+ }
+ /* Stop the PMTU timer. */
+ if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
+ timer_stopped = 1;
+ sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_1);
+ } else {
+ timer_stopped = 0;
+ }
+ /* Update the path MTU. */
+ if (net->mtu > next_mtu) {
+ net->mtu = next_mtu;
+ if (net->port) {
+ net->mtu -= sizeof(struct udphdr);
+ }
+ }
+ /* Update the association MTU */
+ if (stcb->asoc.smallest_mtu > next_mtu) {
+ sctp_pathmtu_adjustment(stcb, next_mtu);
+ }
+ /* Finally, start the PMTU timer if it was running before. */
+ if (timer_stopped) {
+ sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
+ }
+ SCTP_TCB_UNLOCK(stcb);
} else {
SCTP_TCB_UNLOCK(stcb);
}
}
-#ifdef INET
void
-sctp_ctlinput(cmd, sa, vip)
- int cmd;
- struct sockaddr *sa;
- void *vip;
+sctp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
{
- struct ip *ip = vip;
+ struct ip *outer_ip;
+ struct ip *inner_ip;
struct sctphdr *sh;
- uint32_t vrf_id;
+ struct icmp *icmp;
+ struct sctp_inpcb *inp;
+ struct sctp_tcb *stcb;
+ struct sctp_nets *net;
+ struct sctp_init_chunk *ch;
+ struct sockaddr_in src, dst;
- /* FIX, for non-bsd is this right? */
- vrf_id = SCTP_DEFAULT_VRFID;
if (sa->sa_family != AF_INET ||
((struct sockaddr_in *)sa)->sin_addr.s_addr == INADDR_ANY) {
return;
}
if (PRC_IS_REDIRECT(cmd)) {
- ip = 0;
+ vip = NULL;
} else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) {
return;
}
- if (ip) {
- struct sctp_inpcb *inp = NULL;
- struct sctp_tcb *stcb = NULL;
- struct sctp_nets *net = NULL;
- struct sockaddr_in to, from;
-
- sh = (struct sctphdr *)((caddr_t)ip + (ip->ip_hl << 2));
- bzero(&to, sizeof(to));
- bzero(&from, sizeof(from));
- from.sin_family = to.sin_family = AF_INET;
- from.sin_len = to.sin_len = sizeof(to);
- from.sin_port = sh->src_port;
- from.sin_addr = ip->ip_src;
- to.sin_port = sh->dest_port;
- to.sin_addr = ip->ip_dst;
-
+ if (vip != NULL) {
+ inner_ip = (struct ip *)vip;
+ icmp = (struct icmp *)((caddr_t)inner_ip -
+ (sizeof(struct icmp) - sizeof(struct ip)));
+ outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip));
+ sh = (struct sctphdr *)((caddr_t)inner_ip + (inner_ip->ip_hl << 2));
+ memset(&src, 0, sizeof(struct sockaddr_in));
+ src.sin_family = AF_INET;
+ src.sin_len = sizeof(struct sockaddr_in);
+ src.sin_port = sh->src_port;
+ src.sin_addr = inner_ip->ip_src;
+ memset(&dst, 0, sizeof(struct sockaddr_in));
+ dst.sin_family = AF_INET;
+ dst.sin_len = sizeof(struct sockaddr_in);
+ dst.sin_port = sh->dest_port;
+ dst.sin_addr = inner_ip->ip_dst;
/*
- * 'to' holds the dest of the packet that failed to be sent.
- * 'from' holds our local endpoint address. Thus we reverse
- * the to and the from in the lookup.
+ * 'dst' holds the dest of the packet that failed to be
+ * sent. 'src' holds our local endpoint address. Thus we
+ * reverse the dst and the src in the lookup.
*/
- stcb = sctp_findassociation_addr_sa((struct sockaddr *)&to,
- (struct sockaddr *)&from,
- &inp, &net, 1, vrf_id);
- if (stcb != NULL && inp && (inp->sctp_socket != NULL)) {
- if (cmd != PRC_MSGSIZE) {
- sctp_notify(inp, ip, sh,
- (struct sockaddr *)&to, stcb,
- net);
+ inp = NULL;
+ net = NULL;
+ stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst,
+ (struct sockaddr *)&src,
+ &inp, &net, 1,
+ SCTP_DEFAULT_VRFID);
+ if ((stcb != NULL) &&
+ (net != NULL) &&
+ (inp != NULL)) {
+ /* Check the verification tag */
+ if (ntohl(sh->v_tag) != 0) {
+ /*
+ * This must be the verification tag used
+ * for sending out packets. We don't
+ * consider packets reflecting the
+ * verification tag.
+ */
+ if (ntohl(sh->v_tag) != stcb->asoc.peer_vtag) {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
} else {
- /* handle possible ICMP size messages */
- sctp_notify_mbuf(inp, stcb, net, ip, sh);
+ if (ntohs(outer_ip->ip_len) >=
+ sizeof(struct ip) +
+ 8 + (inner_ip->ip_hl << 2) + 20) {
+ /*
+ * In this case we can check if we
+ * got an INIT chunk and if the
+ * initiate tag matches.
+ */
+ ch = (struct sctp_init_chunk *)(sh + 1);
+ if ((ch->ch.chunk_type != SCTP_INITIATION) ||
+ (ntohl(ch->init.initiate_tag) != stcb->asoc.my_vtag)) {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ } else {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
}
+ sctp_notify(inp, stcb, net,
+ icmp->icmp_type,
+ icmp->icmp_code,
+ ntohs(inner_ip->ip_len),
+ ntohs(icmp->icmp_nextmtu));
} else {
if ((stcb == NULL) && (inp != NULL)) {
/* reduce ref-count */
@@ -489,13 +462,8 @@ sctp_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNUS
int error;
uint32_t vrf_id = SCTP_DEFAULT_VRFID;
-#ifdef IPSEC
- uint32_t flags;
-
-#endif
-
inp = (struct sctp_inpcb *)so->so_pcb;
- if (inp != 0) {
+ if (inp != NULL) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
return (EINVAL);
}
@@ -515,33 +483,6 @@ sctp_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNUS
ip_inp = &inp->ip_inp.inp;
ip_inp->inp_vflag |= INP_IPV4;
ip_inp->inp_ip_ttl = MODULE_GLOBAL(ip_defttl);
-#ifdef IPSEC
- error = ipsec_init_policy(so, &ip_inp->inp_sp);
-#ifdef SCTP_LOG_CLOSING
- sctp_log_closing(inp, NULL, 17);
-#endif
- if (error != 0) {
-try_again:
- flags = inp->sctp_flags;
- if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
- (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) {
-#ifdef SCTP_LOG_CLOSING
- sctp_log_closing(inp, NULL, 15);
-#endif
- SCTP_INP_WUNLOCK(inp);
- sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
- SCTP_CALLED_AFTER_CMPSET_OFCLOSE);
- } else {
- flags = inp->sctp_flags;
- if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) {
- goto try_again;
- } else {
- SCTP_INP_WUNLOCK(inp);
- }
- }
- return (error);
- }
-#endif /* IPSEC */
SCTP_INP_WUNLOCK(inp);
return (0);
}
@@ -759,7 +700,7 @@ sctp_disconnect(struct socket *so)
/* Left with Data unread */
struct mbuf *err;
- err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_DONTWAIT, 1, MT_DATA);
+ err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_NOWAIT, 1, MT_DATA);
if (err) {
/*
* Fill in the user
@@ -780,7 +721,8 @@ sctp_disconnect(struct socket *so)
(SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
SCTP_STAT_DECR_GAUGE32(sctps_currestab);
}
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_3);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_3);
/* No unlock tcb assoc is gone */
return (0);
}
@@ -788,7 +730,7 @@ sctp_disconnect(struct socket *so)
TAILQ_EMPTY(&asoc->sent_queue) &&
(asoc->stream_queue_cnt == 0)) {
/* there is nothing queued to send, so done */
- if (asoc->locked_on_sending) {
+ if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
goto abort_anyway;
}
if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) &&
@@ -837,18 +779,8 @@ sctp_disconnect(struct socket *so)
asoc->state |= SCTP_STATE_SHUTDOWN_PENDING;
sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
netp);
- if (asoc->locked_on_sending) {
- /* Locked to send out the data */
- struct sctp_stream_queue_pending *sp;
-
- sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead);
- if (sp == NULL) {
- SCTP_PRINTF("Error, sp is NULL, locked on sending is non-null strm:%d\n",
- asoc->locked_on_sending->stream_no);
- } else {
- if ((sp->length == 0) && (sp->msg_is_complete == 0))
- asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
- }
+ if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
+ asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
}
if (TAILQ_EMPTY(&asoc->send_queue) &&
TAILQ_EMPTY(&asoc->sent_queue) &&
@@ -865,7 +797,8 @@ sctp_disconnect(struct socket *so)
SCTP_STAT_DECR_GAUGE32(sctps_currestab);
}
SCTP_INP_RUNLOCK(inp);
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_5);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_5);
return (0);
} else {
sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED);
@@ -957,14 +890,15 @@ sctp_shutdown(struct socket *so)
SCTP_INP_RUNLOCK(inp);
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
return (EOPNOTSUPP);
- }
- /*
- * Ok if we reach here its the TCP model and it is either a SHUT_WR
- * or SHUT_RDWR. This means we put the shutdown flag against it.
- */
- {
+ } else {
+ /*
+ * Ok, if we reach here its the TCP model and it is either a
+ * SHUT_WR or SHUT_RDWR. This means we put the shutdown flag
+ * against it.
+ */
struct sctp_tcb *stcb;
struct sctp_association *asoc;
+ struct sctp_nets *netp;
if ((so->so_state &
(SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
@@ -976,7 +910,7 @@ sctp_shutdown(struct socket *so)
stcb = LIST_FIRST(&inp->sctp_asoc_list);
if (stcb == NULL) {
/*
- * Ok we hit the case that the shutdown call was
+ * Ok, we hit the case that the shutdown call was
* made after an abort or something. Nothing to do
* now.
*/
@@ -985,66 +919,50 @@ sctp_shutdown(struct socket *so)
}
SCTP_TCB_LOCK(stcb);
asoc = &stcb->asoc;
- if (TAILQ_EMPTY(&asoc->send_queue) &&
+ if (asoc->state & SCTP_STATE_ABOUT_TO_BE_FREED) {
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_INP_RUNLOCK(inp);
+ return (0);
+ }
+ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_ECHOED) &&
+ (SCTP_GET_STATE(asoc) != SCTP_STATE_OPEN)) {
+ /*
+ * If we are not in or before ESTABLISHED, there is
+ * no protocol action required.
+ */
+ SCTP_TCB_UNLOCK(stcb);
+ SCTP_INP_RUNLOCK(inp);
+ return (0);
+ }
+ if (stcb->asoc.alternate) {
+ netp = stcb->asoc.alternate;
+ } else {
+ netp = stcb->asoc.primary_destination;
+ }
+ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) &&
+ TAILQ_EMPTY(&asoc->send_queue) &&
TAILQ_EMPTY(&asoc->sent_queue) &&
(asoc->stream_queue_cnt == 0)) {
- if (asoc->locked_on_sending) {
+ if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
goto abort_anyway;
}
/* there is nothing queued to send, so I'm done... */
- if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) {
- /* only send SHUTDOWN the first time through */
- struct sctp_nets *netp;
-
- if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) ||
- (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
- SCTP_STAT_DECR_GAUGE32(sctps_currestab);
- }
- SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT);
- SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
- sctp_stop_timers_for_shutdown(stcb);
- if (stcb->asoc.alternate) {
- netp = stcb->asoc.alternate;
- } else {
- netp = stcb->asoc.primary_destination;
- }
- sctp_send_shutdown(stcb, netp);
- sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN,
- stcb->sctp_ep, stcb, netp);
- sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
- stcb->sctp_ep, stcb, netp);
- sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_LOCKED);
- }
+ SCTP_STAT_DECR_GAUGE32(sctps_currestab);
+ SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT);
+ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ sctp_stop_timers_for_shutdown(stcb);
+ sctp_send_shutdown(stcb, netp);
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN,
+ stcb->sctp_ep, stcb, netp);
} else {
/*
- * we still got (or just got) data to send, so set
- * SHUTDOWN_PENDING
+ * We still got (or just got) data to send, so set
+ * SHUTDOWN_PENDING.
*/
- struct sctp_nets *netp;
-
- if (stcb->asoc.alternate) {
- netp = stcb->asoc.alternate;
- } else {
- netp = stcb->asoc.primary_destination;
- }
-
- asoc->state |= SCTP_STATE_SHUTDOWN_PENDING;
- sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb,
- netp);
-
- if (asoc->locked_on_sending) {
- /* Locked to send out the data */
- struct sctp_stream_queue_pending *sp;
-
- sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead);
- if (sp == NULL) {
- SCTP_PRINTF("Error, sp is NULL, locked on sending is non-null strm:%d\n",
- asoc->locked_on_sending->stream_no);
- } else {
- if ((sp->length == 0) && (sp->msg_is_complete == 0)) {
- asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT;
- }
- }
+ SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING);
+ if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
+ SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_PARTIAL_MSG_LEFT);
}
if (TAILQ_EMPTY(&asoc->send_queue) &&
TAILQ_EMPTY(&asoc->sent_queue) &&
@@ -1056,16 +974,20 @@ sctp_shutdown(struct socket *so)
stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6;
sctp_abort_an_association(stcb->sctp_ep, stcb,
op_err, SCTP_SO_LOCKED);
- goto skip_unlock;
- } else {
- sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED);
+ SCTP_INP_RUNLOCK(inp);
+ return (0);
}
}
+ sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, netp);
+ /*
+ * XXX: Why do this in the case where we have still data
+ * queued?
+ */
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED);
SCTP_TCB_UNLOCK(stcb);
+ SCTP_INP_RUNLOCK(inp);
+ return (0);
}
-skip_unlock:
- SCTP_INP_RUNLOCK(inp);
- return (0);
}
/*
@@ -1190,7 +1112,7 @@ sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp,
if (ipv4_addr_legal) {
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)&sctp_ifa->address.sa;
+ sin = &sctp_ifa->address.sin;
if (sin->sin_addr.s_addr == 0) {
/*
* we skip
@@ -1235,7 +1157,7 @@ sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp,
if (ipv6_addr_legal) {
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa;
+ sin6 = &sctp_ifa->address.sin6;
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
/*
* we skip
@@ -1375,10 +1297,14 @@ sctp_count_max_addresses_vrf(struct sctp_inpcb *inp, uint32_t vrf_id)
switch (sctp_ifa->address.sa.sa_family) {
#ifdef INET
case AF_INET:
+#ifdef INET6
if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4))
cnt += sizeof(struct sockaddr_in6);
else
cnt += sizeof(struct sockaddr_in);
+#else
+ cnt += sizeof(struct sockaddr_in);
+#endif
break;
#endif
#ifdef INET6
@@ -1398,10 +1324,14 @@ sctp_count_max_addresses_vrf(struct sctp_inpcb *inp, uint32_t vrf_id)
switch (laddr->ifa->address.sa.sa_family) {
#ifdef INET
case AF_INET:
+#ifdef INET6
if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4))
cnt += sizeof(struct sockaddr_in6);
else
cnt += sizeof(struct sockaddr_in);
+#else
+ cnt += sizeof(struct sockaddr_in);
+#endif
break;
#endif
#ifdef INET6
@@ -1437,7 +1367,7 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval,
int creat_lock_on = 0;
struct sctp_tcb *stcb = NULL;
struct sockaddr *sa;
- int num_v6 = 0, num_v4 = 0, *totaddrp, totaddr;
+ unsigned int num_v6 = 0, num_v4 = 0, *totaddrp, totaddr;
uint32_t vrf_id;
int bad_addresses = 0;
sctp_assoc_t *a_id;
@@ -1473,10 +1403,10 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval,
error = EFAULT;
goto out_now;
}
- totaddrp = (int *)optval;
+ totaddrp = (unsigned int *)optval;
totaddr = *totaddrp;
sa = (struct sockaddr *)(totaddrp + 1);
- stcb = sctp_connectx_helper_find(inp, sa, &totaddr, &num_v4, &num_v6, &error, (optsize - sizeof(int)), &bad_addresses);
+ stcb = sctp_connectx_helper_find(inp, sa, &totaddr, &num_v4, &num_v6, &error, (unsigned int)(optsize - sizeof(int)), &bad_addresses);
if ((stcb != NULL) || bad_addresses) {
/* Already have or am bring up an association */
SCTP_ASOC_CREATE_UNLOCK(inp);
@@ -1525,6 +1455,8 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval,
/* We are GOOD to go */
stcb = sctp_aloc_assoc(inp, sa, &error, 0, vrf_id,
+ inp->sctp_ep.pre_open_stream_count,
+ inp->sctp_ep.port,
(struct thread *)p
);
if (stcb == NULL) {
@@ -1557,7 +1489,8 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval,
sctp_connectx_helper_add(stcb, sa, (totaddr - 1), &error);
/* Fill in the return id */
if (error) {
- (void)sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6);
+ (void)sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7);
goto out_now;
}
a_id = (sctp_assoc_t *) optval;
@@ -1575,11 +1508,6 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval,
sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
}
SCTP_TCB_UNLOCK(stcb);
- if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) {
- stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
- /* Set the connected flag so we can queue data */
- soisconnecting(so);
- }
out_now:
if (creat_lock_on) {
SCTP_ASOC_CREATE_UNLOCK(inp);
@@ -1752,6 +1680,37 @@ flags_out:
*optsize = sizeof(uint32_t);
break;
}
+ case SCTP_INTERLEAVING_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ av->assoc_value = stcb->asoc.idata_supported;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_RLOCK(inp);
+ if (inp->idata_supported) {
+ av->assoc_value = 1;
+ } else {
+ av->assoc_value = 0;
+ }
+ SCTP_INP_RUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ if (error == 0) {
+ *optsize = sizeof(struct sctp_assoc_value);
+ }
+ break;
+ }
case SCTP_CMT_ON_OFF:
{
struct sctp_assoc_value *av;
@@ -1905,8 +1864,15 @@ flags_out:
uint32_t *value, cnt;
SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
- cnt = 0;
SCTP_INP_RLOCK(inp);
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ /* Can't do this for a 1-1 socket */
+ error = EINVAL;
+ SCTP_INP_RUNLOCK(inp);
+ break;
+ }
+ cnt = 0;
LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
cnt++;
}
@@ -1918,15 +1884,28 @@ flags_out:
case SCTP_GET_ASSOC_ID_LIST:
{
struct sctp_assoc_ids *ids;
- unsigned int at, limit;
+ uint32_t at;
+ size_t limit;
SCTP_CHECK_AND_CAST(ids, optval, struct sctp_assoc_ids, *optsize);
+ SCTP_INP_RLOCK(inp);
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
+ /* Can't do this for a 1-1 socket */
+ error = EINVAL;
+ SCTP_INP_RUNLOCK(inp);
+ break;
+ }
at = 0;
limit = (*optsize - sizeof(uint32_t)) / sizeof(sctp_assoc_t);
- SCTP_INP_RLOCK(inp);
LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
if (at < limit) {
ids->gaids_assoc_id[at++] = sctp_get_associd(stcb);
+ if (at == 0) {
+ error = EINVAL;
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ break;
+ }
} else {
error = EINVAL;
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
@@ -2219,23 +2198,27 @@ flags_out:
size = 0;
/* Count the sizes */
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
- if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
- size += sizeof(struct sockaddr_in6);
- } else {
- switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) {
+ switch (net->ro._l_addr.sa.sa_family) {
#ifdef INET
- case AF_INET:
+ case AF_INET:
+#ifdef INET6
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
+ size += sizeof(struct sockaddr_in6);
+ } else {
size += sizeof(struct sockaddr_in);
- break;
+ }
+#else
+ size += sizeof(struct sockaddr_in);
+#endif
+ break;
#endif
#ifdef INET6
- case AF_INET6:
- size += sizeof(struct sockaddr_in6);
- break;
+ case AF_INET6:
+ size += sizeof(struct sockaddr_in6);
+ break;
#endif
- default:
- break;
- }
+ default:
+ break;
}
}
SCTP_TCB_UNLOCK(stcb);
@@ -2267,24 +2250,28 @@ flags_out:
sas = (struct sockaddr_storage *)&saddr->addr[0];
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
- if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
- cpsz = sizeof(struct sockaddr_in6);
- } else {
- switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) {
+ switch (net->ro._l_addr.sa.sa_family) {
#ifdef INET
- case AF_INET:
+ case AF_INET:
+#ifdef INET6
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
+ cpsz = sizeof(struct sockaddr_in6);
+ } else {
cpsz = sizeof(struct sockaddr_in);
- break;
+ }
+#else
+ cpsz = sizeof(struct sockaddr_in);
+#endif
+ break;
#endif
#ifdef INET6
- case AF_INET6:
- cpsz = sizeof(struct sockaddr_in6);
- break;
+ case AF_INET6:
+ cpsz = sizeof(struct sockaddr_in6);
+ break;
#endif
- default:
- cpsz = 0;
- break;
- }
+ default:
+ cpsz = 0;
+ break;
}
if (cpsz == 0) {
break;
@@ -2295,15 +2282,15 @@ flags_out:
}
#if defined(INET) && defined(INET6)
if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) &&
- (((struct sockaddr *)&net->ro._l_addr)->sa_family == AF_INET)) {
+ (net->ro._l_addr.sa.sa_family == AF_INET)) {
/* Must map the address */
- in6_sin_2_v4mapsin6((struct sockaddr_in *)&net->ro._l_addr,
+ in6_sin_2_v4mapsin6(&net->ro._l_addr.sin,
(struct sockaddr_in6 *)sas);
} else {
-#endif
memcpy(sas, &net->ro._l_addr, cpsz);
-#if defined(INET) && defined(INET6)
}
+#else
+ memcpy(sas, &net->ro._l_addr, cpsz);
#endif
((struct sockaddr_in *)sas)->sin_port = stcb->rport;
@@ -2340,13 +2327,35 @@ flags_out:
{
struct sctp_paddrparams *paddrp;
struct sctp_nets *net;
+ struct sockaddr *addr;
+
+#if defined(INET) && defined(INET6)
+ struct sockaddr_in sin_store;
+
+#endif
SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, *optsize);
SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id);
- net = NULL;
- if (stcb) {
- net = sctp_findnet(stcb, (struct sockaddr *)&paddrp->spp_address);
+#if defined(INET) && defined(INET6)
+ if (paddrp->spp_address.ss_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&paddrp->spp_address;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ in6_sin6_2_sin(&sin_store, sin6);
+ addr = (struct sockaddr *)&sin_store;
+ } else {
+ addr = (struct sockaddr *)&paddrp->spp_address;
+ }
+ } else {
+ addr = (struct sockaddr *)&paddrp->spp_address;
+ }
+#else
+ addr = (struct sockaddr *)&paddrp->spp_address;
+#endif
+ if (stcb != NULL) {
+ net = sctp_findnet(stcb, addr);
} else {
/*
* We increment here since
@@ -2355,22 +2364,20 @@ flags_out:
* the locked tcb (last argument) is NOT a
* TCB.. aka NULL.
*/
+ net = NULL;
SCTP_INP_INCR_REF(inp);
- stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&paddrp->spp_address, &net, NULL, NULL);
+ stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
if (stcb == NULL) {
SCTP_INP_DECR_REF(inp);
}
}
- if (stcb && (net == NULL)) {
- struct sockaddr *sa;
-
- sa = (struct sockaddr *)&paddrp->spp_address;
+ if ((stcb != NULL) && (net == NULL)) {
#ifdef INET
- if (sa->sa_family == AF_INET) {
+ if (addr->sa_family == AF_INET) {
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)sa;
- if (sin->sin_addr.s_addr) {
+ sin = (struct sockaddr_in *)addr;
+ if (sin->sin_addr.s_addr != INADDR_ANY) {
error = EINVAL;
SCTP_TCB_UNLOCK(stcb);
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
@@ -2379,10 +2386,10 @@ flags_out:
} else
#endif
#ifdef INET6
- if (sa->sa_family == AF_INET6) {
+ if (addr->sa_family == AF_INET6) {
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)sa;
+ sin6 = (struct sockaddr_in6 *)addr;
if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
error = EINVAL;
SCTP_TCB_UNLOCK(stcb);
@@ -2398,21 +2405,27 @@ flags_out:
break;
}
}
- if (stcb) {
+ if (stcb != NULL) {
/* Applies to the specific association */
paddrp->spp_flags = 0;
- if (net) {
- int ovh;
-
- if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
- ovh = SCTP_MED_OVERHEAD;
- } else {
- ovh = SCTP_MED_V4_OVERHEAD;
- }
-
+ if (net != NULL) {
paddrp->spp_hbinterval = net->heart_beat_delay;
paddrp->spp_pathmaxrxt = net->failure_threshold;
- paddrp->spp_pathmtu = net->mtu - ovh;
+ paddrp->spp_pathmtu = net->mtu;
+ switch (net->ro._l_addr.sa.sa_family) {
+#ifdef INET
+ case AF_INET:
+ paddrp->spp_pathmtu -= SCTP_MIN_V4_OVERHEAD;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ paddrp->spp_pathmtu -= SCTP_MIN_V4_OVERHEAD;
+ break;
+#endif
+ default:
+ break;
+ }
/* get flags for HB */
if (net->dest_state & SCTP_ADDR_NOHB) {
paddrp->spp_flags |= SPP_HB_DISABLE;
@@ -2421,9 +2434,9 @@ flags_out:
}
/* get flags for PMTU */
if (net->dest_state & SCTP_ADDR_NO_PMTUD) {
- paddrp->spp_flags |= SPP_PMTUD_ENABLE;
- } else {
paddrp->spp_flags |= SPP_PMTUD_DISABLE;
+ } else {
+ paddrp->spp_flags |= SPP_PMTUD_ENABLE;
}
if (net->dscp & 0x01) {
paddrp->spp_dscp = net->dscp & 0xfc;
@@ -2442,7 +2455,7 @@ flags_out:
* value
*/
paddrp->spp_pathmaxrxt = stcb->asoc.def_net_failure;
- paddrp->spp_pathmtu = sctp_get_frag_point(stcb, &stcb->asoc);
+ paddrp->spp_pathmtu = 0;
if (stcb->asoc.default_dscp & 0x01) {
paddrp->spp_dscp = stcb->asoc.default_dscp & 0xfc;
paddrp->spp_flags |= SPP_DSCP;
@@ -2517,13 +2530,35 @@ flags_out:
{
struct sctp_paddrinfo *paddri;
struct sctp_nets *net;
+ struct sockaddr *addr;
+
+#if defined(INET) && defined(INET6)
+ struct sockaddr_in sin_store;
+
+#endif
SCTP_CHECK_AND_CAST(paddri, optval, struct sctp_paddrinfo, *optsize);
SCTP_FIND_STCB(inp, stcb, paddri->spinfo_assoc_id);
- net = NULL;
- if (stcb) {
- net = sctp_findnet(stcb, (struct sockaddr *)&paddri->spinfo_address);
+#if defined(INET) && defined(INET6)
+ if (paddri->spinfo_address.ss_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&paddri->spinfo_address;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ in6_sin6_2_sin(&sin_store, sin6);
+ addr = (struct sockaddr *)&sin_store;
+ } else {
+ addr = (struct sockaddr *)&paddri->spinfo_address;
+ }
+ } else {
+ addr = (struct sockaddr *)&paddri->spinfo_address;
+ }
+#else
+ addr = (struct sockaddr *)&paddri->spinfo_address;
+#endif
+ if (stcb != NULL) {
+ net = sctp_findnet(stcb, addr);
} else {
/*
* We increment here since
@@ -2532,14 +2567,15 @@ flags_out:
* the locked tcb (last argument) is NOT a
* TCB.. aka NULL.
*/
+ net = NULL;
SCTP_INP_INCR_REF(inp);
- stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&paddri->spinfo_address, &net, NULL, NULL);
+ stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
if (stcb == NULL) {
SCTP_INP_DECR_REF(inp);
}
}
- if ((stcb) && (net)) {
+ if ((stcb != NULL) && (net != NULL)) {
if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
/* It's unconfirmed */
paddri->spinfo_state = SCTP_UNCONFIRMED;
@@ -2555,10 +2591,24 @@ flags_out:
paddri->spinfo_rto = net->RTO;
paddri->spinfo_assoc_id = sctp_get_associd(stcb);
paddri->spinfo_mtu = net->mtu;
+ switch (addr->sa_family) {
+#if defined(INET)
+ case AF_INET:
+ paddri->spinfo_mtu -= SCTP_MIN_V4_OVERHEAD;
+ break;
+#endif
+#if defined(INET6)
+ case AF_INET6:
+ paddri->spinfo_mtu -= SCTP_MIN_OVERHEAD;
+ break;
+#endif
+ default:
+ break;
+ }
SCTP_TCB_UNLOCK(stcb);
*optsize = sizeof(struct sctp_paddrinfo);
} else {
- if (stcb) {
+ if (stcb != NULL) {
SCTP_TCB_UNLOCK(stcb);
}
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
@@ -2588,12 +2638,7 @@ flags_out:
error = EINVAL;
break;
}
- /*
- * I think passing the state is fine since
- * sctp_constants.h will be available to the user
- * land.
- */
- sstat->sstat_state = stcb->asoc.state;
+ sstat->sstat_state = sctp_map_assoc_state(stcb->asoc.state);
sstat->sstat_assoc_id = sctp_get_associd(stcb);
sstat->sstat_rwnd = stcb->asoc.peers_rwnd;
sstat->sstat_unackdata = stcb->asoc.sent_queue_cnt;
@@ -2631,6 +2676,20 @@ flags_out:
sstat->sstat_primary.spinfo_srtt = net->lastsa >> SCTP_RTT_SHIFT;
sstat->sstat_primary.spinfo_rto = net->RTO;
sstat->sstat_primary.spinfo_mtu = net->mtu;
+ switch (stcb->asoc.primary_destination->ro._l_addr.sa.sa_family) {
+#if defined(INET)
+ case AF_INET:
+ sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_V4_OVERHEAD;
+ break;
+#endif
+#if defined(INET6)
+ case AF_INET6:
+ sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_OVERHEAD;
+ break;
+#endif
+ default:
+ break;
+ }
sstat->sstat_primary.spinfo_assoc_id = sctp_get_associd(stcb);
SCTP_TCB_UNLOCK(stcb);
*optsize = sizeof(struct sctp_status);
@@ -2775,16 +2834,32 @@ flags_out:
SCTP_FIND_STCB(inp, stcb, ssp->ssp_assoc_id);
if (stcb) {
- /* simply copy out the sockaddr_storage... */
- size_t len;
+ union sctp_sockstore *addr;
- len = *optsize;
- if (len > stcb->asoc.primary_destination->ro._l_addr.sa.sa_len)
- len = stcb->asoc.primary_destination->ro._l_addr.sa.sa_len;
-
- memcpy(&ssp->ssp_addr,
- &stcb->asoc.primary_destination->ro._l_addr,
- len);
+ addr = &stcb->asoc.primary_destination->ro._l_addr;
+ switch (addr->sa.sa_family) {
+#ifdef INET
+ case AF_INET:
+#ifdef INET6
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
+ in6_sin_2_v4mapsin6(&addr->sin,
+ (struct sockaddr_in6 *)&ssp->ssp_addr);
+ } else {
+ memcpy(&ssp->ssp_addr, &addr->sin, sizeof(struct sockaddr_in));
+ }
+#else
+ memcpy(&ssp->ssp_addr, &addr->sin, sizeof(struct sockaddr_in));
+#endif
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ memcpy(&ssp->ssp_addr, &addr->sin6, sizeof(struct sockaddr_in6));
+ break;
+#endif
+ default:
+ break;
+ }
SCTP_TCB_UNLOCK(stcb);
*optsize = sizeof(struct sctp_setprim);
} else {
@@ -3124,13 +3199,35 @@ flags_out:
{
struct sctp_paddrthlds *thlds;
struct sctp_nets *net;
+ struct sockaddr *addr;
+
+#if defined(INET) && defined(INET6)
+ struct sockaddr_in sin_store;
+
+#endif
SCTP_CHECK_AND_CAST(thlds, optval, struct sctp_paddrthlds, *optsize);
SCTP_FIND_STCB(inp, stcb, thlds->spt_assoc_id);
- net = NULL;
- if (stcb) {
- net = sctp_findnet(stcb, (struct sockaddr *)&thlds->spt_address);
+#if defined(INET) && defined(INET6)
+ if (thlds->spt_address.ss_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&thlds->spt_address;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ in6_sin6_2_sin(&sin_store, sin6);
+ addr = (struct sockaddr *)&sin_store;
+ } else {
+ addr = (struct sockaddr *)&thlds->spt_address;
+ }
+ } else {
+ addr = (struct sockaddr *)&thlds->spt_address;
+ }
+#else
+ addr = (struct sockaddr *)&thlds->spt_address;
+#endif
+ if (stcb != NULL) {
+ net = sctp_findnet(stcb, addr);
} else {
/*
* We increment here since
@@ -3139,22 +3236,20 @@ flags_out:
* the locked tcb (last argument) is NOT a
* TCB.. aka NULL.
*/
+ net = NULL;
SCTP_INP_INCR_REF(inp);
- stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&thlds->spt_address, &net, NULL, NULL);
+ stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
if (stcb == NULL) {
SCTP_INP_DECR_REF(inp);
}
}
- if (stcb && (net == NULL)) {
- struct sockaddr *sa;
-
- sa = (struct sockaddr *)&thlds->spt_address;
+ if ((stcb != NULL) && (net == NULL)) {
#ifdef INET
- if (sa->sa_family == AF_INET) {
+ if (addr->sa_family == AF_INET) {
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)sa;
- if (sin->sin_addr.s_addr) {
+ sin = (struct sockaddr_in *)addr;
+ if (sin->sin_addr.s_addr != INADDR_ANY) {
error = EINVAL;
SCTP_TCB_UNLOCK(stcb);
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
@@ -3163,10 +3258,10 @@ flags_out:
} else
#endif
#ifdef INET6
- if (sa->sa_family == AF_INET6) {
+ if (addr->sa_family == AF_INET6) {
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)sa;
+ sin6 = (struct sockaddr_in6 *)addr;
if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
error = EINVAL;
SCTP_TCB_UNLOCK(stcb);
@@ -3182,13 +3277,15 @@ flags_out:
break;
}
}
- if (stcb) {
- if (net) {
+ if (stcb != NULL) {
+ if (net != NULL) {
thlds->spt_pathmaxrxt = net->failure_threshold;
thlds->spt_pathpfthld = net->pf_threshold;
+ thlds->spt_pathcpthld = 0xffff;
} else {
thlds->spt_pathmaxrxt = stcb->asoc.def_net_failure;
thlds->spt_pathpfthld = stcb->asoc.def_net_pf_threshold;
+ thlds->spt_pathcpthld = 0xffff;
}
thlds->spt_assoc_id = sctp_get_associd(stcb);
SCTP_TCB_UNLOCK(stcb);
@@ -3200,6 +3297,7 @@ flags_out:
SCTP_INP_RLOCK(inp);
thlds->spt_pathmaxrxt = inp->sctp_ep.def_net_failure;
thlds->spt_pathpfthld = inp->sctp_ep.def_net_pf_threshold;
+ thlds->spt_pathcpthld = 0xffff;
SCTP_INP_RUNLOCK(inp);
} else {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
@@ -3215,12 +3313,35 @@ flags_out:
{
struct sctp_udpencaps *encaps;
struct sctp_nets *net;
+ struct sockaddr *addr;
+
+#if defined(INET) && defined(INET6)
+ struct sockaddr_in sin_store;
+
+#endif
SCTP_CHECK_AND_CAST(encaps, optval, struct sctp_udpencaps, *optsize);
SCTP_FIND_STCB(inp, stcb, encaps->sue_assoc_id);
+#if defined(INET) && defined(INET6)
+ if (encaps->sue_address.ss_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&encaps->sue_address;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ in6_sin6_2_sin(&sin_store, sin6);
+ addr = (struct sockaddr *)&sin_store;
+ } else {
+ addr = (struct sockaddr *)&encaps->sue_address;
+ }
+ } else {
+ addr = (struct sockaddr *)&encaps->sue_address;
+ }
+#else
+ addr = (struct sockaddr *)&encaps->sue_address;
+#endif
if (stcb) {
- net = sctp_findnet(stcb, (struct sockaddr *)&encaps->sue_address);
+ net = sctp_findnet(stcb, addr);
} else {
/*
* We increment here since
@@ -3231,21 +3352,18 @@ flags_out:
*/
net = NULL;
SCTP_INP_INCR_REF(inp);
- stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&encaps->sue_address, &net, NULL, NULL);
+ stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
if (stcb == NULL) {
SCTP_INP_DECR_REF(inp);
}
}
- if (stcb && (net == NULL)) {
- struct sockaddr *sa;
-
- sa = (struct sockaddr *)&encaps->sue_address;
+ if ((stcb != NULL) && (net == NULL)) {
#ifdef INET
- if (sa->sa_family == AF_INET) {
+ if (addr->sa_family == AF_INET) {
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)sa;
- if (sin->sin_addr.s_addr) {
+ sin = (struct sockaddr_in *)addr;
+ if (sin->sin_addr.s_addr != INADDR_ANY) {
error = EINVAL;
SCTP_TCB_UNLOCK(stcb);
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
@@ -3254,10 +3372,10 @@ flags_out:
} else
#endif
#ifdef INET6
- if (sa->sa_family == AF_INET6) {
+ if (addr->sa_family == AF_INET6) {
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)sa;
+ sin6 = (struct sockaddr_in6 *)addr;
if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
error = EINVAL;
SCTP_TCB_UNLOCK(stcb);
@@ -3273,7 +3391,7 @@ flags_out:
break;
}
}
- if (stcb) {
+ if (stcb != NULL) {
if (net) {
encaps->sue_port = net->port;
} else {
@@ -3297,6 +3415,195 @@ flags_out:
}
break;
}
+ case SCTP_ECN_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ av->assoc_value = stcb->asoc.ecn_supported;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_RLOCK(inp);
+ av->assoc_value = inp->ecn_supported;
+ SCTP_INP_RUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ if (error == 0) {
+ *optsize = sizeof(struct sctp_assoc_value);
+ }
+ break;
+ }
+ case SCTP_PR_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ av->assoc_value = stcb->asoc.prsctp_supported;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_RLOCK(inp);
+ av->assoc_value = inp->prsctp_supported;
+ SCTP_INP_RUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ if (error == 0) {
+ *optsize = sizeof(struct sctp_assoc_value);
+ }
+ break;
+ }
+ case SCTP_AUTH_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ av->assoc_value = stcb->asoc.auth_supported;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_RLOCK(inp);
+ av->assoc_value = inp->auth_supported;
+ SCTP_INP_RUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ if (error == 0) {
+ *optsize = sizeof(struct sctp_assoc_value);
+ }
+ break;
+ }
+ case SCTP_ASCONF_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ av->assoc_value = stcb->asoc.asconf_supported;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_RLOCK(inp);
+ av->assoc_value = inp->asconf_supported;
+ SCTP_INP_RUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ if (error == 0) {
+ *optsize = sizeof(struct sctp_assoc_value);
+ }
+ break;
+ }
+ case SCTP_RECONFIG_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ av->assoc_value = stcb->asoc.reconfig_supported;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_RLOCK(inp);
+ av->assoc_value = inp->reconfig_supported;
+ SCTP_INP_RUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ if (error == 0) {
+ *optsize = sizeof(struct sctp_assoc_value);
+ }
+ break;
+ }
+ case SCTP_NRSACK_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ av->assoc_value = stcb->asoc.nrsack_supported;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_RLOCK(inp);
+ av->assoc_value = inp->nrsack_supported;
+ SCTP_INP_RUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ if (error == 0) {
+ *optsize = sizeof(struct sctp_assoc_value);
+ }
+ break;
+ }
+ case SCTP_PKTDROP_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ av->assoc_value = stcb->asoc.pktdrop_supported;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_RLOCK(inp);
+ av->assoc_value = inp->pktdrop_supported;
+ SCTP_INP_RUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ if (error == 0) {
+ *optsize = sizeof(struct sctp_assoc_value);
+ }
+ break;
+ }
case SCTP_ENABLE_STREAM_RESET:
{
struct sctp_assoc_value *av;
@@ -3324,6 +3631,100 @@ flags_out:
}
break;
}
+ case SCTP_PR_STREAM_STATUS:
+ {
+ struct sctp_prstatus *sprstat;
+ uint16_t sid;
+ uint16_t policy;
+
+ SCTP_CHECK_AND_CAST(sprstat, optval, struct sctp_prstatus, *optsize);
+ SCTP_FIND_STCB(inp, stcb, sprstat->sprstat_assoc_id);
+
+ sid = sprstat->sprstat_sid;
+ policy = sprstat->sprstat_policy;
+#if defined(SCTP_DETAILED_STR_STATS)
+ if ((stcb != NULL) &&
+ (sid < stcb->asoc.streamoutcnt) &&
+ (policy != SCTP_PR_SCTP_NONE) &&
+ ((policy <= SCTP_PR_SCTP_MAX) ||
+ (policy == SCTP_PR_SCTP_ALL))) {
+ if (policy == SCTP_PR_SCTP_ALL) {
+ sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[0];
+ sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[0];
+ } else {
+ sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[policy];
+ sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[policy];
+ }
+#else
+ if ((stcb != NULL) &&
+ (sid < stcb->asoc.streamoutcnt) &&
+ (policy == SCTP_PR_SCTP_ALL)) {
+ sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[0];
+ sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[0];
+#endif
+ SCTP_TCB_UNLOCK(stcb);
+ *optsize = sizeof(struct sctp_prstatus);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ break;
+ }
+ case SCTP_PR_ASSOC_STATUS:
+ {
+ struct sctp_prstatus *sprstat;
+ uint16_t policy;
+
+ SCTP_CHECK_AND_CAST(sprstat, optval, struct sctp_prstatus, *optsize);
+ SCTP_FIND_STCB(inp, stcb, sprstat->sprstat_assoc_id);
+
+ policy = sprstat->sprstat_policy;
+ if ((stcb != NULL) &&
+ (policy != SCTP_PR_SCTP_NONE) &&
+ ((policy <= SCTP_PR_SCTP_MAX) ||
+ (policy == SCTP_PR_SCTP_ALL))) {
+ if (policy == SCTP_PR_SCTP_ALL) {
+ sprstat->sprstat_abandoned_unsent = stcb->asoc.abandoned_unsent[0];
+ sprstat->sprstat_abandoned_sent = stcb->asoc.abandoned_sent[0];
+ } else {
+ sprstat->sprstat_abandoned_unsent = stcb->asoc.abandoned_unsent[policy];
+ sprstat->sprstat_abandoned_sent = stcb->asoc.abandoned_sent[policy];
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ *optsize = sizeof(struct sctp_prstatus);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ break;
+ }
+ case SCTP_MAX_CWND:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ av->assoc_value = stcb->asoc.max_cwnd;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_RLOCK(inp);
+ av->assoc_value = inp->max_cwnd;
+ SCTP_INP_RUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ if (error == 0) {
+ *optsize = sizeof(struct sctp_assoc_value);
+ }
+ break;
+ }
default:
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
error = ENOPROTOOPT;
@@ -3487,6 +3888,47 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
}
break;
}
+ case SCTP_INTERLEAVING_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_WLOCK(inp);
+ if (av->assoc_value == 0) {
+ inp->idata_supported = 0;
+ } else {
+ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) &&
+ (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS))) {
+ inp->idata_supported = 1;
+ } else {
+ /*
+ * Must have Frag
+ * interleave and
+ * stream interleave
+ * on
+ */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ SCTP_INP_WUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ break;
+ }
case SCTP_CMT_ON_OFF:
if (SCTP_BASE_SYSCTL(sctp_cmt_on_off)) {
struct sctp_assoc_value *av;
@@ -3684,7 +4126,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
SCTP_TCB_UNLOCK(stcb);
}
SCTP_INP_RUNLOCK(inp);
-
} else {
/*
* Can't set stream value without
@@ -3959,12 +4400,13 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
uint32_t i;
SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, optsize);
- if (optsize < sizeof(struct sctp_hmacalgo) + shmac->shmac_number_of_idents * sizeof(uint16_t)) {
+ if ((optsize < sizeof(struct sctp_hmacalgo) + shmac->shmac_number_of_idents * sizeof(uint16_t)) ||
+ (shmac->shmac_number_of_idents > 0xffff)) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
error = EINVAL;
break;
}
- hmaclist = sctp_alloc_hmaclist(shmac->shmac_number_of_idents);
+ hmaclist = sctp_alloc_hmaclist((uint16_t) shmac->shmac_number_of_idents);
if (hmaclist == NULL) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
error = ENOMEM;
@@ -4172,7 +4614,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
error = ENOENT;
break;
}
- if (stcb->asoc.peer_supports_strreset == 0) {
+ if (stcb->asoc.reconfig_supported == 0) {
/*
* Peer does not support the chunk type.
*/
@@ -4181,18 +4623,30 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
SCTP_TCB_UNLOCK(stcb);
break;
}
- if (stcb->asoc.stream_reset_outstanding) {
- SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
- error = EALREADY;
+ if (sizeof(struct sctp_reset_streams) +
+ strrst->srs_number_streams * sizeof(uint16_t) > optsize) {
+ error = EINVAL;
SCTP_TCB_UNLOCK(stcb);
break;
}
if (strrst->srs_flags & SCTP_STREAM_RESET_INCOMING) {
send_in = 1;
+ if (stcb->asoc.stream_reset_outstanding) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
+ error = EALREADY;
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
}
if (strrst->srs_flags & SCTP_STREAM_RESET_OUTGOING) {
send_out = 1;
}
+ if ((strrst->srs_number_streams > SCTP_MAX_STREAMS_AT_ONCE_RESET) && send_in) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
+ error = ENOMEM;
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
if ((send_in == 0) && (send_out == 0)) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
error = EINVAL;
@@ -4217,11 +4671,46 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
SCTP_TCB_UNLOCK(stcb);
break;
}
- error = sctp_send_str_reset_req(stcb, strrst->srs_number_streams,
- strrst->srs_stream_list,
- send_out, send_in, 0, 0, 0, 0, 0);
-
- sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED);
+ if (send_out) {
+ int cnt;
+ uint16_t strm;
+
+ if (strrst->srs_number_streams) {
+ for (i = 0, cnt = 0; i < strrst->srs_number_streams; i++) {
+ strm = strrst->srs_stream_list[i];
+ if (stcb->asoc.strmout[strm].state == SCTP_STREAM_OPEN) {
+ stcb->asoc.strmout[strm].state = SCTP_STREAM_RESET_PENDING;
+ cnt++;
+ }
+ }
+ } else {
+ /* Its all */
+ for (i = 0, cnt = 0; i < stcb->asoc.streamoutcnt; i++) {
+ if (stcb->asoc.strmout[i].state == SCTP_STREAM_OPEN) {
+ stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_PENDING;
+ cnt++;
+ }
+ }
+ }
+ }
+ if (send_in) {
+ error = sctp_send_str_reset_req(stcb, strrst->srs_number_streams,
+ strrst->srs_stream_list,
+ send_in, 0, 0, 0, 0, 0);
+ } else {
+ error = sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_LOCKED);
+ }
+ if (error == 0) {
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED);
+ } else {
+ /*
+ * For outgoing streams don't report any
+ * problems in sending the request to the
+ * application. XXX: Double check resetting
+ * incoming streams.
+ */
+ error = 0;
+ }
SCTP_TCB_UNLOCK(stcb);
break;
}
@@ -4239,7 +4728,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
error = ENOENT;
break;
}
- if (stcb->asoc.peer_supports_strreset == 0) {
+ if (stcb->asoc.reconfig_supported == 0) {
/*
* Peer does not support the chunk type.
*/
@@ -4291,7 +4780,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
goto skip_stuff;
}
}
- error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 0, addstream, add_o_strmcnt, add_i_strmcnt, 0);
+ error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, addstream, add_o_strmcnt, add_i_strmcnt, 0);
sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED);
skip_stuff:
SCTP_TCB_UNLOCK(stcb);
@@ -4299,6 +4788,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
}
case SCTP_RESET_ASSOC:
{
+ int i;
uint32_t *value;
SCTP_CHECK_AND_CAST(value, optval, uint32_t, optsize);
@@ -4308,7 +4798,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
error = ENOENT;
break;
}
- if (stcb->asoc.peer_supports_strreset == 0) {
+ if (stcb->asoc.reconfig_supported == 0) {
/*
* Peer does not support the chunk type.
*/
@@ -4323,7 +4813,25 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
SCTP_TCB_UNLOCK(stcb);
break;
}
- error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 1, 0, 0, 0, 0);
+ /*
+ * Is there any data pending in the send or sent
+ * queues?
+ */
+ if (!TAILQ_EMPTY(&stcb->asoc.send_queue) ||
+ !TAILQ_EMPTY(&stcb->asoc.sent_queue)) {
+ busy_out:
+ error = EBUSY;
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ SCTP_TCB_UNLOCK(stcb);
+ break;
+ }
+ /* Do any streams have data queued? */
+ for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
+ if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) {
+ goto busy_out;
+ }
+ }
+ error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 1, 0, 0, 0, 0);
sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED);
SCTP_TCB_UNLOCK(stcb);
break;
@@ -4347,7 +4855,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
case SCTP_CONNECT_X_COMPLETE:
{
struct sockaddr *sa;
- struct sctp_nets *net;
/* FIXME MT: check correct? */
SCTP_CHECK_AND_CAST(sa, optval, struct sockaddr, optsize);
@@ -4358,7 +4865,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
stcb = LIST_FIRST(&inp->sctp_asoc_list);
if (stcb) {
SCTP_TCB_LOCK(stcb);
- net = sctp_findnet(stcb, sa);
}
SCTP_INP_RUNLOCK(inp);
} else {
@@ -4370,7 +4876,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
* TCB.. aka NULL.
*/
SCTP_INP_INCR_REF(inp);
- stcb = sctp_findassociation_ep_addr(&inp, sa, &net, NULL, NULL);
+ stcb = sctp_findassociation_ep_addr(&inp, sa, NULL, NULL, NULL);
if (stcb == NULL) {
SCTP_INP_DECR_REF(inp);
}
@@ -4386,7 +4892,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
(void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb,
stcb->asoc.primary_destination,
- SCTP_FROM_SCTP_USRREQ + SCTP_LOC_9);
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_8);
sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
} else {
/*
@@ -4690,12 +5196,35 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
{
struct sctp_paddrparams *paddrp;
struct sctp_nets *net;
+ struct sockaddr *addr;
+
+#if defined(INET) && defined(INET6)
+ struct sockaddr_in sin_store;
+
+#endif
SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, optsize);
SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id);
- net = NULL;
- if (stcb) {
- net = sctp_findnet(stcb, (struct sockaddr *)&paddrp->spp_address);
+
+#if defined(INET) && defined(INET6)
+ if (paddrp->spp_address.ss_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&paddrp->spp_address;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ in6_sin6_2_sin(&sin_store, sin6);
+ addr = (struct sockaddr *)&sin_store;
+ } else {
+ addr = (struct sockaddr *)&paddrp->spp_address;
+ }
+ } else {
+ addr = (struct sockaddr *)&paddrp->spp_address;
+ }
+#else
+ addr = (struct sockaddr *)&paddrp->spp_address;
+#endif
+ if (stcb != NULL) {
+ net = sctp_findnet(stcb, addr);
} else {
/*
* We increment here since
@@ -4704,25 +5233,22 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
* the locked tcb (last argument) is NOT a
* TCB.. aka NULL.
*/
+ net = NULL;
SCTP_INP_INCR_REF(inp);
- stcb = sctp_findassociation_ep_addr(&inp,
- (struct sockaddr *)&paddrp->spp_address,
+ stcb = sctp_findassociation_ep_addr(&inp, addr,
&net, NULL, NULL);
if (stcb == NULL) {
SCTP_INP_DECR_REF(inp);
}
}
- if (stcb && (net == NULL)) {
- struct sockaddr *sa;
-
- sa = (struct sockaddr *)&paddrp->spp_address;
+ if ((stcb != NULL) && (net == NULL)) {
#ifdef INET
- if (sa->sa_family == AF_INET) {
+ if (addr->sa_family == AF_INET) {
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)sa;
- if (sin->sin_addr.s_addr) {
+ sin = (struct sockaddr_in *)addr;
+ if (sin->sin_addr.s_addr != INADDR_ANY) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
SCTP_TCB_UNLOCK(stcb);
error = EINVAL;
@@ -4731,10 +5257,10 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
} else
#endif
#ifdef INET6
- if (sa->sa_family == AF_INET6) {
+ if (addr->sa_family == AF_INET6) {
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)sa;
+ sin6 = (struct sockaddr_in6 *)addr;
if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
SCTP_TCB_UNLOCK(stcb);
@@ -4763,28 +5289,15 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
return (EINVAL);
}
- if (stcb) {
+ if (stcb != NULL) {
/************************TCB SPECIFIC SET ******************/
- /*
- * do we change the timer for HB, we run
- * only one?
- */
- int ovh = 0;
-
- if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
- ovh = SCTP_MED_OVERHEAD;
- } else {
- ovh = SCTP_MED_V4_OVERHEAD;
- }
-
- /* network sets ? */
- if (net) {
+ if (net != NULL) {
/************************NET SPECIFIC SET ******************/
if (paddrp->spp_flags & SPP_HB_DISABLE) {
if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED) &&
!(net->dest_state & SCTP_ADDR_NOHB)) {
sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
- SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10);
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_9);
}
net->dest_state |= SCTP_ADDR_NOHB;
}
@@ -4808,10 +5321,24 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) && (paddrp->spp_pathmtu >= SCTP_SMALLEST_PMTU)) {
if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
- SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10);
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_11);
}
net->dest_state |= SCTP_ADDR_NO_PMTUD;
- net->mtu = paddrp->spp_pathmtu + ovh;
+ net->mtu = paddrp->spp_pathmtu;
+ switch (net->ro._l_addr.sa.sa_family) {
+#ifdef INET
+ case AF_INET:
+ net->mtu += SCTP_MIN_V4_OVERHEAD;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ net->mtu += SCTP_MIN_OVERHEAD;
+ break;
+#endif
+ default:
+ break;
+ }
if (net->mtu < stcb->asoc.smallest_mtu) {
sctp_pathmtu_adjustment(stcb, net->mtu);
}
@@ -4832,7 +5359,9 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
(net->error_count > net->pf_threshold)) {
net->dest_state |= SCTP_ADDR_PF;
sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_TIMER + SCTP_LOC_3);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
+ stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_12);
sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
}
}
@@ -4863,7 +5392,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
#endif
} else {
/************************ASSOC ONLY -- NO NET SPECIFIC SET ******************/
- if (paddrp->spp_pathmaxrxt) {
+ if (paddrp->spp_pathmaxrxt != 0) {
stcb->asoc.def_net_failure = paddrp->spp_pathmaxrxt;
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
if (net->dest_state & SCTP_ADDR_PF) {
@@ -4875,7 +5404,9 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
(net->error_count > net->pf_threshold)) {
net->dest_state |= SCTP_ADDR_PF;
sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_TIMER + SCTP_LOC_3);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
+ stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_13);
sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
}
}
@@ -4894,14 +5425,14 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
}
}
if (paddrp->spp_flags & SPP_HB_ENABLE) {
- if (paddrp->spp_hbinterval) {
+ if (paddrp->spp_hbinterval != 0) {
stcb->asoc.heart_beat_delay = paddrp->spp_hbinterval;
} else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) {
stcb->asoc.heart_beat_delay = 0;
}
/* Turn back on the timer */
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
- if (paddrp->spp_hbinterval) {
+ if (paddrp->spp_hbinterval != 0) {
net->heart_beat_delay = paddrp->spp_hbinterval;
} else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) {
net->heart_beat_delay = 0;
@@ -4910,7 +5441,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
net->dest_state &= ~SCTP_ADDR_NOHB;
}
sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
- SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10);
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_14);
sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net);
}
sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT);
@@ -4920,7 +5451,9 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
if (!(net->dest_state & SCTP_ADDR_NOHB)) {
net->dest_state |= SCTP_ADDR_NOHB;
if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED)) {
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
+ inp, stcb, net,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_15);
}
}
}
@@ -4930,10 +5463,24 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
- SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10);
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_16);
}
net->dest_state |= SCTP_ADDR_NO_PMTUD;
- net->mtu = paddrp->spp_pathmtu + ovh;
+ net->mtu = paddrp->spp_pathmtu;
+ switch (net->ro._l_addr.sa.sa_family) {
+#ifdef INET
+ case AF_INET:
+ net->mtu += SCTP_MIN_V4_OVERHEAD;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ net->mtu += SCTP_MIN_OVERHEAD;
+ break;
+#endif
+ default:
+ break;
+ }
if (net->mtu < stcb->asoc.smallest_mtu) {
sctp_pathmtu_adjustment(stcb, net->mtu);
}
@@ -4982,12 +5529,12 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
* set it with the options on the
* socket
*/
- if (paddrp->spp_pathmaxrxt) {
+ if (paddrp->spp_pathmaxrxt != 0) {
inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt;
}
if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO)
inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0;
- else if (paddrp->spp_hbinterval) {
+ else if (paddrp->spp_hbinterval != 0) {
if (paddrp->spp_hbinterval > SCTP_MAX_HB_INTERVAL)
paddrp->spp_hbinterval = SCTP_MAX_HB_INTERVAL;
inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval);
@@ -5153,13 +5700,35 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
{
struct sctp_setprim *spa;
struct sctp_nets *net;
+ struct sockaddr *addr;
+
+#if defined(INET) && defined(INET6)
+ struct sockaddr_in sin_store;
+
+#endif
SCTP_CHECK_AND_CAST(spa, optval, struct sctp_setprim, optsize);
SCTP_FIND_STCB(inp, stcb, spa->ssp_assoc_id);
- net = NULL;
- if (stcb) {
- net = sctp_findnet(stcb, (struct sockaddr *)&spa->ssp_addr);
+#if defined(INET) && defined(INET6)
+ if (spa->ssp_addr.ss_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&spa->ssp_addr;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ in6_sin6_2_sin(&sin_store, sin6);
+ addr = (struct sockaddr *)&sin_store;
+ } else {
+ addr = (struct sockaddr *)&spa->ssp_addr;
+ }
+ } else {
+ addr = (struct sockaddr *)&spa->ssp_addr;
+ }
+#else
+ addr = (struct sockaddr *)&spa->ssp_addr;
+#endif
+ if (stcb != NULL) {
+ net = sctp_findnet(stcb, addr);
} else {
/*
* We increment here since
@@ -5168,33 +5737,40 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
* the locked tcb (last argument) is NOT a
* TCB.. aka NULL.
*/
+ net = NULL;
SCTP_INP_INCR_REF(inp);
- stcb = sctp_findassociation_ep_addr(&inp,
- (struct sockaddr *)&spa->ssp_addr,
+ stcb = sctp_findassociation_ep_addr(&inp, addr,
&net, NULL, NULL);
if (stcb == NULL) {
SCTP_INP_DECR_REF(inp);
}
}
- if ((stcb) && (net)) {
- if ((net != stcb->asoc.primary_destination) &&
- (!(net->dest_state & SCTP_ADDR_UNCONFIRMED))) {
- /* Ok we need to set it */
- if (sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net) == 0) {
- if ((stcb->asoc.alternate) &&
- (!(net->dest_state & SCTP_ADDR_PF)) &&
- (net->dest_state & SCTP_ADDR_REACHABLE)) {
- sctp_free_remote_addr(stcb->asoc.alternate);
- stcb->asoc.alternate = NULL;
+ if ((stcb != NULL) && (net != NULL)) {
+ if (net != stcb->asoc.primary_destination) {
+ if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED)) {
+ /* Ok we need to set it */
+ if (sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net) == 0) {
+ if ((stcb->asoc.alternate) &&
+ (!(net->dest_state & SCTP_ADDR_PF)) &&
+ (net->dest_state & SCTP_ADDR_REACHABLE)) {
+ sctp_free_remote_addr(stcb->asoc.alternate);
+ stcb->asoc.alternate = NULL;
+ }
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
}
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
}
}
} else {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
error = EINVAL;
}
- if (stcb) {
+ if (stcb != NULL) {
SCTP_TCB_UNLOCK(stcb);
}
break;
@@ -5216,14 +5792,36 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
case SCTP_SET_PEER_PRIMARY_ADDR:
{
struct sctp_setpeerprim *sspp;
+ struct sockaddr *addr;
+
+#if defined(INET) && defined(INET6)
+ struct sockaddr_in sin_store;
+
+#endif
SCTP_CHECK_AND_CAST(sspp, optval, struct sctp_setpeerprim, optsize);
SCTP_FIND_STCB(inp, stcb, sspp->sspp_assoc_id);
if (stcb != NULL) {
struct sctp_ifa *ifa;
- ifa = sctp_find_ifa_by_addr((struct sockaddr *)&sspp->sspp_addr,
- stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED);
+#if defined(INET) && defined(INET6)
+ if (sspp->sspp_addr.ss_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&sspp->sspp_addr;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ in6_sin6_2_sin(&sin_store, sin6);
+ addr = (struct sockaddr *)&sin_store;
+ } else {
+ addr = (struct sockaddr *)&sspp->sspp_addr;
+ }
+ } else {
+ addr = (struct sockaddr *)&sspp->sspp_addr;
+ }
+#else
+ addr = (struct sockaddr *)&sspp->sspp_addr;
+#endif
+ ifa = sctp_find_ifa_by_addr(addr, stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED);
if (ifa == NULL) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
error = EINVAL;
@@ -5240,7 +5838,11 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
if (laddr->ifa == NULL) {
SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n",
- __FUNCTION__);
+ __func__);
+ continue;
+ }
+ if ((sctp_is_addr_restricted(stcb, laddr->ifa)) &&
+ (!sctp_is_addr_pending(stcb, laddr->ifa))) {
continue;
}
if (laddr->ifa == ifa) {
@@ -5254,13 +5856,13 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
goto out_of_it;
}
} else {
- switch (sspp->sspp_addr.ss_family) {
+ switch (addr->sa_family) {
#ifdef INET
case AF_INET:
{
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)&sspp->sspp_addr;
+ sin = (struct sockaddr_in *)addr;
if (prison_check_ip4(inp->ip_inp.inp.inp_cred,
&sin->sin_addr) != 0) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
@@ -5275,7 +5877,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
{
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)&sspp->sspp_addr;
+ sin6 = (struct sockaddr_in6 *)addr;
if (prison_check_ip6(inp->ip_inp.inp.inp_cred,
&sin6->sin6_addr) != 0) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
@@ -5291,11 +5893,11 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
goto out_of_it;
}
}
- if (sctp_set_primary_ip_address_sa(stcb,
- (struct sockaddr *)&sspp->sspp_addr) != 0) {
+ if (sctp_set_primary_ip_address_sa(stcb, addr) != 0) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
error = EINVAL;
}
+ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SOCKOPT, SCTP_SO_LOCKED);
out_of_it:
SCTP_TCB_UNLOCK(stcb);
} else {
@@ -5602,7 +6204,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
SCTP_CHECK_AND_CAST(info, optval, struct sctp_default_prinfo, optsize);
SCTP_FIND_STCB(inp, stcb, info->pr_assoc_id);
- if (PR_SCTP_INVALID_POLICY(info->pr_policy)) {
+ if (info->pr_policy > SCTP_PR_SCTP_MAX) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
error = EINVAL;
break;
@@ -5643,12 +6245,35 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
{
struct sctp_paddrthlds *thlds;
struct sctp_nets *net;
+ struct sockaddr *addr;
+
+#if defined(INET) && defined(INET6)
+ struct sockaddr_in sin_store;
+
+#endif
SCTP_CHECK_AND_CAST(thlds, optval, struct sctp_paddrthlds, optsize);
SCTP_FIND_STCB(inp, stcb, thlds->spt_assoc_id);
- net = NULL;
- if (stcb) {
- net = sctp_findnet(stcb, (struct sockaddr *)&thlds->spt_address);
+
+#if defined(INET) && defined(INET6)
+ if (thlds->spt_address.ss_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&thlds->spt_address;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ in6_sin6_2_sin(&sin_store, sin6);
+ addr = (struct sockaddr *)&sin_store;
+ } else {
+ addr = (struct sockaddr *)&thlds->spt_address;
+ }
+ } else {
+ addr = (struct sockaddr *)&thlds->spt_address;
+ }
+#else
+ addr = (struct sockaddr *)&thlds->spt_address;
+#endif
+ if (stcb != NULL) {
+ net = sctp_findnet(stcb, addr);
} else {
/*
* We increment here since
@@ -5657,25 +6282,22 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
* the locked tcb (last argument) is NOT a
* TCB.. aka NULL.
*/
+ net = NULL;
SCTP_INP_INCR_REF(inp);
- stcb = sctp_findassociation_ep_addr(&inp,
- (struct sockaddr *)&thlds->spt_address,
+ stcb = sctp_findassociation_ep_addr(&inp, addr,
&net, NULL, NULL);
if (stcb == NULL) {
SCTP_INP_DECR_REF(inp);
}
}
- if (stcb && (net == NULL)) {
- struct sockaddr *sa;
-
- sa = (struct sockaddr *)&thlds->spt_address;
+ if ((stcb != NULL) && (net == NULL)) {
#ifdef INET
- if (sa->sa_family == AF_INET) {
+ if (addr->sa_family == AF_INET) {
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)sa;
- if (sin->sin_addr.s_addr) {
+ sin = (struct sockaddr_in *)addr;
+ if (sin->sin_addr.s_addr != INADDR_ANY) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
SCTP_TCB_UNLOCK(stcb);
error = EINVAL;
@@ -5684,10 +6306,10 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
} else
#endif
#ifdef INET6
- if (sa->sa_family == AF_INET6) {
+ if (addr->sa_family == AF_INET6) {
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)sa;
+ sin6 = (struct sockaddr_in6 *)addr;
if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
SCTP_TCB_UNLOCK(stcb);
@@ -5703,68 +6325,78 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
break;
}
}
- if (stcb) {
- if (net) {
+ if (thlds->spt_pathcpthld != 0xffff) {
+ error = EINVAL;
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
+ break;
+ }
+ if (stcb != NULL) {
+ if (net != NULL) {
+ net->failure_threshold = thlds->spt_pathmaxrxt;
+ net->pf_threshold = thlds->spt_pathpfthld;
if (net->dest_state & SCTP_ADDR_PF) {
- if ((net->failure_threshold > thlds->spt_pathmaxrxt) ||
- (net->failure_threshold <= thlds->spt_pathpfthld)) {
+ if ((net->error_count > net->failure_threshold) ||
+ (net->error_count <= net->pf_threshold)) {
net->dest_state &= ~SCTP_ADDR_PF;
}
} else {
- if ((net->failure_threshold > thlds->spt_pathpfthld) &&
- (net->failure_threshold <= thlds->spt_pathmaxrxt)) {
+ if ((net->error_count > net->pf_threshold) &&
+ (net->error_count <= net->failure_threshold)) {
net->dest_state |= SCTP_ADDR_PF;
sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_TIMER + SCTP_LOC_3);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
+ stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_17);
sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
}
}
if (net->dest_state & SCTP_ADDR_REACHABLE) {
- if (net->failure_threshold > thlds->spt_pathmaxrxt) {
+ if (net->error_count > net->failure_threshold) {
net->dest_state &= ~SCTP_ADDR_REACHABLE;
sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
}
} else {
- if (net->failure_threshold <= thlds->spt_pathmaxrxt) {
+ if (net->error_count <= net->failure_threshold) {
net->dest_state |= SCTP_ADDR_REACHABLE;
sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
}
}
- net->failure_threshold = thlds->spt_pathmaxrxt;
- net->pf_threshold = thlds->spt_pathpfthld;
} else {
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ net->failure_threshold = thlds->spt_pathmaxrxt;
+ net->pf_threshold = thlds->spt_pathpfthld;
if (net->dest_state & SCTP_ADDR_PF) {
- if ((net->failure_threshold > thlds->spt_pathmaxrxt) ||
- (net->failure_threshold <= thlds->spt_pathpfthld)) {
+ if ((net->error_count > net->failure_threshold) ||
+ (net->error_count <= net->pf_threshold)) {
net->dest_state &= ~SCTP_ADDR_PF;
}
} else {
- if ((net->failure_threshold > thlds->spt_pathpfthld) &&
- (net->failure_threshold <= thlds->spt_pathmaxrxt)) {
+ if ((net->error_count > net->pf_threshold) &&
+ (net->error_count <= net->failure_threshold)) {
net->dest_state |= SCTP_ADDR_PF;
sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
- sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_TIMER + SCTP_LOC_3);
+ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
+ stcb->sctp_ep, stcb, net,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_18);
sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
}
}
if (net->dest_state & SCTP_ADDR_REACHABLE) {
- if (net->failure_threshold > thlds->spt_pathmaxrxt) {
+ if (net->error_count > net->failure_threshold) {
net->dest_state &= ~SCTP_ADDR_REACHABLE;
sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
}
} else {
- if (net->failure_threshold <= thlds->spt_pathmaxrxt) {
+ if (net->error_count <= net->failure_threshold) {
net->dest_state |= SCTP_ADDR_REACHABLE;
sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
}
}
- net->failure_threshold = thlds->spt_pathmaxrxt;
- net->pf_threshold = thlds->spt_pathpfthld;
}
stcb->asoc.def_net_failure = thlds->spt_pathmaxrxt;
stcb->asoc.def_net_pf_threshold = thlds->spt_pathpfthld;
}
+ SCTP_TCB_UNLOCK(stcb);
} else {
if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
(inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
@@ -5784,11 +6416,35 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
{
struct sctp_udpencaps *encaps;
struct sctp_nets *net;
+ struct sockaddr *addr;
+
+#if defined(INET) && defined(INET6)
+ struct sockaddr_in sin_store;
+
+#endif
SCTP_CHECK_AND_CAST(encaps, optval, struct sctp_udpencaps, optsize);
SCTP_FIND_STCB(inp, stcb, encaps->sue_assoc_id);
- if (stcb) {
- net = sctp_findnet(stcb, (struct sockaddr *)&encaps->sue_address);
+
+#if defined(INET) && defined(INET6)
+ if (encaps->sue_address.ss_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&encaps->sue_address;
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ in6_sin6_2_sin(&sin_store, sin6);
+ addr = (struct sockaddr *)&sin_store;
+ } else {
+ addr = (struct sockaddr *)&encaps->sue_address;
+ }
+ } else {
+ addr = (struct sockaddr *)&encaps->sue_address;
+ }
+#else
+ addr = (struct sockaddr *)&encaps->sue_address;
+#endif
+ if (stcb != NULL) {
+ net = sctp_findnet(stcb, addr);
} else {
/*
* We increment here since
@@ -5799,22 +6455,19 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
*/
net = NULL;
SCTP_INP_INCR_REF(inp);
- stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&encaps->sue_address, &net, NULL, NULL);
+ stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
if (stcb == NULL) {
SCTP_INP_DECR_REF(inp);
}
}
- if (stcb && (net == NULL)) {
- struct sockaddr *sa;
-
- sa = (struct sockaddr *)&encaps->sue_address;
+ if ((stcb != NULL) && (net == NULL)) {
#ifdef INET
- if (sa->sa_family == AF_INET) {
+ if (addr->sa_family == AF_INET) {
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)sa;
- if (sin->sin_addr.s_addr) {
+ sin = (struct sockaddr_in *)addr;
+ if (sin->sin_addr.s_addr != INADDR_ANY) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
SCTP_TCB_UNLOCK(stcb);
error = EINVAL;
@@ -5823,10 +6476,10 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
} else
#endif
#ifdef INET6
- if (sa->sa_family == AF_INET6) {
+ if (addr->sa_family == AF_INET6) {
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)sa;
+ sin6 = (struct sockaddr_in6 *)addr;
if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
SCTP_TCB_UNLOCK(stcb);
@@ -5842,8 +6495,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
break;
}
}
- if (stcb) {
- if (net) {
+ if (stcb != NULL) {
+ if (net != NULL) {
net->port = encaps->sue_port;
} else {
stcb->asoc.port = encaps->sue_port;
@@ -5863,6 +6516,273 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
}
break;
}
+ case SCTP_ECN_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_WLOCK(inp);
+ if (av->assoc_value == 0) {
+ inp->ecn_supported = 0;
+ } else {
+ inp->ecn_supported = 1;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ break;
+ }
+ case SCTP_PR_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_WLOCK(inp);
+ if (av->assoc_value == 0) {
+ inp->prsctp_supported = 0;
+ } else {
+ inp->prsctp_supported = 1;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ break;
+ }
+ case SCTP_AUTH_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ if ((av->assoc_value == 0) &&
+ (inp->asconf_supported == 1)) {
+ /*
+ * AUTH is required for
+ * ASCONF
+ */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ } else {
+ SCTP_INP_WLOCK(inp);
+ if (av->assoc_value == 0) {
+ inp->auth_supported = 0;
+ } else {
+ inp->auth_supported = 1;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ break;
+ }
+ case SCTP_ASCONF_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ if ((av->assoc_value != 0) &&
+ (inp->auth_supported == 0)) {
+ /*
+ * AUTH is required for
+ * ASCONF
+ */
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ } else {
+ SCTP_INP_WLOCK(inp);
+ if (av->assoc_value == 0) {
+ inp->asconf_supported = 0;
+ sctp_auth_delete_chunk(SCTP_ASCONF,
+ inp->sctp_ep.local_auth_chunks);
+ sctp_auth_delete_chunk(SCTP_ASCONF_ACK,
+ inp->sctp_ep.local_auth_chunks);
+ } else {
+ inp->asconf_supported = 1;
+ sctp_auth_add_chunk(SCTP_ASCONF,
+ inp->sctp_ep.local_auth_chunks);
+ sctp_auth_add_chunk(SCTP_ASCONF_ACK,
+ inp->sctp_ep.local_auth_chunks);
+ }
+ SCTP_INP_WUNLOCK(inp);
+ }
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ break;
+ }
+ case SCTP_RECONFIG_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_WLOCK(inp);
+ if (av->assoc_value == 0) {
+ inp->reconfig_supported = 0;
+ } else {
+ inp->reconfig_supported = 1;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ break;
+ }
+ case SCTP_NRSACK_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_WLOCK(inp);
+ if (av->assoc_value == 0) {
+ inp->nrsack_supported = 0;
+ } else {
+ inp->nrsack_supported = 1;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ break;
+ }
+ case SCTP_PKTDROP_SUPPORTED:
+ {
+ struct sctp_assoc_value *av;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_WLOCK(inp);
+ if (av->assoc_value == 0) {
+ inp->pktdrop_supported = 0;
+ } else {
+ inp->pktdrop_supported = 1;
+ }
+ SCTP_INP_WUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ break;
+ }
+ case SCTP_MAX_CWND:
+ {
+ struct sctp_assoc_value *av;
+ struct sctp_nets *net;
+
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+ if (stcb) {
+ stcb->asoc.max_cwnd = av->assoc_value;
+ if (stcb->asoc.max_cwnd > 0) {
+ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
+ if ((net->cwnd > stcb->asoc.max_cwnd) &&
+ (net->cwnd > (net->mtu - sizeof(struct sctphdr)))) {
+ net->cwnd = stcb->asoc.max_cwnd;
+ if (net->cwnd < (net->mtu - sizeof(struct sctphdr))) {
+ net->cwnd = net->mtu - sizeof(struct sctphdr);
+ }
+ }
+ }
+ }
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_WLOCK(inp);
+ inp->max_cwnd = av->assoc_value;
+ SCTP_INP_WUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ break;
+ }
default:
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
error = ENOPROTOOPT;
@@ -5878,7 +6798,20 @@ sctp_ctloutput(struct socket *so, struct sockopt *sopt)
size_t optsize = 0;
void *p;
int error = 0;
+ struct sctp_inpcb *inp;
+ if ((sopt->sopt_level == SOL_SOCKET) &&
+ (sopt->sopt_name == SO_SETFIB)) {
+ inp = (struct sctp_inpcb *)so->so_pcb;
+ if (inp == NULL) {
+ SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS);
+ return (EINVAL);
+ }
+ SCTP_INP_WLOCK(inp);
+ inp->fibnum = so->so_fibnum;
+ SCTP_INP_WUNLOCK(inp);
+ return (0);
+ }
if (sopt->sopt_level != IPPROTO_SCTP) {
/* wrong proto level... send back up to IP */
#ifdef INET6
@@ -6052,7 +6985,9 @@ sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p)
}
vrf_id = inp->def_vrf_id;
/* We are GOOD to go */
- stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, p);
+ stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id,
+ inp->sctp_ep.pre_open_stream_count,
+ inp->sctp_ep.port, p);
if (stcb == NULL) {
/* Gak! no memory */
goto out_now;
@@ -6182,7 +7117,7 @@ sctp_listen(struct socket *so, int backlog, struct thread *p)
SCTP_INP_DECR_REF(tinp);
return (EADDRINUSE);
} else if (tinp) {
- SCTP_INP_DECR_REF(inp);
+ SCTP_INP_DECR_REF(tinp);
}
}
}
@@ -6194,8 +7129,8 @@ sctp_listen(struct socket *so, int backlog, struct thread *p)
#endif
SOCK_LOCK(so);
error = solisten_proto_check(so);
+ SOCK_UNLOCK(so);
if (error) {
- SOCK_UNLOCK(so);
SCTP_INP_RUNLOCK(inp);
return (error);
}
@@ -6208,28 +7143,27 @@ sctp_listen(struct socket *so, int backlog, struct thread *p)
* move the guy that was listener to the TCP Pool.
*/
if (sctp_swap_inpcb_for_listen(inp)) {
- goto in_use;
+ SCTP_INP_RUNLOCK(inp);
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE);
+ return (EADDRINUSE);
}
}
if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
(inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) {
/* We are already connected AND the TCP model */
-in_use:
SCTP_INP_RUNLOCK(inp);
- SOCK_UNLOCK(so);
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE);
return (EADDRINUSE);
}
SCTP_INP_RUNLOCK(inp);
if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) {
/* We must do a bind. */
- SOCK_UNLOCK(so);
if ((error = sctp_inpcb_bind(so, NULL, NULL, p))) {
/* bind error, probably perm */
return (error);
}
- SOCK_LOCK(so);
}
+ SOCK_LOCK(so);
/* It appears for 7.0 and on, we must always call this. */
solisten_proto(so, backlog);
if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) {
@@ -6357,7 +7291,8 @@ sctp_accept(struct socket *so, struct sockaddr **addr)
}
if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
SCTP_TCB_LOCK(stcb);
- sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7);
+ sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTP_USRREQ + SCTP_LOC_19);
}
return (0);
}
@@ -6442,7 +7377,7 @@ sctp_ingetaddr(struct socket *so, struct sockaddr **addr)
if (laddr->ifa->address.sa.sa_family == AF_INET) {
struct sockaddr_in *sin_a;
- sin_a = (struct sockaddr_in *)&laddr->ifa->address.sa;
+ sin_a = &laddr->ifa->address.sin;
sin->sin_addr = sin_a->sin_addr;
fnd = 1;
break;
diff --git a/freebsd/sys/netinet/sctp_var.h b/freebsd/sys/netinet/sctp_var.h
index d88a2376..a4d2b998 100644
--- a/freebsd/sys/netinet/sctp_var.h
+++ b/freebsd/sys/netinet/sctp_var.h
@@ -72,7 +72,7 @@ extern struct pr_usrreqs sctp_usrreqs;
((stcb->asoc.sctp_features & feature) == 0)) || \
((stcb == NULL) && (inp != NULL) && \
((inp->sctp_features & feature) == 0)) || \
- ((stcb == NULL) && (inp == NULL)))
+ ((stcb == NULL) && (inp == NULL)))
/* managing mobility_feature in inpcb (by micchie) */
#define sctp_mobility_feature_on(inp, feature) (inp->sctp_mobility_features |= feature)
@@ -86,7 +86,7 @@ extern struct pr_usrreqs sctp_usrreqs;
#define sctp_sbspace_failedmsgs(sb) ((long) ((sctp_maxspace(sb) > (sb)->sb_cc) ? (sctp_maxspace(sb) - (sb)->sb_cc) : 0))
-#define sctp_sbspace_sub(a,b) ((a > b) ? (a - b) : 0)
+#define sctp_sbspace_sub(a,b) (((a) > (b)) ? ((a) - (b)) : 0)
/*
* I tried to cache the readq entries at one point. But the reality
@@ -97,16 +97,24 @@ extern struct pr_usrreqs sctp_usrreqs;
* an mbuf cache as well so it is not really worth doing, at least
* right now :-D
*/
-
+#ifdef INVARIANTS
#define sctp_free_a_readq(_stcb, _readq) { \
+ if ((_readq)->on_strm_q) \
+ panic("On strm q stcb:%p readq:%p", (_stcb), (_readq)); \
SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), (_readq)); \
SCTP_DECR_READQ_COUNT(); \
}
+#else
+#define sctp_free_a_readq(_stcb, _readq) { \
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), (_readq)); \
+ SCTP_DECR_READQ_COUNT(); \
+}
+#endif
#define sctp_alloc_a_readq(_stcb, _readq) { \
(_readq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_readq), struct sctp_queued_to_read); \
if ((_readq)) { \
- SCTP_INCR_READQ_COUNT(); \
+ SCTP_INCR_READQ_COUNT(); \
} \
}
@@ -121,11 +129,11 @@ extern struct pr_usrreqs sctp_usrreqs;
#define sctp_alloc_a_strmoq(_stcb, _strmoq) { \
(_strmoq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_strmoq), struct sctp_stream_queue_pending); \
- if ((_strmoq)) { \
+ if ((_strmoq)) { \
memset(_strmoq, 0, sizeof(struct sctp_stream_queue_pending)); \
SCTP_INCR_STRMOQ_COUNT(); \
(_strmoq)->holds_key_ref = 0; \
- } \
+ } \
}
#define sctp_free_a_chunk(_stcb, _chk, _so_locked) { \
@@ -133,22 +141,22 @@ extern struct pr_usrreqs sctp_usrreqs;
sctp_auth_key_release((_stcb), (_chk)->auth_keyid, _so_locked); \
(_chk)->holds_key_ref = 0; \
} \
- if (_stcb) { \
- SCTP_TCB_LOCK_ASSERT((_stcb)); \
- if ((_chk)->whoTo) { \
- sctp_free_remote_addr((_chk)->whoTo); \
- (_chk)->whoTo = NULL; \
- } \
- if (((_stcb)->asoc.free_chunk_cnt > SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit)) || \
- (SCTP_BASE_INFO(ipi_free_chunks) > SCTP_BASE_SYSCTL(sctp_system_free_resc_limit))) { \
- SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \
- SCTP_DECR_CHK_COUNT(); \
- } else { \
- TAILQ_INSERT_TAIL(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \
- (_stcb)->asoc.free_chunk_cnt++; \
- atomic_add_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \
- } \
- } else { \
+ if (_stcb) { \
+ SCTP_TCB_LOCK_ASSERT((_stcb)); \
+ if ((_chk)->whoTo) { \
+ sctp_free_remote_addr((_chk)->whoTo); \
+ (_chk)->whoTo = NULL; \
+ } \
+ if (((_stcb)->asoc.free_chunk_cnt > SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit)) || \
+ (SCTP_BASE_INFO(ipi_free_chunks) > SCTP_BASE_SYSCTL(sctp_system_free_resc_limit))) { \
+ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \
+ SCTP_DECR_CHK_COUNT(); \
+ } else { \
+ TAILQ_INSERT_TAIL(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \
+ (_stcb)->asoc.free_chunk_cnt++; \
+ atomic_add_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \
+ } \
+ } else { \
SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \
SCTP_DECR_CHK_COUNT(); \
} \
@@ -159,7 +167,7 @@ extern struct pr_usrreqs sctp_usrreqs;
(_chk) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_chunk), struct sctp_tmit_chunk); \
if ((_chk)) { \
SCTP_INCR_CHK_COUNT(); \
- (_chk)->whoTo = NULL; \
+ (_chk)->whoTo = NULL; \
(_chk)->holds_key_ref = 0; \
} \
} else { \
@@ -167,7 +175,7 @@ extern struct pr_usrreqs sctp_usrreqs;
TAILQ_REMOVE(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \
atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \
(_chk)->holds_key_ref = 0; \
- SCTP_STAT_INCR(sctps_cached_chk); \
+ SCTP_STAT_INCR(sctps_cached_chk); \
(_stcb)->asoc.free_chunk_cnt--; \
} \
}
@@ -178,15 +186,16 @@ extern struct pr_usrreqs sctp_usrreqs;
if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&(__net)->ref_count)) { \
(void)SCTP_OS_TIMER_STOP(&(__net)->rxt_timer.timer); \
(void)SCTP_OS_TIMER_STOP(&(__net)->pmtu_timer.timer); \
- if ((__net)->ro.ro_rt) { \
+ (void)SCTP_OS_TIMER_STOP(&(__net)->hb_timer.timer); \
+ if ((__net)->ro.ro_rt) { \
RTFREE((__net)->ro.ro_rt); \
(__net)->ro.ro_rt = NULL; \
- } \
+ } \
if ((__net)->src_addr_selected) { \
sctp_free_ifa((__net)->ro._s_addr); \
(__net)->ro._s_addr = NULL; \
} \
- (__net)->src_addr_selected = 0; \
+ (__net)->src_addr_selected = 0; \
(__net)->dest_state &= ~SCTP_ADDR_REACHABLE; \
SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_net), (__net)); \
SCTP_DECR_RADDR_COUNT(); \
@@ -210,7 +219,7 @@ extern struct pr_usrreqs sctp_usrreqs;
atomic_add_int(&(sb)->sb_cc,SCTP_BUF_LEN((m))); \
atomic_add_int(&(sb)->sb_mbcnt, MSIZE); \
if (stcb) { \
- atomic_add_int(&(stcb)->asoc.sb_cc,SCTP_BUF_LEN((m))); \
+ atomic_add_int(&(stcb)->asoc.sb_cc, SCTP_BUF_LEN((m))); \
atomic_add_int(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \
} \
if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \
@@ -250,12 +259,12 @@ extern struct pr_usrreqs sctp_usrreqs;
} while (0)
#define sctp_flight_size_increase(tp1) do { \
- (tp1)->whoTo->flight_size += (tp1)->book_size; \
+ (tp1)->whoTo->flight_size += (tp1)->book_size; \
} while (0)
#ifdef SCTP_FS_SPEC_LOG
#define sctp_total_flight_decrease(stcb, tp1) do { \
- if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \
+ if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \
stcb->asoc.fs_index = 0;\
stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \
stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.TSN_seq; \
@@ -264,7 +273,7 @@ extern struct pr_usrreqs sctp_usrreqs;
stcb->asoc.fslog[stcb->asoc.fs_index].incr = 0; \
stcb->asoc.fslog[stcb->asoc.fs_index].decr = 1; \
stcb->asoc.fs_index++; \
- tp1->window_probe = 0; \
+ tp1->window_probe = 0; \
if (stcb->asoc.total_flight >= tp1->book_size) { \
stcb->asoc.total_flight -= tp1->book_size; \
if (stcb->asoc.total_flight_count > 0) \
@@ -276,7 +285,7 @@ extern struct pr_usrreqs sctp_usrreqs;
} while (0)
#define sctp_total_flight_increase(stcb, tp1) do { \
- if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \
+ if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \
stcb->asoc.fs_index = 0;\
stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \
stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.TSN_seq; \
@@ -285,14 +294,14 @@ extern struct pr_usrreqs sctp_usrreqs;
stcb->asoc.fslog[stcb->asoc.fs_index].incr = 1; \
stcb->asoc.fslog[stcb->asoc.fs_index].decr = 0; \
stcb->asoc.fs_index++; \
- (stcb)->asoc.total_flight_count++; \
- (stcb)->asoc.total_flight += (tp1)->book_size; \
+ (stcb)->asoc.total_flight_count++; \
+ (stcb)->asoc.total_flight += (tp1)->book_size; \
} while (0)
#else
#define sctp_total_flight_decrease(stcb, tp1) do { \
- tp1->window_probe = 0; \
+ tp1->window_probe = 0; \
if (stcb->asoc.total_flight >= tp1->book_size) { \
stcb->asoc.total_flight -= tp1->book_size; \
if (stcb->asoc.total_flight_count > 0) \
@@ -304,8 +313,8 @@ extern struct pr_usrreqs sctp_usrreqs;
} while (0)
#define sctp_total_flight_increase(stcb, tp1) do { \
- (stcb)->asoc.total_flight_count++; \
- (stcb)->asoc.total_flight += (tp1)->book_size; \
+ (stcb)->asoc.total_flight_count++; \
+ (stcb)->asoc.total_flight += (tp1)->book_size; \
} while (0)
#endif
@@ -326,19 +335,17 @@ int sctp_ctloutput(struct socket *, struct sockopt *);
#ifdef INET
void sctp_input_with_port(struct mbuf *, int, uint16_t);
-void sctp_input(struct mbuf *, int);
+int sctp_input(struct mbuf **, int *, int);
#endif
void sctp_pathmtu_adjustment(struct sctp_tcb *, uint16_t);
void sctp_drain(void);
void sctp_init(void);
-void sctp_finish(void);
+void
+sctp_notify(struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *,
+ uint8_t, uint8_t, uint16_t, uint16_t);
int sctp_flush(struct socket *, int);
int sctp_shutdown(struct socket *);
-void
-sctp_notify(struct sctp_inpcb *, struct ip *ip, struct sctphdr *,
- struct sockaddr *, struct sctp_tcb *,
- struct sctp_nets *);
int
sctp_bindx(struct socket *, int, struct sockaddr_storage *,
int, int, struct proc *);
diff --git a/freebsd/sys/netinet/sctputil.c b/freebsd/sys/netinet/sctputil.c
index 6cd82739..36a9c2ce 100644
--- a/freebsd/sys/netinet/sctputil.c
+++ b/freebsd/sys/netinet/sctputil.c
@@ -54,14 +54,17 @@ __FBSDID("$FreeBSD$");
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <sys/proc.h>
+#ifdef INET6
+#include <netinet/icmp6.h>
+#endif
#ifndef KTR_SCTP
#define KTR_SCTP KTR_SUBSYS
#endif
-extern struct sctp_cc_functions sctp_cc_functions[];
-extern struct sctp_ss_functions sctp_ss_functions[];
+extern const struct sctp_cc_functions sctp_cc_functions[];
+extern const struct sctp_ss_functions sctp_ss_functions[];
void
sctp_sblog(struct sockbuf *sb, struct sctp_tcb *stcb, int from, int incr)
@@ -219,6 +222,7 @@ sctp_log_fr(uint32_t biggest_tsn, uint32_t biggest_new_tsn, uint32_t tsn, int fr
sctp_clog.x.misc.log4);
}
+#ifdef SCTP_MBUF_LOGGING
void
sctp_log_mb(struct mbuf *m, int from)
{
@@ -245,6 +249,18 @@ sctp_log_mb(struct mbuf *m, int from)
}
void
+sctp_log_mbc(struct mbuf *m, int from)
+{
+ struct mbuf *mat;
+
+ for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) {
+ sctp_log_mb(mat, from);
+ }
+}
+
+#endif
+
+void
sctp_log_strm_del(struct sctp_queued_to_read *control, struct sctp_queued_to_read *poschk, int from)
{
struct sctp_cwnd_log sctp_clog;
@@ -415,7 +431,8 @@ sctp_log_rwnd_set(uint8_t from, uint32_t peers_rwnd, uint32_t flight_size, uint3
sctp_clog.x.misc.log4);
}
-void
+#ifdef SCTP_MBCNT_LOGGING
+static void
sctp_log_mbcnt(uint8_t from, uint32_t total_oq, uint32_t book, uint32_t total_mbcnt_q, uint32_t mbcnt)
{
struct sctp_cwnd_log sctp_clog;
@@ -433,6 +450,8 @@ sctp_log_mbcnt(uint8_t from, uint32_t total_oq, uint32_t book, uint32_t total_mb
sctp_clog.x.misc.log4);
}
+#endif
+
void
sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
{
@@ -489,7 +508,7 @@ sctp_wakeup_log(struct sctp_tcb *stcb, uint32_t wake_cnt, int from)
}
void
-sctp_log_block(uint8_t from, struct sctp_association *asoc, int sendlen)
+sctp_log_block(uint8_t from, struct sctp_association *asoc, size_t sendlen)
{
struct sctp_cwnd_log sctp_clog;
@@ -499,7 +518,7 @@ sctp_log_block(uint8_t from, struct sctp_association *asoc, int sendlen)
sctp_clog.x.blk.stream_qcnt = (uint16_t) asoc->stream_queue_cnt;
sctp_clog.x.blk.chunks_on_oque = (uint16_t) asoc->chunks_on_out_queue;
sctp_clog.x.blk.flight_size = (uint16_t) (asoc->total_flight / 1024);
- sctp_clog.x.blk.sndlen = sendlen;
+ sctp_clog.x.blk.sndlen = (uint32_t) sendlen;
SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
SCTP_LOG_EVENT_BLOCK,
from,
@@ -879,9 +898,52 @@ sctp_select_a_tag(struct sctp_inpcb *inp, uint16_t lport, uint16_t rport, int ch
return (x);
}
+int32_t
+sctp_map_assoc_state(int kernel_state)
+{
+ int32_t user_state;
+
+ if (kernel_state & SCTP_STATE_WAS_ABORTED) {
+ user_state = SCTP_CLOSED;
+ } else if (kernel_state & SCTP_STATE_SHUTDOWN_PENDING) {
+ user_state = SCTP_SHUTDOWN_PENDING;
+ } else {
+ switch (kernel_state & SCTP_STATE_MASK) {
+ case SCTP_STATE_EMPTY:
+ user_state = SCTP_CLOSED;
+ break;
+ case SCTP_STATE_INUSE:
+ user_state = SCTP_CLOSED;
+ break;
+ case SCTP_STATE_COOKIE_WAIT:
+ user_state = SCTP_COOKIE_WAIT;
+ break;
+ case SCTP_STATE_COOKIE_ECHOED:
+ user_state = SCTP_COOKIE_ECHOED;
+ break;
+ case SCTP_STATE_OPEN:
+ user_state = SCTP_ESTABLISHED;
+ break;
+ case SCTP_STATE_SHUTDOWN_SENT:
+ user_state = SCTP_SHUTDOWN_SENT;
+ break;
+ case SCTP_STATE_SHUTDOWN_RECEIVED:
+ user_state = SCTP_SHUTDOWN_RECEIVED;
+ break;
+ case SCTP_STATE_SHUTDOWN_ACK_SENT:
+ user_state = SCTP_SHUTDOWN_ACK_SENT;
+ break;
+ default:
+ user_state = SCTP_CLOSED;
+ break;
+ }
+ }
+ return (user_state);
+}
+
int
sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
- uint32_t override_tag, uint32_t vrf_id)
+ uint32_t override_tag, uint32_t vrf_id, uint16_t o_strms)
{
struct sctp_association *asoc;
@@ -898,6 +960,11 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
*/
int i;
+#if defined(SCTP_DETAILED_STR_STATS)
+ int j;
+
+#endif
+
asoc = &stcb->asoc;
/* init all variables to a known value. */
SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_INUSE);
@@ -906,12 +973,20 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
asoc->heart_beat_delay = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]);
asoc->cookie_life = inp->sctp_ep.def_cookie_life;
asoc->sctp_cmt_on_off = inp->sctp_cmt_on_off;
- asoc->ecn_allowed = inp->sctp_ecn_enable;
- asoc->sctp_nr_sack_on_off = (uint8_t) SCTP_BASE_SYSCTL(sctp_nr_sack_on_off);
+ asoc->ecn_supported = inp->ecn_supported;
+ asoc->prsctp_supported = inp->prsctp_supported;
+ asoc->idata_supported = inp->idata_supported;
+ asoc->auth_supported = inp->auth_supported;
+ asoc->asconf_supported = inp->asconf_supported;
+ asoc->reconfig_supported = inp->reconfig_supported;
+ asoc->nrsack_supported = inp->nrsack_supported;
+ asoc->pktdrop_supported = inp->pktdrop_supported;
+ asoc->idata_supported = inp->idata_supported;
asoc->sctp_cmt_pf = (uint8_t) 0;
asoc->sctp_frag_point = inp->sctp_frag_point;
asoc->sctp_features = inp->sctp_features;
asoc->default_dscp = inp->sctp_ep.default_dscp;
+ asoc->max_cwnd = inp->max_cwnd;
#ifdef INET6
if (inp->sctp_ep.default_flowlabel) {
asoc->default_flowlabel = inp->sctp_ep.default_flowlabel;
@@ -953,7 +1028,6 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
sctp_select_initial_TSN(&inp->sctp_ep);
asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1;
/* we are optimisitic here */
- asoc->peer_supports_pktdrop = 1;
asoc->peer_supports_nat = 0;
asoc->sent_queue_retran_cnt = 0;
@@ -1005,7 +1079,6 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
asoc->minrto = inp->sctp_ep.sctp_minrto;
asoc->maxrto = inp->sctp_ep.sctp_maxrto;
- asoc->locked_on_sending = NULL;
asoc->stream_locked_on = 0;
asoc->ecn_echo_cnt_onq = 0;
asoc->stream_locked = 0;
@@ -1033,7 +1106,7 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
* that we request by default.
*/
asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams =
- inp->sctp_ep.pre_open_stream_count;
+ o_strms;
SCTP_MALLOC(asoc->strmout, struct sctp_stream_out *,
asoc->streamoutcnt * sizeof(struct sctp_stream_out),
SCTP_M_STRMO);
@@ -1051,12 +1124,23 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
* that were dropped must be notified to the upper layer as
* failed to send.
*/
- asoc->strmout[i].next_sequence_send = 0x0;
+ asoc->strmout[i].next_mid_ordered = 0;
+ asoc->strmout[i].next_mid_unordered = 0;
TAILQ_INIT(&asoc->strmout[i].outqueue);
asoc->strmout[i].chunks_on_queues = 0;
+#if defined(SCTP_DETAILED_STR_STATS)
+ for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) {
+ asoc->strmout[i].abandoned_sent[j] = 0;
+ asoc->strmout[i].abandoned_unsent[j] = 0;
+ }
+#else
+ asoc->strmout[i].abandoned_sent[0] = 0;
+ asoc->strmout[i].abandoned_unsent[0] = 0;
+#endif
asoc->strmout[i].stream_no = i;
asoc->strmout[i].last_msg_incomplete = 0;
- asoc->ss_functions.sctp_ss_init_stream(&asoc->strmout[i], NULL);
+ asoc->strmout[i].state = SCTP_STREAM_OPENING;
+ asoc->ss_functions.sctp_ss_init_stream(stcb, &asoc->strmout[i], NULL);
}
asoc->ss_functions.sctp_ss_init(stcb, asoc, 0);
@@ -1086,7 +1170,6 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
TAILQ_INIT(&asoc->asconf_send_queue);
TAILQ_INIT(&asoc->send_queue);
TAILQ_INIT(&asoc->sent_queue);
- TAILQ_INIT(&asoc->reasmqueue);
TAILQ_INIT(&asoc->resetHead);
asoc->max_inbound_streams = inp->sctp_ep.max_open_streams_intome;
TAILQ_INIT(&asoc->asconf_queue);
@@ -1109,6 +1192,10 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
asoc->timoshutdownack = 0;
(void)SCTP_GETTIME_TIMEVAL(&asoc->start_time);
asoc->discontinuity_time = asoc->start_time;
+ for (i = 0; i < SCTP_PR_SCTP_MAX + 1; i++) {
+ asoc->abandoned_unsent[i] = 0;
+ asoc->abandoned_sent[i] = 0;
+ }
/*
* sa_ignore MEMLEAK {memory is put in the assoc mapping array and
* freed later when the association is freed.
@@ -1195,6 +1282,7 @@ sctp_iterator_work(struct sctp_iterator *it)
SCTP_INP_INFO_RLOCK();
SCTP_ITERATOR_LOCK();
+ sctp_it_ctl.cur_it = it;
if (it->inp) {
SCTP_INP_RLOCK(it->inp);
SCTP_INP_DECR_REF(it->inp);
@@ -1202,6 +1290,7 @@ sctp_iterator_work(struct sctp_iterator *it)
if (it->inp == NULL) {
/* iterator is complete */
done_with_iterator:
+ sctp_it_ctl.cur_it = NULL;
SCTP_ITERATOR_UNLOCK();
SCTP_INP_INFO_RUNLOCK();
if (it->function_atend != NULL) {
@@ -1342,13 +1431,11 @@ sctp_iterator_worker(void)
sctp_it_ctl.iterator_running = 1;
TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) {
- sctp_it_ctl.cur_it = it;
/* now lets work on this one */
TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr);
SCTP_IPI_ITERATOR_WQ_UNLOCK();
CURVNET_SET(it->vn);
sctp_iterator_work(it);
- sctp_it_ctl.cur_it = NULL;
CURVNET_RESTORE();
SCTP_IPI_ITERATOR_WQ_LOCK();
/* sa_ignore FREED_MEMORY */
@@ -1389,7 +1476,9 @@ sctp_handle_addr_wq(void)
if (asc->cnt == 0) {
SCTP_FREE(asc, SCTP_M_ASC_IT);
} else {
- (void)sctp_initiate_iterator(sctp_asconf_iterator_ep,
+ int ret;
+
+ ret = sctp_initiate_iterator(sctp_asconf_iterator_ep,
sctp_asconf_iterator_stcb,
NULL, /* No ep end for boundall */
SCTP_PCB_FLAGS_BOUNDALL,
@@ -1397,6 +1486,23 @@ sctp_handle_addr_wq(void)
SCTP_ASOC_ANY_STATE,
(void *)asc, 0,
sctp_asconf_iterator_end, NULL, 0);
+ if (ret) {
+ SCTP_PRINTF("Failed to initiate iterator for handle_addr_wq\n");
+ /*
+ * Freeing if we are stopping or put back on the
+ * addr_wq.
+ */
+ if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) {
+ sctp_asconf_iterator_end(asc, 0);
+ } else {
+ SCTP_WQ_ADDR_LOCK();
+ LIST_FOREACH(wi, &asc->list_of_work, sctp_nxt_addr) {
+ LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr);
+ }
+ SCTP_WQ_ADDR_UNLOCK();
+ SCTP_FREE(asc, SCTP_M_ASC_IT);
+ }
+ }
}
}
@@ -1407,12 +1513,14 @@ sctp_timeout_handler(void *t)
struct sctp_tcb *stcb;
struct sctp_nets *net;
struct sctp_timer *tmr;
+ struct mbuf *op_err;
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
struct socket *so;
#endif
- int did_output, type;
+ int did_output;
+ int type;
tmr = (struct sctp_timer *)t;
inp = (struct sctp_inpcb *)tmr->ep;
@@ -1451,7 +1559,6 @@ sctp_timeout_handler(void *t)
}
/* if this is an iterator timeout, get the struct and clear inp */
tmr->stopped_from = 0xa003;
- type = tmr->type;
if (inp) {
SCTP_INP_INCR_REF(inp);
if ((inp->sctp_socket == NULL) &&
@@ -1482,8 +1589,9 @@ sctp_timeout_handler(void *t)
return;
}
}
+ type = tmr->type;
tmr->stopped_from = 0xa005;
- SCTPDBG(SCTP_DEBUG_TIMER1, "Timer type %d goes off\n", tmr->type);
+ SCTPDBG(SCTP_DEBUG_TIMER1, "Timer type %d goes off\n", type);
if (!SCTP_OS_TIMER_ACTIVE(&tmr->timer)) {
if (inp) {
SCTP_INP_DECR_REF(inp);
@@ -1499,7 +1607,7 @@ sctp_timeout_handler(void *t)
if (stcb) {
SCTP_TCB_LOCK(stcb);
atomic_add_int(&stcb->asoc.refcnt, -1);
- if ((tmr->type != SCTP_TIMER_TYPE_ASOCKILL) &&
+ if ((type != SCTP_TIMER_TYPE_ASOCKILL) &&
((stcb->asoc.state == 0) ||
(stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED))) {
SCTP_TCB_UNLOCK(stcb);
@@ -1510,8 +1618,8 @@ sctp_timeout_handler(void *t)
return;
}
}
- /* record in stopped what t-o occured */
- tmr->stopped_from = tmr->type;
+ /* record in stopped what t-o occurred */
+ tmr->stopped_from = type;
/* mark as being serviced now */
if (SCTP_OS_TIMER_PENDING(&tmr->timer)) {
@@ -1529,7 +1637,7 @@ sctp_timeout_handler(void *t)
SCTP_OS_TIMER_DEACTIVATE(&tmr->timer);
/* call the handler for the appropriate timer type */
- switch (tmr->type) {
+ switch (type) {
case SCTP_TIMER_TYPE_ZERO_COPY:
if (inp == NULL) {
break;
@@ -1719,7 +1827,9 @@ sctp_timeout_handler(void *t)
break;
}
SCTP_STAT_INCR(sctps_timoshutdownguard);
- sctp_abort_an_association(inp, stcb, NULL, SCTP_SO_NOT_LOCKED);
+ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code),
+ "Shutdown guard timer expired");
+ sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED);
/* no need to unlock on tcb its gone */
goto out_decr;
@@ -1772,7 +1882,8 @@ sctp_timeout_handler(void *t)
SCTP_STAT_INCR(sctps_timoassockill);
/* Can we free it yet? */
SCTP_INP_DECR_REF(inp);
- sctp_timer_stop(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_1);
+ sctp_timer_stop(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL,
+ SCTP_FROM_SCTPUTIL + SCTP_LOC_1);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
so = SCTP_INP_SO(inp);
atomic_add_int(&stcb->asoc.refcnt, 1);
@@ -1781,7 +1892,8 @@ sctp_timeout_handler(void *t)
SCTP_TCB_LOCK(stcb);
atomic_subtract_int(&stcb->asoc.refcnt, 1);
#endif
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_2);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTPUTIL + SCTP_LOC_2);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -1801,18 +1913,19 @@ sctp_timeout_handler(void *t)
* killer
*/
SCTP_INP_DECR_REF(inp);
- sctp_timer_stop(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_3);
+ sctp_timer_stop(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL,
+ SCTP_FROM_SCTPUTIL + SCTP_LOC_3);
sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
SCTP_CALLED_FROM_INPKILL_TIMER);
inp = NULL;
goto out_no_decr;
default:
SCTPDBG(SCTP_DEBUG_TIMER1, "sctp_timeout_handler:unknown timer %d\n",
- tmr->type);
+ type);
break;
}
#ifdef SCTP_AUDITING_ENABLED
- sctp_audit_log(0xF1, (uint8_t) tmr->type);
+ sctp_audit_log(0xF1, (uint8_t) type);
if (inp)
sctp_auditing(5, inp, stcb, net);
#endif
@@ -1835,8 +1948,7 @@ out_decr:
SCTP_INP_DECR_REF(inp);
}
out_no_decr:
- SCTPDBG(SCTP_DEBUG_TIMER1, "Timer now complete (type %d)\n",
- type);
+ SCTPDBG(SCTP_DEBUG_TIMER1, "Timer now complete (type = %d)\n", type);
CURVNET_RESTORE();
}
@@ -1929,7 +2041,7 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
* though we use a different timer. We also add the HB timer
* PLUS a random jitter.
*/
- if ((inp == NULL) || (stcb == NULL) || (net == NULL)) {
+ if ((stcb == NULL) || (net == NULL)) {
return;
} else {
uint32_t rndval;
@@ -1984,9 +2096,6 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
* nothing needed but the endpoint here ususually about 60
* minutes.
*/
- if (inp == NULL) {
- return;
- }
tmr = &inp->sctp_ep.signature_change;
to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_SIGNATURE];
break;
@@ -2003,9 +2112,6 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
* timer since that has stopped and we are in the GONE
* state.
*/
- if (inp == NULL) {
- return;
- }
tmr = &inp->sctp_ep.signature_change;
to_ticks = MSEC_TO_TICKS(SCTP_INP_KILL_TIMEOUT);
break;
@@ -2014,10 +2120,7 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
* Here we use the value found in the EP for PMTU ususually
* about 10 minutes.
*/
- if ((stcb == NULL) || (inp == NULL)) {
- return;
- }
- if (net == NULL) {
+ if ((stcb == NULL) || (net == NULL)) {
return;
}
if (net->dest_state & SCTP_ADDR_NO_PMTUD) {
@@ -2043,10 +2146,14 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
* Here we use the endpoints shutdown guard timer usually
* about 3 minutes.
*/
- if ((inp == NULL) || (stcb == NULL)) {
+ if (stcb == NULL) {
return;
}
- to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN];
+ if (inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN] == 0) {
+ to_ticks = 5 * MSEC_TO_TICKS(stcb->asoc.maxrto);
+ } else {
+ to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN];
+ }
tmr = &stcb->asoc.shut_guard_timer;
break;
case SCTP_TIMER_TYPE_STRRESET:
@@ -2102,13 +2209,13 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
break;
default:
SCTPDBG(SCTP_DEBUG_TIMER1, "%s: Unknown timer type %d\n",
- __FUNCTION__, t_type);
+ __func__, t_type);
return;
break;
}
if ((to_ticks <= 0) || (tmr == NULL)) {
SCTPDBG(SCTP_DEBUG_TIMER1, "%s: %d:software error to_ticks:%d tmr:%p not set ??\n",
- __FUNCTION__, t_type, to_ticks, (void *)tmr);
+ __func__, t_type, to_ticks, (void *)tmr);
return;
}
if (SCTP_OS_TIMER_PENDING(&tmr->timer)) {
@@ -2264,7 +2371,7 @@ sctp_timer_stop(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
break;
default:
SCTPDBG(SCTP_DEBUG_TIMER1, "%s: Unknown timer type %d\n",
- __FUNCTION__, t_type);
+ __func__, t_type);
break;
}
if (tmr == NULL) {
@@ -2383,8 +2490,8 @@ sctp_calculate_rto(struct sctp_tcb *stcb,
net->rtt = (uint64_t) 1000000 *(uint64_t) now.tv_sec +
(uint64_t) now.tv_usec;
- /* computer rtt in ms */
- rtt = net->rtt / 1000;
+ /* compute rtt in ms */
+ rtt = (int32_t) (net->rtt / 1000);
if ((asoc->cc_functions.sctp_rtt_calculated) && (rtt_from_sack == SCTP_RTT_FROM_DATA)) {
/*
* Tell the CC module that a new update has just occurred
@@ -2518,58 +2625,44 @@ sctp_get_next_param(struct mbuf *m,
}
-int
+struct mbuf *
sctp_add_pad_tombuf(struct mbuf *m, int padlen)
{
- /*
- * add padlen bytes of 0 filled padding to the end of the mbuf. If
- * padlen is > 3 this routine will fail.
- */
- uint8_t *dp;
- int i;
+ struct mbuf *m_last;
+ caddr_t dp;
if (padlen > 3) {
- SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS);
- return (ENOBUFS);
+ return (NULL);
}
if (padlen <= M_TRAILINGSPACE(m)) {
/*
* The easy way. We hope the majority of the time we hit
* here :)
*/
- dp = (uint8_t *) (mtod(m, caddr_t)+SCTP_BUF_LEN(m));
- SCTP_BUF_LEN(m) += padlen;
+ m_last = m;
} else {
- /* Hard way we must grow the mbuf */
- struct mbuf *tmp;
-
- tmp = sctp_get_mbuf_for_msg(padlen, 0, M_DONTWAIT, 1, MT_DATA);
- if (tmp == NULL) {
- /* Out of space GAK! we are in big trouble. */
- SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS);
- return (ENOBUFS);
- }
- /* setup and insert in middle */
- SCTP_BUF_LEN(tmp) = padlen;
- SCTP_BUF_NEXT(tmp) = NULL;
- SCTP_BUF_NEXT(m) = tmp;
- dp = mtod(tmp, uint8_t *);
- }
- /* zero out the pad */
- for (i = 0; i < padlen; i++) {
- *dp = 0;
- dp++;
+ /* Hard way we must grow the mbuf chain */
+ m_last = sctp_get_mbuf_for_msg(padlen, 0, M_NOWAIT, 1, MT_DATA);
+ if (m_last == NULL) {
+ return (NULL);
+ }
+ SCTP_BUF_LEN(m_last) = 0;
+ SCTP_BUF_NEXT(m_last) = NULL;
+ SCTP_BUF_NEXT(m) = m_last;
}
- return (0);
+ dp = mtod(m_last, caddr_t)+SCTP_BUF_LEN(m_last);
+ SCTP_BUF_LEN(m_last) += padlen;
+ memset(dp, 0, padlen);
+ return (m_last);
}
-int
+struct mbuf *
sctp_pad_lastmbuf(struct mbuf *m, int padval, struct mbuf *last_mbuf)
{
/* find the last mbuf in chain and pad it */
struct mbuf *m_at;
- if (last_mbuf) {
+ if (last_mbuf != NULL) {
return (sctp_add_pad_tombuf(last_mbuf, padval));
} else {
for (m_at = m; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
@@ -2578,8 +2671,7 @@ sctp_pad_lastmbuf(struct mbuf *m, int padval, struct mbuf *last_mbuf)
}
}
}
- SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EFAULT);
- return (EFAULT);
+ return (NULL);
}
static void
@@ -2593,7 +2685,8 @@ sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb,
struct mbuf *m_notify;
struct sctp_assoc_change *sac;
struct sctp_queued_to_read *control;
- size_t notif_len, abort_len;
+ unsigned int notif_len;
+ uint16_t abort_len;
unsigned int i;
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
@@ -2601,8 +2694,11 @@ sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb,
#endif
+ if (stcb == NULL) {
+ return;
+ }
if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT)) {
- notif_len = sizeof(struct sctp_assoc_change);
+ notif_len = (unsigned int)sizeof(struct sctp_assoc_change);
if (abort != NULL) {
abort_len = ntohs(abort->ch.chunk_length);
} else {
@@ -2613,11 +2709,11 @@ sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb,
} else if ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC)) {
notif_len += abort_len;
}
- m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_DONTWAIT, 1, MT_DATA);
+ m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL) {
/* Retry with smaller value. */
- notif_len = sizeof(struct sctp_assoc_change);
- m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_DONTWAIT, 1, MT_DATA);
+ notif_len = (unsigned int)sizeof(struct sctp_assoc_change);
+ m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL) {
goto set_error;
}
@@ -2637,17 +2733,20 @@ sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb,
if (notif_len > sizeof(struct sctp_assoc_change)) {
if ((state == SCTP_COMM_UP) || (state == SCTP_RESTART)) {
i = 0;
- if (stcb->asoc.peer_supports_prsctp) {
+ if (stcb->asoc.prsctp_supported == 1) {
sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_PR;
}
- if (stcb->asoc.peer_supports_auth) {
+ if (stcb->asoc.auth_supported == 1) {
sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_AUTH;
}
- if (stcb->asoc.peer_supports_asconf) {
+ if (stcb->asoc.asconf_supported == 1) {
sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_ASCONF;
}
+ if (stcb->asoc.idata_supported == 1) {
+ sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_INTERLEAVING;
+ }
sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_MULTIBUF;
- if (stcb->asoc.peer_supports_strreset) {
+ if (stcb->asoc.reconfig_supported == 1) {
sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_RE_CONFIG;
}
sac->sac_length += i;
@@ -2732,7 +2831,11 @@ set_error:
static void
sctp_notify_peer_addr_change(struct sctp_tcb *stcb, uint32_t state,
- struct sockaddr *sa, uint32_t error)
+ struct sockaddr *sa, uint32_t error, int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
{
struct mbuf *m_notify;
struct sctp_paddr_change *spc;
@@ -2743,18 +2846,28 @@ sctp_notify_peer_addr_change(struct sctp_tcb *stcb, uint32_t state,
/* event not enabled */
return;
}
- m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_paddr_change), 0, M_DONTWAIT, 1, MT_DATA);
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_paddr_change), 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL)
return;
SCTP_BUF_LEN(m_notify) = 0;
spc = mtod(m_notify, struct sctp_paddr_change *);
+ memset(spc, 0, sizeof(struct sctp_paddr_change));
spc->spc_type = SCTP_PEER_ADDR_CHANGE;
spc->spc_flags = 0;
spc->spc_length = sizeof(struct sctp_paddr_change);
switch (sa->sa_family) {
#ifdef INET
case AF_INET:
+#ifdef INET6
+ if (sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
+ in6_sin_2_v4mapsin6((struct sockaddr_in *)sa,
+ (struct sockaddr_in6 *)&spc->spc_aaddr);
+ } else {
+ memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in));
+ }
+#else
memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in));
+#endif
break;
#endif
#ifdef INET6
@@ -2805,7 +2918,7 @@ sctp_notify_peer_addr_change(struct sctp_tcb *stcb, uint32_t state,
control,
&stcb->sctp_socket->so_rcv, 1,
SCTP_READ_LOCK_NOT_HELD,
- SCTP_SO_NOT_LOCKED);
+ so_locked);
}
@@ -2821,7 +2934,8 @@ sctp_notify_send_failed(struct sctp_tcb *stcb, uint8_t sent, uint32_t error,
struct sctp_send_failed *ssf;
struct sctp_send_failed_event *ssfe;
struct sctp_queued_to_read *control;
- int length;
+ struct sctp_chunkhdr *chkhdr;
+ int notifhdr_len, chk_len, chkhdr_len, padding_len, payload_len;
if ((stcb == NULL) ||
(sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT) &&
@@ -2830,27 +2944,49 @@ sctp_notify_send_failed(struct sctp_tcb *stcb, uint8_t sent, uint32_t error,
return;
}
if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) {
- length = sizeof(struct sctp_send_failed_event);
+ notifhdr_len = sizeof(struct sctp_send_failed_event);
} else {
- length = sizeof(struct sctp_send_failed);
+ notifhdr_len = sizeof(struct sctp_send_failed);
}
- m_notify = sctp_get_mbuf_for_msg(length, 0, M_DONTWAIT, 1, MT_DATA);
+ m_notify = sctp_get_mbuf_for_msg(notifhdr_len, 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL)
/* no space left */
return;
- SCTP_BUF_LEN(m_notify) = 0;
+ SCTP_BUF_LEN(m_notify) = notifhdr_len;
+ if (stcb->asoc.idata_supported) {
+ chkhdr_len = sizeof(struct sctp_idata_chunk);
+ } else {
+ chkhdr_len = sizeof(struct sctp_data_chunk);
+ }
+ /* Use some defaults in case we can't access the chunk header */
+ if (chk->send_size >= chkhdr_len) {
+ payload_len = chk->send_size - chkhdr_len;
+ } else {
+ payload_len = 0;
+ }
+ padding_len = 0;
+ if (chk->data != NULL) {
+ chkhdr = mtod(chk->data, struct sctp_chunkhdr *);
+ if (chkhdr != NULL) {
+ chk_len = ntohs(chkhdr->chunk_length);
+ if ((chk_len >= chkhdr_len) &&
+ (chk->send_size >= chk_len) &&
+ (chk->send_size - chk_len < 4)) {
+ padding_len = chk->send_size - chk_len;
+ payload_len = chk->send_size - chkhdr_len - padding_len;
+ }
+ }
+ }
if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) {
ssfe = mtod(m_notify, struct sctp_send_failed_event *);
- memset(ssfe, 0, length);
+ memset(ssfe, 0, notifhdr_len);
ssfe->ssfe_type = SCTP_SEND_FAILED_EVENT;
if (sent) {
ssfe->ssfe_flags = SCTP_DATA_SENT;
} else {
ssfe->ssfe_flags = SCTP_DATA_UNSENT;
}
- length += chk->send_size;
- length -= sizeof(struct sctp_data_chunk);
- ssfe->ssfe_length = length;
+ ssfe->ssfe_length = (uint32_t) (notifhdr_len + payload_len);
ssfe->ssfe_error = error;
/* not exactly what the user sent in, but should be close :) */
ssfe->ssfe_info.snd_sid = chk->rec.data.stream_number;
@@ -2859,39 +2995,33 @@ sctp_notify_send_failed(struct sctp_tcb *stcb, uint8_t sent, uint32_t error,
ssfe->ssfe_info.snd_context = chk->rec.data.context;
ssfe->ssfe_info.snd_assoc_id = sctp_get_associd(stcb);
ssfe->ssfe_assoc_id = sctp_get_associd(stcb);
- SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed_event);
} else {
ssf = mtod(m_notify, struct sctp_send_failed *);
- memset(ssf, 0, length);
+ memset(ssf, 0, notifhdr_len);
ssf->ssf_type = SCTP_SEND_FAILED;
if (sent) {
ssf->ssf_flags = SCTP_DATA_SENT;
} else {
ssf->ssf_flags = SCTP_DATA_UNSENT;
}
- length += chk->send_size;
- length -= sizeof(struct sctp_data_chunk);
- ssf->ssf_length = length;
+ ssf->ssf_length = (uint32_t) (notifhdr_len + payload_len);
ssf->ssf_error = error;
/* not exactly what the user sent in, but should be close :) */
- bzero(&ssf->ssf_info, sizeof(ssf->ssf_info));
ssf->ssf_info.sinfo_stream = chk->rec.data.stream_number;
- ssf->ssf_info.sinfo_ssn = chk->rec.data.stream_seq;
+ ssf->ssf_info.sinfo_ssn = (uint16_t) chk->rec.data.stream_seq;
ssf->ssf_info.sinfo_flags = chk->rec.data.rcv_flags;
ssf->ssf_info.sinfo_ppid = chk->rec.data.payloadtype;
ssf->ssf_info.sinfo_context = chk->rec.data.context;
ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb);
ssf->ssf_assoc_id = sctp_get_associd(stcb);
- SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed);
}
- if (chk->data) {
- /*
- * trim off the sctp chunk header(it should be there)
- */
- if (chk->send_size >= sizeof(struct sctp_data_chunk)) {
- m_adj(chk->data, sizeof(struct sctp_data_chunk));
+ if (chk->data != NULL) {
+ /* Trim off the sctp chunk header (it should be there) */
+ if (chk->send_size == chkhdr_len + payload_len + padding_len) {
+ m_adj(chk->data, chkhdr_len);
+ m_adj(chk->data, -padding_len);
sctp_mbuf_crush(chk->data);
- chk->send_size -= sizeof(struct sctp_data_chunk);
+ chk->send_size -= (chkhdr_len + padding_len);
}
}
SCTP_BUF_NEXT(m_notify) = chk->data;
@@ -2936,7 +3066,7 @@ sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error,
struct sctp_send_failed *ssf;
struct sctp_send_failed_event *ssfe;
struct sctp_queued_to_read *control;
- int length;
+ int notifhdr_len;
if ((stcb == NULL) ||
(sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT) &&
@@ -2945,23 +3075,22 @@ sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error,
return;
}
if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) {
- length = sizeof(struct sctp_send_failed_event);
+ notifhdr_len = sizeof(struct sctp_send_failed_event);
} else {
- length = sizeof(struct sctp_send_failed);
+ notifhdr_len = sizeof(struct sctp_send_failed);
}
- m_notify = sctp_get_mbuf_for_msg(length, 0, M_DONTWAIT, 1, MT_DATA);
+ m_notify = sctp_get_mbuf_for_msg(notifhdr_len, 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL) {
/* no space left */
return;
}
- SCTP_BUF_LEN(m_notify) = 0;
+ SCTP_BUF_LEN(m_notify) = notifhdr_len;
if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) {
ssfe = mtod(m_notify, struct sctp_send_failed_event *);
- memset(ssfe, 0, length);
+ memset(ssfe, 0, notifhdr_len);
ssfe->ssfe_type = SCTP_SEND_FAILED_EVENT;
ssfe->ssfe_flags = SCTP_DATA_UNSENT;
- length += sp->length;
- ssfe->ssfe_length = length;
+ ssfe->ssfe_length = (uint32_t) (notifhdr_len + sp->length);
ssfe->ssfe_error = error;
/* not exactly what the user sent in, but should be close :) */
ssfe->ssfe_info.snd_sid = sp->stream;
@@ -2974,14 +3103,12 @@ sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error,
ssfe->ssfe_info.snd_context = sp->context;
ssfe->ssfe_info.snd_assoc_id = sctp_get_associd(stcb);
ssfe->ssfe_assoc_id = sctp_get_associd(stcb);
- SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed_event);
} else {
ssf = mtod(m_notify, struct sctp_send_failed *);
- memset(ssf, 0, length);
+ memset(ssf, 0, notifhdr_len);
ssf->ssf_type = SCTP_SEND_FAILED;
ssf->ssf_flags = SCTP_DATA_UNSENT;
- length += sp->length;
- ssf->ssf_length = length;
+ ssf->ssf_length = (uint32_t) (notifhdr_len + sp->length);
ssf->ssf_error = error;
/* not exactly what the user sent in, but should be close :) */
ssf->ssf_info.sinfo_stream = sp->stream;
@@ -2995,7 +3122,6 @@ sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error,
ssf->ssf_info.sinfo_context = sp->context;
ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb);
ssf->ssf_assoc_id = sctp_get_associd(stcb);
- SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed);
}
SCTP_BUF_NEXT(m_notify) = sp->data;
@@ -3039,7 +3165,7 @@ sctp_notify_adaptation_layer(struct sctp_tcb *stcb)
/* event not enabled */
return;
}
- m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_adaption_event), 0, M_DONTWAIT, 1, MT_DATA);
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_adaption_event), 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL)
/* no space left */
return;
@@ -3095,7 +3221,7 @@ sctp_notify_partial_delivery_indication(struct sctp_tcb *stcb, uint32_t error,
if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ) {
return;
}
- m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_pdapi_event), 0, M_DONTWAIT, 1, MT_DATA);
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_pdapi_event), 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL)
/* no space left */
return;
@@ -3206,7 +3332,7 @@ sctp_notify_shutdown_event(struct sctp_tcb *stcb)
/* event not enabled */
return;
}
- m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_event), 0, M_DONTWAIT, 1, MT_DATA);
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_event), 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL)
/* no space left */
return;
@@ -3255,7 +3381,7 @@ sctp_notify_sender_dry_event(struct sctp_tcb *stcb,
/* event not enabled */
return;
}
- m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_sender_dry_event), 0, M_DONTWAIT, 1, MT_DATA);
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_sender_dry_event), 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL) {
/* no space left */
return;
@@ -3307,7 +3433,7 @@ sctp_notify_stream_reset_add(struct sctp_tcb *stcb, uint16_t numberin, uint16_t
return;
}
stcb->asoc.peer_req_out = 0;
- m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_stream_change_event), 0, M_DONTWAIT, 1, MT_DATA);
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_stream_change_event), 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL)
/* no space left */
return;
@@ -3357,7 +3483,7 @@ sctp_notify_stream_reset_tsn(struct sctp_tcb *stcb, uint32_t sending_tsn, uint32
/* event not enabled */
return;
}
- m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_assoc_reset_event), 0, M_DONTWAIT, 1, MT_DATA);
+ m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_assoc_reset_event), 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL)
/* no space left */
return;
@@ -3411,7 +3537,7 @@ sctp_notify_stream_reset(struct sctp_tcb *stcb,
/* event not enabled */
return;
}
- m_notify = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
+ m_notify = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL)
/* no space left */
return;
@@ -3467,7 +3593,8 @@ sctp_notify_remote_error(struct sctp_tcb *stcb, uint16_t error, struct sctp_erro
struct mbuf *m_notify;
struct sctp_remote_error *sre;
struct sctp_queued_to_read *control;
- size_t notif_len, chunk_len;
+ unsigned int notif_len;
+ uint16_t chunk_len;
if ((stcb == NULL) ||
sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVPEERERR)) {
@@ -3478,18 +3605,19 @@ sctp_notify_remote_error(struct sctp_tcb *stcb, uint16_t error, struct sctp_erro
} else {
chunk_len = 0;
}
- notif_len = sizeof(struct sctp_remote_error) + chunk_len;
- m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_DONTWAIT, 1, MT_DATA);
+ notif_len = (unsigned int)(sizeof(struct sctp_remote_error) + chunk_len);
+ m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL) {
/* Retry with smaller value. */
- notif_len = sizeof(struct sctp_remote_error);
- m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_DONTWAIT, 1, MT_DATA);
+ notif_len = (unsigned int)sizeof(struct sctp_remote_error);
+ m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA);
if (m_notify == NULL) {
return;
}
}
SCTP_BUF_NEXT(m_notify) = NULL;
sre = mtod(m_notify, struct sctp_remote_error *);
+ memset(sre, 0, notif_len);
sre->sre_type = SCTP_REMOTE_ERROR;
sre->sre_flags = 0;
sre->sre_length = sizeof(struct sctp_remote_error);
@@ -3554,7 +3682,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb,
if (stcb->asoc.adaptation_needed && (stcb->asoc.adaptation_sent == 0)) {
sctp_notify_adaptation_layer(stcb);
}
- if (stcb->asoc.peer_supports_auth == 0) {
+ if (stcb->asoc.auth_supported == 0) {
sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0,
NULL, so_locked);
}
@@ -3568,7 +3696,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb,
net = (struct sctp_nets *)data;
sctp_notify_peer_addr_change(stcb, SCTP_ADDR_UNREACHABLE,
- (struct sockaddr *)&net->ro._l_addr, error);
+ (struct sockaddr *)&net->ro._l_addr, error, so_locked);
break;
}
case SCTP_NOTIFY_INTERFACE_UP:
@@ -3577,7 +3705,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb,
net = (struct sctp_nets *)data;
sctp_notify_peer_addr_change(stcb, SCTP_ADDR_AVAILABLE,
- (struct sockaddr *)&net->ro._l_addr, error);
+ (struct sockaddr *)&net->ro._l_addr, error, so_locked);
break;
}
case SCTP_NOTIFY_INTERFACE_CONFIRMED:
@@ -3586,7 +3714,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb,
net = (struct sctp_nets *)data;
sctp_notify_peer_addr_change(stcb, SCTP_ADDR_CONFIRMED,
- (struct sockaddr *)&net->ro._l_addr, error);
+ (struct sockaddr *)&net->ro._l_addr, error, so_locked);
break;
}
case SCTP_NOTIFY_SPECIAL_SP_FAIL:
@@ -3628,7 +3756,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb,
break;
case SCTP_NOTIFY_ASSOC_RESTART:
sctp_notify_assoc_change(SCTP_RESTART, stcb, error, NULL, 0, so_locked);
- if (stcb->asoc.peer_supports_auth == 0) {
+ if (stcb->asoc.auth_supported == 0) {
sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0,
NULL, so_locked);
}
@@ -3657,15 +3785,15 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb,
break;
case SCTP_NOTIFY_ASCONF_ADD_IP:
sctp_notify_peer_addr_change(stcb, SCTP_ADDR_ADDED, data,
- error);
+ error, so_locked);
break;
case SCTP_NOTIFY_ASCONF_DELETE_IP:
sctp_notify_peer_addr_change(stcb, SCTP_ADDR_REMOVED, data,
- error);
+ error, so_locked);
break;
case SCTP_NOTIFY_ASCONF_SET_PRIMARY:
sctp_notify_peer_addr_change(stcb, SCTP_ADDR_MADE_PRIM, data,
- error);
+ error, so_locked);
break;
case SCTP_NOTIFY_PEER_SHUTDOWN:
sctp_notify_shutdown_event(stcb);
@@ -3693,7 +3821,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb,
break;
default:
SCTPDBG(SCTP_DEBUG_UTIL1, "%s: unknown notification %xh (%u)\n",
- __FUNCTION__, notification, notification);
+ __func__, notification, notification);
break;
} /* end switch */
}
@@ -3780,10 +3908,10 @@ sctp_report_all_outbound(struct sctp_tcb *stcb, uint16_t error, int holds_lock,
/* For each stream */
outs = &asoc->strmout[i];
/* clean up any sends there */
- asoc->locked_on_sending = NULL;
TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) {
- asoc->stream_queue_cnt--;
+ atomic_subtract_int(&asoc->stream_queue_cnt, 1);
TAILQ_REMOVE(&outs->outqueue, sp, next);
+ stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, holds_lock);
sctp_free_spbufspace(stcb, asoc, sp);
if (sp->data) {
sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb,
@@ -3845,7 +3973,7 @@ sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
struct mbuf *m, int iphlen,
struct sockaddr *src, struct sockaddr *dst,
struct sctphdr *sh, struct mbuf *op_err,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid,
uint32_t vrf_id, uint16_t port)
{
uint32_t vtag;
@@ -3865,7 +3993,7 @@ sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
stcb->asoc.state |= SCTP_STATE_WAS_ABORTED;
}
sctp_send_abort(m, iphlen, src, dst, sh, vtag, op_err,
- use_mflowid, mflowid,
+ mflowtype, mflowid, inp->fibnum,
vrf_id, port);
if (stcb != NULL) {
/* Ok, now lets free it */
@@ -3882,7 +4010,8 @@ sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
(SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
SCTP_STAT_DECR_GAUGE32(sctps_currestab);
}
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_4);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTPUTIL + SCTP_LOC_4);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
SCTP_SOCKET_UNLOCK(so, 1);
#endif
@@ -4006,7 +4135,8 @@ sctp_abort_an_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
atomic_subtract_int(&stcb->asoc.refcnt, 1);
}
#endif
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_5);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTPUTIL + SCTP_LOC_5);
#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
if (!so_locked) {
SCTP_SOCKET_UNLOCK(so, 1);
@@ -4019,7 +4149,7 @@ sctp_handle_ootb(struct mbuf *m, int iphlen, int offset,
struct sockaddr *src, struct sockaddr *dst,
struct sctphdr *sh, struct sctp_inpcb *inp,
struct mbuf *cause,
- uint8_t use_mflowid, uint32_t mflowid,
+ uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum,
uint32_t vrf_id, uint16_t port)
{
struct sctp_chunkhdr *ch, chunk_buf;
@@ -4061,7 +4191,7 @@ sctp_handle_ootb(struct mbuf *m, int iphlen, int offset,
return;
case SCTP_SHUTDOWN_ACK:
sctp_send_shutdown_complete2(src, dst, sh,
- use_mflowid, mflowid,
+ mflowtype, mflowid, fibnum,
vrf_id, port);
return;
default:
@@ -4075,7 +4205,7 @@ sctp_handle_ootb(struct mbuf *m, int iphlen, int offset,
((SCTP_BASE_SYSCTL(sctp_blackhole) == 1) &&
(contains_init_chunk == 0))) {
sctp_send_abort(m, iphlen, src, dst, sh, 0, cause,
- use_mflowid, mflowid,
+ mflowtype, mflowid, fibnum,
vrf_id, port);
}
}
@@ -4342,6 +4472,49 @@ sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp,
}
void
+sctp_wakeup_the_read_socket(struct sctp_inpcb *inp,
+ struct sctp_tcb *stcb,
+ int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+)
+{
+ if ((inp != NULL) && (inp->sctp_socket != NULL)) {
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) {
+ SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket);
+ } else {
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+ so = SCTP_INP_SO(inp);
+ if (!so_locked) {
+ if (stcb) {
+ atomic_add_int(&stcb->asoc.refcnt, 1);
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ SCTP_SOCKET_LOCK(so, 1);
+ if (stcb) {
+ SCTP_TCB_LOCK(stcb);
+ atomic_subtract_int(&stcb->asoc.refcnt, 1);
+ }
+ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ return;
+ }
+ }
+#endif
+ sctp_sorwakeup(inp, inp->sctp_socket);
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ if (!so_locked) {
+ SCTP_SOCKET_UNLOCK(so, 1);
+ }
+#endif
+ }
+ }
+}
+
+void
sctp_add_to_readq(struct sctp_inpcb *inp,
struct sctp_tcb *stcb,
struct sctp_queued_to_read *control,
@@ -4376,7 +4549,7 @@ sctp_add_to_readq(struct sctp_inpcb *inp,
sctp_m_freem(control->data);
control->data = NULL;
}
- SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), control);
+ sctp_free_a_readq(stcb, control);
if (inp_read_lock_held == 0)
SCTP_INP_READ_UNLOCK(inp);
return;
@@ -4422,7 +4595,7 @@ sctp_add_to_readq(struct sctp_inpcb *inp,
} else {
/* Everything got collapsed out?? */
sctp_free_remote_addr(control->whoFrom);
- SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), control);
+ sctp_free_a_readq(stcb, control);
if (inp_read_lock_held == 0)
SCTP_INP_READ_UNLOCK(inp);
return;
@@ -4431,195 +4604,14 @@ sctp_add_to_readq(struct sctp_inpcb *inp,
control->end_added = 1;
}
TAILQ_INSERT_TAIL(&inp->read_queue, control, next);
+ control->on_read_q = 1;
if (inp_read_lock_held == 0)
SCTP_INP_READ_UNLOCK(inp);
if (inp && inp->sctp_socket) {
- if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) {
- SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket);
- } else {
-#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
- struct socket *so;
-
- so = SCTP_INP_SO(inp);
- if (!so_locked) {
- if (stcb) {
- atomic_add_int(&stcb->asoc.refcnt, 1);
- SCTP_TCB_UNLOCK(stcb);
- }
- SCTP_SOCKET_LOCK(so, 1);
- if (stcb) {
- SCTP_TCB_LOCK(stcb);
- atomic_subtract_int(&stcb->asoc.refcnt, 1);
- }
- if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
- SCTP_SOCKET_UNLOCK(so, 1);
- return;
- }
- }
-#endif
- sctp_sorwakeup(inp, inp->sctp_socket);
-#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
- if (!so_locked) {
- SCTP_SOCKET_UNLOCK(so, 1);
- }
-#endif
- }
+ sctp_wakeup_the_read_socket(inp, stcb, so_locked);
}
}
-
-int
-sctp_append_to_readq(struct sctp_inpcb *inp,
- struct sctp_tcb *stcb,
- struct sctp_queued_to_read *control,
- struct mbuf *m,
- int end,
- int ctls_cumack,
- struct sockbuf *sb)
-{
- /*
- * A partial delivery API event is underway. OR we are appending on
- * the reassembly queue.
- *
- * If PDAPI this means we need to add m to the end of the data.
- * Increase the length in the control AND increment the sb_cc.
- * Otherwise sb is NULL and all we need to do is put it at the end
- * of the mbuf chain.
- */
- int len = 0;
- struct mbuf *mm, *tail = NULL, *prev = NULL;
-
- if (inp) {
- SCTP_INP_READ_LOCK(inp);
- }
- if (control == NULL) {
-get_out:
- if (inp) {
- SCTP_INP_READ_UNLOCK(inp);
- }
- return (-1);
- }
- if (inp && (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ)) {
- SCTP_INP_READ_UNLOCK(inp);
- return (0);
- }
- if (control->end_added) {
- /* huh this one is complete? */
- goto get_out;
- }
- mm = m;
- if (mm == NULL) {
- goto get_out;
- }
- while (mm) {
- if (SCTP_BUF_LEN(mm) == 0) {
- /* Skip mbufs with NO lenght */
- if (prev == NULL) {
- /* First one */
- m = sctp_m_free(mm);
- mm = m;
- } else {
- SCTP_BUF_NEXT(prev) = sctp_m_free(mm);
- mm = SCTP_BUF_NEXT(prev);
- }
- continue;
- }
- prev = mm;
- len += SCTP_BUF_LEN(mm);
- if (sb) {
- if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
- sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(mm));
- }
- sctp_sballoc(stcb, sb, mm);
- if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) {
- sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0);
- }
- }
- mm = SCTP_BUF_NEXT(mm);
- }
- if (prev) {
- tail = prev;
- } else {
- /* Really there should always be a prev */
- if (m == NULL) {
- /* Huh nothing left? */
-#ifdef INVARIANTS
- panic("Nothing left to add?");
-#else
- goto get_out;
-#endif
- }
- tail = m;
- }
- if (control->tail_mbuf) {
- /* append */
- SCTP_BUF_NEXT(control->tail_mbuf) = m;
- control->tail_mbuf = tail;
- } else {
- /* nothing there */
-#ifdef INVARIANTS
- if (control->data != NULL) {
- panic("This should NOT happen");
- }
-#endif
- control->data = m;
- control->tail_mbuf = tail;
- }
- atomic_add_int(&control->length, len);
- if (end) {
- /* message is complete */
- if (stcb && (control == stcb->asoc.control_pdapi)) {
- stcb->asoc.control_pdapi = NULL;
- }
- control->held_length = 0;
- control->end_added = 1;
- }
- if (stcb == NULL) {
- control->do_not_ref_stcb = 1;
- }
- /*
- * When we are appending in partial delivery, the cum-ack is used
- * for the actual pd-api highest tsn on this mbuf. The true cum-ack
- * is populated in the outbound sinfo structure from the true cumack
- * if the association exists...
- */
- control->sinfo_tsn = control->sinfo_cumtsn = ctls_cumack;
- if (inp) {
- SCTP_INP_READ_UNLOCK(inp);
- }
- if (inp && inp->sctp_socket) {
- if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) {
- SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket);
- } else {
-#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
- struct socket *so;
-
- so = SCTP_INP_SO(inp);
- if (stcb) {
- atomic_add_int(&stcb->asoc.refcnt, 1);
- SCTP_TCB_UNLOCK(stcb);
- }
- SCTP_SOCKET_LOCK(so, 1);
- if (stcb) {
- SCTP_TCB_LOCK(stcb);
- atomic_subtract_int(&stcb->asoc.refcnt, 1);
- }
- if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
- SCTP_SOCKET_UNLOCK(so, 1);
- return (0);
- }
-#endif
- sctp_sorwakeup(inp, inp->sctp_socket);
-#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
- SCTP_SOCKET_UNLOCK(so, 1);
-#endif
- }
- }
- return (0);
-}
-
-
-
/*************HOLD THIS COMMENT FOR PATCH FILE OF
*************ALTERNATE ROUTING CODE
*/
@@ -4633,19 +4625,23 @@ sctp_generate_cause(uint16_t code, char *info)
{
struct mbuf *m;
struct sctp_gen_error_cause *cause;
- size_t info_len, len;
+ size_t info_len;
+ uint16_t len;
if ((code == 0) || (info == NULL)) {
return (NULL);
}
info_len = strlen(info);
- len = sizeof(struct sctp_paramhdr) + info_len;
+ if (info_len > (SCTP_MAX_CAUSE_LENGTH - sizeof(struct sctp_paramhdr))) {
+ return (NULL);
+ }
+ len = (uint16_t) (sizeof(struct sctp_paramhdr) + info_len);
m = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA);
if (m != NULL) {
SCTP_BUF_LEN(m) = len;
cause = mtod(m, struct sctp_gen_error_cause *);
cause->code = htons(code);
- cause->length = htons((uint16_t) len);
+ cause->length = htons(len);
memcpy(cause->info, info, info_len);
}
return (m);
@@ -4656,15 +4652,15 @@ sctp_generate_no_user_data_cause(uint32_t tsn)
{
struct mbuf *m;
struct sctp_error_no_user_data *no_user_data_cause;
- size_t len;
+ uint16_t len;
- len = sizeof(struct sctp_error_no_user_data);
+ len = (uint16_t) sizeof(struct sctp_error_no_user_data);
m = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA);
if (m != NULL) {
SCTP_BUF_LEN(m) = len;
no_user_data_cause = mtod(m, struct sctp_error_no_user_data *);
no_user_data_cause->cause.code = htons(SCTP_CAUSE_NO_USER_DATA);
- no_user_data_cause->cause.length = htons((uint16_t) len);
+ no_user_data_cause->cause.length = htons(len);
no_user_data_cause->tsn = tsn; /* tsn is passed in as NBO */
}
return (m);
@@ -4724,6 +4720,21 @@ sctp_release_pr_sctp_chunk(struct sctp_tcb *stcb, struct sctp_tmit_chunk *tp1,
stream = tp1->rec.data.stream_number;
seq = tp1->rec.data.stream_seq;
+ if (sent || !(tp1->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG)) {
+ stcb->asoc.abandoned_sent[0]++;
+ stcb->asoc.abandoned_sent[PR_SCTP_POLICY(tp1->flags)]++;
+ stcb->asoc.strmout[stream].abandoned_sent[0]++;
+#if defined(SCTP_DETAILED_STR_STATS)
+ stcb->asoc.strmout[stream].abandoned_sent[PR_SCTP_POLICY(tp1->flags)]++;
+#endif
+ } else {
+ stcb->asoc.abandoned_unsent[0]++;
+ stcb->asoc.abandoned_unsent[PR_SCTP_POLICY(tp1->flags)]++;
+ stcb->asoc.strmout[stream].abandoned_unsent[0]++;
+#if defined(SCTP_DETAILED_STR_STATS)
+ stcb->asoc.strmout[stream].abandoned_unsent[PR_SCTP_POLICY(tp1->flags)]++;
+#endif
+ }
do {
ret_sz += tp1->book_size;
if (tp1->data != NULL) {
@@ -4840,28 +4851,48 @@ sctp_release_pr_sctp_chunk(struct sctp_tcb *stcb, struct sctp_tmit_chunk *tp1,
goto oh_well;
}
memset(chk, 0, sizeof(*chk));
- chk->rec.data.rcv_flags = SCTP_DATA_LAST_FRAG;
+ chk->rec.data.rcv_flags = 0;
chk->sent = SCTP_FORWARD_TSN_SKIP;
chk->asoc = &stcb->asoc;
- chk->rec.data.stream_seq = strq->next_sequence_send;
+ if (stcb->asoc.idata_supported == 0) {
+ if (sp->sinfo_flags & SCTP_UNORDERED) {
+ chk->rec.data.stream_seq = 0;
+ } else {
+ chk->rec.data.stream_seq = strq->next_mid_ordered;
+ }
+ } else {
+ if (sp->sinfo_flags & SCTP_UNORDERED) {
+ chk->rec.data.stream_seq = strq->next_mid_unordered;
+ } else {
+ chk->rec.data.stream_seq = strq->next_mid_ordered;
+ }
+ }
chk->rec.data.stream_number = sp->stream;
chk->rec.data.payloadtype = sp->ppid;
chk->rec.data.context = sp->context;
chk->flags = sp->act_flags;
- if (sp->net)
- chk->whoTo = sp->net;
- else
- chk->whoTo = stcb->asoc.primary_destination;
- atomic_add_int(&chk->whoTo->ref_count, 1);
+ chk->whoTo = NULL;
chk->rec.data.TSN_seq = atomic_fetchadd_int(&stcb->asoc.sending_seq, 1);
- stcb->asoc.pr_sctp_cnt++;
+ strq->chunks_on_queues++;
TAILQ_INSERT_TAIL(&stcb->asoc.sent_queue, chk, sctp_next);
stcb->asoc.sent_queue_cnt++;
stcb->asoc.pr_sctp_cnt++;
+ }
+ chk->rec.data.rcv_flags |= SCTP_DATA_LAST_FRAG;
+ if (sp->sinfo_flags & SCTP_UNORDERED) {
+ chk->rec.data.rcv_flags |= SCTP_DATA_UNORDERED;
+ }
+ if (stcb->asoc.idata_supported == 0) {
+ if ((sp->sinfo_flags & SCTP_UNORDERED) == 0) {
+ strq->next_mid_ordered++;
+ }
} else {
- chk->rec.data.rcv_flags |= SCTP_DATA_LAST_FRAG;
+ if (sp->sinfo_flags & SCTP_UNORDERED) {
+ strq->next_mid_unordered++;
+ } else {
+ strq->next_mid_ordered++;
+ }
}
- strq->next_sequence_send++;
oh_well:
if (sp->data) {
/*
@@ -5009,7 +5040,6 @@ sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock)
vrf = sctp_find_vrf(vrf_id);
if (vrf == NULL) {
-stage_right:
if (holds_lock == 0)
SCTP_IPI_ADDR_RUNLOCK();
return (NULL);
@@ -5029,15 +5059,6 @@ stage_right:
return (NULL);
}
LIST_FOREACH(sctp_ifap, hash_head, next_bucket) {
- if (sctp_ifap == NULL) {
-#ifdef INVARIANTS
- panic("Huh LIST_FOREACH corrupt");
- goto stage_right;
-#else
- SCTP_PRINTF("LIST corrupt of sctp_ifap's?\n");
- goto stage_right;
-#endif
- }
if (addr->sa_family != sctp_ifap->address.sa.sa_family)
continue;
#ifdef INET
@@ -5136,7 +5157,8 @@ sctp_user_rcvd(struct sctp_tcb *stcb, uint32_t * freed_so_far, int hold_rlock,
sctp_chunk_output(stcb->sctp_ep, stcb,
SCTP_OUTPUT_FROM_USR_RCVD, SCTP_SO_LOCKED);
/* make sure no timer is running */
- sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_6);
+ sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL,
+ SCTP_FROM_SCTPUTIL + SCTP_LOC_6);
SCTP_TCB_UNLOCK(stcb);
} else {
/* Update how much we have pending */
@@ -5187,7 +5209,7 @@ sctp_sorecvmsg(struct socket *so,
uint32_t rwnd_req = 0;
int hold_sblock = 0;
int hold_rlock = 0;
- int slen = 0;
+ ssize_t slen = 0;
uint32_t held_length = 0;
int sockbuf_lock = 0;
@@ -5232,11 +5254,11 @@ sctp_sorecvmsg(struct socket *so,
in_eeor_mode = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR);
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) {
sctp_misc_ints(SCTP_SORECV_ENTER,
- rwnd_req, in_eeor_mode, so->so_rcv.sb_cc, uio->uio_resid);
+ rwnd_req, in_eeor_mode, so->so_rcv.sb_cc, (uint32_t) uio->uio_resid);
}
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) {
sctp_misc_ints(SCTP_SORECV_ENTERPL,
- rwnd_req, block_allowed, so->so_rcv.sb_cc, uio->uio_resid);
+ rwnd_req, block_allowed, so->so_rcv.sb_cc, (uint32_t) uio->uio_resid);
}
error = sblock(&so->so_rcv, (block_allowed ? SBL_WAIT : 0));
if (error) {
@@ -5269,8 +5291,14 @@ restart_nosblocks:
}
}
}
- if ((so->so_rcv.sb_cc <= held_length) && block_allowed) {
- /* we need to wait for data */
+ if (so->so_rcv.sb_cc <= held_length) {
+ if (so->so_error) {
+ error = so->so_error;
+ if ((in_flags & MSG_PEEK) == 0) {
+ so->so_error = 0;
+ }
+ goto out;
+ }
if ((so->so_rcv.sb_cc == 0) &&
((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
(inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) {
@@ -5301,51 +5329,18 @@ restart_nosblocks:
goto out;
}
}
- error = sbwait(&so->so_rcv);
- if (error) {
- goto out;
- }
- held_length = 0;
- goto restart_nosblocks;
- } else if (so->so_rcv.sb_cc == 0) {
- if (so->so_error) {
- error = so->so_error;
- if ((in_flags & MSG_PEEK) == 0)
- so->so_error = 0;
- } else {
- if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
- (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
- if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0) {
- /*
- * For active open side clear flags
- * for re-use passive open is
- * blocked by connect.
- */
- if (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED) {
- /*
- * You were aborted, passive
- * side always hits here
- */
- SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET);
- error = ECONNRESET;
- }
- so->so_state &= ~(SS_ISCONNECTING |
- SS_ISDISCONNECTING |
- SS_ISCONFIRMING |
- SS_ISCONNECTED);
- if (error == 0) {
- if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) == 0) {
- SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOTCONN);
- error = ENOTCONN;
- }
- }
- goto out;
- }
+ if (block_allowed) {
+ error = sbwait(&so->so_rcv);
+ if (error) {
+ goto out;
}
+ held_length = 0;
+ goto restart_nosblocks;
+ } else {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EWOULDBLOCK);
error = EWOULDBLOCK;
+ goto out;
}
- goto out;
}
if (hold_sblock == 1) {
SOCKBUF_UNLOCK(&so->so_rcv);
@@ -5438,6 +5433,12 @@ restart_nosblocks:
sctp_m_free(control->aux_data);
control->aux_data = NULL;
}
+#ifdef INVARIANTS
+ if (control->on_strm_q) {
+ panic("About to free ctl:%p so:%p and its in %d",
+ control, so, control->on_strm_q);
+ }
+#endif
sctp_free_remote_addr(control->whoFrom);
sctp_free_a_readq(stcb, control);
if (hold_rlock) {
@@ -5498,20 +5499,16 @@ restart_nosblocks:
}
/* Clear the held length since there is something to read */
control->held_length = 0;
- if (hold_rlock) {
- SCTP_INP_READ_UNLOCK(inp);
- hold_rlock = 0;
- }
found_one:
/*
* If we reach here, control has a some data for us to read off.
* Note that stcb COULD be NULL.
*/
- control->some_taken++;
- if (hold_sblock) {
- SOCKBUF_UNLOCK(&so->so_rcv);
- hold_sblock = 0;
+ if (hold_rlock == 0) {
+ hold_rlock = 1;
+ SCTP_INP_READ_LOCK(inp);
}
+ control->some_taken++;
stcb = control->stcb;
if (stcb) {
if ((control->do_not_ref_stcb == 0) &&
@@ -5556,8 +5553,16 @@ found_one:
stcb->asoc.strmin[control->sinfo_stream].delivery_started = 1;
}
/* First lets get off the sinfo and sockaddr info */
- if ((sinfo) && filling_sinfo) {
- memcpy(sinfo, control, sizeof(struct sctp_nonpad_sndrcvinfo));
+ if ((sinfo != NULL) && (filling_sinfo != 0)) {
+ sinfo->sinfo_stream = control->sinfo_stream;
+ sinfo->sinfo_ssn = (uint16_t) control->sinfo_ssn;
+ sinfo->sinfo_flags = control->sinfo_flags;
+ sinfo->sinfo_ppid = control->sinfo_ppid;
+ sinfo->sinfo_context = control->sinfo_context;
+ sinfo->sinfo_timetolive = control->sinfo_timetolive;
+ sinfo->sinfo_tsn = control->sinfo_tsn;
+ sinfo->sinfo_cumtsn = control->sinfo_cumtsn;
+ sinfo->sinfo_assoc_id = control->sinfo_assoc_id;
nxt = TAILQ_NEXT(control, next);
if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO) ||
sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) {
@@ -5566,20 +5571,20 @@ found_one:
s_extra = (struct sctp_extrcvinfo *)sinfo;
if ((nxt) &&
(nxt->length)) {
- s_extra->sreinfo_next_flags = SCTP_NEXT_MSG_AVAIL;
+ s_extra->serinfo_next_flags = SCTP_NEXT_MSG_AVAIL;
if (nxt->sinfo_flags & SCTP_UNORDERED) {
- s_extra->sreinfo_next_flags |= SCTP_NEXT_MSG_IS_UNORDERED;
+ s_extra->serinfo_next_flags |= SCTP_NEXT_MSG_IS_UNORDERED;
}
if (nxt->spec_flags & M_NOTIFICATION) {
- s_extra->sreinfo_next_flags |= SCTP_NEXT_MSG_IS_NOTIFICATION;
+ s_extra->serinfo_next_flags |= SCTP_NEXT_MSG_IS_NOTIFICATION;
}
- s_extra->sreinfo_next_aid = nxt->sinfo_assoc_id;
- s_extra->sreinfo_next_length = nxt->length;
- s_extra->sreinfo_next_ppid = nxt->sinfo_ppid;
- s_extra->sreinfo_next_stream = nxt->sinfo_stream;
+ s_extra->serinfo_next_aid = nxt->sinfo_assoc_id;
+ s_extra->serinfo_next_length = nxt->length;
+ s_extra->serinfo_next_ppid = nxt->sinfo_ppid;
+ s_extra->serinfo_next_stream = nxt->sinfo_stream;
if (nxt->tail_mbuf != NULL) {
if (nxt->end_added) {
- s_extra->sreinfo_next_flags |= SCTP_NEXT_MSG_ISCOMPLETE;
+ s_extra->serinfo_next_flags |= SCTP_NEXT_MSG_ISCOMPLETE;
}
}
} else {
@@ -5590,11 +5595,11 @@ found_one:
* :-D
*/
nxt = NULL;
- s_extra->sreinfo_next_flags = SCTP_NO_NEXT_MSG;
- s_extra->sreinfo_next_aid = 0;
- s_extra->sreinfo_next_length = 0;
- s_extra->sreinfo_next_ppid = 0;
- s_extra->sreinfo_next_stream = 0;
+ s_extra->serinfo_next_flags = SCTP_NO_NEXT_MSG;
+ s_extra->serinfo_next_aid = 0;
+ s_extra->serinfo_next_length = 0;
+ s_extra->serinfo_next_ppid = 0;
+ s_extra->serinfo_next_stream = 0;
}
}
/*
@@ -5631,43 +5636,43 @@ found_one:
entry->flgs = control->sinfo_flags;
}
#endif
- if (fromlen && from) {
- cp_len = min((size_t)fromlen, (size_t)control->whoFrom->ro._l_addr.sa.sa_len);
+ if ((fromlen > 0) && (from != NULL)) {
+ union sctp_sockstore store;
+ size_t len;
+
switch (control->whoFrom->ro._l_addr.sa.sa_family) {
#ifdef INET6
case AF_INET6:
- ((struct sockaddr_in6 *)from)->sin6_port = control->port_from;
+ len = sizeof(struct sockaddr_in6);
+ store.sin6 = control->whoFrom->ro._l_addr.sin6;
+ store.sin6.sin6_port = control->port_from;
break;
#endif
#ifdef INET
case AF_INET:
- ((struct sockaddr_in *)from)->sin_port = control->port_from;
+#ifdef INET6
+ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
+ len = sizeof(struct sockaddr_in6);
+ in6_sin_2_v4mapsin6(&control->whoFrom->ro._l_addr.sin,
+ &store.sin6);
+ store.sin6.sin6_port = control->port_from;
+ } else {
+ len = sizeof(struct sockaddr_in);
+ store.sin = control->whoFrom->ro._l_addr.sin;
+ store.sin.sin_port = control->port_from;
+ }
+#else
+ len = sizeof(struct sockaddr_in);
+ store.sin = control->whoFrom->ro._l_addr.sin;
+ store.sin.sin_port = control->port_from;
+#endif
break;
#endif
default:
+ len = 0;
break;
}
- memcpy(from, &control->whoFrom->ro._l_addr, cp_len);
-
-#if defined(INET) && defined(INET6)
- if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) &&
- (from->sa_family == AF_INET) &&
- ((size_t)fromlen >= sizeof(struct sockaddr_in6))) {
- struct sockaddr_in *sin;
- struct sockaddr_in6 sin6;
-
- sin = (struct sockaddr_in *)from;
- bzero(&sin6, sizeof(sin6));
- sin6.sin6_family = AF_INET6;
- sin6.sin6_len = sizeof(struct sockaddr_in6);
- sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
- bcopy(&sin->sin_addr,
- &sin6.sin6_addr.s6_addr32[3],
- sizeof(sin6.sin6_addr.s6_addr32[3]));
- sin6.sin6_port = sin->sin_port;
- memcpy(from, &sin6, sizeof(struct sockaddr_in6));
- }
-#endif
+ memcpy(from, &store, min((size_t)fromlen, len));
#ifdef INET6
{
struct sockaddr_in6 lsa6, *from6;
@@ -5677,6 +5682,14 @@ found_one:
}
#endif
}
+ if (hold_rlock) {
+ SCTP_INP_READ_UNLOCK(inp);
+ hold_rlock = 0;
+ }
+ if (hold_sblock) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ hold_sblock = 0;
+ }
/* now copy out what data we can */
if (mp == NULL) {
/* copy out each mbuf in the chain up to length */
@@ -5708,15 +5721,8 @@ get_more_data:
/* error we are out of here */
goto release;
}
- if ((SCTP_BUF_NEXT(m) == NULL) &&
- (cp_len >= SCTP_BUF_LEN(m)) &&
- ((control->end_added == 0) ||
- (control->end_added &&
- (TAILQ_NEXT(control, next) == NULL)))
- ) {
- SCTP_INP_READ_LOCK(inp);
- hold_rlock = 1;
- }
+ SCTP_INP_READ_LOCK(inp);
+ hold_rlock = 1;
if (cp_len == SCTP_BUF_LEN(m)) {
if ((SCTP_BUF_NEXT(m) == NULL) &&
(control->end_added)) {
@@ -5834,19 +5840,9 @@ get_more_data:
#endif
}
done_with_control:
- if (TAILQ_NEXT(control, next) == NULL) {
- /*
- * If we don't have a next we need a
- * lock, if there is a next
- * interrupt is filling ahead of us
- * and we don't need a lock to
- * remove this guy (which is the
- * head of the queue).
- */
- if (hold_rlock == 0) {
- SCTP_INP_READ_LOCK(inp);
- hold_rlock = 1;
- }
+ if (hold_rlock == 0) {
+ SCTP_INP_READ_LOCK(inp);
+ hold_rlock = 1;
}
TAILQ_REMOVE(&inp->read_queue, control, next);
/* Add back any hiddend data */
@@ -5862,6 +5858,12 @@ get_more_data:
no_rcv_needed = control->do_not_ref_stcb;
sctp_free_remote_addr(control->whoFrom);
control->data = NULL;
+#ifdef INVARIANTS
+ if (control->on_strm_q) {
+ panic("About to free ctl:%p so:%p and its in %d",
+ control, so, control->on_strm_q);
+ }
+#endif
sctp_free_a_readq(stcb, control);
control = NULL;
if ((freed_so_far >= rwnd_req) &&
@@ -6077,7 +6079,7 @@ out:
struct sctp_extrcvinfo *s_extra;
s_extra = (struct sctp_extrcvinfo *)sinfo;
- s_extra->sreinfo_next_flags = SCTP_NO_NEXT_MSG;
+ s_extra->serinfo_next_flags = SCTP_NO_NEXT_MSG;
}
if (hold_rlock == 1) {
SCTP_INP_READ_UNLOCK(inp);
@@ -6103,21 +6105,21 @@ out:
goto stage_left;
#endif
}
- atomic_add_int(&stcb->asoc.refcnt, -1);
/* Save the value back for next time */
stcb->freed_by_sorcv_sincelast = freed_so_far;
+ atomic_add_int(&stcb->asoc.refcnt, -1);
}
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) {
if (stcb) {
sctp_misc_ints(SCTP_SORECV_DONE,
freed_so_far,
- ((uio) ? (slen - uio->uio_resid) : slen),
+ (uint32_t) ((uio) ? (slen - uio->uio_resid) : slen),
stcb->asoc.my_rwnd,
so->so_rcv.sb_cc);
} else {
sctp_misc_ints(SCTP_SORECV_DONE,
freed_so_far,
- ((uio) ? (slen - uio->uio_resid) : slen),
+ (uint32_t) ((uio) ? (slen - uio->uio_resid) : slen),
0,
so->so_rcv.sb_cc);
}
@@ -6135,9 +6137,7 @@ struct mbuf *
sctp_m_free(struct mbuf *m)
{
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
- if (SCTP_BUF_IS_EXTENDED(m)) {
- sctp_log_mb(m, SCTP_MBUF_IFREE);
- }
+ sctp_log_mb(m, SCTP_MBUF_IFREE);
}
return (m_free(m));
}
@@ -6296,14 +6296,18 @@ sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr,
(sin->sin_addr.s_addr == INADDR_BROADCAST) ||
IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTPUTIL + SCTP_LOC_7);
*error = EINVAL;
goto out_now;
}
- if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) {
+ if (sctp_add_remote_addr(stcb, sa, NULL, stcb->asoc.port,
+ SCTP_DONOT_SETSCOPE,
+ SCTP_ADDR_IS_CONFIRMED)) {
/* assoc gone no un-lock */
SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS);
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTPUTIL + SCTP_LOC_8);
*error = ENOBUFS;
goto out_now;
}
@@ -6317,14 +6321,18 @@ sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr,
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_8);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTPUTIL + SCTP_LOC_9);
*error = EINVAL;
goto out_now;
}
- if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) {
+ if (sctp_add_remote_addr(stcb, sa, NULL, stcb->asoc.port,
+ SCTP_DONOT_SETSCOPE,
+ SCTP_ADDR_IS_CONFIRMED)) {
/* assoc gone no un-lock */
SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS);
- (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_8);
+ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
+ SCTP_FROM_SCTPUTIL + SCTP_LOC_10);
*error = ENOBUFS;
goto out_now;
}
@@ -6342,30 +6350,30 @@ out_now:
struct sctp_tcb *
sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr,
- int *totaddr, int *num_v4, int *num_v6, int *error,
- int limit, int *bad_addr)
+ unsigned int *totaddr,
+ unsigned int *num_v4, unsigned int *num_v6, int *error,
+ unsigned int limit, int *bad_addr)
{
struct sockaddr *sa;
struct sctp_tcb *stcb = NULL;
- size_t incr, at, i;
+ unsigned int incr, at, i;
- at = incr = 0;
+ at = 0;
sa = addr;
-
*error = *num_v6 = *num_v4 = 0;
/* account and validate addresses */
- for (i = 0; i < (size_t)*totaddr; i++) {
+ for (i = 0; i < *totaddr; i++) {
switch (sa->sa_family) {
#ifdef INET
case AF_INET:
- (*num_v4) += 1;
- incr = sizeof(struct sockaddr_in);
+ incr = (unsigned int)sizeof(struct sockaddr_in);
if (sa->sa_len != incr) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
*error = EINVAL;
*bad_addr = 1;
return (NULL);
}
+ (*num_v4) += 1;
break;
#endif
#ifdef INET6
@@ -6381,14 +6389,14 @@ sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr,
*bad_addr = 1;
return (NULL);
}
- (*num_v6) += 1;
- incr = sizeof(struct sockaddr_in6);
+ incr = (unsigned int)sizeof(struct sockaddr_in6);
if (sa->sa_len != incr) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
*error = EINVAL;
*bad_addr = 1;
return (NULL);
}
+ (*num_v6) += 1;
break;
}
#endif
@@ -6397,7 +6405,7 @@ sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr,
/* we are done */
break;
}
- if (i == (size_t)*totaddr) {
+ if (i == *totaddr) {
break;
}
SCTP_INP_INCR_REF(inp);
@@ -6408,7 +6416,7 @@ sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr,
} else {
SCTP_INP_DECR_REF(inp);
}
- if ((at + incr) > (size_t)limit) {
+ if ((at + incr) > limit) {
*totaddr = i;
break;
}
@@ -6428,7 +6436,7 @@ sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp,
{
struct sockaddr *addr_touse;
-#ifdef INET6
+#if defined(INET) && defined(INET6)
struct sockaddr_in sin;
#endif
@@ -6442,8 +6450,10 @@ sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp,
addr_touse = sa;
#ifdef INET6
if (sa->sa_family == AF_INET6) {
+#ifdef INET
struct sockaddr_in6 *sin6;
+#endif
if (sa->sa_len != sizeof(struct sockaddr_in6)) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
*error = EINVAL;
@@ -6455,6 +6465,7 @@ sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp,
*error = EINVAL;
return;
}
+#ifdef INET
sin6 = (struct sockaddr_in6 *)addr_touse;
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
@@ -6467,6 +6478,7 @@ sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp,
in6_sin6_2_sin(&sin, sin6);
addr_touse = (struct sockaddr *)&sin;
}
+#endif
}
#endif
#ifdef INET
@@ -6556,7 +6568,7 @@ sctp_bindx_delete_address(struct sctp_inpcb *inp,
{
struct sockaddr *addr_touse;
-#ifdef INET6
+#if defined(INET) && defined(INET6)
struct sockaddr_in sin;
#endif
@@ -6570,8 +6582,11 @@ sctp_bindx_delete_address(struct sctp_inpcb *inp,
addr_touse = sa;
#ifdef INET6
if (sa->sa_family == AF_INET6) {
+#ifdef INET
struct sockaddr_in6 *sin6;
+#endif
+
if (sa->sa_len != sizeof(struct sockaddr_in6)) {
SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL);
*error = EINVAL;
@@ -6583,6 +6598,7 @@ sctp_bindx_delete_address(struct sctp_inpcb *inp,
*error = EINVAL;
return;
}
+#ifdef INET
sin6 = (struct sockaddr_in6 *)addr_touse;
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
@@ -6595,6 +6611,7 @@ sctp_bindx_delete_address(struct sctp_inpcb *inp,
in6_sin6_2_sin(&sin, sin6);
addr_touse = (struct sockaddr *)&sin;
}
+#endif
}
#endif
#ifdef INET
@@ -6688,7 +6705,7 @@ sctp_local_addr_count(struct sctp_tcb *stcb)
if (ipv4_addr_legal) {
struct sockaddr_in *sin;
- sin = (struct sockaddr_in *)&sctp_ifa->address.sa;
+ sin = &sctp_ifa->address.sin;
if (sin->sin_addr.s_addr == 0) {
/*
* skip unspecified
@@ -6716,7 +6733,7 @@ sctp_local_addr_count(struct sctp_tcb *stcb)
if (ipv6_addr_legal) {
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa;
+ sin6 = &sctp_ifa->address.sin6;
if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
continue;
}
@@ -6810,7 +6827,8 @@ sctp_log_trace(uint32_t subsys, const char *str SCTP_UNUSED, uint32_t a, uint32_
#endif
static void
-sctp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *ignored)
+sctp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp,
+ const struct sockaddr *sa SCTP_UNUSED, void *ctx SCTP_UNUSED)
{
struct ip *iph;
@@ -6834,7 +6852,7 @@ sctp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *ignored)
* Split out the mbuf chain. Leave the IP header in m, place the
* rest in the sp.
*/
- sp = m_split(m, off, M_DONTWAIT);
+ sp = m_split(m, off, M_NOWAIT);
if (sp == NULL) {
/* Gak, drop packet, we can't do a split */
goto out;
@@ -6857,11 +6875,23 @@ sctp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *ignored)
for (last = m; last->m_next; last = last->m_next);
last->m_next = sp;
m->m_pkthdr.len += sp->m_pkthdr.len;
+ /*
+ * The CSUM_DATA_VALID flags indicates that the HW checked the UDP
+ * checksum and it was valid. Since CSUM_DATA_VALID ==
+ * CSUM_SCTP_VALID this would imply that the HW also verified the
+ * SCTP checksum. Therefore, clear the bit.
+ */
+ SCTPDBG(SCTP_DEBUG_CRCOFFLOAD,
+ "sctp_recv_udp_tunneled_packet(): Packet of length %d received on %s with csum_flags 0x%b.\n",
+ m->m_pkthdr.len,
+ if_name(m->m_pkthdr.rcvif),
+ (int)m->m_pkthdr.csum_flags, CSUM_BITS);
+ m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
iph = mtod(m, struct ip *);
switch (iph->ip_v) {
#ifdef INET
case IPVERSION:
- iph->ip_len -= sizeof(struct udphdr);
+ iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr));
sctp_input_with_port(m, off, port);
break;
#endif
@@ -6881,6 +6911,259 @@ out:
m_freem(m);
}
+#ifdef INET
+static void
+sctp_recv_icmp_tunneled_packet(int cmd, struct sockaddr *sa, void *vip, void *ctx SCTP_UNUSED)
+{
+ struct ip *outer_ip, *inner_ip;
+ struct sctphdr *sh;
+ struct icmp *icmp;
+ struct udphdr *udp;
+ struct sctp_inpcb *inp;
+ struct sctp_tcb *stcb;
+ struct sctp_nets *net;
+ struct sctp_init_chunk *ch;
+ struct sockaddr_in src, dst;
+ uint8_t type, code;
+
+ inner_ip = (struct ip *)vip;
+ icmp = (struct icmp *)((caddr_t)inner_ip -
+ (sizeof(struct icmp) - sizeof(struct ip)));
+ outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip));
+ if (ntohs(outer_ip->ip_len) <
+ sizeof(struct ip) + 8 + (inner_ip->ip_hl << 2) + sizeof(struct udphdr) + 8) {
+ return;
+ }
+ udp = (struct udphdr *)((caddr_t)inner_ip + (inner_ip->ip_hl << 2));
+ sh = (struct sctphdr *)(udp + 1);
+ memset(&src, 0, sizeof(struct sockaddr_in));
+ src.sin_family = AF_INET;
+ src.sin_len = sizeof(struct sockaddr_in);
+ src.sin_port = sh->src_port;
+ src.sin_addr = inner_ip->ip_src;
+ memset(&dst, 0, sizeof(struct sockaddr_in));
+ dst.sin_family = AF_INET;
+ dst.sin_len = sizeof(struct sockaddr_in);
+ dst.sin_port = sh->dest_port;
+ dst.sin_addr = inner_ip->ip_dst;
+ /*
+ * 'dst' holds the dest of the packet that failed to be sent. 'src'
+ * holds our local endpoint address. Thus we reverse the dst and the
+ * src in the lookup.
+ */
+ inp = NULL;
+ net = NULL;
+ stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst,
+ (struct sockaddr *)&src,
+ &inp, &net, 1,
+ SCTP_DEFAULT_VRFID);
+ if ((stcb != NULL) &&
+ (net != NULL) &&
+ (inp != NULL)) {
+ /* Check the UDP port numbers */
+ if ((udp->uh_dport != net->port) ||
+ (udp->uh_sport != htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)))) {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ /* Check the verification tag */
+ if (ntohl(sh->v_tag) != 0) {
+ /*
+ * This must be the verification tag used for
+ * sending out packets. We don't consider packets
+ * reflecting the verification tag.
+ */
+ if (ntohl(sh->v_tag) != stcb->asoc.peer_vtag) {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ } else {
+ if (ntohs(outer_ip->ip_len) >=
+ sizeof(struct ip) +
+ 8 + (inner_ip->ip_hl << 2) + 8 + 20) {
+ /*
+ * In this case we can check if we got an
+ * INIT chunk and if the initiate tag
+ * matches.
+ */
+ ch = (struct sctp_init_chunk *)(sh + 1);
+ if ((ch->ch.chunk_type != SCTP_INITIATION) ||
+ (ntohl(ch->init.initiate_tag) != stcb->asoc.my_vtag)) {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ } else {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ }
+ type = icmp->icmp_type;
+ code = icmp->icmp_code;
+ if ((type == ICMP_UNREACH) &&
+ (code == ICMP_UNREACH_PORT)) {
+ code = ICMP_UNREACH_PROTOCOL;
+ }
+ sctp_notify(inp, stcb, net, type, code,
+ ntohs(inner_ip->ip_len),
+ ntohs(icmp->icmp_nextmtu));
+ } else {
+ if ((stcb == NULL) && (inp != NULL)) {
+ /* reduce ref-count */
+ SCTP_INP_WLOCK(inp);
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ }
+ if (stcb) {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ }
+ return;
+}
+
+#endif
+
+#ifdef INET6
+static void
+sctp_recv_icmp6_tunneled_packet(int cmd, struct sockaddr *sa, void *d, void *ctx SCTP_UNUSED)
+{
+ struct ip6ctlparam *ip6cp;
+ struct sctp_inpcb *inp;
+ struct sctp_tcb *stcb;
+ struct sctp_nets *net;
+ struct sctphdr sh;
+ struct udphdr udp;
+ struct sockaddr_in6 src, dst;
+ uint8_t type, code;
+
+ ip6cp = (struct ip6ctlparam *)d;
+ /*
+ * XXX: We assume that when IPV6 is non NULL, M and OFF are valid.
+ */
+ if (ip6cp->ip6c_m == NULL) {
+ return;
+ }
+ /*
+ * Check if we can safely examine the ports and the verification tag
+ * of the SCTP common header.
+ */
+ if (ip6cp->ip6c_m->m_pkthdr.len <
+ ip6cp->ip6c_off + sizeof(struct udphdr) + offsetof(struct sctphdr, checksum)) {
+ return;
+ }
+ /* Copy out the UDP header. */
+ memset(&udp, 0, sizeof(struct udphdr));
+ m_copydata(ip6cp->ip6c_m,
+ ip6cp->ip6c_off,
+ sizeof(struct udphdr),
+ (caddr_t)&udp);
+ /* Copy out the port numbers and the verification tag. */
+ memset(&sh, 0, sizeof(struct sctphdr));
+ m_copydata(ip6cp->ip6c_m,
+ ip6cp->ip6c_off + sizeof(struct udphdr),
+ sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t),
+ (caddr_t)&sh);
+ memset(&src, 0, sizeof(struct sockaddr_in6));
+ src.sin6_family = AF_INET6;
+ src.sin6_len = sizeof(struct sockaddr_in6);
+ src.sin6_port = sh.src_port;
+ src.sin6_addr = ip6cp->ip6c_ip6->ip6_src;
+ if (in6_setscope(&src.sin6_addr, ip6cp->ip6c_m->m_pkthdr.rcvif, NULL) != 0) {
+ return;
+ }
+ memset(&dst, 0, sizeof(struct sockaddr_in6));
+ dst.sin6_family = AF_INET6;
+ dst.sin6_len = sizeof(struct sockaddr_in6);
+ dst.sin6_port = sh.dest_port;
+ dst.sin6_addr = ip6cp->ip6c_ip6->ip6_dst;
+ if (in6_setscope(&dst.sin6_addr, ip6cp->ip6c_m->m_pkthdr.rcvif, NULL) != 0) {
+ return;
+ }
+ inp = NULL;
+ net = NULL;
+ stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst,
+ (struct sockaddr *)&src,
+ &inp, &net, 1, SCTP_DEFAULT_VRFID);
+ if ((stcb != NULL) &&
+ (net != NULL) &&
+ (inp != NULL)) {
+ /* Check the UDP port numbers */
+ if ((udp.uh_dport != net->port) ||
+ (udp.uh_sport != htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)))) {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ /* Check the verification tag */
+ if (ntohl(sh.v_tag) != 0) {
+ /*
+ * This must be the verification tag used for
+ * sending out packets. We don't consider packets
+ * reflecting the verification tag.
+ */
+ if (ntohl(sh.v_tag) != stcb->asoc.peer_vtag) {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ } else {
+ if (ip6cp->ip6c_m->m_pkthdr.len >=
+ ip6cp->ip6c_off + sizeof(struct udphdr) +
+ sizeof(struct sctphdr) +
+ sizeof(struct sctp_chunkhdr) +
+ offsetof(struct sctp_init, a_rwnd)) {
+ /*
+ * In this case we can check if we got an
+ * INIT chunk and if the initiate tag
+ * matches.
+ */
+ uint32_t initiate_tag;
+ uint8_t chunk_type;
+
+ m_copydata(ip6cp->ip6c_m,
+ ip6cp->ip6c_off +
+ sizeof(struct udphdr) +
+ sizeof(struct sctphdr),
+ sizeof(uint8_t),
+ (caddr_t)&chunk_type);
+ m_copydata(ip6cp->ip6c_m,
+ ip6cp->ip6c_off +
+ sizeof(struct udphdr) +
+ sizeof(struct sctphdr) +
+ sizeof(struct sctp_chunkhdr),
+ sizeof(uint32_t),
+ (caddr_t)&initiate_tag);
+ if ((chunk_type != SCTP_INITIATION) ||
+ (ntohl(initiate_tag) != stcb->asoc.my_vtag)) {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ } else {
+ SCTP_TCB_UNLOCK(stcb);
+ return;
+ }
+ }
+ type = ip6cp->ip6c_icmp6->icmp6_type;
+ code = ip6cp->ip6c_icmp6->icmp6_code;
+ if ((type == ICMP6_DST_UNREACH) &&
+ (code == ICMP6_DST_UNREACH_NOPORT)) {
+ type = ICMP6_PARAM_PROB;
+ code = ICMP6_PARAMPROB_NEXTHEADER;
+ }
+ sctp6_notify(inp, stcb, net, type, code,
+ (uint16_t) ntohl(ip6cp->ip6c_icmp6->icmp6_mtu));
+ } else {
+ if ((stcb == NULL) && (inp != NULL)) {
+ /* reduce inp's ref-count */
+ SCTP_INP_WLOCK(inp);
+ SCTP_INP_DECR_REF(inp);
+ SCTP_INP_WUNLOCK(inp);
+ }
+ if (stcb) {
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ }
+}
+
+#endif
+
void
sctp_over_udp_stop(void)
{
@@ -6946,7 +7229,9 @@ sctp_over_udp_start(void)
}
/* Call the special UDP hook. */
if ((ret = udp_set_kernel_tunneling(SCTP_BASE_INFO(udp4_tun_socket),
- sctp_recv_udp_tunneled_packet))) {
+ sctp_recv_udp_tunneled_packet,
+ sctp_recv_icmp_tunneled_packet,
+ NULL))) {
sctp_over_udp_stop();
return (ret);
}
@@ -6970,7 +7255,9 @@ sctp_over_udp_start(void)
}
/* Call the special UDP hook. */
if ((ret = udp_set_kernel_tunneling(SCTP_BASE_INFO(udp6_tun_socket),
- sctp_recv_udp_tunneled_packet))) {
+ sctp_recv_udp_tunneled_packet,
+ sctp_recv_icmp6_tunneled_packet,
+ NULL))) {
sctp_over_udp_stop();
return (ret);
}
diff --git a/freebsd/sys/netinet/sctputil.h b/freebsd/sys/netinet/sctputil.h
index af5a0f29..292068af 100644
--- a/freebsd/sys/netinet/sctputil.h
+++ b/freebsd/sys/netinet/sctputil.h
@@ -67,6 +67,9 @@ void
/*
* Function prototypes
*/
+int32_t
+sctp_map_assoc_state(int);
+
uint32_t
sctp_get_ifa_hash_val(struct sockaddr *addr);
@@ -80,7 +83,7 @@ uint32_t sctp_select_initial_TSN(struct sctp_pcb *);
uint32_t sctp_select_a_tag(struct sctp_inpcb *, uint16_t lport, uint16_t rport, int);
-int sctp_init_asoc(struct sctp_inpcb *, struct sctp_tcb *, uint32_t, uint32_t);
+int sctp_init_asoc(struct sctp_inpcb *, struct sctp_tcb *, uint32_t, uint32_t, uint16_t);
void sctp_fill_random_store(struct sctp_pcb *);
@@ -105,6 +108,14 @@ void
sctp_mtu_size_reset(struct sctp_inpcb *, struct sctp_association *, uint32_t);
void
+sctp_wakeup_the_read_socket(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
+ int so_locked
+#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
+ SCTP_UNUSED
+#endif
+);
+
+void
sctp_add_to_readq(struct sctp_inpcb *inp,
struct sctp_tcb *stcb,
struct sctp_queued_to_read *control,
@@ -117,16 +128,6 @@ sctp_add_to_readq(struct sctp_inpcb *inp,
#endif
);
-int
-sctp_append_to_readq(struct sctp_inpcb *inp,
- struct sctp_tcb *stcb,
- struct sctp_queued_to_read *control,
- struct mbuf *m,
- int end,
- int new_cumack,
- struct sockbuf *sb);
-
-
void sctp_iterator_worker(void);
uint32_t sctp_get_prev_mtu(uint32_t);
@@ -147,9 +148,11 @@ struct sctp_paramhdr *
sctp_get_next_param(struct mbuf *, int,
struct sctp_paramhdr *, int);
-int sctp_add_pad_tombuf(struct mbuf *, int);
+struct mbuf *
+ sctp_add_pad_tombuf(struct mbuf *, int);
-int sctp_pad_lastmbuf(struct mbuf *, int, struct mbuf *);
+struct mbuf *
+ sctp_pad_lastmbuf(struct mbuf *, int, struct mbuf *);
void
sctp_ulp_notify(uint32_t, struct sctp_tcb *, uint32_t, void *, int
@@ -206,7 +209,7 @@ sctp_handle_ootb(struct mbuf *, int, int,
struct sockaddr *, struct sockaddr *,
struct sctphdr *, struct sctp_inpcb *,
struct mbuf *,
- uint8_t, uint32_t,
+ uint8_t, uint32_t, uint16_t,
uint32_t, uint16_t);
int
@@ -215,7 +218,8 @@ sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr,
struct sctp_tcb *
sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr,
- int *totaddr, int *num_v4, int *num_v6, int *error, int limit, int *bad_addr);
+ unsigned int *totaddr, unsigned int *num_v4, unsigned int *num_v6,
+ int *error, unsigned int limit, int *bad_addr);
int sctp_is_there_an_abort_here(struct mbuf *, int, uint32_t *);
@@ -276,42 +280,42 @@ sctp_free_bufspace(struct sctp_tcb *, struct sctp_association *,
#define sctp_free_bufspace(stcb, asoc, tp1, chk_cnt) \
do { \
if (tp1->data != NULL) { \
- atomic_subtract_int(&((asoc)->chunks_on_out_queue), chk_cnt); \
+ atomic_subtract_int(&((asoc)->chunks_on_out_queue), chk_cnt); \
if ((asoc)->total_output_queue_size >= tp1->book_size) { \
atomic_subtract_int(&((asoc)->total_output_queue_size), tp1->book_size); \
} else { \
(asoc)->total_output_queue_size = 0; \
} \
- if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
- (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
+ if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
if (stcb->sctp_socket->so_snd.sb_cc >= tp1->book_size) { \
atomic_subtract_int(&((stcb)->sctp_socket->so_snd.sb_cc), tp1->book_size); \
} else { \
stcb->sctp_socket->so_snd.sb_cc = 0; \
} \
} \
- } \
+ } \
} while (0)
#endif
#define sctp_free_spbufspace(stcb, asoc, sp) \
do { \
- if (sp->data != NULL) { \
+ if (sp->data != NULL) { \
if ((asoc)->total_output_queue_size >= sp->length) { \
atomic_subtract_int(&(asoc)->total_output_queue_size, sp->length); \
} else { \
(asoc)->total_output_queue_size = 0; \
} \
- if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
- (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
+ if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \
+ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \
if (stcb->sctp_socket->so_snd.sb_cc >= sp->length) { \
atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc,sp->length); \
} else { \
stcb->sctp_socket->so_snd.sb_cc = 0; \
} \
} \
- } \
+ } \
} while (0)
#define sctp_snd_sb_alloc(stcb, sz) \
@@ -347,10 +351,16 @@ void sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t, uint16_t, uint16_t,
void sctp_log_nagle_event(struct sctp_tcb *stcb, int action);
+#ifdef SCTP_MBUF_LOGGING
void
sctp_log_mb(struct mbuf *m, int from);
void
+ sctp_log_mbc(struct mbuf *m, int from);
+
+#endif
+
+void
sctp_sblog(struct sockbuf *sb,
struct sctp_tcb *stcb, int from, int incr);
@@ -365,9 +375,8 @@ void sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc
void sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from);
void sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *, int, int, uint8_t);
-void sctp_log_block(uint8_t, struct sctp_association *, int);
+void sctp_log_block(uint8_t, struct sctp_association *, size_t);
void sctp_log_rwnd(uint8_t, uint32_t, uint32_t, uint32_t);
-void sctp_log_mbcnt(uint8_t, uint32_t, uint32_t, uint32_t, uint32_t);
void sctp_log_rwnd_set(uint8_t, uint32_t, uint32_t, uint32_t, uint32_t);
int sctp_fill_stat_log(void *, size_t *);
void sctp_log_fr(uint32_t, uint32_t, uint32_t, int);
diff --git a/freebsd/sys/netinet/tcp.h b/freebsd/sys/netinet/tcp.h
index fb2f8108..47038104 100644
--- a/freebsd/sys/netinet/tcp.h
+++ b/freebsd/sys/netinet/tcp.h
@@ -97,6 +97,10 @@ struct tcphdr {
#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */
#define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */
#define TCPOLEN_SIGNATURE 18
+#define TCPOPT_FAST_OPEN 34
+#define TCPOLEN_FAST_OPEN_EMPTY 2
+#define TCPOLEN_FAST_OPEN_MIN 6
+#define TCPOLEN_FAST_OPEN_MAX 18
/* Miscellaneous constants */
#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */
@@ -161,11 +165,15 @@ struct tcphdr {
#define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */
#define TCP_INFO 32 /* retrieve tcp_info structure */
#define TCP_CONGESTION 64 /* get/set congestion control algorithm */
+#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */
#define TCP_KEEPINIT 128 /* N, time to establish connection */
#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */
#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */
#define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */
-
+#define TCP_FASTOPEN 1025 /* enable TFO / was created via TFO */
+#define TCP_PCAP_OUT 2048 /* number of output packets to keep */
+#define TCP_PCAP_IN 4096 /* number of input packets to keep */
+#define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */
/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR
@@ -243,5 +251,11 @@ struct tcp_info {
u_int32_t __tcpi_pad[26]; /* Padding. */
};
#endif
+#define TCP_FUNCTION_NAME_LEN_MAX 32
+
+struct tcp_function_set {
+ char function_set_name[TCP_FUNCTION_NAME_LEN_MAX];
+ uint32_t pcbcnt;
+};
#endif /* !_NETINET_TCP_H_ */
diff --git a/freebsd/sys/netinet/tcp_debug.c b/freebsd/sys/netinet/tcp_debug.c
index 2ef9ce43..c5f74182 100644
--- a/freebsd/sys/netinet/tcp_debug.c
+++ b/freebsd/sys/netinet/tcp_debug.c
@@ -177,11 +177,10 @@ tcp_trace(short act, short ostate, struct tcpcb *tp, void *ipgen,
#ifdef INET6
isipv6 ? ntohs(((struct ip6_hdr *)ipgen)->ip6_plen) :
#endif
- ((struct ip *)ipgen)->ip_len;
+ ntohs(((struct ip *)ipgen)->ip_len);
if (act == TA_OUTPUT) {
seq = ntohl(seq);
ack = ntohl(ack);
- len = ntohs((u_short)len);
}
if (act == TA_OUTPUT)
len -= sizeof (struct tcphdr);
diff --git a/freebsd/sys/netinet/tcp_hostcache.c b/freebsd/sys/netinet/tcp_hostcache.c
index 260d161d..4e78b8b2 100644
--- a/freebsd/sys/netinet/tcp_hostcache.c
+++ b/freebsd/sys/netinet/tcp_hostcache.c
@@ -34,8 +34,8 @@
* table to a dedicated structure indexed by the remote IP address. It keeps
* information on the measured TCP parameters of past TCP sessions to allow
* better initial start values to be used with later connections to/from the
- * same source. Depending on the network parameters (delay, bandwidth, max
- * MTU, congestion window) between local and remote sites, this can lead to
+ * same source. Depending on the network parameters (delay, max MTU,
+ * congestion window) between local and remote sites, this can lead to
* significant speed-ups for new TCP connections after the first one.
*
* Due to the tcp_hostcache, all TCP-specific metrics information in the
@@ -81,6 +81,7 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#include <net/vnet.h>
@@ -118,37 +119,38 @@ static VNET_DEFINE(struct callout, tcp_hc_callout);
static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *);
static struct hc_metrics *tcp_hc_insert(struct in_conninfo *);
static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
+static int sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS);
static void tcp_hc_purge_internal(int);
static void tcp_hc_purge(void *);
static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0,
"TCP Host cache");
-SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
+SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN,
&VNET_NAME(tcp_hostcache.cache_limit), 0,
"Overall entry limit for hostcache");
-SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
+SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN,
&VNET_NAME(tcp_hostcache.hashsize), 0,
"Size of TCP hostcache hashtable");
-SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit,
- CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0,
+SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit,
+ CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0,
"Per-bucket hash limit for hostcache");
-SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD,
+SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_VNET | CTLFLAG_RD,
&VNET_NAME(tcp_hostcache.cache_count), 0,
"Current number of entries in hostcache");
-SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_hostcache.expire), 0,
"Expire time of TCP hostcache entries");
-SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_hostcache.prune), 0,
"Time between purge runs");
-SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_hostcache.purgeall), 0,
"Expire all entires on next purge run");
@@ -156,6 +158,9 @@ SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0,
sysctl_tcp_hc_list, "A", "List of all hostcache entries");
+SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, purgenow,
+ CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
+ sysctl_tcp_hc_purgenow, "I", "Immediately purge all entries");
static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");
@@ -235,7 +240,7 @@ tcp_hc_init(void)
/*
* Set up periodic cache cleanup.
*/
- callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE);
+ callout_init(&V_tcp_hc_callout, 1);
callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
tcp_hc_purge, curvnet);
}
@@ -297,6 +302,7 @@ tcp_hc_lookup(struct in_conninfo *inc)
*/
TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
if (inc->inc_flags & INC_ISIPV6) {
+ /* XXX: check ip6_zoneid */
if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
sizeof(inc->inc6_faddr)) == 0)
return hc_entry;
@@ -388,9 +394,10 @@ tcp_hc_insert(struct in_conninfo *inc)
* Initialize basic information of hostcache entry.
*/
bzero(hc_entry, sizeof(*hc_entry));
- if (inc->inc_flags & INC_ISIPV6)
- bcopy(&inc->inc6_faddr, &hc_entry->ip6, sizeof(hc_entry->ip6));
- else
+ if (inc->inc_flags & INC_ISIPV6) {
+ hc_entry->ip6 = inc->inc6_faddr;
+ hc_entry->ip6_zoneid = inc->inc6_zoneid;
+ } else
hc_entry->ip4 = inc->inc_faddr;
hc_entry->rmx_head = hc_head;
hc_entry->rmx_expire = V_tcp_hostcache.expire;
@@ -435,7 +442,6 @@ tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite)
hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
- hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth;
hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;
@@ -550,14 +556,6 @@ tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
(hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
TCPSTAT_INC(tcps_cachedssthresh);
}
- if (hcml->rmx_bandwidth != 0) {
- if (hc_entry->rmx_bandwidth == 0)
- hc_entry->rmx_bandwidth = hcml->rmx_bandwidth;
- else
- hc_entry->rmx_bandwidth =
- (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2;
- /* TCPSTAT_INC(tcps_cachedbandwidth); */
- }
if (hcml->rmx_cwnd != 0) {
if (hc_entry->rmx_cwnd == 0)
hc_entry->rmx_cwnd = hcml->rmx_cwnd;
@@ -595,7 +593,7 @@ tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
static int
sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
{
- int linesize = 128;
+ const int linesize = 128;
struct sbuf sb;
int i, error;
struct hc_metrics *hc_entry;
@@ -604,10 +602,10 @@ sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
#endif
sbuf_new(&sb, NULL, linesize * (V_tcp_hostcache.cache_count + 1),
- SBUF_FIXEDLEN);
+ SBUF_INCLUDENUL);
sbuf_printf(&sb,
- "\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH "
+ "\nIP address MTU SSTRESH RTT RTTVAR "
" CWND SENDPIPE RECVPIPE HITS UPD EXP\n");
#define msec(u) (((u) + 500) / 1000)
@@ -616,8 +614,8 @@ sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket,
rmx_q) {
sbuf_printf(&sb,
- "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu "
- "%4lu %4lu %4i\n",
+ "%-15s %5lu %8lu %6lums %6lums %8lu %8lu %8lu %4lu "
+ "%4lu %4i\n",
hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) :
#ifdef INET6
ip6_sprintf(ip6buf, &hc_entry->ip6),
@@ -630,7 +628,6 @@ sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
msec(hc_entry->rmx_rttvar *
(RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE))),
- hc_entry->rmx_bandwidth * 8,
hc_entry->rmx_cwnd,
hc_entry->rmx_sendpipe,
hc_entry->rmx_recvpipe,
@@ -641,8 +638,9 @@ sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
}
#undef msec
- sbuf_finish(&sb);
- error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
+ error = sbuf_finish(&sb);
+ if (error == 0)
+ error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
sbuf_delete(&sb);
return(error);
}
@@ -694,3 +692,24 @@ tcp_hc_purge(void *arg)
tcp_hc_purge, arg);
CURVNET_RESTORE();
}
+
+/*
+ * Expire and purge all entries in hostcache immediately.
+ */
+static int
+sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS)
+{
+ int error, val;
+
+ val = 0;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ tcp_hc_purge_internal(1);
+
+ callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
+ tcp_hc_purge, curvnet);
+
+ return (0);
+}
diff --git a/freebsd/sys/netinet/tcp_hostcache.h b/freebsd/sys/netinet/tcp_hostcache.h
index 8569edcc..44875ff6 100644
--- a/freebsd/sys/netinet/tcp_hostcache.h
+++ b/freebsd/sys/netinet/tcp_hostcache.h
@@ -51,12 +51,12 @@ struct hc_metrics {
struct hc_head *rmx_head; /* head of bucket tail queue */
struct in_addr ip4; /* IP address */
struct in6_addr ip6; /* IP6 address */
+ uint32_t ip6_zoneid; /* IPv6 scope zone id */
/* endpoint specific values for tcp */
u_long rmx_mtu; /* MTU for this path */
u_long rmx_ssthresh; /* outbound gateway buffer limit */
u_long rmx_rtt; /* estimated round trip time */
u_long rmx_rttvar; /* estimated rtt variance */
- u_long rmx_bandwidth; /* estimated bandwidth */
u_long rmx_cwnd; /* congestion window */
u_long rmx_sendpipe; /* outbound delay-bandwidth product */
u_long rmx_recvpipe; /* inbound delay-bandwidth product */
diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c
index f9512eb3..eaa3eb3d 100644
--- a/freebsd/sys/netinet/tcp_input.c
+++ b/freebsd/sys/netinet/tcp_input.c
@@ -52,7 +52,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <rtems/bsd/local/opt_ipfw.h> /* for ipfw_fwd */
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_ipsec.h>
@@ -65,6 +64,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mbuf.h>
#include <sys/proc.h> /* for proc0 declaration */
#include <sys/protosw.h>
+#include <sys/sdt.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
@@ -77,16 +77,16 @@ __FBSDID("$FreeBSD$");
#include <vm/uma.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#include <net/vnet.h>
#define TCPSTATES /* for logging */
-#include <netinet/cc.h>
#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
-#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h> /* required for icmp_var.h */
#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
@@ -95,14 +95,23 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet6/in6_pcb.h>
+#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
+#ifdef TCP_RFC7413
+#include <netinet/tcp_fastopen.h>
+#endif
+#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet6/tcp6_var.h>
#include <netinet/tcpip.h>
+#include <netinet/cc/cc.h>
+#ifdef TCPPCAP
+#include <netinet/tcp_pcap.h>
+#endif
#include <netinet/tcp_syncache.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
@@ -122,11 +131,6 @@ __FBSDID("$FreeBSD$");
const int tcprexmtthresh = 3;
-VNET_DEFINE(struct tcpstat, tcpstat);
-SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
- &VNET_NAME(tcpstat), tcpstat,
- "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
-
int tcp_log_in_vain = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
&tcp_log_in_vain, 0,
@@ -134,88 +138,96 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
VNET_DEFINE(int, blackhole) = 0;
#define V_blackhole VNET(blackhole)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(blackhole), 0,
"Do not send RST on segments to closed ports");
VNET_DEFINE(int, tcp_delack_enabled) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_delack_enabled), 0,
"Delay ACK to try and piggyback it onto a data packet");
VNET_DEFINE(int, drop_synfin) = 0;
#define V_drop_synfin VNET(drop_synfin)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(drop_synfin), 0,
"Drop TCP packets with SYN+FIN set");
+VNET_DEFINE(int, tcp_do_rfc6675_pipe) = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc6675_pipe, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(tcp_do_rfc6675_pipe), 0,
+ "Use calculated pipe/in-flight bytes per RFC 6675");
+
VNET_DEFINE(int, tcp_do_rfc3042) = 1;
#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_rfc3042), 0,
"Enable RFC 3042 (Limited Transmit)");
VNET_DEFINE(int, tcp_do_rfc3390) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_rfc3390), 0,
"Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, experimental, CTLFLAG_RW, 0,
- "Experimental TCP extensions");
-
-VNET_DEFINE(int, tcp_do_initcwnd10) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_RW,
- &VNET_NAME(tcp_do_initcwnd10), 0,
- "Enable RFC 6928 (Increasing initial CWND to 10)");
+VNET_DEFINE(int, tcp_initcwnd_segments) = 10;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0,
+ "Slow-start flight size (initial congestion window) in number of segments");
VNET_DEFINE(int, tcp_do_rfc3465) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_rfc3465), 0,
"Enable RFC 3465 (Appropriate Byte Counting)");
VNET_DEFINE(int, tcp_abc_l_var) = 2;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_abc_l_var), 2,
"Cap the max cwnd increment during slow-start to this number of segments");
static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
-VNET_DEFINE(int, tcp_do_ecn) = 0;
-SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
+VNET_DEFINE(int, tcp_do_ecn) = 2;
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_ecn), 0,
"TCP ECN support");
VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_ecn_maxretries), 0,
"Max retries before giving up on ECN");
+VNET_DEFINE(int, tcp_insecure_syn) = 0;
+#define V_tcp_insecure_syn VNET(tcp_insecure_syn)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(tcp_insecure_syn), 0,
+ "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets");
+
VNET_DEFINE(int, tcp_insecure_rst) = 0;
#define V_tcp_insecure_rst VNET(tcp_insecure_rst)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_insecure_rst), 0,
- "Follow the old (insecure) criteria for accepting RST packets");
+ "Follow RFC793 instead of RFC5961 criteria for accepting RST packets");
VNET_DEFINE(int, tcp_recvspace) = 1024*64;
#define V_tcp_recvspace VNET(tcp_recvspace)
-SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size");
VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_autorcvbuf), 0,
"Enable automatic receive buffer sizing");
VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024;
#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_autorcvbuf_inc), 0,
"Incrementor step size of automatic receive buffer");
VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024;
#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_autorcvbuf_max), 0,
"Max size of automatic receive buffer");
@@ -223,47 +235,55 @@ VNET_DEFINE(struct inpcbhead, tcb);
#define tcb6 tcb /* for KAME src sync over BSD*'s */
VNET_DEFINE(struct inpcbinfo, tcbinfo);
-static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
-static void tcp_do_segment(struct mbuf *, struct tcphdr *,
- struct socket *, struct tcpcb *, int, int, uint8_t,
- int);
-static void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
- struct tcpcb *, int, int);
-static void tcp_pulloutofband(struct socket *,
- struct tcphdr *, struct mbuf *, int);
-static void tcp_xmit_timer(struct tcpcb *, int);
-static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
-static void inline tcp_fields_to_host(struct tcphdr *);
-#ifdef TCP_SIGNATURE
-static void inline tcp_fields_to_net(struct tcphdr *);
-static int inline tcp_signature_verify_input(struct mbuf *, int, int,
- int, struct tcpopt *, struct tcphdr *, u_int);
-#endif
-static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
- uint16_t type);
-static void inline cc_conn_init(struct tcpcb *tp);
-static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
-static void inline hhook_run_tcp_est_in(struct tcpcb *tp,
- struct tcphdr *th, struct tcpopt *to);
+/*
+ * TCP statistics are stored in an array of counter(9)s, which size matches
+ * size of struct tcpstat. TCP running connection count is a regular array.
+ */
+VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat);
+SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat,
+ tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
+VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]);
+SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD |
+ CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES,
+ "TCP connection counts by TCP state");
+
+static void
+tcp_vnet_init(const void *unused)
+{
+
+ COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK);
+ VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK);
+}
+VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+ tcp_vnet_init, NULL);
+
+#ifdef VIMAGE
+static void
+tcp_vnet_uninit(const void *unused)
+{
+
+ COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES);
+ VNET_PCPUSTAT_FREE(tcpstat);
+}
+VNET_SYSUNINIT(tcp_vnet_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+ tcp_vnet_uninit, NULL);
+#endif /* VIMAGE */
/*
* Kernel module interface for updating tcpstat. The argument is an index
- * into tcpstat treated as an array of u_long. While this encodes the
- * general layout of tcpstat into the caller, it doesn't encode its location,
- * so that future changes to add, for example, per-CPU stats support won't
- * cause binary compatibility problems for kernel modules.
+ * into tcpstat treated as an array.
*/
void
kmod_tcpstat_inc(int statnum)
{
- (*((u_long *)&V_tcpstat + statnum))++;
+ counter_u64_add(VNET(tcpstat)[statnum], 1);
}
/*
* Wrapper for the TCP established input helper hook.
*/
-static void inline
+void
hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
{
struct tcp_hhook_data hhook_data;
@@ -281,7 +301,7 @@ hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
/*
* CC wrapper hook functions
*/
-static void inline
+void
cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
{
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -295,7 +315,7 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
if (type == CC_ACK) {
if (tp->snd_cwnd > tp->snd_ssthresh) {
tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
- V_tcp_abc_l_var * tp->t_maxseg);
+ V_tcp_abc_l_var * tcp_maxseg(tp));
if (tp->t_bytes_acked >= tp->snd_cwnd) {
tp->t_bytes_acked -= tp->snd_cwnd;
tp->ccv->flags |= CCF_ABC_SENTAWND;
@@ -313,16 +333,18 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
}
}
-static void inline
+void
cc_conn_init(struct tcpcb *tp)
{
struct hc_metrics_lite metrics;
struct inpcb *inp = tp->t_inpcb;
+ u_int maxseg;
int rtt;
INP_WLOCK_ASSERT(tp->t_inpcb);
tcp_hc_get(&inp->inp_inc, &metrics);
+ maxseg = tcp_maxseg(tp);
if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
tp->t_srtt = rtt;
@@ -344,10 +366,10 @@ cc_conn_init(struct tcpcb *tp)
/*
* There's some sort of gateway or interface
* buffer limit on the path. Use this to set
- * the slow start threshhold, but set the
+ * the slow start threshold, but set the
* threshold to no less than 2*mss.
*/
- tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh);
+ tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh);
TCPSTAT_INC(tcps_usedssthresh);
}
@@ -357,27 +379,27 @@ cc_conn_init(struct tcpcb *tp)
* RFC5681 Section 3.1 specifies the default conservative values.
* RFC3390 specifies slightly more aggressive values.
* RFC6928 increases it to ten segments.
+ * Support for user specified value for initial flight size.
*
* If a SYN or SYN/ACK was lost and retransmitted, we have to
* reduce the initial CWND to one segment as congestion is likely
* requiring us to be cautious.
*/
if (tp->snd_cwnd == 1)
- tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */
- else if (V_tcp_do_initcwnd10)
- tp->snd_cwnd = min(10 * tp->t_maxseg,
- max(2 * tp->t_maxseg, 14600));
+ tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */
+ else if (V_tcp_initcwnd_segments)
+ tp->snd_cwnd = min(V_tcp_initcwnd_segments * maxseg,
+ max(2 * maxseg, V_tcp_initcwnd_segments * 1460));
else if (V_tcp_do_rfc3390)
- tp->snd_cwnd = min(4 * tp->t_maxseg,
- max(2 * tp->t_maxseg, 4380));
+ tp->snd_cwnd = min(4 * maxseg, max(2 * maxseg, 4380));
else {
/* Per RFC5681 Section 3.1 */
- if (tp->t_maxseg > 2190)
- tp->snd_cwnd = 2 * tp->t_maxseg;
- else if (tp->t_maxseg > 1095)
- tp->snd_cwnd = 3 * tp->t_maxseg;
+ if (maxseg > 2190)
+ tp->snd_cwnd = 2 * maxseg;
+ else if (maxseg > 1095)
+ tp->snd_cwnd = 3 * maxseg;
else
- tp->snd_cwnd = 4 * tp->t_maxseg;
+ tp->snd_cwnd = 4 * maxseg;
}
if (CC_ALGO(tp)->conn_init != NULL)
@@ -387,6 +409,8 @@ cc_conn_init(struct tcpcb *tp)
void inline
cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
{
+ u_int maxseg;
+
INP_WLOCK_ASSERT(tp->t_inpcb);
switch(type) {
@@ -406,12 +430,13 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
}
break;
case CC_RTO:
+ maxseg = tcp_maxseg(tp);
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
EXIT_RECOVERY(tp->t_flags);
tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
- tp->t_maxseg) * tp->t_maxseg;
- tp->snd_cwnd = tp->t_maxseg;
+ maxseg) * maxseg;
+ tp->snd_cwnd = maxseg;
break;
case CC_RTO_ERR:
TCPSTAT_INC(tcps_sndrexmitbad);
@@ -436,7 +461,7 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
}
}
-static void inline
+void inline
cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
{
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -451,27 +476,7 @@ cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
tp->t_bytes_acked = 0;
}
-static inline void
-tcp_fields_to_host(struct tcphdr *th)
-{
-
- th->th_seq = ntohl(th->th_seq);
- th->th_ack = ntohl(th->th_ack);
- th->th_win = ntohs(th->th_win);
- th->th_urp = ntohs(th->th_urp);
-}
-
#ifdef TCP_SIGNATURE
-static inline void
-tcp_fields_to_net(struct tcphdr *th)
-{
-
- th->th_seq = htonl(th->th_seq);
- th->th_ack = htonl(th->th_ack);
- th->th_win = htons(th->th_win);
- th->th_urp = htons(th->th_urp);
-}
-
static inline int
tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen,
struct tcpopt *to, struct tcphdr *th, u_int tcpbflag)
@@ -485,34 +490,56 @@ tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen,
}
#endif
-/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
-#ifdef INET6
-#define ND6_HINT(tp) \
-do { \
- if ((tp) && (tp)->t_inpcb && \
- ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
- nd6_nud_hint(NULL, NULL, 0); \
-} while (0)
-#else
-#define ND6_HINT(tp)
-#endif
-
/*
* Indicate whether this ack should be delayed. We can delay the ack if
- * - there is no delayed ack timer in progress and
- * - our last ack wasn't a 0-sized window. We never want to delay
- * the ack that opens up a 0-sized window and
- * - delayed acks are enabled or
- * - this is a half-synchronized T/TCP connection.
- * - the segment size is not larger than the MSS and LRO wasn't used
- * for this segment.
+ * following conditions are met:
+ * - There is no delayed ack timer in progress.
+ * - Our last ack wasn't a 0-sized window. We never want to delay
+ * the ack that opens up a 0-sized window.
+ * - LRO wasn't used for this segment. We make sure by checking that the
+ * segment size is not larger than the MSS.
*/
#define DELAY_ACK(tp, tlen) \
((!tcp_timer_active(tp, TT_DELACK) && \
(tp->t_flags & TF_RXWIN0SENT) == 0) && \
- (tlen <= tp->t_maxopd) && \
+ (tlen <= tp->t_maxseg) && \
(V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
+static void inline
+cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
+{
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ if (CC_ALGO(tp)->ecnpkt_handler != NULL) {
+ switch (iptos & IPTOS_ECN_MASK) {
+ case IPTOS_ECN_CE:
+ tp->ccv->flags |= CCF_IPHDR_CE;
+ break;
+ case IPTOS_ECN_ECT0:
+ tp->ccv->flags &= ~CCF_IPHDR_CE;
+ break;
+ case IPTOS_ECN_ECT1:
+ tp->ccv->flags &= ~CCF_IPHDR_CE;
+ break;
+ }
+
+ if (th->th_flags & TH_CWR)
+ tp->ccv->flags |= CCF_TCPHDR_CWR;
+ else
+ tp->ccv->flags &= ~CCF_TCPHDR_CWR;
+
+ if (tp->t_flags & TF_DELACK)
+ tp->ccv->flags |= CCF_DELACK;
+ else
+ tp->ccv->flags &= ~CCF_DELACK;
+
+ CC_ALGO(tp)->ecnpkt_handler(tp->ccv);
+
+ if (tp->ccv->flags & CCF_ACKNOW)
+ tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ }
+}
+
/*
* TCP input handling is split into multiple parts:
* tcp6_input is a thin wrapper around tcp_input for the extended
@@ -528,6 +555,7 @@ tcp6_input(struct mbuf **mp, int *offp, int proto)
{
struct mbuf *m = *mp;
struct in6_ifaddr *ia6;
+ struct ip6_hdr *ip6;
IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
@@ -535,7 +563,8 @@ tcp6_input(struct mbuf **mp, int *offp, int proto)
* draft-itojun-ipv6-tcp-to-anycast
* better place to put this in?
*/
- ia6 = ip6_getdstifaddr(m);
+ ip6 = mtod(m, struct ip6_hdr *);
+ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
struct ip6_hdr *ip6;
@@ -543,28 +572,26 @@ tcp6_input(struct mbuf **mp, int *offp, int proto)
ip6 = mtod(m, struct ip6_hdr *);
icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
(caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
- return IPPROTO_DONE;
+ return (IPPROTO_DONE);
}
if (ia6)
ifa_free(&ia6->ia_ifa);
- tcp_input(m, *offp);
- return IPPROTO_DONE;
+ return (tcp_input(mp, offp, proto));
}
#endif /* INET6 */
-void
-tcp_input(struct mbuf *m, int off0)
+int
+tcp_input(struct mbuf **mp, int *offp, int proto)
{
+ struct mbuf *m = *mp;
struct tcphdr *th = NULL;
struct ip *ip = NULL;
-#ifdef INET
- struct ipovly *ipov;
-#endif
struct inpcb *inp = NULL;
struct tcpcb *tp = NULL;
struct socket *so = NULL;
u_char *optp = NULL;
+ int off0;
int optlen = 0;
#ifdef INET
int len;
@@ -587,9 +614,6 @@ tcp_input(struct mbuf *m, int off0)
struct tcpopt to; /* options in this segment */
char *s = NULL; /* address and port logging */
int ti_locked;
-#define TI_UNLOCKED 1
-#define TI_WLOCKED 2
-
#ifdef TCPDEBUG
/*
* The size of tcp_saveipgen must be the size of the max ip header,
@@ -604,6 +628,9 @@ tcp_input(struct mbuf *m, int off0)
isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
#endif
+ off0 = *offp;
+ m = *mp;
+ *mp = NULL;
to.to_flags = 0;
TCPSTAT_INC(tcps_rcvtotal);
@@ -615,7 +642,7 @@ tcp_input(struct mbuf *m, int off0)
m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
if (m == NULL) {
TCPSTAT_INC(tcps_rcvshort);
- return;
+ return (IPPROTO_DONE);
}
}
@@ -660,45 +687,43 @@ tcp_input(struct mbuf *m, int off0)
* Note: IP leaves IP header in first mbuf.
*/
if (off0 > sizeof (struct ip)) {
- ip_stripoptions(m, (struct mbuf *)0);
+ ip_stripoptions(m);
off0 = sizeof(struct ip);
}
if (m->m_len < sizeof (struct tcpiphdr)) {
if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
== NULL) {
TCPSTAT_INC(tcps_rcvshort);
- return;
+ return (IPPROTO_DONE);
}
}
ip = mtod(m, struct ip *);
- ipov = (struct ipovly *)ip;
th = (struct tcphdr *)((caddr_t)ip + off0);
- tlen = ip->ip_len;
+ tlen = ntohs(ip->ip_len) - off0;
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
th->th_sum = m->m_pkthdr.csum_data;
else
th->th_sum = in_pseudo(ip->ip_src.s_addr,
- ip->ip_dst.s_addr,
- htonl(m->m_pkthdr.csum_data +
- ip->ip_len +
- IPPROTO_TCP));
+ ip->ip_dst.s_addr,
+ htonl(m->m_pkthdr.csum_data + tlen +
+ IPPROTO_TCP));
th->th_sum ^= 0xffff;
-#ifdef TCPDEBUG
- ipov->ih_len = (u_short)tlen;
- ipov->ih_len = htons(ipov->ih_len);
-#endif
} else {
+ struct ipovly *ipov = (struct ipovly *)ip;
+
/*
* Checksum extended TCP header and data.
*/
- len = sizeof (struct ip) + tlen;
+ len = off0 + tlen;
bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
- ipov->ih_len = (u_short)tlen;
- ipov->ih_len = htons(ipov->ih_len);
+ ipov->ih_len = htons(tlen);
th->th_sum = in_cksum(m, len);
+ /* Reset length for SDT probes. */
+ ip->ip_len = htons(tlen + off0);
}
+
if (th->th_sum) {
TCPSTAT_INC(tcps_rcvbadsum);
goto drop;
@@ -732,7 +757,7 @@ tcp_input(struct mbuf *m, int off0)
if (off > sizeof (struct tcphdr)) {
#ifdef INET6
if (isipv6) {
- IP6_EXTHDR_CHECK(m, off0, off, );
+ IP6_EXTHDR_CHECK(m, off0, off, IPPROTO_DONE);
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)((caddr_t)ip6 + off0);
}
@@ -746,10 +771,9 @@ tcp_input(struct mbuf *m, int off0)
if ((m = m_pullup(m, sizeof (struct ip) + off))
== NULL) {
TCPSTAT_INC(tcps_rcvshort);
- return;
+ return (IPPROTO_DONE);
}
ip = mtod(m, struct ip *);
- ipov = (struct ipovly *)ip;
th = (struct tcphdr *)((caddr_t)ip + off0);
}
}
@@ -771,26 +795,17 @@ tcp_input(struct mbuf *m, int off0)
/*
* Locate pcb for segment; if we're likely to add or remove a
- * connection then first acquire pcbinfo lock. There are two cases
+ * connection then first acquire pcbinfo lock. There are three cases
* where we might discover later we need a write lock despite the
- * flags: ACKs moving a connection out of the syncache, and ACKs for
- * a connection in TIMEWAIT.
+ * flags: ACKs moving a connection out of the syncache, ACKs for a
+ * connection in TIMEWAIT and SYNs not targeting a listening socket.
*/
- if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) {
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
+ if ((thflags & (TH_FIN | TH_RST)) != 0) {
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
} else
ti_locked = TI_UNLOCKED;
-findpcb:
-#ifdef INVARIANTS
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- } else {
- INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- }
-#endif
-
/*
* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
*/
@@ -807,6 +822,14 @@ findpcb:
)
fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
+findpcb:
+#ifdef INVARIANTS
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ } else {
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
+#endif
#ifdef INET6
if (isipv6 && fwd_tag != NULL) {
struct sockaddr_in6 *next_hop6;
@@ -831,10 +854,6 @@ findpcb:
th->th_dport, INPLOOKUP_WILDCARD |
INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
}
- /* Remove the tag from the packet. We don't need it anymore. */
- m_tag_delete(m, fwd_tag);
- m->m_flags &= ~M_IP6_NEXTHOP;
- fwd_tag = NULL;
} else if (isipv6) {
inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src,
th->th_sport, &ip6->ip6_dst, th->th_dport,
@@ -869,10 +888,6 @@ findpcb:
th->th_dport, INPLOOKUP_WILDCARD |
INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
}
- /* Remove the tag from the packet. We don't need it anymore. */
- m_tag_delete(m, fwd_tag);
- m->m_flags &= ~M_IP_NEXTHOP;
- fwd_tag = NULL;
} else
inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
th->th_sport, ip->ip_dst, th->th_dport,
@@ -908,23 +923,20 @@ findpcb:
goto dropwithreset;
}
INP_WLOCK_ASSERT(inp);
- if (!(inp->inp_flags & INP_HW_FLOWID)
- && (m->m_flags & M_FLOWID)
- && ((inp->inp_socket == NULL)
- || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) {
- inp->inp_flags |= INP_HW_FLOWID;
- inp->inp_flags &= ~INP_SW_FLOWID;
+ if ((inp->inp_flowtype == M_HASHTYPE_NONE) &&
+ (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) &&
+ ((inp->inp_socket == NULL) ||
+ (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) {
inp->inp_flowid = m->m_pkthdr.flowid;
+ inp->inp_flowtype = M_HASHTYPE_GET(m);
}
#ifdef IPSEC
#ifdef INET6
if (isipv6 && ipsec6_in_reject(m, inp)) {
- IPSEC6STAT_INC(in_polvio);
goto dropunlock;
} else
#endif /* INET6 */
if (ipsec4_in_reject(m, inp) != 0) {
- IPSECSTAT_INC(in_polvio);
goto dropunlock;
}
#endif /* IPSEC */
@@ -934,9 +946,10 @@ findpcb:
*/
if (inp->inp_ip_minttl != 0) {
#ifdef INET6
- if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim)
- goto dropunlock;
- else
+ if (isipv6) {
+ if (inp->inp_ip_minttl > ip6->ip6_hlim)
+ goto dropunlock;
+ } else
#endif
if (inp->inp_ip_minttl > ip->ip_ttl)
goto dropunlock;
@@ -945,7 +958,7 @@ findpcb:
/*
* A previous connection in TIMEWAIT state is supposed to catch stray
* or duplicate segments arriving late. If this segment was a
- * legitimate new connection attempt the old INPCB gets removed and
+ * legitimate new connection attempt, the old INPCB gets removed and
* we can try again to find a listening socket.
*
* At this point, due to earlier optimism, we may hold only an inpcb
@@ -961,20 +974,20 @@ findpcb:
relocked:
if (inp->inp_flags & INP_TIMEWAIT) {
if (ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
inp = NULL;
goto findpcb;
}
} else
- ti_locked = TI_WLOCKED;
+ ti_locked = TI_RLOCKED;
}
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
if (thflags & TH_SYN)
tcp_dooptions(&to, optp, optlen, TO_SYN);
@@ -983,8 +996,8 @@ relocked:
*/
if (tcp_twcheck(inp, &to, th, m, tlen))
goto findpcb;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- return;
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ return (IPPROTO_DONE);
}
/*
* The TCPCB may no longer exist if the connection is winding
@@ -1013,16 +1026,18 @@ relocked:
* now be in TIMEWAIT.
*/
#ifdef INVARIANTS
- if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0)
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ if ((thflags & (TH_FIN | TH_RST)) != 0)
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
#endif
- if (tp->t_state != TCPS_ESTABLISHED) {
+ if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) ||
+ (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) &&
+ !(tp->t_flags & TF_FASTOPEN)))) {
if (ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
inp = NULL;
@@ -1030,9 +1045,9 @@ relocked:
}
goto relocked;
} else
- ti_locked = TI_WLOCKED;
+ ti_locked = TI_RLOCKED;
}
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
}
#ifdef MAC
@@ -1057,17 +1072,13 @@ relocked:
/*
* When the socket is accepting connections (the INPCB is in LISTEN
* state) we look into the SYN cache if this is a new connection
- * attempt or the completion of a previous one. Because listen
- * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be
- * held in this case.
+ * attempt or the completion of a previous one.
*/
if (so->so_options & SO_ACCEPTCONN) {
struct in_conninfo inc;
KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but "
"tp not listening", __func__));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
-
bzero(&inc, sizeof(inc));
#ifdef INET6
if (isipv6) {
@@ -1090,6 +1101,8 @@ relocked:
* socket appended to the listen queue in SYN_RECEIVED state.
*/
if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
+
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
/*
* Parse the TCP options here because
* syncookies need access to the reflected
@@ -1110,6 +1123,9 @@ relocked:
rstreason = BANDLIM_RST_OPENPORT;
goto dropwithreset;
}
+#ifdef TCP_RFC7413
+new_tfo_socket:
+#endif
if (so == NULL) {
/*
* We completed the 3-way handshake
@@ -1141,7 +1157,11 @@ relocked:
*/
INP_WUNLOCK(inp); /* listen socket */
inp = sotoinpcb(so);
- INP_WLOCK(inp); /* new connection */
+ /*
+ * New connection inpcb is already locked by
+ * syncache_expand().
+ */
+ INP_WLOCK_ASSERT(inp);
tp = intotcpcb(inp);
KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
("%s: ", __func__));
@@ -1170,10 +1190,10 @@ relocked:
* contains. tcp_do_segment() consumes
* the mbuf chain and unlocks the inpcb.
*/
- tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
+ tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
iptos, ti_locked);
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- return;
+ return (IPPROTO_DONE);
}
/*
* Segment flag validation for new connection attempts:
@@ -1277,7 +1297,7 @@ relocked:
if (isipv6 && !V_ip6_use_deprecated) {
struct in6_ifaddr *ia6;
- ia6 = ip6_getdstifaddr(m);
+ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
if (ia6 != NULL &&
(ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
ifa_free(&ia6->ia_ifa);
@@ -1366,14 +1386,24 @@ relocked:
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
tcp_dooptions(&to, optp, optlen, TO_SYN);
- syncache_add(&inc, &to, th, inp, &so, m);
+#ifdef TCP_RFC7413
+ if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL))
+ goto new_tfo_socket;
+#else
+ syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL);
+#endif
/*
* Entry added to syncache and mbuf consumed.
- * Everything already unlocked by syncache_add().
+ * Only the listen socket is unlocked by syncache_add().
*/
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ ti_locked = TI_UNLOCKED;
+ }
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- return;
+ return (IPPROTO_DONE);
} else if (tp->t_state == TCPS_LISTEN) {
/*
* When a listen socket is torn down the SO_ACCEPTCONN
@@ -1404,18 +1434,22 @@ relocked:
}
#endif
+ TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
+
/*
* Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
* state. tcp_do_segment() always consumes the mbuf chain, unlocks
* the inpcb, and unlocks pcbinfo.
*/
- tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
+ tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- return;
+ return (IPPROTO_DONE);
dropwithreset:
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
+
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
#ifdef INVARIANTS
@@ -1435,8 +1469,11 @@ dropwithreset:
goto drop;
dropunlock:
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (m != NULL)
+ TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
+
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
#ifdef INVARIANTS
@@ -1456,18 +1493,23 @@ drop:
free(s, M_TCPLOG);
if (m != NULL)
m_freem(m);
+ return (IPPROTO_DONE);
}
-static void
+void
tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
int ti_locked)
{
- int thflags, acked, ourfinisacked, needoutput = 0;
+ int thflags, acked, ourfinisacked, needoutput = 0, sack_changed;
int rstreason, todrop, win;
u_long tiwin;
+ char *s;
+ struct in_conninfo *inc;
+ struct mbuf *mfree;
struct tcpopt to;
-
+ int tfo_syn;
+
#ifdef TCPDEBUG
/*
* The size of tcp_saveipgen must be the size of the max ip header,
@@ -1478,30 +1520,25 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
short ostate = 0;
#endif
thflags = th->th_flags;
+ inc = &tp->t_inpcb->inp_inc;
tp->sackhint.last_sack_ack = 0;
+ sack_changed = 0;
/*
* If this is either a state-changing packet or current state isn't
* established, we require a write lock on tcbinfo. Otherwise, we
- * allow either a read lock or a write lock, as we may have acquired
- * a write lock due to a race.
- *
- * Require a global write lock for SYN/FIN/RST segments or
- * non-established connections; otherwise accept either a read or
- * write lock, as we may have conservatively acquired a write lock in
- * certain cases in tcp_input() (is this still true?). Currently we
- * will never enter with no lock, so we try to drop it quickly in the
- * common pure ack/pure data cases.
+ * allow the tcbinfo to be in either alocked or unlocked, as the
+ * caller may have unnecessarily acquired a write lock due to a race.
*/
if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
tp->t_state != TCPS_ESTABLISHED) {
- KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
+ KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
"SYN/FIN/RST/!EST", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
} else {
#ifdef INVARIANTS
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
else {
KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
"ti_locked: %d", __func__, ti_locked));
@@ -1515,6 +1552,11 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
__func__));
+#ifdef TCPPCAP
+ /* Save segment, if requested. */
+ tcp_pcap_add(th, m, &(tp->t_inpkts));
+#endif
+
/*
* Segment received on connection.
* Reset idle time and keep-alive timer.
@@ -1526,7 +1568,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
/*
- * Unscale the window into a 32-bit value.
+ * Scale up the window into a 32-bit value.
* For the SYN_SENT state the scale is zero.
*/
tiwin = th->th_win << tp->snd_scale;
@@ -1549,6 +1591,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
TCPSTAT_INC(tcps_ecn_ect1);
break;
}
+
+ /* Process a packet differently from RFC3168. */
+ cc_ecnpkt_handler(tp, th, iptos);
+
/* Congestion experienced. */
if (thflags & TH_ECE) {
cc_cong_signal(tp, th, CC_ECN);
@@ -1573,6 +1619,24 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
to.to_tsecr = 0;
}
+ /*
+ * If timestamps were negotiated during SYN/ACK they should
+ * appear on every segment during this session and vice versa.
+ */
+ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Timestamp missing, "
+ "no action\n", s, __func__);
+ free(s, M_TCPLOG);
+ }
+ }
+ if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
+ "no action\n", s, __func__);
+ free(s, M_TCPLOG);
+ }
+ }
/*
* Process options only when we get SYN/ACK back. The SYN case
@@ -1652,8 +1716,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
/*
* This is a pure ack for outstanding data.
*/
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
TCPSTAT_INC(tcps_predack);
@@ -1720,7 +1784,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_wl2 = th->th_ack;
tp->t_dupacks = 0;
m_freem(m);
- ND6_HINT(tp); /* Some progress has been made. */
/*
* If all outstanding data are acked, stop
@@ -1737,14 +1800,16 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
(void *)tcp_saveipgen,
&tcp_savetcp, 0);
#endif
+ TCP_PROBE3(debug__input, tp, th,
+ mtod(m, const char *));
if (tp->snd_una == tp->snd_max)
tcp_timer_activate(tp, TT_REXMT, 0);
else if (!tcp_timer_active(tp, TT_PERSIST))
tcp_timer_activate(tp, TT_REXMT,
tp->t_rxtcur);
sowwakeup(so);
- if (so->so_snd.sb_cc)
- (void) tcp_output(tp);
+ if (sbavail(&so->so_snd))
+ (void) tp->t_fb->tfb_tcp_output(tp);
goto check_delack;
}
} else if (th->th_ack == tp->snd_una &&
@@ -1756,8 +1821,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* nothing on the reassembly queue and we have enough
* buffer space to take it.
*/
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
/* Clean receiver SACK report if present */
@@ -1777,12 +1842,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->rcv_up = tp->rcv_nxt;
TCPSTAT_INC(tcps_rcvpack);
TCPSTAT_ADD(tcps_rcvbyte, tlen);
- ND6_HINT(tp); /* Some progress has been made */
#ifdef TCPDEBUG
if (so->so_options & SO_DEBUG)
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
+
/*
* Automatic sizing of receive socket buffer. Often the send
* buffer size is not optimally adjusted to the actual network
@@ -1802,11 +1868,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* reassembly queue.
*
* The criteria to step up the receive buffer one notch are:
- * 1. the number of bytes received during the time it takes
+ * 1. Application has not set receive buffer size with
+ * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
+ * 2. the number of bytes received during the time it takes
* one timestamp to be reflected back to us (the RTT);
- * 2. received bytes per RTT is within seven eighth of the
+ * 3. received bytes per RTT is within seven eighth of the
* current socket buffer size;
- * 3. receive buffer size has not hit maximal automatic size;
+ * 4. receive buffer size has not hit maximal automatic size;
*
* This algorithm does one step per RTT at most and only if
* we receive a bulk stream w/o packet losses or reorderings.
@@ -1817,6 +1885,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* the buffer to better manage the socket buffer resources.
*/
if (V_tcp_do_autorcvbuf &&
+ (to.to_flags & TOF_TS) &&
to.to_tsecr &&
(so->so_rcv.sb_flags & SB_AUTOSIZE)) {
if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
@@ -1851,7 +1920,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
newsize, so, NULL))
so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
m_adj(m, drop_hdrlen); /* delayed header drop */
- sbappendstream_locked(&so->so_rcv, m);
+ sbappendstream_locked(&so->so_rcv, m, 0);
}
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
@@ -1859,7 +1928,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->t_flags |= TF_DELACK;
} else {
tp->t_flags |= TF_ACKNOW;
- tcp_output(tp);
+ tp->t_fb->tfb_tcp_output(tp);
}
goto check_delack;
}
@@ -1893,6 +1962,28 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
rstreason = BANDLIM_RST_OPENPORT;
goto dropwithreset;
}
+#ifdef TCP_RFC7413
+ if (tp->t_flags & TF_FASTOPEN) {
+ /*
+ * When a TFO connection is in SYN_RECEIVED, the
+ * only valid packets are the initial SYN, a
+ * retransmit/copy of the initial SYN (possibly with
+ * a subset of the original data), a valid ACK, a
+ * FIN, or a RST.
+ */
+ if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ } else if (thflags & TH_SYN) {
+ /* non-initial SYN is ignored */
+ if ((tcp_timer_active(tp, TT_DELACK) ||
+ tcp_timer_active(tp, TT_REXMT)))
+ goto drop;
+ } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) {
+ goto drop;
+ }
+ }
+#endif
break;
/*
@@ -1916,8 +2007,11 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
rstreason = BANDLIM_UNLIMITED;
goto dropwithreset;
}
- if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST))
+ if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
+ TCP_PROBE5(connect__refused, NULL, tp,
+ mtod(m, const char *), tp, th);
tp = tcp_drop(tp, ECONNREFUSED);
+ }
if (thflags & TH_RST)
goto drop;
if (!(thflags & TH_SYN))
@@ -1962,11 +2056,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
tp->t_starttime = ticks;
if (tp->t_flags & TF_NEEDFIN) {
- tp->t_state = TCPS_FIN_WAIT_1;
+ tcp_state_change(tp, TCPS_FIN_WAIT_1);
tp->t_flags &= ~TF_NEEDFIN;
thflags &= ~TH_SYN;
} else {
- tp->t_state = TCPS_ESTABLISHED;
+ tcp_state_change(tp, TCPS_ESTABLISHED);
+ TCP_PROBE5(connect__established, NULL, tp,
+ mtod(m, const char *), tp, th);
cc_conn_init(tp);
tcp_timer_activate(tp, TT_KEEP,
TP_KEEPIDLE(tp));
@@ -1974,22 +2070,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
} else {
/*
* Received initial SYN in SYN-SENT[*] state =>
- * simultaneous open. If segment contains CC option
- * and there is a cached CC, apply TAO test.
+ * simultaneous open.
* If it succeeds, connection is * half-synchronized.
* Otherwise, do 3-way handshake:
* SYN-SENT -> SYN-RECEIVED
* SYN-SENT* -> SYN-RECEIVED*
- * If there was no CC option, clear cached CC value.
*/
tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
tcp_timer_activate(tp, TT_REXMT, 0);
- tp->t_state = TCPS_SYN_RECEIVED;
+ tcp_state_change(tp, TCPS_SYN_RECEIVED);
}
- KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
"ti_locked %d", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
/*
@@ -2045,98 +2139,84 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* Then check that at least some bytes of segment are within
* receive window. If segment begins before rcv_nxt,
* drop leading data (and SYN); if nothing left, just ack.
- *
- *
- * If the RST bit is set, check the sequence number to see
- * if this is a valid reset segment.
- * RFC 793 page 37:
- * In all states except SYN-SENT, all reset (RST) segments
- * are validated by checking their SEQ-fields. A reset is
- * valid if its sequence number is in the window.
- * Note: this does not take into account delayed ACKs, so
- * we should test against last_ack_sent instead of rcv_nxt.
- * The sequence number in the reset segment is normally an
- * echo of our outgoing acknowlegement numbers, but some hosts
- * send a reset with the sequence number at the rightmost edge
- * of our receive window, and we have to handle this case.
- * Note 2: Paul Watson's paper "Slipping in the Window" has shown
- * that brute force RST attacks are possible. To combat this,
- * we use a much stricter check while in the ESTABLISHED state,
- * only accepting RSTs where the sequence number is equal to
- * last_ack_sent. In all other states (the states in which a
- * RST is more likely), the more permissive check is used.
- * If we have multiple segments in flight, the initial reset
- * segment sequence numbers will be to the left of last_ack_sent,
- * but they will eventually catch up.
- * In any case, it never made sense to trim reset segments to
- * fit the receive window since RFC 1122 says:
- * 4.2.2.12 RST Segment: RFC-793 Section 3.4
- *
- * A TCP SHOULD allow a received RST segment to include data.
- *
- * DISCUSSION
- * It has been suggested that a RST segment could contain
- * ASCII text that encoded and explained the cause of the
- * RST. No standard has yet been established for such
- * data.
- *
- * If the reset segment passes the sequence number test examine
- * the state:
- * SYN_RECEIVED STATE:
- * If passive open, return to LISTEN state.
- * If active open, inform user that connection was refused.
- * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
- * Inform user that connection was reset, and close tcb.
- * CLOSING, LAST_ACK STATES:
- * Close the tcb.
- * TIME_WAIT STATE:
- * Drop the segment - see Stevens, vol. 2, p. 964 and
- * RFC 1337.
*/
if (thflags & TH_RST) {
- if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
- SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
- switch (tp->t_state) {
-
- case TCPS_SYN_RECEIVED:
- so->so_error = ECONNREFUSED;
- goto close;
-
- case TCPS_ESTABLISHED:
- if (V_tcp_insecure_rst == 0 &&
- !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) &&
- SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) &&
- !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
- SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) {
- TCPSTAT_INC(tcps_badrst);
- goto drop;
- }
- /* FALLTHROUGH */
- case TCPS_FIN_WAIT_1:
- case TCPS_FIN_WAIT_2:
- case TCPS_CLOSE_WAIT:
- so->so_error = ECONNRESET;
- close:
- KASSERT(ti_locked == TI_WLOCKED,
- ("tcp_do_segment: TH_RST 1 ti_locked %d",
- ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
-
- tp->t_state = TCPS_CLOSED;
+ /*
+ * RFC5961 Section 3.2
+ *
+ * - RST drops connection only if SEG.SEQ == RCV.NXT.
+ * - If RST is in window, we send challenge ACK.
+ *
+ * Note: to take into account delayed ACKs, we should
+ * test against last_ack_sent instead of rcv_nxt.
+ * Note 2: we handle special case of closed window, not
+ * covered by the RFC.
+ */
+ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
+ (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
+
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_RLOCKED,
+ ("%s: TH_RST ti_locked %d, th %p tp %p",
+ __func__, ti_locked, th, tp));
+ KASSERT(tp->t_state != TCPS_SYN_SENT,
+ ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
+ __func__, th, tp));
+
+ if (V_tcp_insecure_rst ||
+ tp->last_ack_sent == th->th_seq) {
TCPSTAT_INC(tcps_drops);
- tp = tcp_close(tp);
- break;
+ /* Drop the connection. */
+ switch (tp->t_state) {
+ case TCPS_SYN_RECEIVED:
+ so->so_error = ECONNREFUSED;
+ goto close;
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ so->so_error = ECONNRESET;
+ close:
+ tcp_state_change(tp, TCPS_CLOSED);
+ /* FALLTHROUGH */
+ default:
+ tp = tcp_close(tp);
+ }
+ } else {
+ TCPSTAT_INC(tcps_badrst);
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m,
+ tp->rcv_nxt, tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ m = NULL;
+ }
+ }
+ goto drop;
+ }
- case TCPS_CLOSING:
- case TCPS_LAST_ACK:
- KASSERT(ti_locked == TI_WLOCKED,
- ("tcp_do_segment: TH_RST 2 ti_locked %d",
- ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ /*
+ * RFC5961 Section 4.2
+ * Send challenge ACK for any SYN in synchronized state.
+ */
+ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT &&
+ tp->t_state != TCPS_SYN_RECEIVED) {
+ KASSERT(ti_locked == TI_RLOCKED,
+ ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
- tp = tcp_close(tp);
- break;
- }
+ TCPSTAT_INC(tcps_badsyn);
+ if (V_tcp_insecure_syn &&
+ SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+ tp = tcp_drop(tp, ECONNRESET);
+ rstreason = BANDLIM_UNLIMITED;
+ } else {
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
+ tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ m = NULL;
}
goto drop;
}
@@ -2236,15 +2316,14 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((so->so_state & SS_NOFDREF) &&
tp->t_state > TCPS_CLOSE_WAIT && tlen) {
- char *s;
-
- KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && "
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
"CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
- if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
- log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket "
- "was closed, sending RST and removing tcpcb\n",
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
+ "after socket was closed, "
+ "sending RST and removing tcpcb\n",
s, __func__, tcpstates[tp->t_state], tlen);
free(s, M_TCPLOG);
}
@@ -2309,29 +2388,22 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
/*
- * If a SYN is in the window, then this is an
- * error and we send an RST and drop the connection.
- */
- if (thflags & TH_SYN) {
- KASSERT(ti_locked == TI_WLOCKED,
- ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
-
- tp = tcp_drop(tp, ECONNRESET);
- rstreason = BANDLIM_UNLIMITED;
- goto drop;
- }
-
- /*
* If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
* flag is on (half-synchronized state), then queue data for
* later processing; else drop segment and return.
*/
if ((thflags & TH_ACK) == 0) {
if (tp->t_state == TCPS_SYN_RECEIVED ||
- (tp->t_flags & TF_NEEDSYN))
+ (tp->t_flags & TF_NEEDSYN)) {
+#ifdef TCP_RFC7413
+ if (tp->t_state == TCPS_SYN_RECEIVED &&
+ tp->t_flags & TF_FASTOPEN) {
+ tp->snd_wnd = tiwin;
+ cc_conn_init(tp);
+ }
+#endif
goto step6;
- else if (tp->t_flags & TF_ACKNOW)
+ } else if (tp->t_flags & TF_ACKNOW)
goto dropafterack;
else
goto drop;
@@ -2364,11 +2436,33 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
tp->t_starttime = ticks;
if (tp->t_flags & TF_NEEDFIN) {
- tp->t_state = TCPS_FIN_WAIT_1;
+ tcp_state_change(tp, TCPS_FIN_WAIT_1);
tp->t_flags &= ~TF_NEEDFIN;
} else {
- tp->t_state = TCPS_ESTABLISHED;
- cc_conn_init(tp);
+ tcp_state_change(tp, TCPS_ESTABLISHED);
+ TCP_PROBE5(accept__established, NULL, tp,
+ mtod(m, const char *), tp, th);
+#ifdef TCP_RFC7413
+ if (tp->t_tfo_pending) {
+ tcp_fastopen_decrement_counter(tp->t_tfo_pending);
+ tp->t_tfo_pending = NULL;
+
+ /*
+ * Account for the ACK of our SYN prior to
+ * regular ACK processing below.
+ */
+ tp->snd_una++;
+ }
+ /*
+ * TFO connections call cc_conn_init() during SYN
+ * processing. Calling it again here for such
+ * connections is not harmless as it would undo the
+ * snd_cwnd reduction that occurs when a TFO SYN|ACK
+ * is retransmitted.
+ */
+ if (!(tp->t_flags & TF_FASTOPEN))
+#endif
+ cc_conn_init(tp);
tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
}
/*
@@ -2402,21 +2496,45 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
if ((tp->t_flags & TF_SACK_PERMIT) &&
((to.to_flags & TOF_SACK) ||
!TAILQ_EMPTY(&tp->snd_holes)))
- tcp_sack_doack(tp, &to, th->th_ack);
+ sack_changed = tcp_sack_doack(tp, &to, th->th_ack);
+ else
+ /*
+ * Reset the value so that previous (valid) value
+ * from the last ack with SACK doesn't get used.
+ */
+ tp->sackhint.sacked_bytes = 0;
/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
hhook_run_tcp_est_in(tp, th, &to);
if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
- if (tlen == 0 && tiwin == tp->snd_wnd) {
+ u_int maxseg;
+
+ maxseg = tcp_maxseg(tp);
+ if (tlen == 0 &&
+ (tiwin == tp->snd_wnd ||
+ (tp->t_flags & TF_SACK_PERMIT))) {
+ /*
+ * If this is the first time we've seen a
+ * FIN from the remote, this is not a
+ * duplicate and it needs to be processed
+ * normally. This happens during a
+ * simultaneous close.
+ */
+ if ((thflags & TH_FIN) &&
+ (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
+ tp->t_dupacks = 0;
+ break;
+ }
TCPSTAT_INC(tcps_rcvdupack);
/*
* If we have outstanding data (other than
* a window probe), this is a completely
* duplicate ack (ie, window info didn't
- * change), the ack is the biggest we've
+ * change and FIN isn't set),
+ * the ack is the biggest we've
* seen and we've seen exactly our rexmt
- * threshhold of them, assume a packet
+ * threshold of them, assume a packet
* has been dropped and retransmit it.
* Kludge snd_nxt & the congestion
* window so we send only this one
@@ -2437,8 +2555,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* When using TCP ECN, notify the peer that
* we reduced the cwnd.
*/
- if (!tcp_timer_active(tp, TT_REXMT) ||
- th->th_ack != tp->snd_una)
+ /*
+ * Following 2 kinds of acks should not affect
+ * dupack counting:
+ * 1) Old acks
+ * 2) Acks with SACK but without any new SACK
+ * information in them. These could result from
+ * any anomaly in the network like a switch
+ * duplicating packets or a possible DoS attack.
+ */
+ if (th->th_ack != tp->snd_una ||
+ ((tp->t_flags & TF_SACK_PERMIT) &&
+ !sack_changed))
+ break;
+ else if (!tcp_timer_active(tp, TT_REXMT))
tp->t_dupacks = 0;
else if (++tp->t_dupacks > tcprexmtthresh ||
IN_FASTRECOVERY(tp->t_flags)) {
@@ -2453,26 +2583,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* we have less than 1/2 the original window's
* worth of data in flight.
*/
- awnd = (tp->snd_nxt - tp->snd_fack) +
- tp->sackhint.sack_bytes_rexmit;
+ if (V_tcp_do_rfc6675_pipe)
+ awnd = tcp_compute_pipe(tp);
+ else
+ awnd = (tp->snd_nxt - tp->snd_fack) +
+ tp->sackhint.sack_bytes_rexmit;
+
if (awnd < tp->snd_ssthresh) {
- tp->snd_cwnd += tp->t_maxseg;
+ tp->snd_cwnd += maxseg;
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;
}
} else
- tp->snd_cwnd += tp->t_maxseg;
- if ((thflags & TH_FIN) &&
- (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
- /*
- * If its a fin we need to process
- * it to avoid a race where both
- * sides enter FIN-WAIT and send FIN|ACK
- * at the same time.
- */
- break;
- }
- (void) tcp_output(tp);
+ tp->snd_cwnd += maxseg;
+ (void) tp->t_fb->tfb_tcp_output(tp);
goto drop;
} else if (tp->t_dupacks == tcprexmtthresh) {
tcp_seq onxt = tp->snd_nxt;
@@ -2505,33 +2629,33 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
TCPSTAT_INC(
tcps_sack_recovery_episode);
tp->sack_newdata = tp->snd_nxt;
- tp->snd_cwnd = tp->t_maxseg;
- (void) tcp_output(tp);
+ tp->snd_cwnd = maxseg;
+ (void) tp->t_fb->tfb_tcp_output(tp);
goto drop;
}
tp->snd_nxt = th->th_ack;
- tp->snd_cwnd = tp->t_maxseg;
- if ((thflags & TH_FIN) &&
- (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
- /*
- * If its a fin we need to process
- * it to avoid a race where both
- * sides enter FIN-WAIT and send FIN|ACK
- * at the same time.
- */
- break;
- }
- (void) tcp_output(tp);
+ tp->snd_cwnd = maxseg;
+ (void) tp->t_fb->tfb_tcp_output(tp);
KASSERT(tp->snd_limited <= 2,
("%s: tp->snd_limited too big",
__func__));
tp->snd_cwnd = tp->snd_ssthresh +
- tp->t_maxseg *
+ maxseg *
(tp->t_dupacks - tp->snd_limited);
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
goto drop;
} else if (V_tcp_do_rfc3042) {
+ /*
+ * Process first and second duplicate
+ * ACKs. Each indicates a segment
+ * leaving the network, creating room
+ * for more. Make sure we can send a
+ * packet on reception of each duplicate
+ * ACK by increasing snd_cwnd by one
+ * segment. Restore the original
+ * snd_cwnd after packet transmission.
+ */
cc_ack_received(tp, th, CC_DUPACK);
u_long oldcwnd = tp->snd_cwnd;
tcp_seq oldsndmax = tp->snd_max;
@@ -2547,33 +2671,23 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_cwnd =
(tp->snd_nxt - tp->snd_una) +
(tp->t_dupacks - tp->snd_limited) *
- tp->t_maxseg;
- if ((thflags & TH_FIN) &&
- (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
- /*
- * If its a fin we need to process
- * it to avoid a race where both
- * sides enter FIN-WAIT and send FIN|ACK
- * at the same time.
- */
- break;
- }
+ maxseg;
/*
* Only call tcp_output when there
* is new data available to be sent.
* Otherwise we would send pure ACKs.
*/
SOCKBUF_LOCK(&so->so_snd);
- avail = so->so_snd.sb_cc -
+ avail = sbavail(&so->so_snd) -
(tp->snd_nxt - tp->snd_una);
SOCKBUF_UNLOCK(&so->so_snd);
if (avail > 0)
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
sent = tp->snd_max - oldsndmax;
- if (sent > tp->t_maxseg) {
+ if (sent > maxseg) {
KASSERT((tp->t_dupacks == 2 &&
tp->snd_limited == 0) ||
- (sent == tp->t_maxseg + 1 &&
+ (sent == maxseg + 1 &&
tp->t_flags & TF_SENTFIN),
("%s: sent too much",
__func__));
@@ -2583,9 +2697,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_cwnd = oldcwnd;
goto drop;
}
- } else
- tp->t_dupacks = 0;
+ }
break;
+ } else {
+ /*
+ * This ack is advancing the left edge, reset the
+ * counter.
+ */
+ tp->t_dupacks = 0;
+ /*
+ * If this ack also has new SACK info, increment the
+ * counter as per rfc6675.
+ */
+ if ((tp->t_flags & TF_SACK_PERMIT) && sack_changed)
+ tp->t_dupacks++;
}
KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
@@ -2604,7 +2729,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
} else
cc_post_recovery(tp, th);
}
- tp->t_dupacks = 0;
/*
* If we reach this point, ACK is not a duplicate,
* i.e., it ACKs something we sent.
@@ -2631,6 +2755,9 @@ process_ACK:
INP_WLOCK_ASSERT(tp->t_inpcb);
acked = BYTES_THIS_ACK(tp, th);
+ KASSERT(acked >= 0, ("%s: acked unexepectedly negative "
+ "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__,
+ tp->snd_una, th->th_ack, tp, m));
TCPSTAT_INC(tcps_rcvackpack);
TCPSTAT_ADD(tcps_rcvackbyte, acked);
@@ -2699,17 +2826,25 @@ process_ACK:
cc_ack_received(tp, th, CC_ACK);
SOCKBUF_LOCK(&so->so_snd);
- if (acked > so->so_snd.sb_cc) {
- tp->snd_wnd -= so->so_snd.sb_cc;
- sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc);
+ if (acked > sbavail(&so->so_snd)) {
+ if (tp->snd_wnd >= sbavail(&so->so_snd))
+ tp->snd_wnd -= sbavail(&so->so_snd);
+ else
+ tp->snd_wnd = 0;
+ mfree = sbcut_locked(&so->so_snd,
+ (int)sbavail(&so->so_snd));
ourfinisacked = 1;
} else {
- sbdrop_locked(&so->so_snd, acked);
- tp->snd_wnd -= acked;
+ mfree = sbcut_locked(&so->so_snd, acked);
+ if (tp->snd_wnd >= (u_long) acked)
+ tp->snd_wnd -= acked;
+ else
+ tp->snd_wnd = 0;
ourfinisacked = 0;
}
/* NB: sowwakeup_locked() does an implicit unlock. */
sowwakeup_locked(so);
+ m_freem(mfree);
/* Detect una wraparound. */
if (!IN_RECOVERY(tp->t_flags) &&
SEQ_GT(tp->snd_una, tp->snd_recover) &&
@@ -2755,7 +2890,7 @@ process_ACK:
tcp_finwait2_timeout :
TP_MAXIDLE(tp)));
}
- tp->t_state = TCPS_FIN_WAIT_2;
+ tcp_state_change(tp, TCPS_FIN_WAIT_2);
}
break;
@@ -2767,9 +2902,9 @@ process_ACK:
*/
case TCPS_CLOSING:
if (ourfinisacked) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tcp_twstart(tp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return;
}
@@ -2783,7 +2918,7 @@ process_ACK:
*/
case TCPS_LAST_ACK:
if (ourfinisacked) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tp = tcp_close(tp);
goto drop;
}
@@ -2826,7 +2961,7 @@ step6:
* actually wanting to send this much urgent data.
*/
SOCKBUF_LOCK(&so->so_rcv);
- if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
+ if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
th->th_urp = 0; /* XXX */
thflags &= ~TH_URG; /* XXX */
SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */
@@ -2848,7 +2983,7 @@ step6:
*/
if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
tp->rcv_up = th->th_seq + th->th_urp;
- so->so_oobmark = so->so_rcv.sb_cc +
+ so->so_oobmark = sbavail(&so->so_rcv) +
(tp->rcv_up - tp->rcv_nxt) - 1;
if (so->so_oobmark == 0)
so->so_rcv.sb_state |= SBS_RCVATMARK;
@@ -2887,7 +3022,9 @@ dodata: /* XXX */
* case PRU_RCVD). If a FIN has already been received on this
* connection then we just ignore the text.
*/
- if ((tlen || (thflags & TH_FIN)) &&
+ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
+ (tp->t_flags & TF_FASTOPEN));
+ if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
TCPS_HAVERCVDFIN(tp->t_state) == 0) {
tcp_seq save_start = th->th_seq;
m_adj(m, drop_hdrlen); /* delayed header drop */
@@ -2905,8 +3042,9 @@ dodata: /* XXX */
*/
if (th->th_seq == tp->rcv_nxt &&
LIST_EMPTY(&tp->t_segq) &&
- TCPS_HAVEESTABLISHED(tp->t_state)) {
- if (DELAY_ACK(tp, tlen))
+ (TCPS_HAVEESTABLISHED(tp->t_state) ||
+ tfo_syn)) {
+ if (DELAY_ACK(tp, tlen) || tfo_syn)
tp->t_flags |= TF_DELACK;
else
tp->t_flags |= TF_ACKNOW;
@@ -2914,12 +3052,11 @@ dodata: /* XXX */
thflags = th->th_flags & TH_FIN;
TCPSTAT_INC(tcps_rcvpack);
TCPSTAT_ADD(tcps_rcvbyte, tlen);
- ND6_HINT(tp);
SOCKBUF_LOCK(&so->so_rcv);
if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
m_freem(m);
else
- sbappendstream_locked(&so->so_rcv, m);
+ sbappendstream_locked(&so->so_rcv, m, 0);
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
} else {
@@ -2981,7 +3118,7 @@ dodata: /* XXX */
tp->t_starttime = ticks;
/* FALLTHROUGH */
case TCPS_ESTABLISHED:
- tp->t_state = TCPS_CLOSE_WAIT;
+ tcp_state_change(tp, TCPS_CLOSE_WAIT);
break;
/*
@@ -2989,7 +3126,7 @@ dodata: /* XXX */
* enter the CLOSING state.
*/
case TCPS_FIN_WAIT_1:
- tp->t_state = TCPS_CLOSING;
+ tcp_state_change(tp, TCPS_CLOSING);
break;
/*
@@ -2998,18 +3135,18 @@ dodata: /* XXX */
* standard timers.
*/
case TCPS_FIN_WAIT_2:
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata "
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata "
"TCP_FIN_WAIT_2 ti_locked: %d", __func__,
ti_locked));
tcp_twstart(tp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return;
}
}
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
#ifdef TCPDEBUG
@@ -3017,12 +3154,13 @@ dodata: /* XXX */
tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
&tcp_savetcp, 0);
#endif
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
/*
* Return any desired output.
*/
if (needoutput || (tp->t_flags & TF_ACKNOW))
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
check_delack:
KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
@@ -3064,19 +3202,20 @@ dropafterack:
tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
&tcp_savetcp, 0);
#endif
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
tp->t_flags |= TF_ACKNOW;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
INP_WUNLOCK(tp->t_inpcb);
m_freem(m);
return;
dropwithreset:
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
if (tp != NULL) {
@@ -3087,8 +3226,8 @@ dropwithreset:
return;
drop:
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
#ifdef INVARIANTS
@@ -3104,6 +3243,7 @@ drop:
tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
&tcp_savetcp, 0);
#endif
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
if (tp != NULL)
INP_WUNLOCK(tp->t_inpcb);
m_freem(m);
@@ -3114,7 +3254,7 @@ drop:
* The mbuf must still include the original packet header.
* tp may be NULL.
*/
-static void
+void
tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
int tlen, int rstreason)
{
@@ -3177,7 +3317,7 @@ drop:
/*
* Parse TCP options and place in tcpopt.
*/
-static void
+void
tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
{
int opt, optlen;
@@ -3259,6 +3399,21 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
to->to_sacks = cp + 2;
TCPSTAT_INC(tcps_sack_rcv_blocks);
break;
+#ifdef TCP_RFC7413
+ case TCPOPT_FAST_OPEN:
+ if ((optlen != TCPOLEN_FAST_OPEN_EMPTY) &&
+ (optlen < TCPOLEN_FAST_OPEN_MIN) &&
+ (optlen > TCPOLEN_FAST_OPEN_MAX))
+ continue;
+ if (!(flags & TO_SYN))
+ continue;
+ if (!V_tcp_fastopen_enabled)
+ continue;
+ to->to_flags |= TOF_FASTOPEN;
+ to->to_tfo_len = optlen - 2;
+ to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL;
+ break;
+#endif
default:
continue;
}
@@ -3271,7 +3426,7 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
* It is still reflected in the segment length for
* sequencing purposes.
*/
-static void
+void
tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
int off)
{
@@ -3304,7 +3459,7 @@ tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
* Collect new round-trip time estimate
* and update averages and current timeout.
*/
-static void
+void
tcp_xmit_timer(struct tcpcb *tp, int rtt)
{
int delta;
@@ -3394,11 +3549,9 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt)
* While looking at the routing entry, we also initialize other path-dependent
* parameters from pre-set or cached values in the routing entry.
*
- * Also take into account the space needed for options that we
- * send regularly. Make maxseg shorter by that amount to assure
- * that we can send maxseg amount of data even when the options
- * are present. Store the upper limit of the length of options plus
- * data in maxopd.
+ * NOTE that resulting t_maxseg doesn't include space for TCP options or
+ * IP options, e.g. IPSEC data, since length of this data may vary, and
+ * thus it is calculated for every segment separately in tcp_output().
*
* NOTE that this routine is only called when we process an incoming
* segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
@@ -3412,7 +3565,6 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
u_long maxmtu = 0;
struct inpcb *inp = tp->t_inpcb;
struct hc_metrics_lite metrics;
- int origoffer;
#ifdef INET6
int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
size_t min_protoh = isipv6 ?
@@ -3428,13 +3580,12 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
KASSERT(offer == -1, ("%s: conflict", __func__));
offer = mtuoffer - min_protoh;
}
- origoffer = offer;
/* Initialize. */
#ifdef INET6
if (isipv6) {
maxmtu = tcp_maxmtu6(&inp->inp_inc, cap);
- tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
+ tp->t_maxseg = V_tcp_v6mssdflt;
}
#endif
#if defined(INET) && defined(INET6)
@@ -3443,7 +3594,7 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
#ifdef INET
{
maxmtu = tcp_maxmtu(&inp->inp_inc, cap);
- tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
+ tp->t_maxseg = V_tcp_mssdflt;
}
#endif
@@ -3467,9 +3618,9 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
/*
* Offer == 0 means that there was no MSS on the SYN
* segment, in this case we use tcp_mssdflt as
- * already assigned to t_maxopd above.
+ * already assigned to t_maxseg above.
*/
- offer = tp->t_maxopd;
+ offer = tp->t_maxseg;
break;
case -1:
@@ -3494,8 +3645,8 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
/*
- * If there's a discovered mtu int tcp hostcache, use it
- * else, use the link mtu.
+ * If there's a discovered mtu in tcp hostcache, use it.
+ * Else, use the link mtu.
*/
if (metrics.rmx_mtu)
mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
@@ -3541,31 +3692,15 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
mss = min(mss, offer);
/*
- * Sanity check: make sure that maxopd will be large
+ * Sanity check: make sure that maxseg will be large
* enough to allow some data on segments even if the
* all the option space is used (40bytes). Otherwise
* funny things may happen in tcp_output.
+ *
+ * XXXGL: shouldn't we reserve space for IP/IPv6 options?
*/
mss = max(mss, 64);
- /*
- * maxopd stores the maximum length of data AND options
- * in a segment; maxseg is the amount of data in a normal
- * segment. We need to store this value (maxopd) apart
- * from maxseg, because now every segment carries options
- * and thus we normally have somewhat less data in segments.
- */
- tp->t_maxopd = mss;
-
- /*
- * origoffer==-1 indicates that no segments were received yet.
- * In this case we just guess.
- */
- if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
- (origoffer == -1 ||
- (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
- mss -= TCPOLEN_TSTAMP_APPA;
-
tp->t_maxseg = mss;
}
@@ -3684,11 +3819,12 @@ tcp_mssopt(struct in_conninfo *inc)
* By setting snd_nxt to ti_ack, this forces retransmission timer to
* be started again.
*/
-static void
+void
tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
{
tcp_seq onxt = tp->snd_nxt;
- u_long ocwnd = tp->snd_cwnd;
+ u_long ocwnd = tp->snd_cwnd;
+ u_int maxseg = tcp_maxseg(tp);
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -3699,9 +3835,9 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
* Set snd_cwnd to one segment beyond acknowledged offset.
* (tp->snd_una has not yet been updated when this function is called.)
*/
- tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
+ tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
tp->t_flags |= TF_ACKNOW;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
tp->snd_cwnd = ocwnd;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
@@ -3713,5 +3849,13 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
else
tp->snd_cwnd = 0;
- tp->snd_cwnd += tp->t_maxseg;
+ tp->snd_cwnd += maxseg;
+}
+
+int
+tcp_compute_pipe(struct tcpcb *tp)
+{
+ return (tp->snd_max - tp->snd_una +
+ tp->sackhint.sack_bytes_rexmit -
+ tp->sackhint.sacked_bytes);
}
diff --git a/freebsd/sys/netinet/tcp_lro.c b/freebsd/sys/netinet/tcp_lro.c
index 52d92aa0..3550ab84 100644
--- a/freebsd/sys/netinet/tcp_lro.c
+++ b/freebsd/sys/netinet/tcp_lro.c
@@ -4,6 +4,7 @@
* Copyright (c) 2007, Myricom Inc.
* Copyright (c) 2008, Intel Corporation.
* Copyright (c) 2012 The FreeBSD Foundation
+ * Copyright (c) 2016 Mellanox Technologies.
* All rights reserved.
*
* Portions of this software were developed by Bjoern Zeeb
@@ -39,9 +40,11 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
-#include <sys/mbuf.h>
#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
#include <sys/socket.h>
+#include <sys/sysctl.h>
#include <net/if.h>
#include <net/if_var.h>
@@ -55,59 +58,139 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_lro.h>
+#include <netinet/tcp_var.h>
#include <netinet6/ip6_var.h>
#include <machine/in_cksum.h>
-#ifndef LRO_ENTRIES
-#define LRO_ENTRIES 8 /* # of LRO entries per RX queue. */
-#endif
+static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures");
#define TCP_LRO_UPDATE_CSUM 1
#ifndef TCP_LRO_UPDATE_CSUM
#define TCP_LRO_INVALID_CSUM 0x0000
#endif
+static void tcp_lro_rx_done(struct lro_ctrl *lc);
+static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m,
+ uint32_t csum, int use_hash);
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "TCP LRO");
+
+static unsigned tcp_lro_entries = TCP_LRO_ENTRIES;
+SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
+ CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
+ "default number of LRO entries");
+
+static __inline void
+tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket,
+ struct lro_entry *le)
+{
+
+ LIST_INSERT_HEAD(&lc->lro_active, le, next);
+ LIST_INSERT_HEAD(bucket, le, hash_next);
+}
+
+static __inline void
+tcp_lro_active_remove(struct lro_entry *le)
+{
+
+ LIST_REMOVE(le, next); /* active list */
+ LIST_REMOVE(le, hash_next); /* hash bucket */
+}
+
int
tcp_lro_init(struct lro_ctrl *lc)
{
+ return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0));
+}
+
+int
+tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp,
+ unsigned lro_entries, unsigned lro_mbufs)
+{
struct lro_entry *le;
- int error, i;
+ size_t size;
+ unsigned i, elements;
lc->lro_bad_csum = 0;
lc->lro_queued = 0;
lc->lro_flushed = 0;
lc->lro_cnt = 0;
- SLIST_INIT(&lc->lro_free);
- SLIST_INIT(&lc->lro_active);
-
- error = 0;
- for (i = 0; i < LRO_ENTRIES; i++) {
- le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF,
- M_NOWAIT | M_ZERO);
- if (le == NULL) {
- if (i == 0)
- error = ENOMEM;
- break;
- }
- lc->lro_cnt = i + 1;
- SLIST_INSERT_HEAD(&lc->lro_free, le, next);
- }
-
- return (error);
+ lc->lro_mbuf_count = 0;
+ lc->lro_mbuf_max = lro_mbufs;
+ lc->lro_cnt = lro_entries;
+ lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX;
+ lc->lro_length_lim = TCP_LRO_LENGTH_MAX;
+ lc->ifp = ifp;
+ LIST_INIT(&lc->lro_free);
+ LIST_INIT(&lc->lro_active);
+
+ /* create hash table to accelerate entry lookup */
+ if (lro_entries > lro_mbufs)
+ elements = lro_entries;
+ else
+ elements = lro_mbufs;
+ lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz,
+ HASH_NOWAIT);
+ if (lc->lro_hash == NULL) {
+ memset(lc, 0, sizeof(*lc));
+ return (ENOMEM);
+ }
+
+ /* compute size to allocate */
+ size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) +
+ (lro_entries * sizeof(*le));
+ lc->lro_mbuf_data = (struct lro_mbuf_sort *)
+ malloc(size, M_LRO, M_NOWAIT | M_ZERO);
+
+ /* check for out of memory */
+ if (lc->lro_mbuf_data == NULL) {
+ memset(lc, 0, sizeof(*lc));
+ return (ENOMEM);
+ }
+ /* compute offset for LRO entries */
+ le = (struct lro_entry *)
+ (lc->lro_mbuf_data + lro_mbufs);
+
+ /* setup linked list */
+ for (i = 0; i != lro_entries; i++)
+ LIST_INSERT_HEAD(&lc->lro_free, le + i, next);
+
+ return (0);
}
void
tcp_lro_free(struct lro_ctrl *lc)
{
struct lro_entry *le;
+ unsigned x;
- while (!SLIST_EMPTY(&lc->lro_free)) {
- le = SLIST_FIRST(&lc->lro_free);
- SLIST_REMOVE_HEAD(&lc->lro_free, next);
- free(le, M_DEVBUF);
+ /* reset LRO free list */
+ LIST_INIT(&lc->lro_free);
+
+ /* free active mbufs, if any */
+ while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
+ tcp_lro_active_remove(le);
+ m_freem(le->m_head);
}
+
+ /* free hash table */
+ if (lc->lro_hash != NULL) {
+ free(lc->lro_hash, M_LRO);
+ lc->lro_hash = NULL;
+ }
+ lc->lro_hashsz = 0;
+
+ /* free mbuf array, if any */
+ for (x = 0; x != lc->lro_mbuf_count; x++)
+ m_freem(lc->lro_mbuf_data[x].mb);
+ lc->lro_mbuf_count = 0;
+
+ /* free allocated memory, if any */
+ free(lc->lro_mbuf_data, M_LRO);
+ lc->lro_mbuf_data = NULL;
}
#ifdef TCP_LRO_UPDATE_CSUM
@@ -195,6 +278,36 @@ tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
}
#endif
+static void
+tcp_lro_rx_done(struct lro_ctrl *lc)
+{
+ struct lro_entry *le;
+
+ while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
+ tcp_lro_active_remove(le);
+ tcp_lro_flush(lc, le);
+ }
+}
+
+void
+tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
+{
+ struct lro_entry *le, *le_tmp;
+ struct timeval tv;
+
+ if (LIST_EMPTY(&lc->lro_active))
+ return;
+
+ getmicrotime(&tv);
+ timevalsub(&tv, timeout);
+ LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
+ if (timevalcmp(&tv, &le->mtime, >=)) {
+ tcp_lro_active_remove(le);
+ tcp_lro_flush(lc, le);
+ }
+ }
+}
+
void
tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
{
@@ -285,7 +398,143 @@ tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
lc->lro_queued += le->append_cnt + 1;
lc->lro_flushed++;
bzero(le, sizeof(*le));
- SLIST_INSERT_HEAD(&lc->lro_free, le, next);
+ LIST_INSERT_HEAD(&lc->lro_free, le, next);
+}
+
+#ifdef HAVE_INLINE_FLSLL
+#define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1))
+#else
+static inline uint64_t
+tcp_lro_msb_64(uint64_t x)
+{
+ x |= (x >> 1);
+ x |= (x >> 2);
+ x |= (x >> 4);
+ x |= (x >> 8);
+ x |= (x >> 16);
+ x |= (x >> 32);
+ return (x & ~(x >> 1));
+}
+#endif
+
+/*
+ * The tcp_lro_sort() routine is comparable to qsort(), except it has
+ * a worst case complexity limit of O(MIN(N,64)*N), where N is the
+ * number of elements to sort and 64 is the number of sequence bits
+ * available. The algorithm is bit-slicing the 64-bit sequence number,
+ * sorting one bit at a time from the most significant bit until the
+ * least significant one, skipping the constant bits. This is
+ * typically called a radix sort.
+ */
+static void
+tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size)
+{
+ struct lro_mbuf_sort temp;
+ uint64_t ones;
+ uint64_t zeros;
+ uint32_t x;
+ uint32_t y;
+
+repeat:
+ /* for small arrays insertion sort is faster */
+ if (size <= 12) {
+ for (x = 1; x < size; x++) {
+ temp = parray[x];
+ for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--)
+ parray[y] = parray[y - 1];
+ parray[y] = temp;
+ }
+ return;
+ }
+
+ /* compute sequence bits which are constant */
+ ones = 0;
+ zeros = 0;
+ for (x = 0; x != size; x++) {
+ ones |= parray[x].seq;
+ zeros |= ~parray[x].seq;
+ }
+
+ /* compute bits which are not constant into "ones" */
+ ones &= zeros;
+ if (ones == 0)
+ return;
+
+ /* pick the most significant bit which is not constant */
+ ones = tcp_lro_msb_64(ones);
+
+ /*
+ * Move entries having cleared sequence bits to the beginning
+ * of the array:
+ */
+ for (x = y = 0; y != size; y++) {
+ /* skip set bits */
+ if (parray[y].seq & ones)
+ continue;
+ /* swap entries */
+ temp = parray[x];
+ parray[x] = parray[y];
+ parray[y] = temp;
+ x++;
+ }
+
+ KASSERT(x != 0 && x != size, ("Memory is corrupted\n"));
+
+ /* sort zeros */
+ tcp_lro_sort(parray, x);
+
+ /* sort ones */
+ parray += x;
+ size -= x;
+ goto repeat;
+}
+
+void
+tcp_lro_flush_all(struct lro_ctrl *lc)
+{
+ uint64_t seq;
+ uint64_t nseq;
+ unsigned x;
+
+ /* check if no mbufs to flush */
+ if (lc->lro_mbuf_count == 0)
+ goto done;
+
+ /* sort all mbufs according to stream */
+ tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count);
+
+ /* input data into LRO engine, stream by stream */
+ seq = 0;
+ for (x = 0; x != lc->lro_mbuf_count; x++) {
+ struct mbuf *mb;
+
+ /* get mbuf */
+ mb = lc->lro_mbuf_data[x].mb;
+
+ /* get sequence number, masking away the packet index */
+ nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24);
+
+ /* check for new stream */
+ if (seq != nseq) {
+ seq = nseq;
+
+ /* flush active streams */
+ tcp_lro_rx_done(lc);
+ }
+
+ /* add packet to LRO engine */
+ if (tcp_lro_rx2(lc, mb, 0, 0) != 0) {
+ /* input packet to network layer */
+ (*lc->ifp->if_input)(lc->ifp, mb);
+ lc->lro_queued++;
+ lc->lro_flushed++;
+ }
+ }
+done:
+ /* flush active streams */
+ tcp_lro_rx_done(lc);
+
+ lc->lro_mbuf_count = 0;
}
#ifdef INET6
@@ -348,8 +597,8 @@ tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
}
#endif
-int
-tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
+static int
+tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash)
{
struct lro_entry *le;
struct ether_header *eh;
@@ -365,6 +614,8 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
tcp_seq seq;
int error, ip_len, l;
uint16_t eh_type, tcp_data_len;
+ struct lro_head *bucket;
+ int force_flush = 0;
/* We expect a contiguous header [eh, ip, tcp]. */
@@ -431,10 +682,17 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
* Check TCP header constraints.
*/
/* Ensure no bits set besides ACK or PSH. */
- if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
- return (TCP_LRO_CANNOT);
+ if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) {
+ if (th->th_flags & TH_SYN)
+ return (TCP_LRO_CANNOT);
+ /*
+ * Make sure that previously seen segements/ACKs are delivered
+ * before this segement, e.g. FIN.
+ */
+ force_flush = 1;
+ }
- /* XXX-BZ We lose a AKC|PUSH flag concatinating multiple segments. */
+ /* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */
/* XXX-BZ Ideally we'd flush on PUSH? */
/*
@@ -448,8 +706,13 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
ts_ptr = (uint32_t *)(th + 1);
if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
(*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
- TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))))
- return (TCP_LRO_CANNOT);
+ TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
+ /*
+ * Make sure that previously seen segements/ACKs are delivered
+ * before this segement.
+ */
+ force_flush = 1;
+ }
/* If the driver did not pass in the checksum, set it now. */
if (csum == 0x0000)
@@ -457,8 +720,41 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
seq = ntohl(th->th_seq);
+ if (!use_hash) {
+ bucket = &lc->lro_hash[0];
+ } else if (M_HASHTYPE_ISHASH(m)) {
+ bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz];
+ } else {
+ uint32_t hash;
+
+ switch (eh_type) {
+#ifdef INET
+ case ETHERTYPE_IP:
+ hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr;
+ break;
+#endif
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ hash = ip6->ip6_src.s6_addr32[0] +
+ ip6->ip6_dst.s6_addr32[0];
+ hash += ip6->ip6_src.s6_addr32[1] +
+ ip6->ip6_dst.s6_addr32[1];
+ hash += ip6->ip6_src.s6_addr32[2] +
+ ip6->ip6_dst.s6_addr32[2];
+ hash += ip6->ip6_src.s6_addr32[3] +
+ ip6->ip6_dst.s6_addr32[3];
+ break;
+#endif
+ default:
+ hash = 0;
+ break;
+ }
+ hash += th->th_sport + th->th_dport;
+ bucket = &lc->lro_hash[hash % lc->lro_hashsz];
+ }
+
/* Try to find a matching previous segment. */
- SLIST_FOREACH(le, &lc->lro_active, next) {
+ LIST_FOREACH(le, bucket, hash_next) {
if (le->eh_type != eh_type)
continue;
if (le->source_port != th->th_sport ||
@@ -483,9 +779,16 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
#endif
}
+ if (force_flush) {
+ /* Timestamps mismatch; this is a FIN, etc */
+ tcp_lro_active_remove(le);
+ tcp_lro_flush(lc, le);
+ return (TCP_LRO_CANNOT);
+ }
+
/* Flush now if appending will result in overflow. */
- if (le->p_len > (65535 - tcp_data_len)) {
- SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+ if (le->p_len > (lc->lro_length_lim - tcp_data_len)) {
+ tcp_lro_active_remove(le);
tcp_lro_flush(lc, le);
break;
}
@@ -494,7 +797,7 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
if (__predict_false(seq != le->next_seq ||
(tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
/* Out of order packet or duplicate ACK. */
- SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+ tcp_lro_active_remove(le);
tcp_lro_flush(lc, le);
return (TCP_LRO_CANNOT);
}
@@ -522,6 +825,14 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
if (tcp_data_len == 0) {
m_freem(m);
+ /*
+ * Flush this LRO entry, if this ACK should not
+ * be further delayed.
+ */
+ if (le->append_cnt >= lc->lro_ackcnt_lim) {
+ tcp_lro_active_remove(le);
+ tcp_lro_flush(lc, le);
+ }
return (0);
}
@@ -533,7 +844,7 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
* append new segment to existing mbuf chain.
*/
m_adj(m, m->m_pkthdr.len - tcp_data_len);
- m->m_flags &= ~M_PKTHDR;
+ m_demote_pkthdr(m);
le->m_tail->m_next = m;
le->m_tail = m_last(m);
@@ -542,22 +853,32 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
* If a possible next full length packet would cause an
* overflow, pro-actively flush now.
*/
- if (le->p_len > (65535 - lc->ifp->if_mtu)) {
- SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+ if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) {
+ tcp_lro_active_remove(le);
tcp_lro_flush(lc, le);
- }
+ } else
+ getmicrotime(&le->mtime);
return (0);
}
- /* Try to find an empty slot. */
- if (SLIST_EMPTY(&lc->lro_free))
+ if (force_flush) {
+ /*
+ * Nothing to flush, but this segment can not be further
+ * aggregated/delayed.
+ */
return (TCP_LRO_CANNOT);
+ }
+
+ /* Try to find an empty slot. */
+ if (LIST_EMPTY(&lc->lro_free))
+ return (TCP_LRO_NO_ENTRIES);
/* Start a new segment chain. */
- le = SLIST_FIRST(&lc->lro_free);
- SLIST_REMOVE_HEAD(&lc->lro_free, next);
- SLIST_INSERT_HEAD(&lc->lro_active, le, next);
+ le = LIST_FIRST(&lc->lro_free);
+ LIST_REMOVE(le, next);
+ tcp_lro_active_insert(lc, bucket, le);
+ getmicrotime(&le->mtime);
/* Start filling in details. */
switch (eh_type) {
@@ -614,4 +935,47 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
return (0);
}
+int
+tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
+{
+
+ return tcp_lro_rx2(lc, m, csum, 1);
+}
+
+void
+tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)
+{
+ /* sanity checks */
+ if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL ||
+ lc->lro_mbuf_max == 0)) {
+ /* packet drop */
+ m_freem(mb);
+ return;
+ }
+
+ /* check if packet is not LRO capable */
+ if (__predict_false(mb->m_pkthdr.csum_flags == 0 ||
+ (lc->ifp->if_capenable & IFCAP_LRO) == 0)) {
+ lc->lro_flushed++;
+ lc->lro_queued++;
+
+ /* input packet to network layer */
+ (*lc->ifp->if_input) (lc->ifp, mb);
+ return;
+ }
+
+ /* check if array is full */
+ if (__predict_false(lc->lro_mbuf_count == lc->lro_mbuf_max))
+ tcp_lro_flush_all(lc);
+
+ /* create sequence number */
+ lc->lro_mbuf_data[lc->lro_mbuf_count].seq =
+ (((uint64_t)M_HASHTYPE_GET(mb)) << 56) |
+ (((uint64_t)mb->m_pkthdr.flowid) << 24) |
+ ((uint64_t)lc->lro_mbuf_count);
+
+ /* enter mbuf */
+ lc->lro_mbuf_data[lc->lro_mbuf_count++].mb = mb;
+}
+
/* end */
diff --git a/freebsd/sys/netinet/tcp_lro.h b/freebsd/sys/netinet/tcp_lro.h
index b3a50179..e019cd1e 100644
--- a/freebsd/sys/netinet/tcp_lro.h
+++ b/freebsd/sys/netinet/tcp_lro.h
@@ -1,6 +1,7 @@
/*-
* Copyright (c) 2006, Myricom Inc.
* Copyright (c) 2008, Intel Corporation.
+ * Copyright (c) 2016 Mellanox Technologies.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -30,9 +31,16 @@
#ifndef _TCP_LRO_H_
#define _TCP_LRO_H_
-struct lro_entry
-{
- SLIST_ENTRY(lro_entry) next;
+#include <sys/time.h>
+
+#ifndef TCP_LRO_ENTRIES
+/* Define default number of LRO entries per RX queue */
+#define TCP_LRO_ENTRIES 8
+#endif
+
+struct lro_entry {
+ LIST_ENTRY(lro_entry) next;
+ LIST_ENTRY(lro_entry) hash_next;
struct mbuf *m_head;
struct mbuf *m_tail;
union {
@@ -59,8 +67,9 @@ struct lro_entry
uint32_t tsecr;
uint16_t window;
uint16_t timestamp; /* flag, not a TCP hdr field. */
+ struct timeval mtime;
};
-SLIST_HEAD(lro_head, lro_entry);
+LIST_HEAD(lro_head, lro_entry);
#define le_ip4 leip.ip4
#define le_ip6 leip.ip6
@@ -69,23 +78,43 @@ SLIST_HEAD(lro_head, lro_entry);
#define source_ip6 lesource.s_ip6
#define dest_ip6 ledest.d_ip6
+struct lro_mbuf_sort {
+ uint64_t seq;
+ struct mbuf *mb;
+};
+
/* NB: This is part of driver structs. */
struct lro_ctrl {
struct ifnet *ifp;
- int lro_queued;
- int lro_flushed;
- int lro_bad_csum;
- int lro_cnt;
+ struct lro_mbuf_sort *lro_mbuf_data;
+ uint64_t lro_queued;
+ uint64_t lro_flushed;
+ uint64_t lro_bad_csum;
+ unsigned lro_cnt;
+ unsigned lro_mbuf_count;
+ unsigned lro_mbuf_max;
+ unsigned short lro_ackcnt_lim; /* max # of aggregated ACKs */
+ unsigned lro_length_lim; /* max len of aggregated data */
+ u_long lro_hashsz;
+ struct lro_head *lro_hash;
struct lro_head lro_active;
struct lro_head lro_free;
};
+#define TCP_LRO_LENGTH_MAX 65535
+#define TCP_LRO_ACKCNT_MAX 65535 /* unlimited */
+
int tcp_lro_init(struct lro_ctrl *);
+int tcp_lro_init_args(struct lro_ctrl *, struct ifnet *, unsigned, unsigned);
void tcp_lro_free(struct lro_ctrl *);
+void tcp_lro_flush_inactive(struct lro_ctrl *, const struct timeval *);
void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *);
+void tcp_lro_flush_all(struct lro_ctrl *);
int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t);
+void tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *);
+#define TCP_LRO_NO_ENTRIES -2
#define TCP_LRO_CANNOT -1
#define TCP_LRO_NOT_SUPPORTED 1
diff --git a/freebsd/sys/netinet/tcp_offload.c b/freebsd/sys/netinet/tcp_offload.c
index 1a90f408..78275fb8 100644
--- a/freebsd/sys/netinet/tcp_offload.c
+++ b/freebsd/sys/netinet/tcp_offload.c
@@ -39,14 +39,15 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
#include <sys/sockopt.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/tcp.h>
-#include <netinet/tcp_var.h>
#include <netinet/tcp_offload.h>
#define TCPOUTFLAGS
#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_var.h>
#include <netinet/toecore.h>
int registered_toedevs;
diff --git a/freebsd/sys/netinet/tcp_output.c b/freebsd/sys/netinet/tcp_output.c
index 550af64f..af11d805 100644
--- a/freebsd/sys/netinet/tcp_output.c
+++ b/freebsd/sys/netinet/tcp_output.c
@@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/protosw.h>
+#include <sys/sdt.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
@@ -56,8 +57,8 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/vnet.h>
-#include <netinet/cc.h>
#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
@@ -68,12 +69,20 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#endif
+#ifdef TCP_RFC7413
+#include <netinet/tcp_fastopen.h>
+#endif
+#include <netinet/tcp.h>
#define TCPOUTFLAGS
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
+#include <netinet/cc/cc.h>
+#ifdef TCPPCAP
+#include <netinet/tcp_pcap.h>
+#endif
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
@@ -90,46 +99,56 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
VNET_DEFINE(int, path_mtu_discovery) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(path_mtu_discovery), 1,
"Enable Path MTU Discovery");
VNET_DEFINE(int, tcp_do_tso) = 1;
#define V_tcp_do_tso VNET(tcp_do_tso)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_tso), 0,
"Enable TCP Segmentation Offload");
VNET_DEFINE(int, tcp_sendspace) = 1024*32;
#define V_tcp_sendspace VNET(tcp_sendspace)
-SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size");
VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_autosndbuf), 0,
"Enable automatic send buffer sizing");
VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024;
#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_autosndbuf_inc), 0,
"Incrementor step size of automatic send buffer");
VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024;
#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_autosndbuf_max), 0,
"Max size of automatic send buffer");
+/*
+ * Make sure that either retransmit or persist timer is set for SYN, FIN and
+ * non-ACK.
+ */
+#define TCP_XMIT_TIMER_ASSERT(tp, len, th_flags) \
+ KASSERT(((len) == 0 && ((th_flags) & (TH_SYN | TH_FIN)) == 0) ||\
+ tcp_timer_active((tp), TT_REXMT) || \
+ tcp_timer_active((tp), TT_PERSIST), \
+ ("neither rexmt nor persist timer is set"))
+
static void inline hhook_run_tcp_est_out(struct tcpcb *tp,
struct tcphdr *th, struct tcpopt *to,
long len, int tso);
static void inline cc_after_idle(struct tcpcb *tp);
/*
- * Wrapper for the TCP established ouput helper hook.
+ * Wrapper for the TCP established output helper hook.
*/
static void inline
hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
@@ -201,6 +220,17 @@ tcp_output(struct tcpcb *tp)
return (tcp_offload_output(tp));
#endif
+#ifdef TCP_RFC7413
+ /*
+ * For TFO connections in SYN_RECEIVED, only allow the initial
+ * SYN|ACK and those sent by the retransmit timer.
+ */
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ (tp->t_state == TCPS_SYN_RECEIVED) &&
+ SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */
+ (tp->snd_nxt != tp->snd_una)) /* not a retransmit */
+ return (0);
+#endif
/*
* Determine length of data that should be transmitted,
* and flags that will be used.
@@ -322,7 +352,7 @@ after_sack_rexmit:
* to send then the probe will be the FIN
* itself.
*/
- if (off < so->so_snd.sb_cc)
+ if (off < sbused(&so->so_snd))
flags &= ~TH_FIN;
sendwin = 1;
} else {
@@ -348,7 +378,8 @@ after_sack_rexmit:
*/
if (sack_rxmit == 0) {
if (sack_bytes_rxmt == 0)
- len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
+ len = ((long)ulmin(sbavail(&so->so_snd), sendwin) -
+ off);
else {
long cwin;
@@ -357,8 +388,8 @@ after_sack_rexmit:
* sending new data, having retransmitted all the
* data possible in the scoreboard.
*/
- len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd)
- - off);
+ len = ((long)ulmin(sbavail(&so->so_snd), tp->snd_wnd) -
+ off);
/*
* Don't remove this (len > 0) check !
* We explicitly check for len > 0 here (although it
@@ -386,6 +417,15 @@ after_sack_rexmit:
if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
if (tp->t_state != TCPS_SYN_RECEIVED)
flags &= ~TH_SYN;
+#ifdef TCP_RFC7413
+ /*
+ * When sending additional segments following a TFO SYN|ACK,
+ * do not include the SYN bit.
+ */
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ (tp->t_state == TCPS_SYN_RECEIVED))
+ flags &= ~TH_SYN;
+#endif
off--, len++;
}
@@ -399,7 +439,18 @@ after_sack_rexmit:
flags &= ~TH_FIN;
}
- if (len < 0) {
+#ifdef TCP_RFC7413
+ /*
+ * When retransmitting SYN|ACK on a passively-created TFO socket,
+ * don't include data, as the presence of data may have caused the
+ * original SYN|ACK to have been dropped by a middlebox.
+ */
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ (((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)) ||
+ (flags & TH_RST)))
+ len = 0;
+#endif
+ if (len <= 0) {
/*
* If FIN has been sent but not acked,
* but we haven't been called to retransmit,
@@ -409,9 +460,16 @@ after_sack_rexmit:
* to (closed) window, and set the persist timer
* if it isn't already going. If the window didn't
* close completely, just wait for an ACK.
+ *
+ * We also do a general check here to ensure that
+ * we will set the persist timer when we have data
+ * to send, but a 0-byte window. This makes sure
+ * the persist timer is set even if the packet
+ * hits one of the "goto send" lines below.
*/
len = 0;
- if (sendwin == 0) {
+ if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) &&
+ (off < (int) sbavail(&so->so_snd))) {
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rxtshift = 0;
tp->snd_nxt = tp->snd_una;
@@ -449,20 +507,23 @@ after_sack_rexmit:
* and does at most one step per received ACK. This fast
* scaling has the drawback of growing the send buffer beyond
* what is strictly necessary to make full use of a given
- * delay*bandwith product. However testing has shown this not
+ * delay*bandwidth product. However testing has shown this not
* to be much of an problem. At worst we are trading wasting
- * of available bandwith (the non-use of it) for wasting some
+ * of available bandwidth (the non-use of it) for wasting some
* socket buffer memory.
*
* TODO: Shrink send buffer during idle periods together
* with congestion window. Requires another timer. Has to
* wait for upcoming tcp timer rewrite.
+ *
+ * XXXGL: should there be used sbused() or sbavail()?
*/
if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
- so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
- so->so_snd.sb_cc < V_tcp_autosndbuf_max &&
- sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
+ sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) &&
+ sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
+ sendwin >= (sbused(&so->so_snd) -
+ (tp->snd_nxt - tp->snd_una))) {
if (!sbreserve_locked(&so->so_snd,
min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
V_tcp_autosndbuf_max), so, curthread))
@@ -499,10 +560,11 @@ after_sack_rexmit:
tso = 1;
if (sack_rxmit) {
- if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
+ if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd)))
flags &= ~TH_FIN;
} else {
- if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
+ if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
+ sbused(&so->so_snd)))
flags &= ~TH_FIN;
}
@@ -532,7 +594,7 @@ after_sack_rexmit:
*/
if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
(idle || (tp->t_flags & TF_NODELAY)) &&
- len + off >= so->so_snd.sb_cc &&
+ len + off >= sbavail(&so->so_snd) &&
(tp->t_flags & TF_NOPUSH) == 0) {
goto send;
}
@@ -660,7 +722,7 @@ dontupdate:
* if window is nonzero, transmit what we can,
* otherwise force out a byte.
*/
- if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) &&
+ if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) &&
!tcp_timer_active(tp, TT_PERSIST)) {
tp->t_rxtshift = 0;
tcp_setpersist(tp);
@@ -675,6 +737,12 @@ just_return:
send:
SOCKBUF_LOCK_ASSERT(&so->so_snd);
+ if (len > 0) {
+ if (len >= tp->t_maxseg)
+ tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
+ else
+ tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
+ }
/*
* Before ESTABLISHED, force sending of initial options
* unless TCP set not to do any options.
@@ -697,13 +765,29 @@ send:
* segments. Options for SYN-ACK segments are handled in TCP
* syncache.
*/
+ to.to_flags = 0;
if ((tp->t_flags & TF_NOOPT) == 0) {
- to.to_flags = 0;
/* Maximum segment size. */
if (flags & TH_SYN) {
tp->snd_nxt = tp->iss;
to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
to.to_flags |= TOF_MSS;
+#ifdef TCP_RFC7413
+ /*
+ * Only include the TFO option on the first
+ * transmission of the SYN|ACK on a
+ * passively-created TFO socket, as the presence of
+ * the TFO option may have caused the original
+ * SYN|ACK to have been dropped by a middlebox.
+ */
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ (tp->t_state == TCPS_SYN_RECEIVED) &&
+ (tp->t_rxtshift == 0)) {
+ to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
+ to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie;
+ to.to_flags |= TOF_FASTOPEN;
+ }
+#endif
}
/* Window scaling. */
if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
@@ -759,11 +843,11 @@ send:
/*
* Adjust data length if insertion of options will
- * bump the packet length beyond the t_maxopd length.
+ * bump the packet length beyond the t_maxseg length.
* Clear the FIN bit because we cut off the tail of
* the segment.
*/
- if (len + optlen + ipoptlen > tp->t_maxopd) {
+ if (len + optlen + ipoptlen > tp->t_maxseg) {
flags &= ~TH_FIN;
if (tso) {
@@ -793,7 +877,8 @@ send:
*/
if (if_hw_tsomax != 0) {
/* compute maximum TSO length */
- max_len = (if_hw_tsomax - hdrlen);
+ max_len = (if_hw_tsomax - hdrlen -
+ max_linkhdr);
if (max_len <= 0) {
len = 0;
} else if (len > max_len) {
@@ -808,6 +893,15 @@ send:
*/
if (if_hw_tsomaxsegcount != 0 &&
if_hw_tsomaxsegsize != 0) {
+ /*
+ * Subtract one segment for the LINK
+ * and TCP/IP headers mbuf that will
+ * be prepended to this mbuf chain
+ * after the code in this section
+ * limits the number of mbufs in the
+ * chain to if_hw_tsomaxsegcount.
+ */
+ if_hw_tsomaxsegcount -= 1;
max_len = 0;
mb = sbsndmbuf(&so->so_snd, off, &moff);
@@ -856,8 +950,8 @@ send:
* fractional unless the send sockbuf can be
* emptied:
*/
- max_len = (tp->t_maxopd - optlen);
- if ((off + len) < so->so_snd.sb_cc) {
+ max_len = (tp->t_maxseg - optlen);
+ if ((off + len) < sbavail(&so->so_snd)) {
moff = len % max_len;
if (moff != 0) {
len -= moff;
@@ -886,7 +980,7 @@ send:
sendalot = 1;
} else {
- len = tp->t_maxopd - optlen - ipoptlen;
+ len = tp->t_maxseg - optlen - ipoptlen;
sendalot = 1;
}
} else
@@ -929,23 +1023,20 @@ send:
TCPSTAT_INC(tcps_sndpack);
TCPSTAT_ADD(tcps_sndbyte, len);
}
- MGETHDR(m, M_DONTWAIT, MT_DATA);
+#ifdef INET6
+ if (MHLEN < hdrlen + max_linkhdr)
+ m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
+ else
+#endif
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+
if (m == NULL) {
SOCKBUF_UNLOCK(&so->so_snd);
error = ENOBUFS;
+ sack_rxmit = 0;
goto out;
}
-#ifdef INET6
- if (MHLEN < hdrlen + max_linkhdr) {
- MCLGET(m, M_DONTWAIT);
- if ((m->m_flags & M_EXT) == 0) {
- SOCKBUF_UNLOCK(&so->so_snd);
- m_freem(m);
- error = ENOBUFS;
- goto out;
- }
- }
-#endif
+
m->m_data += max_linkhdr;
m->m_len = hdrlen;
@@ -965,6 +1056,7 @@ send:
SOCKBUF_UNLOCK(&so->so_snd);
(void) m_free(m);
error = ENOBUFS;
+ sack_rxmit = 0;
goto out;
}
}
@@ -975,7 +1067,7 @@ send:
* give data to the user when a buffer fills or
* a PUSH comes in.)
*/
- if (off + len == so->so_snd.sb_cc)
+ if ((off + len == sbused(&so->so_snd)) && !(flags & TH_SYN))
flags |= TH_PUSH;
SOCKBUF_UNLOCK(&so->so_snd);
} else {
@@ -989,15 +1081,16 @@ send:
else
TCPSTAT_INC(tcps_sndwinup);
- MGETHDR(m, M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL) {
error = ENOBUFS;
+ sack_rxmit = 0;
goto out;
}
#ifdef INET6
if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
MHLEN >= hdrlen) {
- MH_ALIGN(m, hdrlen);
+ M_ALIGN(m, hdrlen);
} else
#endif
m->m_data += max_linkhdr;
@@ -1036,7 +1129,7 @@ send:
* resend those bits a number of times as per
* RFC 3168.
*/
- if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
+ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
if (tp->t_rxtshift >= 1) {
if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
flags |= TH_ECE|TH_CWR;
@@ -1153,7 +1246,7 @@ send:
tp->snd_up = tp->snd_una; /* drag it along */
#ifdef TCP_SIGNATURE
- if (tp->t_flags & TF_SIGNATURE) {
+ if (to.to_flags & TOF_SIGNATURE) {
int sigoff = to.to_signature - opt;
tcp_signature_compute(m, 0, len, optlen,
(u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
@@ -1195,13 +1288,12 @@ send:
/*
* Enable TSO and specify the size of the segments.
* The TCP pseudo header checksum is always provided.
- * XXX: Fixme: This is currently not the case for IPv6.
*/
if (tso) {
- KASSERT(len > tp->t_maxopd - optlen,
+ KASSERT(len > tp->t_maxseg - optlen,
("%s: len <= tso_segsz", __func__));
m->m_pkthdr.csum_flags |= CSUM_TSO;
- m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
+ m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
}
#ifdef IPSEC
@@ -1214,75 +1306,6 @@ send:
__func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
#endif
- /*
- * In transmit state, time the transmission and arrange for
- * the retransmit. In persist state, just set snd_max.
- */
- if ((tp->t_flags & TF_FORCEDATA) == 0 ||
- !tcp_timer_active(tp, TT_PERSIST)) {
- tcp_seq startseq = tp->snd_nxt;
-
- /*
- * Advance snd_nxt over sequence space of this segment.
- */
- if (flags & (TH_SYN|TH_FIN)) {
- if (flags & TH_SYN)
- tp->snd_nxt++;
- if (flags & TH_FIN) {
- tp->snd_nxt++;
- tp->t_flags |= TF_SENTFIN;
- }
- }
- if (sack_rxmit)
- goto timer;
- tp->snd_nxt += len;
- if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
- tp->snd_max = tp->snd_nxt;
- /*
- * Time this transmission if not a retransmission and
- * not currently timing anything.
- */
- if (tp->t_rtttime == 0) {
- tp->t_rtttime = ticks;
- tp->t_rtseq = startseq;
- TCPSTAT_INC(tcps_segstimed);
- }
- }
-
- /*
- * Set retransmit timer if not currently set,
- * and not doing a pure ack or a keep-alive probe.
- * Initial value for retransmit timer is smoothed
- * round-trip time + 2 * round-trip time variance.
- * Initialize shift counter which is used for backoff
- * of retransmit time.
- */
-timer:
- if (!tcp_timer_active(tp, TT_REXMT) &&
- ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
- (tp->snd_nxt != tp->snd_una))) {
- if (tcp_timer_active(tp, TT_PERSIST)) {
- tcp_timer_activate(tp, TT_PERSIST, 0);
- tp->t_rxtshift = 0;
- }
- tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
- }
- } else {
- /*
- * Persist case, update snd_max but since we are in
- * persist mode (no window) we do not update snd_nxt.
- */
- int xlen = len;
- if (flags & TH_SYN)
- ++xlen;
- if (flags & TH_FIN) {
- ++xlen;
- tp->t_flags |= TF_SENTFIN;
- }
- if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
- tp->snd_max = tp->snd_nxt + len;
- }
-
/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
hhook_run_tcp_est_out(tp, th, &to, len, tso);
@@ -1306,6 +1329,7 @@ timer:
ipov->ih_len = save;
}
#endif /* TCPDEBUG */
+ TCP_PROBE3(debug__output, tp, th, mtod(m, const char *));
/*
* Fill in IP length and desired time to live and
@@ -1314,7 +1338,7 @@ timer:
* the template, but need a way to checksum without them.
*/
/*
- * m->m_pkthdr.len should have been set before cksum calcuration,
+ * m->m_pkthdr.len should have been set before checksum calculation,
* because in6_cksum() need it.
*/
#ifdef INET6
@@ -1330,13 +1354,35 @@ timer:
*/
ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
+ /*
+ * Set the packet size here for the benefit of DTrace probes.
+ * ip6_output() will set it properly; it's supposed to include
+ * the option header lengths as well.
+ */
+ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
+
+ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ else
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+
+ if (tp->t_state == TCPS_SYN_SENT)
+ TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
+
+ TCP_PROBE5(send, NULL, tp, ip6, tp, th);
+
+#ifdef TCPPCAP
+ /* Save packet, if requested. */
+ tcp_pcap_add(th, m, &(tp->t_outpkts));
+#endif
+
/* TODO: IPv6 IP6TOS_ECT bit on */
error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro,
((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
NULL, NULL, tp->t_inpcb);
if (error == EMSGSIZE && ro.ro_rt != NULL)
- mtu = ro.ro_rt->rt_rmx.rmx_mtu;
+ mtu = ro.ro_rt->rt_mtu;
RO_RTFREE(&ro);
}
#endif /* INET6 */
@@ -1345,10 +1391,7 @@ timer:
#endif
#ifdef INET
{
- struct route ro;
-
- bzero(&ro, sizeof(ro));
- ip->ip_len = m->m_pkthdr.len;
+ ip->ip_len = htons(m->m_pkthdr.len);
#ifdef INET6
if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
@@ -1361,18 +1404,126 @@ timer:
*
* NB: Don't set DF on small MTU/MSS to have a safe fallback.
*/
- if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss)
- ip->ip_off |= IP_DF;
+ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
+ ip->ip_off |= htons(IP_DF);
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ } else {
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ }
+
+ if (tp->t_state == TCPS_SYN_SENT)
+ TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
- error = ip_output(m, tp->t_inpcb->inp_options, &ro,
+ TCP_PROBE5(send, NULL, tp, ip, tp, th);
+
+#ifdef TCPPCAP
+ /* Save packet, if requested. */
+ tcp_pcap_add(th, m, &(tp->t_outpkts));
+#endif
+
+ error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
tp->t_inpcb);
- if (error == EMSGSIZE && ro.ro_rt != NULL)
- mtu = ro.ro_rt->rt_rmx.rmx_mtu;
- RO_RTFREE(&ro);
+ if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL)
+ mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu;
}
#endif /* INET */
+
+out:
+ /*
+ * In transmit state, time the transmission and arrange for
+ * the retransmit. In persist state, just set snd_max.
+ */
+ if ((tp->t_flags & TF_FORCEDATA) == 0 ||
+ !tcp_timer_active(tp, TT_PERSIST)) {
+ tcp_seq startseq = tp->snd_nxt;
+
+ /*
+ * Advance snd_nxt over sequence space of this segment.
+ */
+ if (flags & (TH_SYN|TH_FIN)) {
+ if (flags & TH_SYN)
+ tp->snd_nxt++;
+ if (flags & TH_FIN) {
+ tp->snd_nxt++;
+ tp->t_flags |= TF_SENTFIN;
+ }
+ }
+ if (sack_rxmit)
+ goto timer;
+ tp->snd_nxt += len;
+ if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
+ tp->snd_max = tp->snd_nxt;
+ /*
+ * Time this transmission if not a retransmission and
+ * not currently timing anything.
+ */
+ if (tp->t_rtttime == 0) {
+ tp->t_rtttime = ticks;
+ tp->t_rtseq = startseq;
+ TCPSTAT_INC(tcps_segstimed);
+ }
+ }
+
+ /*
+ * Set retransmit timer if not currently set,
+ * and not doing a pure ack or a keep-alive probe.
+ * Initial value for retransmit timer is smoothed
+ * round-trip time + 2 * round-trip time variance.
+ * Initialize shift counter which is used for backoff
+ * of retransmit time.
+ */
+timer:
+ if (!tcp_timer_active(tp, TT_REXMT) &&
+ ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
+ (tp->snd_nxt != tp->snd_una))) {
+ if (tcp_timer_active(tp, TT_PERSIST)) {
+ tcp_timer_activate(tp, TT_PERSIST, 0);
+ tp->t_rxtshift = 0;
+ }
+ tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+ } else if (len == 0 && sbavail(&so->so_snd) &&
+ !tcp_timer_active(tp, TT_REXMT) &&
+ !tcp_timer_active(tp, TT_PERSIST)) {
+ /*
+ * Avoid a situation where we do not set persist timer
+ * after a zero window condition. For example:
+ * 1) A -> B: packet with enough data to fill the window
+ * 2) B -> A: ACK for #1 + new data (0 window
+ * advertisement)
+ * 3) A -> B: ACK for #2, 0 len packet
+ *
+ * In this case, A will not activate the persist timer,
+ * because it chose to send a packet. Unless tcp_output
+ * is called for some other reason (delayed ack timer,
+ * another input packet from B, socket syscall), A will
+ * not send zero window probes.
+ *
+ * So, if you send a 0-length packet, but there is data
+ * in the socket buffer, and neither the rexmt or
+ * persist timer is already set, then activate the
+ * persist timer.
+ */
+ tp->t_rxtshift = 0;
+ tcp_setpersist(tp);
+ }
+ } else {
+ /*
+ * Persist case, update snd_max but since we are in
+ * persist mode (no window) we do not update snd_nxt.
+ */
+ int xlen = len;
+ if (flags & TH_SYN)
+ ++xlen;
+ if (flags & TH_FIN) {
+ ++xlen;
+ tp->t_flags |= TF_SENTFIN;
+ }
+ if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
+ tp->snd_max = tp->snd_nxt + len;
+ }
+
if (error) {
/*
@@ -1400,16 +1551,13 @@ timer:
} else
tp->snd_nxt -= len;
}
-out:
SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */
switch (error) {
case EPERM:
tp->t_softerror = error;
return (error);
case ENOBUFS:
- if (!tcp_timer_active(tp, TT_REXMT) &&
- !tcp_timer_active(tp, TT_PERSIST))
- tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+ TCP_XMIT_TIMER_ASSERT(tp, len, flags);
tp->snd_cwnd = tp->t_maxseg;
return (0);
case EMSGSIZE:
@@ -1481,10 +1629,10 @@ tcp_setpersist(struct tcpcb *tp)
if (tcp_timer_active(tp, TT_REXMT))
panic("tcp_setpersist: retransmit pending");
/*
- * Start/restart persistance timer.
+ * Start/restart persistence timer.
*/
TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
- TCPTV_PERSMIN, TCPTV_PERSMAX);
+ tcp_persmin, tcp_persmax);
tcp_timer_activate(tp, TT_PERSIST, tt);
if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
tp->t_rxtshift++;
@@ -1510,7 +1658,7 @@ tcp_setpersist(struct tcpcb *tp)
int
tcp_addoptions(struct tcpopt *to, u_char *optp)
{
- u_int mask, optlen = 0;
+ u_int32_t mask, optlen = 0;
for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
if ((to->to_flags & mask) != mask)
@@ -1572,6 +1720,7 @@ tcp_addoptions(struct tcpopt *to, u_char *optp)
bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
optp += sizeof(to->to_tsecr);
break;
+#ifdef TCP_SIGNATURE
case TOF_SIGNATURE:
{
int siglen = TCPOLEN_SIGNATURE - 2;
@@ -1590,6 +1739,7 @@ tcp_addoptions(struct tcpopt *to, u_char *optp)
*optp++ = 0;
break;
}
+#endif
case TOF_SACK:
{
int sackblks = 0;
@@ -1620,6 +1770,25 @@ tcp_addoptions(struct tcpopt *to, u_char *optp)
TCPSTAT_INC(tcps_sack_send_blocks);
break;
}
+#ifdef TCP_RFC7413
+ case TOF_FASTOPEN:
+ {
+ int total_len;
+
+ /* XXX is there any point to aligning this option? */
+ total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len;
+ if (TCP_MAXOLEN - optlen < total_len)
+ continue;
+ *optp++ = TCPOPT_FAST_OPEN;
+ *optp++ = total_len;
+ if (to->to_tfo_len > 0) {
+ bcopy(to->to_tfo_cookie, optp, to->to_tfo_len);
+ optp += to->to_tfo_len;
+ }
+ optlen += total_len;
+ break;
+ }
+#endif
default:
panic("%s: unknown TCP option type", __func__);
break;
diff --git a/freebsd/sys/netinet/tcp_reass.c b/freebsd/sys/netinet/tcp_reass.c
index 2570a5f3..49184a5f 100644
--- a/freebsd/sys/netinet/tcp_reass.c
+++ b/freebsd/sys/netinet/tcp_reass.c
@@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/kernel.h>
+#include <sys/eventhandler.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
@@ -51,6 +52,7 @@ __FBSDID("$FreeBSD$");
#include <vm/uma.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#include <net/vnet.h>
@@ -76,67 +78,46 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_debug.h>
#endif /* TCPDEBUG */
-static int tcp_reass_sysctl_qsize(SYSCTL_HANDLER_ARGS);
-
static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
"TCP Segment Reassembly Queue");
-static VNET_DEFINE(int, tcp_reass_maxseg) = 0;
-#define V_tcp_reass_maxseg VNET(tcp_reass_maxseg)
-SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
- &VNET_NAME(tcp_reass_maxseg), 0,
+static int tcp_reass_maxseg = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
+ &tcp_reass_maxseg, 0,
"Global maximum number of TCP Segments in Reassembly Queue");
-SYSCTL_VNET_PROC(_net_inet_tcp_reass, OID_AUTO, cursegments,
- (CTLTYPE_INT | CTLFLAG_RD), NULL, 0, &tcp_reass_sysctl_qsize, "I",
+static uma_zone_t tcp_reass_zone;
+SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegments, 0,
+ &tcp_reass_zone,
"Global number of TCP Segments currently in Reassembly Queue");
-static VNET_DEFINE(int, tcp_reass_overflows) = 0;
-#define V_tcp_reass_overflows VNET(tcp_reass_overflows)
-SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
- CTLFLAG_RD,
- &VNET_NAME(tcp_reass_overflows), 0,
- "Global number of TCP Segment Reassembly Queue Overflows");
-
-static VNET_DEFINE(uma_zone_t, tcp_reass_zone);
-#define V_tcp_reass_zone VNET(tcp_reass_zone)
-
/* Initialize TCP reassembly queue */
static void
tcp_reass_zone_change(void *tag)
{
/* Set the zone limit and read back the effective value. */
- V_tcp_reass_maxseg = nmbclusters / 16;
- V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone,
- V_tcp_reass_maxseg);
+ tcp_reass_maxseg = nmbclusters / 16;
+ tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone,
+ tcp_reass_maxseg);
}
void
-tcp_reass_init(void)
+tcp_reass_global_init(void)
{
- V_tcp_reass_maxseg = nmbclusters / 16;
+ tcp_reass_maxseg = nmbclusters / 16;
TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
- &V_tcp_reass_maxseg);
- V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
+ &tcp_reass_maxseg);
+ tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
/* Set the zone limit and read back the effective value. */
- V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone,
- V_tcp_reass_maxseg);
+ tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone,
+ tcp_reass_maxseg);
EVENTHANDLER_REGISTER(nmbclusters_change,
tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
}
-#ifdef VIMAGE
-void
-tcp_reass_destroy(void)
-{
-
- uma_zdestroy(V_tcp_reass_zone);
-}
-#endif
-
void
tcp_reass_flush(struct tcpcb *tp)
{
@@ -147,7 +128,7 @@ tcp_reass_flush(struct tcpcb *tp)
while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) {
LIST_REMOVE(qe, tqe_q);
m_freem(qe->tqe_m);
- uma_zfree(V_tcp_reass_zone, qe);
+ uma_zfree(tcp_reass_zone, qe);
tp->t_segqlen--;
}
@@ -156,15 +137,6 @@ tcp_reass_flush(struct tcpcb *tp)
tp, tp->t_segqlen));
}
-static int
-tcp_reass_sysctl_qsize(SYSCTL_HANDLER_ARGS)
-{
- int qsize;
-
- qsize = uma_zone_get_cur(V_tcp_reass_zone);
- return (sysctl_handle_int(oidp, &qsize, 0, req));
-}
-
int
tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
{
@@ -209,15 +181,14 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
*/
if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) &&
tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) {
- V_tcp_reass_overflows++;
- TCPSTAT_INC(tcps_rcvmemdrop);
- m_freem(m);
+ TCPSTAT_INC(tcps_rcvreassfull);
*tlenp = 0;
if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
log(LOG_DEBUG, "%s; %s: queue limit reached, "
"segment dropped\n", s, __func__);
free(s, M_TCPLOG);
}
+ m_freem(m);
return (0);
}
@@ -228,7 +199,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
* Use a temporary structure on the stack for the missing segment
* when the zone is exhausted. Otherwise we may get stuck.
*/
- te = uma_zalloc(V_tcp_reass_zone, M_NOWAIT);
+ te = uma_zalloc(tcp_reass_zone, M_NOWAIT);
if (te == NULL) {
if (th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) {
TCPSTAT_INC(tcps_rcvmemdrop);
@@ -279,7 +250,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
m_freem(m);
if (te != &tqs)
- uma_zfree(V_tcp_reass_zone, te);
+ uma_zfree(tcp_reass_zone, te);
tp->t_segqlen--;
/*
* Try to present any queued data
@@ -316,7 +287,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
nq = LIST_NEXT(q, tqe_q);
LIST_REMOVE(q, tqe_q);
m_freem(q->tqe_m);
- uma_zfree(V_tcp_reass_zone, q);
+ uma_zfree(tcp_reass_zone, q);
tp->t_segqlen--;
q = nq;
}
@@ -353,13 +324,12 @@ present:
if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
m_freem(q->tqe_m);
else
- sbappendstream_locked(&so->so_rcv, q->tqe_m);
+ sbappendstream_locked(&so->so_rcv, q->tqe_m, 0);
if (q != &tqs)
- uma_zfree(V_tcp_reass_zone, q);
+ uma_zfree(tcp_reass_zone, q);
tp->t_segqlen--;
q = nq;
} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
- ND6_HINT(tp);
sorwakeup_locked(so);
return (flags);
}
diff --git a/freebsd/sys/netinet/tcp_sack.c b/freebsd/sys/netinet/tcp_sack.c
index 9cc1d86a..c7e32cba 100644
--- a/freebsd/sys/netinet/tcp_sack.c
+++ b/freebsd/sys/netinet/tcp_sack.c
@@ -97,6 +97,7 @@ __FBSDID("$FreeBSD$");
#include <vm/uma.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#include <net/vnet.h>
@@ -130,24 +131,24 @@ VNET_DECLARE(struct uma_zone *, sack_hole_zone);
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK");
VNET_DEFINE(int, tcp_do_sack) = 1;
#define V_tcp_do_sack VNET(tcp_do_sack)
-SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_sack), 0, "Enable/Disable TCP SACK support");
VNET_DEFINE(int, tcp_sack_maxholes) = 128;
#define V_tcp_sack_maxholes VNET(tcp_sack_maxholes)
-SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_sack_maxholes), 0,
"Maximum number of TCP SACK holes allowed per connection");
VNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536;
#define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes)
-SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_sack_globalmaxholes), 0,
"Global maximum number of TCP SACK holes");
VNET_DEFINE(int, tcp_sack_globalholes) = 0;
#define V_tcp_sack_globalholes VNET(tcp_sack_globalholes)
-SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD,
+SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD,
&VNET_NAME(tcp_sack_globalholes), 0,
"Global number of TCP SACK holes currently allocated");
@@ -346,17 +347,22 @@ tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole)
* Process cumulative ACK and the TCP SACK option to update the scoreboard.
* tp->snd_holes is an ordered list of holes (oldest to newest, in terms of
* the sequence space).
+ * Returns 1 if incoming ACK has previously unknown SACK information,
+ * 0 otherwise. Note: We treat (snd_una, th_ack) as a sack block so any changes
+ * to that (i.e. left edge moving) would also be considered a change in SACK
+ * information which is slightly different than rfc6675.
*/
-void
+int
tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
{
struct sackhole *cur, *temp;
struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp;
- int i, j, num_sack_blks;
+ int i, j, num_sack_blks, sack_changed;
INP_WLOCK_ASSERT(tp->t_inpcb);
num_sack_blks = 0;
+ sack_changed = 0;
/*
* If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist,
* treat [SND.UNA, SEG.ACK) as if it is a SACK block.
@@ -370,6 +376,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
* received new blocks from the other side.
*/
if (to->to_flags & TOF_SACK) {
+ tp->sackhint.sacked_bytes = 0; /* reset */
for (i = 0; i < to->to_nsacks; i++) {
bcopy((to->to_sacks + i * TCPOLEN_SACK),
&sack, sizeof(sack));
@@ -380,8 +387,11 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
SEQ_GT(sack.start, th_ack) &&
SEQ_LT(sack.start, tp->snd_max) &&
SEQ_GT(sack.end, tp->snd_una) &&
- SEQ_LEQ(sack.end, tp->snd_max))
+ SEQ_LEQ(sack.end, tp->snd_max)) {
sack_blocks[num_sack_blks++] = sack;
+ tp->sackhint.sacked_bytes +=
+ (sack.end-sack.start);
+ }
}
}
/*
@@ -389,12 +399,12 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
* received.
*/
if (num_sack_blks == 0)
- return;
+ return (sack_changed);
/*
* Sort the SACK blocks so we can update the scoreboard with just one
- * pass. The overhead of sorting upto 4+1 elements is less than
- * making upto 4+1 passes over the scoreboard.
+ * pass. The overhead of sorting up to 4+1 elements is less than
+ * making up to 4+1 passes over the scoreboard.
*/
for (i = 0; i < num_sack_blks; i++) {
for (j = i + 1; j < num_sack_blks; j++) {
@@ -440,6 +450,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
tp->snd_fack = sblkp->end;
/* Go to the previous sack block. */
sblkp--;
+ sack_changed = 1;
} else {
/*
* We failed to add a new hole based on the current
@@ -456,9 +467,11 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
SEQ_LT(tp->snd_fack, sblkp->end))
tp->snd_fack = sblkp->end;
}
- } else if (SEQ_LT(tp->snd_fack, sblkp->end))
+ } else if (SEQ_LT(tp->snd_fack, sblkp->end)) {
/* fack is advanced. */
tp->snd_fack = sblkp->end;
+ sack_changed = 1;
+ }
/* We must have at least one SACK hole in scoreboard. */
KASSERT(!TAILQ_EMPTY(&tp->snd_holes),
("SACK scoreboard must not be empty"));
@@ -487,6 +500,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start);
KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
("sackhint bytes rtx >= 0"));
+ sack_changed = 1;
if (SEQ_LEQ(sblkp->start, cur->start)) {
/* Data acks at least the beginning of hole. */
if (SEQ_GEQ(sblkp->end, cur->end)) {
@@ -542,6 +556,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
else
sblkp--;
}
+ return (sack_changed);
}
/*
@@ -586,7 +601,7 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;
tp->t_flags |= TF_ACKNOW;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
}
#if 0
diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c
index b175c0c0..cff9bd7b 100644
--- a/freebsd/sys/netinet/tcp_subr.c
+++ b/freebsd/sys/netinet/tcp_subr.c
@@ -47,18 +47,21 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
+#include <sys/eventhandler.h>
#include <sys/hhook.h>
#include <sys/kernel.h>
#include <sys/khelp.h>
#include <sys/sysctl.h>
#include <sys/jail.h>
#include <sys/malloc.h>
+#include <sys/refcount.h>
#include <sys/mbuf.h>
#ifdef INET6
#include <sys/domain.h>
#endif
#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/sdt.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
@@ -68,10 +71,12 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/vnet.h>
-#include <netinet/cc.h>
#include <netinet/in.h>
+#include <netinet/in_fib.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
@@ -79,22 +84,32 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip_icmp.h>
#include <netinet/ip_var.h>
#ifdef INET6
+#include <netinet/icmp6.h>
#include <netinet/ip6.h>
+#include <netinet6/in6_fib.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet6/nd6.h>
#endif
+#ifdef TCP_RFC7413
+#include <netinet/tcp_fastopen.h>
+#endif
+#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_syncache.h>
+#include <netinet/cc/cc.h>
#ifdef INET6
#include <netinet6/tcp6_var.h>
#endif
#include <netinet/tcpip.h>
+#ifdef TCPPCAP
+#include <netinet/tcp_pcap.h>
+#endif
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
@@ -125,6 +140,8 @@ VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS;
VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
#endif
+struct rwlock tcp_function_lock;
+
static int
sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
{
@@ -141,8 +158,8 @@ sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
return (error);
}
-SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
- CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0,
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0,
&sysctl_net_inet_tcp_mss_check, "I",
"Default TCP Maximum Segment Size");
@@ -163,8 +180,8 @@ sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
return (error);
}
-SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
- CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0,
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0,
&sysctl_net_inet_tcp_mss_v6_check, "I",
"Default TCP Maximum Segment Size for IPv6");
#endif /* INET6 */
@@ -178,12 +195,12 @@ SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
* checking. This setting prevents us from sending too small packets.
*/
VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_minmss), 0,
"Minimum TCP Maximum Segment Size");
VNET_DEFINE(int, tcp_do_rfc1323) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_rfc1323), 0,
"Enable rfc1323 (high performance TCP) extensions");
@@ -191,30 +208,30 @@ static int tcp_log_debug = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
&tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
-static int tcp_tcbhashsize = 0;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
+static int tcp_tcbhashsize;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
&tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
static int do_tcpdrain = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
"Enable tcp_drain routine for extra help when low on mbufs");
-SYSCTL_VNET_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD,
&VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs");
static VNET_DEFINE(int, icmp_may_rst) = 1;
#define V_icmp_may_rst VNET(icmp_may_rst)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(icmp_may_rst), 0,
"Certain ICMP unreachable messages may abort connections in SYN_SENT");
static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0;
#define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_isn_reseed_interval), 0,
"Seconds between reseeding of ISN secret");
-static int tcp_soreceive_stream = 0;
+static int tcp_soreceive_stream;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
&tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets");
@@ -231,9 +248,193 @@ VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
static struct inpcb *tcp_notify(struct inpcb *, int);
static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
+static void tcp_mtudisc(struct inpcb *, int);
static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
void *ip4hdr, const void *ip6hdr);
+
+static struct tcp_function_block tcp_def_funcblk = {
+ "default",
+ tcp_output,
+ tcp_do_segment,
+ tcp_default_ctloutput,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ 0,
+ 0
+};
+
+int t_functions_inited = 0;
+struct tcp_funchead t_functions;
+static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk;
+
+static void
+init_tcp_functions(void)
+{
+ if (t_functions_inited == 0) {
+ TAILQ_INIT(&t_functions);
+ rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0);
+ t_functions_inited = 1;
+ }
+}
+
+static struct tcp_function_block *
+find_tcp_functions_locked(struct tcp_function_set *fs)
+{
+ struct tcp_function *f;
+ struct tcp_function_block *blk=NULL;
+
+ TAILQ_FOREACH(f, &t_functions, tf_next) {
+ if (strcmp(f->tf_fb->tfb_tcp_block_name, fs->function_set_name) == 0) {
+ blk = f->tf_fb;
+ break;
+ }
+ }
+ return(blk);
+}
+
+static struct tcp_function_block *
+find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s)
+{
+ struct tcp_function_block *rblk=NULL;
+ struct tcp_function *f;
+
+ TAILQ_FOREACH(f, &t_functions, tf_next) {
+ if (f->tf_fb == blk) {
+ rblk = blk;
+ if (s) {
+ *s = f;
+ }
+ break;
+ }
+ }
+ return (rblk);
+}
+
+struct tcp_function_block *
+find_and_ref_tcp_functions(struct tcp_function_set *fs)
+{
+ struct tcp_function_block *blk;
+
+ rw_rlock(&tcp_function_lock);
+ blk = find_tcp_functions_locked(fs);
+ if (blk)
+ refcount_acquire(&blk->tfb_refcnt);
+ rw_runlock(&tcp_function_lock);
+ return(blk);
+}
+
+struct tcp_function_block *
+find_and_ref_tcp_fb(struct tcp_function_block *blk)
+{
+ struct tcp_function_block *rblk;
+
+ rw_rlock(&tcp_function_lock);
+ rblk = find_tcp_fb_locked(blk, NULL);
+ if (rblk)
+ refcount_acquire(&rblk->tfb_refcnt);
+ rw_runlock(&tcp_function_lock);
+ return(rblk);
+}
+
+
+static int
+sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
+{
+ int error=ENOENT;
+ struct tcp_function_set fs;
+ struct tcp_function_block *blk;
+
+ memset(&fs, 0, sizeof(fs));
+ rw_rlock(&tcp_function_lock);
+ blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL);
+ if (blk) {
+ /* Found him */
+ strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
+ fs.pcbcnt = blk->tfb_refcnt;
+ }
+ rw_runlock(&tcp_function_lock);
+ error = sysctl_handle_string(oidp, fs.function_set_name,
+ sizeof(fs.function_set_name), req);
+
+ /* Check for error or no change */
+ if (error != 0 || req->newptr == NULL)
+ return(error);
+
+ rw_wlock(&tcp_function_lock);
+ blk = find_tcp_functions_locked(&fs);
+ if ((blk == NULL) ||
+ (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) {
+ error = ENOENT;
+ goto done;
+ }
+ tcp_func_set_ptr = blk;
+done:
+ rw_wunlock(&tcp_function_lock);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default,
+ CTLTYPE_STRING | CTLFLAG_RW,
+ NULL, 0, sysctl_net_inet_default_tcp_functions, "A",
+ "Set/get the default TCP functions");
+
+static int
+sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS)
+{
+ int error, cnt, linesz;
+ struct tcp_function *f;
+ char *buffer, *cp;
+ size_t bufsz, outsz;
+
+ cnt = 0;
+ rw_rlock(&tcp_function_lock);
+ TAILQ_FOREACH(f, &t_functions, tf_next) {
+ cnt++;
+ }
+ rw_runlock(&tcp_function_lock);
+
+ bufsz = (cnt+2) * (TCP_FUNCTION_NAME_LEN_MAX + 12) + 1;
+ buffer = malloc(bufsz, M_TEMP, M_WAITOK);
+
+ error = 0;
+ cp = buffer;
+
+ linesz = snprintf(cp, bufsz, "\n%-32s%c %s\n", "Stack", 'D', "PCB count");
+ cp += linesz;
+ bufsz -= linesz;
+ outsz = linesz;
+
+ rw_rlock(&tcp_function_lock);
+ TAILQ_FOREACH(f, &t_functions, tf_next) {
+ linesz = snprintf(cp, bufsz, "%-32s%c %u\n",
+ f->tf_fb->tfb_tcp_block_name,
+ (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ',
+ f->tf_fb->tfb_refcnt);
+ if (linesz >= bufsz) {
+ error = EOVERFLOW;
+ break;
+ }
+ cp += linesz;
+ bufsz -= linesz;
+ outsz += linesz;
+ }
+ rw_runlock(&tcp_function_lock);
+ if (error == 0)
+ error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
+ free(buffer, M_TEMP);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
+ CTLTYPE_STRING|CTLFLAG_RD,
+ NULL, 0, sysctl_net_inet_list_available, "A",
+ "list available TCP Function sets");
+
/*
* Target size of TCP PCB hash tables. Must be a power of two.
*
@@ -241,7 +442,7 @@ static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
* variable net.inet.tcp.tcbhashsize
*/
#ifndef TCBHASHSIZE
-#define TCBHASHSIZE 512
+#define TCBHASHSIZE 0
#endif
/*
@@ -261,6 +462,8 @@ static VNET_DEFINE(uma_zone_t, tcpcb_zone);
#define V_tcpcb_zone VNET(tcpcb_zone)
MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
+MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory");
+
static struct mtx isn_mtx;
#define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
@@ -288,48 +491,196 @@ tcp_inpcb_init(void *mem, int size, int flags)
return (0);
}
+/*
+ * Take a value and get the next power of 2 that doesn't overflow.
+ * Used to size the tcp_inpcb hash buckets.
+ */
+static int
+maketcp_hashsize(int size)
+{
+ int hashsize;
+
+ /*
+ * auto tune.
+ * get the next power of 2 higher than maxsockets.
+ */
+ hashsize = 1 << fls(size);
+ /* catch overflow, and just go one power of 2 smaller */
+ if (hashsize < size) {
+ hashsize = 1 << (fls(size) - 1);
+ }
+ return (hashsize);
+}
+
+int
+register_tcp_functions(struct tcp_function_block *blk, int wait)
+{
+ struct tcp_function_block *lblk;
+ struct tcp_function *n;
+ struct tcp_function_set fs;
+
+ if (t_functions_inited == 0) {
+ init_tcp_functions();
+ }
+ if ((blk->tfb_tcp_output == NULL) ||
+ (blk->tfb_tcp_do_segment == NULL) ||
+ (blk->tfb_tcp_ctloutput == NULL) ||
+ (strlen(blk->tfb_tcp_block_name) == 0)) {
+ /*
+ * These functions are required and you
+ * need a name.
+ */
+ return (EINVAL);
+ }
+ if (blk->tfb_tcp_timer_stop_all ||
+ blk->tfb_tcp_timer_activate ||
+ blk->tfb_tcp_timer_active ||
+ blk->tfb_tcp_timer_stop) {
+ /*
+ * If you define one timer function you
+ * must have them all.
+ */
+ if ((blk->tfb_tcp_timer_stop_all == NULL) ||
+ (blk->tfb_tcp_timer_activate == NULL) ||
+ (blk->tfb_tcp_timer_active == NULL) ||
+ (blk->tfb_tcp_timer_stop == NULL)) {
+ return (EINVAL);
+ }
+ }
+ n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
+ if (n == NULL) {
+ return (ENOMEM);
+ }
+ n->tf_fb = blk;
+ strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
+ rw_wlock(&tcp_function_lock);
+ lblk = find_tcp_functions_locked(&fs);
+ if (lblk) {
+ /* Duplicate name space not allowed */
+ rw_wunlock(&tcp_function_lock);
+ free(n, M_TCPFUNCTIONS);
+ return (EALREADY);
+ }
+ refcount_init(&blk->tfb_refcnt, 0);
+ blk->tfb_flags = 0;
+ TAILQ_INSERT_TAIL(&t_functions, n, tf_next);
+ rw_wunlock(&tcp_function_lock);
+ return(0);
+}
+
+int
+deregister_tcp_functions(struct tcp_function_block *blk)
+{
+ struct tcp_function_block *lblk;
+ struct tcp_function *f;
+ int error=ENOENT;
+
+ if (strcmp(blk->tfb_tcp_block_name, "default") == 0) {
+ /* You can't un-register the default */
+ return (EPERM);
+ }
+ rw_wlock(&tcp_function_lock);
+ if (blk == tcp_func_set_ptr) {
+ /* You can't free the current default */
+ rw_wunlock(&tcp_function_lock);
+ return (EBUSY);
+ }
+ if (blk->tfb_refcnt) {
+ /* Still tcb attached, mark it. */
+ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
+ rw_wunlock(&tcp_function_lock);
+ return (EBUSY);
+ }
+ lblk = find_tcp_fb_locked(blk, &f);
+ if (lblk) {
+ /* Found */
+ TAILQ_REMOVE(&t_functions, f, tf_next);
+ f->tf_fb = NULL;
+ free(f, M_TCPFUNCTIONS);
+ error = 0;
+ }
+ rw_wunlock(&tcp_function_lock);
+ return (error);
+}
+
void
tcp_init(void)
{
+ const char *tcbhash_tuneable;
int hashsize;
+ tcbhash_tuneable = "net.inet.tcp.tcbhashsize";
+
if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN,
&V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
printf("%s: WARNING: unable to register helper hook\n", __func__);
if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT,
&V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
printf("%s: WARNING: unable to register helper hook\n", __func__);
-
hashsize = TCBHASHSIZE;
- TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
+ TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
+ if (hashsize == 0) {
+ /*
+ * Auto tune the hash size based on maxsockets.
+ * A perfect hash would have a 1:1 mapping
+ * (hashsize = maxsockets) however it's been
+ * suggested that O(2) average is better.
+ */
+ hashsize = maketcp_hashsize(maxsockets / 4);
+ /*
+ * Our historical default is 512,
+ * do not autotune lower than this.
+ */
+ if (hashsize < 512)
+ hashsize = 512;
+ if (bootverbose && IS_DEFAULT_VNET(curvnet))
+ printf("%s: %s auto tuned to %d\n", __func__,
+ tcbhash_tuneable, hashsize);
+ }
+ /*
+ * We require a hashsize to be a power of two.
+ * Previously if it was not a power of two we would just reset it
+ * back to 512, which could be a nasty surprise if you did not notice
+ * the error message.
+ * Instead what we do is clip it to the closest power of two lower
+ * than the specified hash value.
+ */
if (!powerof2(hashsize)) {
- printf("WARNING: TCB hash size not a power of 2\n");
- hashsize = 512; /* safe default */
+ int oldhashsize = hashsize;
+
+ hashsize = maketcp_hashsize(hashsize);
+ /* prevent absurdly low value */
+ if (hashsize < 16)
+ hashsize = 16;
+ printf("%s: WARNING: TCB hash size not a power of 2, "
+ "clipped from %d to %d.\n", __func__, oldhashsize,
+ hashsize);
}
in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize,
- "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE,
- IPI_HASHFIELDS_4TUPLE);
+ "tcp_inpcb", tcp_inpcb_init, NULL, 0, IPI_HASHFIELDS_4TUPLE);
/*
* These have to be type stable for the benefit of the timers.
*/
V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
uma_zone_set_max(V_tcpcb_zone, maxsockets);
+ uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached");
tcp_tw_init();
syncache_init();
tcp_hc_init();
- tcp_reass_init();
TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
/* Skip initialization of globals for non-default instances. */
if (!IS_DEFAULT_VNET(curvnet))
return;
+ tcp_reass_global_init();
+
/* XXX virtualize those bellow? */
tcp_delacktime = TCPTV_DELACK;
tcp_keepinit = TCPTV_KEEP_INIT;
@@ -340,11 +691,15 @@ tcp_init(void)
tcp_rexmit_min = TCPTV_MIN;
if (tcp_rexmit_min < 1)
tcp_rexmit_min = 1;
+ tcp_persmin = TCPTV_PERSMIN;
+ tcp_persmax = TCPTV_PERSMAX;
tcp_rexmit_slop = TCPTV_CPU_VAR;
tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
tcp_tcbhashsize = hashsize;
+ /* Setup the tcp function block list */
+ init_tcp_functions();
+ register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
- TUNABLE_INT_FETCH("net.inet.tcp.soreceive_stream", &tcp_soreceive_stream);
if (tcp_soreceive_stream) {
#ifdef INET
tcp_usrreqs.pru_soreceive = soreceive_stream;
@@ -370,21 +725,64 @@ tcp_init(void)
SHUTDOWN_PRI_DEFAULT);
EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
EVENTHANDLER_PRI_ANY);
+#ifdef TCPPCAP
+ tcp_pcap_init();
+#endif
+
+#ifdef TCP_RFC7413
+ tcp_fastopen_init();
+#endif
}
#ifdef VIMAGE
-void
-tcp_destroy(void)
+static void
+tcp_destroy(void *unused __unused)
{
+ int error, n;
- tcp_reass_destroy();
+ /*
+ * All our processes are gone, all our sockets should be cleaned
+ * up, which means, we should be past the tcp_discardcb() calls.
+ * Sleep to let all tcpcb timers really disappear and cleanup.
+ */
+ for (;;) {
+ INP_LIST_RLOCK(&V_tcbinfo);
+ n = V_tcbinfo.ipi_count;
+ INP_LIST_RUNLOCK(&V_tcbinfo);
+ if (n == 0)
+ break;
+ pause("tcpdes", hz / 10);
+ }
tcp_hc_destroy();
syncache_destroy();
tcp_tw_destroy();
in_pcbinfo_destroy(&V_tcbinfo);
+ /* tcp_discardcb() clears the sack_holes up. */
uma_zdestroy(V_sack_hole_zone);
uma_zdestroy(V_tcpcb_zone);
+
+#ifdef TCP_RFC7413
+ /*
+ * Cannot free the zone until all tcpcbs are released as we attach
+ * the allocations to them.
+ */
+ tcp_fastopen_destroy();
+#endif
+
+ error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]);
+ if (error != 0) {
+ printf("%s: WARNING: unable to deregister helper hook "
+ "type=%d, id=%d: error %d returned\n", __func__,
+ HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error);
+ }
+ error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]);
+ if (error != 0) {
+ printf("%s: WARNING: unable to deregister helper hook "
+ "type=%d, id=%d: error %d returned\n", __func__,
+ HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error);
+ }
}
+VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL);
#endif
void
@@ -473,31 +871,33 @@ tcpip_maketemplate(struct inpcb *inp)
/*
* Send a single message to the TCP at address specified by
* the given TCP/IP header. If m == NULL, then we make a copy
- * of the tcpiphdr at ti and send directly to the addressed host.
+ * of the tcpiphdr at th and send directly to the addressed host.
* This is used to force keep alive messages out using the TCP
* template for a connection. If flags are given then we send
- * a message back to the TCP which originated the * segment ti,
+ * a message back to the TCP which originated the segment th,
* and discard the mbuf containing it and any other attached mbufs.
*
* In any case the ack and sequence number of the transmitted
* segment are as specified by the parameters.
*
- * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
+ * NOTE: If m != NULL, then th must point to *inside* the mbuf.
*/
void
tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
tcp_seq ack, tcp_seq seq, int flags)
{
- int tlen;
- int win = 0;
+ struct tcpopt to;
+ struct inpcb *inp;
struct ip *ip;
+ struct mbuf *optm;
struct tcphdr *nth;
+ u_char *optp;
#ifdef INET6
struct ip6_hdr *ip6;
int isipv6;
#endif /* INET6 */
- int ipflags = 0;
- struct inpcb *inp;
+ int optlen, tlen, win;
+ bool incl_opts;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
@@ -514,18 +914,21 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
} else
inp = NULL;
+ incl_opts = false;
+ win = 0;
if (tp != NULL) {
if (!(flags & TH_RST)) {
win = sbspace(&inp->inp_socket->so_rcv);
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
win = (long)TCP_MAXWIN << tp->rcv_scale;
}
+ if ((tp->t_flags & TF_NOOPT) == 0)
+ incl_opts = true;
}
if (m == NULL) {
- m = m_gethdr(M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL)
return;
- tlen = 0;
m->m_data += max_linkhdr;
#ifdef INET6
if (isipv6) {
@@ -535,35 +938,71 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
nth = (struct tcphdr *)(ip6 + 1);
} else
#endif /* INET6 */
- {
- bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
- ip = mtod(m, struct ip *);
- nth = (struct tcphdr *)(ip + 1);
- }
+ {
+ bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
+ ip = mtod(m, struct ip *);
+ nth = (struct tcphdr *)(ip + 1);
+ }
bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
flags = TH_ACK;
+ } else if (!M_WRITABLE(m)) {
+ struct mbuf *n;
+
+ /* Can't reuse 'm', allocate a new mbuf. */
+ n = m_gethdr(M_NOWAIT, MT_DATA);
+ if (n == NULL) {
+ m_freem(m);
+ return;
+ }
+
+ if (!m_dup_pkthdr(n, m, M_NOWAIT)) {
+ m_freem(m);
+ m_freem(n);
+ return;
+ }
+
+ n->m_data += max_linkhdr;
+ /* m_len is set later */
+#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
+#ifdef INET6
+ if (isipv6) {
+ bcopy((caddr_t)ip6, mtod(n, caddr_t),
+ sizeof(struct ip6_hdr));
+ ip6 = mtod(n, struct ip6_hdr *);
+ xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
+ nth = (struct tcphdr *)(ip6 + 1);
+ } else
+#endif /* INET6 */
+ {
+ bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip));
+ ip = mtod(n, struct ip *);
+ xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
+ nth = (struct tcphdr *)(ip + 1);
+ }
+ bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
+ xchg(nth->th_dport, nth->th_sport, uint16_t);
+ th = nth;
+ m_freem(m);
+ m = n;
} else {
/*
* reuse the mbuf.
- * XXX MRT We inherrit the FIB, which is lucky.
+ * XXX MRT We inherit the FIB, which is lucky.
*/
m_freem(m->m_next);
m->m_next = NULL;
m->m_data = (caddr_t)ipgen;
- m_addr_changed(m);
/* m_len is set later */
- tlen = 0;
-#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
#ifdef INET6
if (isipv6) {
xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
nth = (struct tcphdr *)(ip6 + 1);
} else
#endif /* INET6 */
- {
- xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
- nth = (struct tcphdr *)(ip + 1);
- }
+ {
+ xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
+ nth = (struct tcphdr *)(ip + 1);
+ }
if (th != nth) {
/*
* this is usually a case when an extension header
@@ -576,13 +1015,65 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
xchg(nth->th_dport, nth->th_sport, uint16_t);
#undef xchg
}
+ tlen = 0;
+#ifdef INET6
+ if (isipv6)
+ tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
+#endif
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ tlen = sizeof (struct tcpiphdr);
+#endif
+#ifdef INVARIANTS
+ m->m_len = 0;
+ KASSERT(M_TRAILINGSPACE(m) >= tlen,
+ ("Not enough trailing space for message (m=%p, need=%d, have=%ld)",
+ m, tlen, (long)M_TRAILINGSPACE(m)));
+#endif
+ m->m_len = tlen;
+ to.to_flags = 0;
+ if (incl_opts) {
+ /* Make sure we have room. */
+ if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) {
+ m->m_next = m_get(M_NOWAIT, MT_DATA);
+ if (m->m_next) {
+ optp = mtod(m->m_next, u_char *);
+ optm = m->m_next;
+ } else
+ incl_opts = false;
+ } else {
+ optp = (u_char *) (nth + 1);
+ optm = m;
+ }
+ }
+ if (incl_opts) {
+ /* Timestamps. */
+ if (tp->t_flags & TF_RCVD_TSTMP) {
+ to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
+ to.to_tsecr = tp->ts_recent;
+ to.to_flags |= TOF_TS;
+ }
+#ifdef TCP_SIGNATURE
+ /* TCP-MD5 (RFC2385). */
+ if (tp->t_flags & TF_SIGNATURE)
+ to.to_flags |= TOF_SIGNATURE;
+#endif
+
+ /* Add the options. */
+ tlen += optlen = tcp_addoptions(&to, optp);
+
+ /* Update m_len in the correct mbuf. */
+ optm->m_len += optlen;
+ } else
+ optlen = 0;
#ifdef INET6
if (isipv6) {
ip6->ip6_flow = 0;
ip6->ip6_vfc = IPV6_VERSION;
ip6->ip6_nxt = IPPROTO_TCP;
- ip6->ip6_plen = 0; /* Set in ip6_output(). */
- tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
+ ip6->ip6_plen = htons(tlen - sizeof(*ip6));
}
#endif
#if defined(INET) && defined(INET6)
@@ -590,14 +1081,12 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
#endif
#ifdef INET
{
- tlen += sizeof (struct tcpiphdr);
- ip->ip_len = tlen;
+ ip->ip_len = htons(tlen);
ip->ip_ttl = V_ip_defttl;
if (V_path_mtu_discovery)
- ip->ip_off |= IP_DF;
+ ip->ip_off |= htons(IP_DF);
}
#endif
- m->m_len = tlen;
m->m_pkthdr.len = tlen;
m->m_pkthdr.rcvif = NULL;
#ifdef MAC
@@ -619,7 +1108,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
nth->th_seq = htonl(seq);
nth->th_ack = htonl(ack);
nth->th_x2 = 0;
- nth->th_off = sizeof (struct tcphdr) >> 2;
+ nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
nth->th_flags = flags;
if (tp != NULL)
nth->th_win = htons((u_short) (win >> tp->rcv_scale));
@@ -627,6 +1116,13 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
nth->th_win = htons((u_short)win);
nth->th_urp = 0;
+#ifdef TCP_SIGNATURE
+ if (to.to_flags & TOF_SIGNATURE) {
+ tcp_signature_compute(m, 0, 0, optlen, to.to_signature,
+ IPSEC_DIR_OUTBOUND);
+ }
+#endif
+
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
#ifdef INET6
if (isipv6) {
@@ -651,15 +1147,21 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
#endif
+ TCP_PROBE3(debug__output, tp, th, mtod(m, const char *));
+ if (flags & TH_RST)
+ TCP_PROBE5(accept__refused, NULL, NULL, mtod(m, const char *),
+ tp, nth);
+
+ TCP_PROBE5(send, NULL, tp, mtod(m, const char *), tp, nth);
#ifdef INET6
if (isipv6)
- (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
+ (void) ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
#endif /* INET6 */
#if defined(INET) && defined(INET6)
else
#endif
#ifdef INET
- (void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
+ (void) ip_output(m, NULL, NULL, 0, NULL, inp);
#endif
}
@@ -687,7 +1189,10 @@ tcp_newtcpcb(struct inpcb *inp)
tp->ccv = &tm->ccv;
tp->ccv->type = IPPROTO_TCP;
tp->ccv->ccvc.tcp = tp;
-
+ rw_rlock(&tcp_function_lock);
+ tp->t_fb = tcp_func_set_ptr;
+ refcount_acquire(&tp->t_fb->tfb_refcnt);
+ rw_runlock(&tcp_function_lock);
/*
* Use the current system default CC algorithm.
*/
@@ -698,12 +1203,18 @@ tcp_newtcpcb(struct inpcb *inp)
if (CC_ALGO(tp)->cb_init != NULL)
if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
+ if (tp->t_fb->tfb_tcp_fb_fini)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
+ refcount_release(&tp->t_fb->tfb_refcnt);
uma_zfree(V_tcpcb_zone, tm);
return (NULL);
}
tp->osd = &tm->osd;
if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) {
+ if (tp->t_fb->tfb_tcp_fb_fini)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
+ refcount_release(&tp->t_fb->tfb_refcnt);
uma_zfree(V_tcpcb_zone, tm);
return (NULL);
}
@@ -713,25 +1224,31 @@ tcp_newtcpcb(struct inpcb *inp)
#endif
tp->t_timers = &tm->tt;
/* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */
- tp->t_maxseg = tp->t_maxopd =
+ tp->t_maxseg =
#ifdef INET6
isipv6 ? V_tcp_v6mssdflt :
#endif /* INET6 */
V_tcp_mssdflt;
/* Set up our timeouts. */
- callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE);
- callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE);
- callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE);
- callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE);
- callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE);
+ callout_init(&tp->t_timers->tt_rexmt, 1);
+ callout_init(&tp->t_timers->tt_persist, 1);
+ callout_init(&tp->t_timers->tt_keep, 1);
+ callout_init(&tp->t_timers->tt_2msl, 1);
+ callout_init(&tp->t_timers->tt_delack, 1);
if (V_tcp_do_rfc1323)
tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
if (V_tcp_do_sack)
tp->t_flags |= TF_SACK_PERMIT;
TAILQ_INIT(&tp->snd_holes);
- tp->t_inpcb = inp; /* XXX */
+ /*
+ * The tcpcb will hold a reference on its inpcb until tcp_discardcb()
+ * is called.
+ */
+ in_pcbref(inp); /* Reference for tcpcb */
+ tp->t_inpcb = inp;
+
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
* rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
@@ -751,6 +1268,15 @@ tcp_newtcpcb(struct inpcb *inp)
*/
inp->inp_ip_ttl = V_ip_defttl;
inp->inp_ppcb = tp;
+#ifdef TCPPCAP
+ /*
+ * Init the TCP PCAP queues.
+ */
+ tcp_pcap_tcpcb_init(tp);
+#endif
+ if (tp->t_fb->tfb_tcp_fb_init) {
+ (*tp->t_fb->tfb_tcp_fb_init)(tp);
+ }
return (tp); /* XXX */
}
@@ -779,7 +1305,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo)
VNET_LIST_RLOCK();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
/*
* New connections already part way through being initialised
* with the CC algo we're removing will not race with this code
@@ -809,7 +1335,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo)
}
INP_WUNLOCK(inp);
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK();
@@ -827,12 +1353,12 @@ tcp_drop(struct tcpcb *tp, int errno)
{
struct socket *so = tp->t_inpcb->inp_socket;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
if (TCPS_HAVERCVDSYN(tp->t_state)) {
- tp->t_state = TCPS_CLOSED;
- (void) tcp_output(tp);
+ tcp_state_change(tp, TCPS_CLOSED);
+ (void) tp->t_fb->tfb_tcp_output(tp);
TCPSTAT_INC(tcps_drops);
} else
TCPSTAT_INC(tcps_conndrops);
@@ -850,6 +1376,7 @@ tcp_discardcb(struct tcpcb *tp)
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
+ int released;
INP_WLOCK_ASSERT(inp);
@@ -857,22 +1384,27 @@ tcp_discardcb(struct tcpcb *tp)
* Make sure that all of our timers are stopped before we delete the
* PCB.
*
- * XXXRW: Really, we would like to use callout_drain() here in order
- * to avoid races experienced in tcp_timer.c where a timer is already
- * executing at this point. However, we can't, both because we're
- * running in a context where we can't sleep, and also because we
- * hold locks required by the timers. What we instead need to do is
- * test to see if callout_drain() is required, and if so, defer some
- * portion of the remainder of tcp_discardcb() to an asynchronous
- * context that can callout_drain() and then continue. Some care
- * will be required to ensure that no further processing takes place
- * on the tcpcb, even though it hasn't been freed (a flag?).
+ * If stopping a timer fails, we schedule a discard function in same
+ * callout, and the last discard function called will take care of
+ * deleting the tcpcb.
*/
- callout_stop(&tp->t_timers->tt_rexmt);
- callout_stop(&tp->t_timers->tt_persist);
- callout_stop(&tp->t_timers->tt_keep);
- callout_stop(&tp->t_timers->tt_2msl);
- callout_stop(&tp->t_timers->tt_delack);
+ tp->t_timers->tt_draincnt = 0;
+ tcp_timer_stop(tp, TT_REXMT);
+ tcp_timer_stop(tp, TT_PERSIST);
+ tcp_timer_stop(tp, TT_KEEP);
+ tcp_timer_stop(tp, TT_2MSL);
+ tcp_timer_stop(tp, TT_DELACK);
+ if (tp->t_fb->tfb_tcp_timer_stop_all) {
+ /*
+ * Call the stop-all function of the methods,
+ * this function should call the tcp_timer_stop()
+ * method with each of the function specific timeouts.
+ * That stop will be called via the tfb_tcp_timer_stop()
+ * which should use the async drain function of the
+ * callout system (see tcp_var.h).
+ */
+ tp->t_fb->tfb_tcp_timer_stop_all(tp);
+ }
/*
* If we got enough samples through the srtt filter,
@@ -893,7 +1425,7 @@ tcp_discardcb(struct tcpcb *tp)
* Update the ssthresh always when the conditions below
* are satisfied. This gives us better new start value
* for the congestion avoidance for new connections.
- * ssthresh is only set if packet loss occured on a session.
+ * ssthresh is only set if packet loss occurred on a session.
*
* XXXRW: 'so' may be NULL here, and/or socket buffer may be
* being torn down. Ideally this code would not use 'so'.
@@ -909,14 +1441,14 @@ tcp_discardcb(struct tcpcb *tp)
ssthresh = 2;
ssthresh *= (u_long)(tp->t_maxseg +
#ifdef INET6
- (isipv6 ? sizeof (struct ip6_hdr) +
- sizeof (struct tcphdr) :
+ (isipv6 ? sizeof (struct ip6_hdr) +
+ sizeof (struct tcphdr) :
#endif
- sizeof (struct tcpiphdr)
+ sizeof (struct tcpiphdr)
#ifdef INET6
- )
+ )
#endif
- );
+ );
} else
ssthresh = 0;
metrics.rmx_ssthresh = ssthresh;
@@ -941,6 +1473,12 @@ tcp_discardcb(struct tcpcb *tp)
tcp_free_sackholes(tp);
+#ifdef TCPPCAP
+ /* Free the TCP PCAP queues. */
+ tcp_pcap_drain(&(tp->t_inpkts));
+ tcp_pcap_drain(&(tp->t_outpkts));
+#endif
+
/* Allow the CC algorithm to clean up after itself. */
if (CC_ALGO(tp)->cb_destroy != NULL)
CC_ALGO(tp)->cb_destroy(tp->ccv);
@@ -949,8 +1487,51 @@ tcp_discardcb(struct tcpcb *tp)
CC_ALGO(tp) = NULL;
inp->inp_ppcb = NULL;
- tp->t_inpcb = NULL;
- uma_zfree(V_tcpcb_zone, tp);
+ if (tp->t_timers->tt_draincnt == 0) {
+ /* We own the last reference on tcpcb, let's free it. */
+ if (tp->t_fb->tfb_tcp_fb_fini)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
+ refcount_release(&tp->t_fb->tfb_refcnt);
+ tp->t_inpcb = NULL;
+ uma_zfree(V_tcpcb_zone, tp);
+ released = in_pcbrele_wlocked(inp);
+ KASSERT(!released, ("%s: inp %p should not have been released "
+ "here", __func__, inp));
+ }
+}
+
+void
+tcp_timer_discard(void *ptp)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp;
+
+ tp = (struct tcpcb *)ptp;
+ CURVNET_SET(tp->t_vnet);
+ INP_INFO_RLOCK(&V_tcbinfo);
+ inp = tp->t_inpcb;
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL",
+ __func__, tp));
+ INP_WLOCK(inp);
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0,
+ ("%s: tcpcb has to be stopped here", __func__));
+ tp->t_timers->tt_draincnt--;
+ if (tp->t_timers->tt_draincnt == 0) {
+ /* We own the last reference on this tcpcb, let's free it. */
+ if (tp->t_fb->tfb_tcp_fb_fini)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
+ refcount_release(&tp->t_fb->tfb_refcnt);
+ tp->t_inpcb = NULL;
+ uma_zfree(V_tcpcb_zone, tp);
+ if (in_pcbrele_wlocked(inp)) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
+ }
+ INP_WUNLOCK(inp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
}
/*
@@ -963,15 +1544,27 @@ tcp_close(struct tcpcb *tp)
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
#ifdef TCP_OFFLOAD
if (tp->t_state == TCPS_LISTEN)
tcp_offload_listen_stop(tp);
#endif
+#ifdef TCP_RFC7413
+ /*
+ * This releases the TFO pending counter resource for TFO listen
+ * sockets as well as passively-created TFO sockets that transition
+ * from SYN_RECEIVED to CLOSED.
+ */
+ if (tp->t_tfo_pending) {
+ tcp_fastopen_decrement_counter(tp->t_tfo_pending);
+ tp->t_tfo_pending = NULL;
+ }
+#endif
in_pcbdrop(inp);
TCPSTAT_INC(tcps_closed);
+ TCPSTATES_DEC(tp->t_state);
KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
so = inp->inp_socket;
soisdisconnected(so);
@@ -1009,9 +1602,9 @@ tcp_drain(void)
* XXX: The "Net/3" implementation doesn't imply that the TCP
* reassembly queue should be flushed, but in a situation
* where we're really low on mbufs, this is potentially
- * usefull.
+ * useful.
*/
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
if (inpb->inp_flags & INP_TIMEWAIT)
continue;
@@ -1019,10 +1612,17 @@ tcp_drain(void)
if ((tcpb = intotcpcb(inpb)) != NULL) {
tcp_reass_flush(tcpb);
tcp_clean_sackreport(tcpb);
+#ifdef TCPPCAP
+ if (tcp_pcap_aggressive_free) {
+ /* Free the TCP PCAP queues. */
+ tcp_pcap_drain(&(tcpb->t_inpkts));
+ tcp_pcap_drain(&(tcpb->t_outpkts));
+ }
+#endif
}
INP_WUNLOCK(inpb);
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
@@ -1041,7 +1641,7 @@ tcp_notify(struct inpcb *inp, int error)
{
struct tcpcb *tp;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if ((inp->inp_flags & INP_TIMEWAIT) ||
@@ -1061,6 +1661,10 @@ tcp_notify(struct inpcb *inp, int error)
if (tp->t_state == TCPS_ESTABLISHED &&
(error == EHOSTUNREACH || error == ENETUNREACH ||
error == EHOSTDOWN)) {
+ if (inp->inp_route.ro_rt) {
+ RTFREE(inp->inp_route.ro_rt);
+ inp->inp_route.ro_rt = (struct rtentry *)NULL;
+ }
return (inp);
} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
tp->t_softerror) {
@@ -1093,7 +1697,8 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
* resource-intensive to repeat twice on every request.
*/
if (req->oldptr == NULL) {
- n = V_tcbinfo.ipi_count + syncache_pcbcount();
+ n = V_tcbinfo.ipi_count +
+ counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]);
n += imax(n / 8, 10);
req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
return (0);
@@ -1105,12 +1710,12 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
/*
* OK, now we're committed to doing something.
*/
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_LIST_RLOCK(&V_tcbinfo);
gencnt = V_tcbinfo.ipi_gencnt;
n = V_tcbinfo.ipi_count;
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_LIST_RUNLOCK(&V_tcbinfo);
- m = syncache_pcbcount();
+ m = counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]);
error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
+ (n + m) * sizeof(struct xtcpcb));
@@ -1130,10 +1735,8 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
return (error);
inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
- if (inp_list == NULL)
- return (ENOMEM);
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
INP_WLOCK(inp);
@@ -1158,7 +1761,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
}
INP_WUNLOCK(inp);
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
n = i;
error = 0;
@@ -1196,14 +1799,14 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
} else
INP_RUNLOCK(inp);
}
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
for (i = 0; i < n; i++) {
inp = inp_list[i];
INP_RLOCK(inp);
if (!in_pcbrele_rlocked(inp))
INP_RUNLOCK(inp);
}
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
if (!error) {
/*
@@ -1213,11 +1816,11 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
* while we were processing this request, and it
* might be necessary to retry.
*/
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_LIST_RLOCK(&V_tcbinfo);
xig.xig_gen = V_tcbinfo.ipi_gencnt;
xig.xig_sogen = so_gencnt;
xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_LIST_RUNLOCK(&V_tcbinfo);
error = SYSCTL_OUT(req, &xig, sizeof xig);
}
free(inp_list, M_TEMP);
@@ -1354,16 +1957,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
notify = tcp_drop_syn_sent;
- /*
- * Redirects don't need to be handled up here.
- */
- else if (PRC_IS_REDIRECT(cmd))
- return;
- /*
- * Source quench is depreciated.
- */
- else if (cmd == PRC_QUENCH)
- return;
+
/*
* Hostdead is ugly because it goes linearly through all PCBs.
* XXX: We never get this from ICMP, otherwise it makes an
@@ -1373,75 +1967,79 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
ip = NULL;
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
return;
- if (ip != NULL) {
- icp = (struct icmp *)((caddr_t)ip
- - offsetof(struct icmp, icmp_ip));
- th = (struct tcphdr *)((caddr_t)ip
- + (ip->ip_hl << 2));
- INP_INFO_WLOCK(&V_tcbinfo);
- inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport,
- ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
- if (inp != NULL) {
- if (!(inp->inp_flags & INP_TIMEWAIT) &&
- !(inp->inp_flags & INP_DROPPED) &&
- !(inp->inp_socket == NULL)) {
- icmp_tcp_seq = htonl(th->th_seq);
- tp = intotcpcb(inp);
- if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
- SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
- if (cmd == PRC_MSGSIZE) {
- /*
- * MTU discovery:
- * If we got a needfrag set the MTU
- * in the route to the suggested new
- * value (if given) and then notify.
- */
- bzero(&inc, sizeof(inc));
- inc.inc_faddr = faddr;
- inc.inc_fibnum =
- inp->inp_inc.inc_fibnum;
-
- mtu = ntohs(icp->icmp_nextmtu);
- /*
- * If no alternative MTU was
- * proposed, try the next smaller
- * one. ip->ip_len has already
- * been swapped in icmp_input().
- */
- if (!mtu)
- mtu = ip_next_mtu(ip->ip_len,
- 1);
- if (mtu < V_tcp_minmss
- + sizeof(struct tcpiphdr))
- mtu = V_tcp_minmss
- + sizeof(struct tcpiphdr);
- /*
- * Only cache the MTU if it
- * is smaller than the interface
- * or route MTU. tcp_mtudisc()
- * will do right thing by itself.
- */
- if (mtu <= tcp_maxmtu(&inc, NULL))
+
+ if (ip == NULL) {
+ in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
+ return;
+ }
+
+ icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip));
+ th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
+ INP_INFO_RLOCK(&V_tcbinfo);
+ inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src,
+ th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
+ if (inp != NULL && PRC_IS_REDIRECT(cmd)) {
+ /* signal EHOSTDOWN, as it flushes the cached route */
+ inp = (*notify)(inp, EHOSTDOWN);
+ if (inp != NULL)
+ INP_WUNLOCK(inp);
+ } else if (inp != NULL) {
+ if (!(inp->inp_flags & INP_TIMEWAIT) &&
+ !(inp->inp_flags & INP_DROPPED) &&
+ !(inp->inp_socket == NULL)) {
+ icmp_tcp_seq = ntohl(th->th_seq);
+ tp = intotcpcb(inp);
+ if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
+ SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
+ if (cmd == PRC_MSGSIZE) {
+ /*
+ * MTU discovery:
+ * If we got a needfrag set the MTU
+ * in the route to the suggested new
+ * value (if given) and then notify.
+ */
+ mtu = ntohs(icp->icmp_nextmtu);
+ /*
+ * If no alternative MTU was
+ * proposed, try the next smaller
+ * one.
+ */
+ if (!mtu)
+ mtu = ip_next_mtu(
+ ntohs(ip->ip_len), 1);
+ if (mtu < V_tcp_minmss +
+ sizeof(struct tcpiphdr))
+ mtu = V_tcp_minmss +
+ sizeof(struct tcpiphdr);
+ /*
+ * Only process the offered MTU if it
+ * is smaller than the current one.
+ */
+ if (mtu < tp->t_maxseg +
+ sizeof(struct tcpiphdr)) {
+ bzero(&inc, sizeof(inc));
+ inc.inc_faddr = faddr;
+ inc.inc_fibnum =
+ inp->inp_inc.inc_fibnum;
tcp_hc_updatemtu(&inc, mtu);
- tcp_mtudisc(inp, mtu);
- } else
- inp = (*notify)(inp,
- inetctlerrmap[cmd]);
- }
+ tcp_mtudisc(inp, mtu);
+ }
+ } else
+ inp = (*notify)(inp,
+ inetctlerrmap[cmd]);
}
- if (inp != NULL)
- INP_WUNLOCK(inp);
- } else {
- bzero(&inc, sizeof(inc));
- inc.inc_fport = th->th_dport;
- inc.inc_lport = th->th_sport;
- inc.inc_faddr = faddr;
- inc.inc_laddr = ip->ip_src;
- syncache_unreach(&inc, th);
}
- INP_INFO_WUNLOCK(&V_tcbinfo);
- } else
- in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
+ if (inp != NULL)
+ INP_WUNLOCK(inp);
+ } else {
+ bzero(&inc, sizeof(inc));
+ inc.inc_fport = th->th_dport;
+ inc.inc_lport = th->th_sport;
+ inc.inc_faddr = faddr;
+ inc.inc_laddr = ip->ip_src;
+ syncache_unreach(&inc, th);
+ }
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
#endif /* INET */
@@ -1449,75 +2047,146 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
void
tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
{
- struct tcphdr th;
+ struct in6_addr *dst;
+ struct tcphdr *th;
struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
struct ip6_hdr *ip6;
struct mbuf *m;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct icmp6_hdr *icmp6;
struct ip6ctlparam *ip6cp = NULL;
const struct sockaddr_in6 *sa6_src = NULL;
- int off;
- struct tcp_portonly {
- u_int16_t th_sport;
- u_int16_t th_dport;
- } *thp;
+ struct in_conninfo inc;
+ tcp_seq icmp_tcp_seq;
+ unsigned int mtu;
+ unsigned int off;
+
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
return;
- if (cmd == PRC_MSGSIZE)
- notify = tcp_mtudisc_notify;
- else if (!PRC_IS_REDIRECT(cmd) &&
- ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
- return;
- /* Source quench is depreciated. */
- else if (cmd == PRC_QUENCH)
- return;
-
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
ip6cp = (struct ip6ctlparam *)d;
+ icmp6 = ip6cp->ip6c_icmp6;
m = ip6cp->ip6c_m;
ip6 = ip6cp->ip6c_ip6;
off = ip6cp->ip6c_off;
sa6_src = ip6cp->ip6c_src;
+ dst = ip6cp->ip6c_finaldst;
} else {
m = NULL;
ip6 = NULL;
off = 0; /* fool gcc */
sa6_src = &sa6_any;
+ dst = NULL;
}
- if (ip6 != NULL) {
- struct in_conninfo inc;
- /*
- * XXX: We assume that when IPV6 is non NULL,
- * M and OFF are valid.
- */
+ if (cmd == PRC_MSGSIZE)
+ notify = tcp_mtudisc_notify;
+ else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
+ cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) &&
+ ip6 != NULL)
+ notify = tcp_drop_syn_sent;
- /* check if we can safely examine src and dst ports */
- if (m->m_pkthdr.len < off + sizeof(*thp))
- return;
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ * XXX: We never get this from ICMP, otherwise it makes an
+ * excellent DoS attack on machines with many connections.
+ */
+ else if (cmd == PRC_HOSTDEAD)
+ ip6 = NULL;
+ else if ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)
+ return;
- bzero(&th, sizeof(th));
- m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
+ if (ip6 == NULL) {
+ in6_pcbnotify(&V_tcbinfo, sa, 0,
+ (const struct sockaddr *)sa6_src,
+ 0, cmd, NULL, notify);
+ return;
+ }
- in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
- (struct sockaddr *)ip6cp->ip6c_src,
- th.th_sport, cmd, NULL, notify);
+ /* Check if we can safely get the ports from the tcp hdr */
+ if (m == NULL ||
+ (m->m_pkthdr.len <
+ (int32_t) (off + offsetof(struct tcphdr, th_seq)))) {
+ return;
+ }
+ th = (struct tcphdr *) mtodo(ip6cp->ip6c_m, ip6cp->ip6c_off);
+ INP_INFO_RLOCK(&V_tcbinfo);
+ inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, th->th_dport,
+ &ip6->ip6_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
+ if (inp != NULL && PRC_IS_REDIRECT(cmd)) {
+ /* signal EHOSTDOWN, as it flushes the cached route */
+ inp = (*notify)(inp, EHOSTDOWN);
+ if (inp != NULL)
+ INP_WUNLOCK(inp);
+ } else if (inp != NULL) {
+ if (!(inp->inp_flags & INP_TIMEWAIT) &&
+ !(inp->inp_flags & INP_DROPPED) &&
+ !(inp->inp_socket == NULL)) {
+ icmp_tcp_seq = ntohl(th->th_seq);
+ tp = intotcpcb(inp);
+ if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
+ SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
+ if (cmd == PRC_MSGSIZE) {
+ /*
+ * MTU discovery:
+ * If we got a needfrag set the MTU
+ * in the route to the suggested new
+ * value (if given) and then notify.
+ */
+ mtu = ntohl(icmp6->icmp6_mtu);
+ /*
+ * If no alternative MTU was
+ * proposed, or the proposed
+ * MTU was too small, set to
+ * the min.
+ */
+ if (mtu < IPV6_MMTU)
+ mtu = IPV6_MMTU - 8;
+
+
+ bzero(&inc, sizeof(inc));
+ inc.inc_fibnum = M_GETFIB(m);
+ inc.inc_flags |= INC_ISIPV6;
+ inc.inc6_faddr = *dst;
+ if (in6_setscope(&inc.inc6_faddr,
+ m->m_pkthdr.rcvif, NULL))
+ goto unlock_inp;
+
+ /*
+ * Only process the offered MTU if it
+ * is smaller than the current one.
+ */
+ if (mtu < tp->t_maxseg +
+ (sizeof (*th) + sizeof (*ip6))) {
+ tcp_hc_updatemtu(&inc, mtu);
+ tcp_mtudisc(inp, mtu);
+ ICMP6STAT_INC(icp6s_pmtuchg);
+ }
+ } else
+ inp = (*notify)(inp,
+ inet6ctlerrmap[cmd]);
+ }
+ }
+unlock_inp:
+ if (inp != NULL)
+ INP_WUNLOCK(inp);
+ } else {
bzero(&inc, sizeof(inc));
- inc.inc_fport = th.th_dport;
- inc.inc_lport = th.th_sport;
- inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
- inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
+ inc.inc_fibnum = M_GETFIB(m);
inc.inc_flags |= INC_ISIPV6;
- INP_INFO_WLOCK(&V_tcbinfo);
- syncache_unreach(&inc, &th);
- INP_INFO_WUNLOCK(&V_tcbinfo);
- } else
- in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
- 0, cmd, NULL, notify);
+ inc.inc_fport = th->th_dport;
+ inc.inc_lport = th->th_sport;
+ inc.inc6_faddr = *dst;
+ inc.inc6_laddr = ip6->ip6_src;
+ syncache_unreach(&inc, th);
+ }
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
#endif /* INET6 */
@@ -1647,7 +2316,7 @@ tcp_drop_syn_sent(struct inpcb *inp, int errno)
{
struct tcpcb *tp;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if ((inp->inp_flags & INP_TIMEWAIT) ||
@@ -1675,10 +2344,11 @@ static struct inpcb *
tcp_mtudisc_notify(struct inpcb *inp, int error)
{
- return (tcp_mtudisc(inp, -1));
+ tcp_mtudisc(inp, -1);
+ return (inp);
}
-struct inpcb *
+static void
tcp_mtudisc(struct inpcb *inp, int mtuoffer)
{
struct tcpcb *tp;
@@ -1687,7 +2357,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer)
INP_WLOCK_ASSERT(inp);
if ((inp->inp_flags & INP_TIMEWAIT) ||
(inp->inp_flags & INP_DROPPED))
- return (inp);
+ return;
tp = intotcpcb(inp);
KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
@@ -1708,8 +2378,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer)
tp->snd_recover = tp->snd_max;
if (tp->t_flags & TF_SACK_PERMIT)
EXIT_FASTRECOVERY(tp->t_flags);
- tcp_output(tp);
- return (inp);
+ tp->t_fb->tfb_tcp_output(tp);
}
#ifdef INET
@@ -1722,27 +2391,20 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer)
u_long
tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap)
{
- struct route sro;
- struct sockaddr_in *dst;
+ struct nhop4_extended nh4;
struct ifnet *ifp;
u_long maxmtu = 0;
KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
- bzero(&sro, sizeof(sro));
if (inc->inc_faddr.s_addr != INADDR_ANY) {
- dst = (struct sockaddr_in *)&sro.ro_dst;
- dst->sin_family = AF_INET;
- dst->sin_len = sizeof(*dst);
- dst->sin_addr = inc->inc_faddr;
- in_rtalloc_ign(&sro, 0, inc->inc_fibnum);
- }
- if (sro.ro_rt != NULL) {
- ifp = sro.ro_rt->rt_ifp;
- if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
- maxmtu = ifp->if_mtu;
- else
- maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
+
+ if (fib4_lookup_nh_ext(inc->inc_fibnum, inc->inc_faddr,
+ NHR_REF, 0, &nh4) != 0)
+ return (0);
+
+ ifp = nh4.nh_ifp;
+ maxmtu = nh4.nh_mtu;
/* Report additional interface capabilities. */
if (cap != NULL) {
@@ -1754,7 +2416,7 @@ tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap)
cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
}
}
- RTFREE(sro.ro_rt);
+ fib4_free_nh_ext(inc->inc_fibnum, &nh4);
}
return (maxmtu);
}
@@ -1764,26 +2426,22 @@ tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap)
u_long
tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap)
{
- struct route_in6 sro6;
+ struct nhop6_extended nh6;
+ struct in6_addr dst6;
+ uint32_t scopeid;
struct ifnet *ifp;
u_long maxmtu = 0;
KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
- bzero(&sro6, sizeof(sro6));
if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
- sro6.ro_dst.sin6_family = AF_INET6;
- sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
- sro6.ro_dst.sin6_addr = inc->inc6_faddr;
- in6_rtalloc_ign(&sro6, 0, inc->inc_fibnum);
- }
- if (sro6.ro_rt != NULL) {
- ifp = sro6.ro_rt->rt_ifp;
- if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
- maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
- else
- maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
- IN6_LINKMTU(sro6.ro_rt->rt_ifp));
+ in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid);
+ if (fib6_lookup_nh_ext(inc->inc_fibnum, &dst6, scopeid, 0,
+ 0, &nh6) != 0)
+ return (0);
+
+ ifp = nh6.nh_ifp;
+ maxmtu = nh6.nh_mtu;
/* Report additional interface capabilities. */
if (cap != NULL) {
@@ -1795,13 +2453,66 @@ tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap)
cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
}
}
- RTFREE(sro6.ro_rt);
+ fib6_free_nh_ext(inc->inc_fibnum, &nh6);
}
return (maxmtu);
}
#endif /* INET6 */
+/*
+ * Calculate effective SMSS per RFC5681 definition for a given TCP
+ * connection at its current state, taking into account SACK and etc.
+ */
+u_int
+tcp_maxseg(const struct tcpcb *tp)
+{
+ u_int optlen;
+
+ if (tp->t_flags & TF_NOOPT)
+ return (tp->t_maxseg);
+
+ /*
+ * Here we have a simplified code from tcp_addoptions(),
+ * without a proper loop, and having most of paddings hardcoded.
+ * We might make mistakes with padding here in some edge cases,
+ * but this is harmless, since result of tcp_maxseg() is used
+ * only in cwnd and ssthresh estimations.
+ */
+#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4)
+ if (TCPS_HAVEESTABLISHED(tp->t_state)) {
+ if (tp->t_flags & TF_RCVD_TSTMP)
+ optlen = TCPOLEN_TSTAMP_APPA;
+ else
+ optlen = 0;
+#ifdef TCP_SIGNATURE
+ if (tp->t_flags & TF_SIGNATURE)
+ optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) {
+ optlen += TCPOLEN_SACKHDR;
+ optlen += tp->rcv_numsacks * TCPOLEN_SACK;
+ optlen = PAD(optlen);
+ }
+ } else {
+ if (tp->t_flags & TF_REQ_TSTMP)
+ optlen = TCPOLEN_TSTAMP_APPA;
+ else
+ optlen = PAD(TCPOLEN_MAXSEG);
+ if (tp->t_flags & TF_REQ_SCALE)
+ optlen += PAD(TCPOLEN_WINDOW);
+#ifdef TCP_SIGNATURE
+ if (tp->t_flags & TF_SIGNATURE)
+ optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+ if (tp->t_flags & TF_SACK_PERMIT)
+ optlen += PAD(TCPOLEN_SACK_PERMITTED);
+ }
+#undef PAD
+ optlen = min(optlen, TCP_MAXOLEN);
+ return (tp->t_maxseg - optlen);
+}
+
#ifdef IPSEC
/* compute ESP/AH header size for TCP, including outer IP header. */
size_t
@@ -1816,9 +2527,10 @@ ipsec_hdrsiz_tcp(struct tcpcb *tp)
#endif
struct tcphdr *th;
- if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
+ if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL) ||
+ (!key_havesp(IPSEC_DIR_OUTBOUND)))
return (0);
- MGETHDR(m, M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (!m)
return (0);
@@ -1859,55 +2571,20 @@ tcp_signature_apply(void *fstate, void *data, u_int len)
}
/*
- * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
- *
- * Parameters:
- * m pointer to head of mbuf chain
- * _unused
- * len length of TCP segment data, excluding options
- * optlen length of TCP segment options
- * buf pointer to storage for computed MD5 digest
- * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
- *
- * We do this over ip, tcphdr, segment data, and the key in the SADB.
- * When called from tcp_input(), we can be sure that th_sum has been
- * zeroed out and verified already.
- *
- * Return 0 if successful, otherwise return -1.
- *
* XXX The key is retrieved from the system's PF_KEY SADB, by keying a
* search with the destination IP address, and a 'magic SPI' to be
* determined by the application. This is hardcoded elsewhere to 1179
- * right now. Another branch of this code exists which uses the SPD to
- * specify per-application flows but it is unstable.
- */
-int
-tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
- u_char *buf, u_int direction)
+*/
+struct secasvar *
+tcp_get_sav(struct mbuf *m, u_int direction)
{
union sockaddr_union dst;
-#ifdef INET
- struct ippseudo ippseudo;
-#endif
- MD5_CTX ctx;
- int doff;
- struct ip *ip;
-#ifdef INET
- struct ipovly *ipovly;
-#endif
struct secasvar *sav;
- struct tcphdr *th;
+ struct ip *ip;
#ifdef INET6
struct ip6_hdr *ip6;
- struct in6_addr in6;
char ip6buf[INET6_ADDRSTRLEN];
- uint32_t plen;
- uint16_t nhdr;
#endif
- u_short savecsum;
-
- KASSERT(m != NULL, ("NULL mbuf chain"));
- KASSERT(buf != NULL, ("NULL signature pointer"));
/* Extract the destination from the IP header in the mbuf. */
bzero(&dst, sizeof(union sockaddr_union));
@@ -1934,7 +2611,7 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
break;
#endif
default:
- return (EINVAL);
+ return (NULL);
/* NOTREACHED */
break;
}
@@ -1949,9 +2626,61 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) :
#endif
"(unsupported)"));
- return (EINVAL);
}
+ return (sav);
+}
+
+/*
+ * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
+ *
+ * Parameters:
+ * m pointer to head of mbuf chain
+ * len length of TCP segment data, excluding options
+ * optlen length of TCP segment options
+ * buf pointer to storage for computed MD5 digest
+ * sav pointer to security assosiation
+ *
+ * We do this over ip, tcphdr, segment data, and the key in the SADB.
+ * When called from tcp_input(), we can be sure that th_sum has been
+ * zeroed out and verified already.
+ *
+ * Releases reference to SADB key before return.
+ *
+ * Return 0 if successful, otherwise return -1.
+ *
+ */
+int
+tcp_signature_do_compute(struct mbuf *m, int len, int optlen,
+ u_char *buf, struct secasvar *sav)
+{
+#ifdef INET
+ struct ippseudo ippseudo;
+#endif
+ MD5_CTX ctx;
+ int doff;
+ struct ip *ip;
+#ifdef INET
+ struct ipovly *ipovly;
+#endif
+ struct tcphdr *th;
+#ifdef INET6
+ struct ip6_hdr *ip6;
+ struct in6_addr in6;
+ uint32_t plen;
+ uint16_t nhdr;
+#endif
+ u_short savecsum;
+
+ KASSERT(m != NULL, ("NULL mbuf chain"));
+ KASSERT(buf != NULL, ("NULL signature pointer"));
+
+ /* Extract the destination from the IP header in the mbuf. */
+ ip = mtod(m, struct ip *);
+#ifdef INET6
+ ip6 = NULL; /* Make the compiler happy. */
+#endif
+
MD5Init(&ctx);
/*
* Step 1: Update MD5 hash with IP(v6) pseudo-header.
@@ -2008,7 +2737,8 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
break;
#endif
default:
- return (EINVAL);
+ KEY_FREESAV(&sav);
+ return (-1);
/* NOTREACHED */
break;
}
@@ -2042,6 +2772,23 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
}
/*
+ * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
+ *
+ * Return 0 if successful, otherwise return -1.
+ */
+int
+tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
+ u_char *buf, u_int direction)
+{
+ struct secasvar *sav;
+
+ if ((sav = tcp_get_sav(m, direction)) == NULL)
+ return (-1);
+
+ return (tcp_signature_do_compute(m, len, optlen, buf, sav));
+}
+
+/*
* Verify the TCP-MD5 hash of a TCP segment. (RFC2385)
*
* Parameters:
@@ -2170,7 +2917,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
default:
return (EINVAL);
}
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
switch (addrs[0].ss_family) {
#ifdef INET6
case AF_INET6:
@@ -2209,12 +2956,12 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
INP_WUNLOCK(inp);
} else
error = ESRCH;
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
-SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
- CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
+ CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL,
0, sysctl_drop, "", "Drop TCP connection");
/*
@@ -2332,3 +3079,21 @@ tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
panic("%s: string too long", __func__);
return (s);
}
+
+/*
+ * A subroutine which makes it easy to track TCP state changes with DTrace.
+ * This function shouldn't be called for t_state initializations that don't
+ * correspond to actual TCP state transitions.
+ */
+void
+tcp_state_change(struct tcpcb *tp, int newstate)
+{
+#if defined(KDTRACE_HOOKS)
+ int pstate = tp->t_state;
+#endif
+
+ TCPSTATES_DEC(tp->t_state);
+ TCPSTATES_INC(newstate);
+ tp->t_state = newstate;
+ TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate);
+}
diff --git a/freebsd/sys/netinet/tcp_syncache.c b/freebsd/sys/netinet/tcp_syncache.c
index 10bd00ae..d7da3a01 100644
--- a/freebsd/sys/netinet/tcp_syncache.c
+++ b/freebsd/sys/netinet/tcp_syncache.c
@@ -2,13 +2,13 @@
/*-
* Copyright (c) 2001 McAfee, Inc.
- * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG
+ * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Jonathan Lemon
* and McAfee Research, the Security Research Division of McAfee, Inc. under
* DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
- * DARPA CHATS research program.
+ * DARPA CHATS research program. [2001 McAfee, Inc.]
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -42,6 +42,8 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
+#include <sys/hash.h>
+#include <sys/refcount.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/limits.h>
@@ -49,7 +51,6 @@ __FBSDID("$FreeBSD$");
#include <sys/mutex.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
-#include <sys/md5.h>
#include <sys/proc.h> /* for proc0 declaration */
#include <sys/random.h>
#include <sys/socket.h>
@@ -57,9 +58,13 @@ __FBSDID("$FreeBSD$");
#include <sys/syslog.h>
#include <sys/ucred.h>
+#include <sys/md5.h>
+#include <crypto/siphash/siphash.h>
+
#include <vm/uma.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#include <net/vnet.h>
@@ -78,6 +83,9 @@ __FBSDID("$FreeBSD$");
#include <netinet6/in6_pcb.h>
#endif
#include <netinet/tcp.h>
+#ifdef TCP_RFC7413
+#include <netinet/tcp_fastopen.h>
+#endif
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
@@ -104,13 +112,13 @@ __FBSDID("$FreeBSD$");
static VNET_DEFINE(int, tcp_syncookies) = 1;
#define V_tcp_syncookies VNET(tcp_syncookies)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_syncookies), 0,
"Use TCP SYN cookies if the syncache overflows");
static VNET_DEFINE(int, tcp_syncookiesonly) = 0;
#define V_tcp_syncookiesonly VNET(tcp_syncookiesonly)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_syncookiesonly), 0,
"Use only TCP SYN cookies");
@@ -121,20 +129,27 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW,
static void syncache_drop(struct syncache *, struct syncache_head *);
static void syncache_free(struct syncache *);
static void syncache_insert(struct syncache *, struct syncache_head *);
-struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
-static int syncache_respond(struct syncache *);
+static int syncache_respond(struct syncache *, struct syncache_head *, int,
+ const struct mbuf *);
static struct socket *syncache_socket(struct syncache *, struct socket *,
struct mbuf *m);
-static int syncache_sysctl_count(SYSCTL_HANDLER_ARGS);
static void syncache_timeout(struct syncache *sc, struct syncache_head *sch,
int docallout);
static void syncache_timer(void *);
-static void syncookie_generate(struct syncache_head *, struct syncache *,
- u_int32_t *);
+
+static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t,
+ uint8_t *, uintptr_t);
+static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *);
static struct syncache
*syncookie_lookup(struct in_conninfo *, struct syncache_head *,
- struct syncache *, struct tcpopt *, struct tcphdr *,
+ struct syncache *, struct tcphdr *, struct tcpopt *,
struct socket *);
+static void syncookie_reseed(void *);
+#ifdef INVARIANTS
+static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch,
+ struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
+ struct socket *lso);
+#endif
/*
* Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
@@ -153,54 +168,32 @@ static VNET_DEFINE(struct tcp_syncache, tcp_syncache);
static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0,
"TCP SYN cache");
-SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
+SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_VNET | CTLFLAG_RDTUN,
&VNET_NAME(tcp_syncache.bucket_limit), 0,
"Per-bucket hash limit for syncache");
-SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
+SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN,
&VNET_NAME(tcp_syncache.cache_limit), 0,
"Overall entry limit for syncache");
-SYSCTL_VNET_PROC(_net_inet_tcp_syncache, OID_AUTO, count, (CTLTYPE_UINT|CTLFLAG_RD),
- NULL, 0, &syncache_sysctl_count, "IU",
- "Current number of entries in syncache");
+SYSCTL_UMA_CUR(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_VNET,
+ &VNET_NAME(tcp_syncache.zone), "Current number of entries in syncache");
-SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
+SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN,
&VNET_NAME(tcp_syncache.hashsize), 0,
"Size of TCP syncache hashtable");
-SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW,
+SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_syncache.rexmt_limit), 0,
"Limit on SYN/ACK retransmissions");
VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail,
- CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0,
+SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0,
"Send reset on socket allocation failure");
static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
-#define SYNCACHE_HASH(inc, mask) \
- ((V_tcp_syncache.hash_secret ^ \
- (inc)->inc_faddr.s_addr ^ \
- ((inc)->inc_faddr.s_addr >> 16) ^ \
- (inc)->inc_fport ^ (inc)->inc_lport) & mask)
-
-#define SYNCACHE_HASH6(inc, mask) \
- ((V_tcp_syncache.hash_secret ^ \
- (inc)->inc6_faddr.s6_addr32[0] ^ \
- (inc)->inc6_faddr.s6_addr32[3] ^ \
- (inc)->inc_fport ^ (inc)->inc_lport) & mask)
-
-#define ENDPTS_EQ(a, b) ( \
- (a)->ie_fport == (b)->ie_fport && \
- (a)->ie_lport == (b)->ie_lport && \
- (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \
- (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \
-)
-
-#define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
-
#define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx)
#define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx)
#define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED)
@@ -254,17 +247,19 @@ syncache_init(void)
V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize *
sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO);
- /* Initialize the hash buckets. */
- for (i = 0; i < V_tcp_syncache.hashsize; i++) {
#ifdef VIMAGE
- V_tcp_syncache.hashbase[i].sch_vnet = curvnet;
+ V_tcp_syncache.vnet = curvnet;
#endif
+
+ /* Initialize the hash buckets. */
+ for (i = 0; i < V_tcp_syncache.hashsize; i++) {
TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket);
mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head",
NULL, MTX_DEF);
callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer,
&V_tcp_syncache.hashbase[i].sch_mtx, 0);
V_tcp_syncache.hashbase[i].sch_length = 0;
+ V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache;
}
/* Create the syncache entry zone. */
@@ -272,6 +267,13 @@ syncache_init(void)
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone,
V_tcp_syncache.cache_limit);
+
+ /* Start the SYN cookie reseeder callout. */
+ callout_init(&V_tcp_syncache.secret.reseed, 1);
+ arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0);
+ arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0);
+ callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz,
+ syncookie_reseed, &V_tcp_syncache);
}
#ifdef VIMAGE
@@ -282,6 +284,12 @@ syncache_destroy(void)
struct syncache *sc, *nsc;
int i;
+ /*
+ * Stop the re-seed timer before freeing resources. No need to
+ * possibly schedule it another time.
+ */
+ callout_drain(&V_tcp_syncache.secret.reseed);
+
/* Cleanup hash buckets: stop timers, free entries, destroy locks. */
for (i = 0; i < V_tcp_syncache.hashsize; i++) {
@@ -308,15 +316,6 @@ syncache_destroy(void)
}
#endif
-static int
-syncache_sysctl_count(SYSCTL_HANDLER_ARGS)
-{
- int count;
-
- count = uma_zone_get_cur(V_tcp_syncache.zone);
- return (sysctl_handle_int(oidp, &count, 0, req));
-}
-
/*
* Inserts a syncache entry into the specified bucket row.
* Locks and unlocks the syncache_head autonomously.
@@ -359,6 +358,7 @@ syncache_insert(struct syncache *sc, struct syncache_head *sch)
SCH_UNLOCK(sch);
+ TCPSTATES_INC(TCPS_SYN_RECEIVED);
TCPSTAT_INC(tcps_sc_added);
}
@@ -372,6 +372,7 @@ syncache_drop(struct syncache *sc, struct syncache_head *sch)
SCH_LOCK_ASSERT(sch);
+ TCPSTATES_DEC(TCPS_SYN_RECEIVED);
TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
sch->sch_length--;
@@ -393,7 +394,7 @@ static void
syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout)
{
sc->sc_rxttime = ticks +
- TCPTV_RTOBASE * (tcp_backoff[sc->sc_rxmits]);
+ TCPTV_RTOBASE * (tcp_syn_backoff[sc->sc_rxmits]);
sc->sc_rxmits++;
if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) {
sch->sch_nextc = sc->sc_rxttime;
@@ -416,7 +417,7 @@ syncache_timer(void *xsch)
int tick = ticks;
char *s;
- CURVNET_SET(sch->sch_vnet);
+ CURVNET_SET(sch->sch_sc->vnet);
/* NB: syncache_head has already been locked by the callout. */
SCH_LOCK_ASSERT(sch);
@@ -459,7 +460,7 @@ syncache_timer(void *xsch)
free(s, M_TCPLOG);
}
- (void) syncache_respond(sc);
+ syncache_respond(sc, sch, 1, NULL);
TCPSTAT_INC(tcps_sc_retransmitted);
syncache_timeout(sc, sch, 0);
}
@@ -473,46 +474,34 @@ syncache_timer(void *xsch)
* Find an entry in the syncache.
* Returns always with locked syncache_head plus a matching entry or NULL.
*/
-struct syncache *
+static struct syncache *
syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp)
{
struct syncache *sc;
struct syncache_head *sch;
+ uint32_t hash;
-#ifdef INET6
- if (inc->inc_flags & INC_ISIPV6) {
- sch = &V_tcp_syncache.hashbase[
- SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)];
- *schp = sch;
-
- SCH_LOCK(sch);
+ /*
+ * The hash is built on foreign port + local port + foreign address.
+ * We rely on the fact that struct in_conninfo starts with 16 bits
+ * of foreign port, then 16 bits of local port then followed by 128
+ * bits of foreign address. In case of IPv4 address, the first 3
+ * 32-bit words of the address always are zeroes.
+ */
+ hash = jenkins_hash32((uint32_t *)&inc->inc_ie, 5,
+ V_tcp_syncache.hash_secret) & V_tcp_syncache.hashmask;
- /* Circle through bucket row to find matching entry. */
- TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
- if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
- return (sc);
- }
- } else
-#endif
- {
- sch = &V_tcp_syncache.hashbase[
- SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)];
- *schp = sch;
+ sch = &V_tcp_syncache.hashbase[hash];
+ *schp = sch;
+ SCH_LOCK(sch);
- SCH_LOCK(sch);
+ /* Circle through bucket row to find matching entry. */
+ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash)
+ if (bcmp(&inc->inc_ie, &sc->sc_inc.inc_ie,
+ sizeof(struct in_endpoints)) == 0)
+ break;
- /* Circle through bucket row to find matching entry. */
- TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
-#ifdef INET6
- if (sc->sc_inc.inc_flags & INC_ISIPV6)
- continue;
-#endif
- if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
- return (sc);
- }
- }
- SCH_LOCK_ASSERT(*schp);
- return (NULL); /* always returns with locked sch */
+ return (sc); /* Always returns with locked sch. */
}
/*
@@ -644,17 +633,20 @@ done:
/*
* Build a new TCP socket structure from a syncache entry.
+ *
+ * On success return the newly created socket with its underlying inp locked.
*/
static struct socket *
syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
{
+ struct tcp_function_block *blk;
struct inpcb *inp = NULL;
struct socket *so;
struct tcpcb *tp;
int error;
char *s;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
/*
* Ok, create the full blown connection, and set things up
@@ -662,7 +654,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
* connection when the SYN arrived. If we can't create
* the connection, abort it.
*/
- so = sonewconn(lso, SS_ISCONNECTED);
+ so = sonewconn(lso, 0);
if (so == NULL) {
/*
* Drop the connection; we will either send a RST or
@@ -685,6 +677,15 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
inp = sotoinpcb(so);
inp->inp_inc.inc_fibnum = so->so_fibnum;
INP_WLOCK(inp);
+ /*
+ * Exclusive pcbinfo lock is not required in syncache socket case even
+ * if two inpcb locks can be acquired simultaneously:
+ * - the inpcb in LISTEN state,
+ * - the newly created inp.
+ *
+ * In this case, an inp cannot be at same time in LISTEN state and
+ * just created by an accept() call.
+ */
INP_HASH_WLOCK(&V_tcbinfo);
/* Insert new socket into PCB hash list. */
@@ -702,6 +703,15 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
#endif
/*
+ * If there's an mbuf and it has a flowid, then let's initialise the
+ * inp with that particular flowid.
+ */
+ if (m != NULL && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
+ inp->inp_flowid = m->m_pkthdr.flowid;
+ inp->inp_flowtype = M_HASHTYPE_GET(m);
+ }
+
+ /*
* Install in the reservation hash table for now, but don't yet
* install a connection group since the full 4-tuple isn't yet
* configured.
@@ -824,11 +834,31 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
#endif /* INET */
INP_HASH_WUNLOCK(&V_tcbinfo);
tp = intotcpcb(inp);
- tp->t_state = TCPS_SYN_RECEIVED;
+ tcp_state_change(tp, TCPS_SYN_RECEIVED);
tp->iss = sc->sc_iss;
tp->irs = sc->sc_irs;
tcp_rcvseqinit(tp);
tcp_sendseqinit(tp);
+ blk = sototcpcb(lso)->t_fb;
+ if (blk != tp->t_fb) {
+ /*
+ * Our parents t_fb was not the default,
+ * we need to release our ref on tp->t_fb and
+ * pickup one on the new entry.
+ */
+ struct tcp_function_block *rblk;
+
+ rblk = find_and_ref_tcp_fb(blk);
+ KASSERT(rblk != NULL,
+ ("cannot find blk %p out of syncache?", blk));
+ if (tp->t_fb->tfb_tcp_fb_fini)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ refcount_release(&tp->t_fb->tfb_refcnt);
+ tp->t_fb = rblk;
+ if (tp->t_fb->tfb_tcp_fb_init) {
+ (*tp->t_fb->tfb_tcp_fb_init)(tp);
+ }
+ }
tp->snd_wl1 = sc->sc_irs;
tp->snd_max = tp->iss + 1;
tp->snd_nxt = tp->iss + 1;
@@ -898,7 +928,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
tp->t_keepcnt = sototcpcb(lso)->t_keepcnt;
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
- INP_WUNLOCK(inp);
+ soisconnected(so);
TCPSTAT_INC(tcps_accepts);
return (so);
@@ -917,6 +947,9 @@ abort2:
* in the syncache, and if its there, we pull it out of
* the cache and turn it into a full-blown connection in
* the SYN-RECEIVED state.
+ *
+ * On syncache_socket() success the newly created socket
+ * has its underlying inp locked.
*/
int
syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
@@ -931,12 +964,22 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
* Global TCP locks are held because we manipulate the PCB lists
* and create a new socket.
*/
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK,
("%s: can handle only ACK", __func__));
sc = syncache_lookup(inc, &sch); /* returns locked sch */
SCH_LOCK_ASSERT(sch);
+
+#ifdef INVARIANTS
+ /*
+ * Test code for syncookies comparing the syncache stored
+ * values with the reconstructed values from the cookie.
+ */
+ if (sc != NULL)
+ syncookie_cmp(inc, sch, sc, th, to, *lsop);
+#endif
+
if (sc == NULL) {
/*
* There is no syncache entry, so see if this ACK is
@@ -956,7 +999,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
goto failed;
}
bzero(&scs, sizeof(scs));
- sc = syncookie_lookup(inc, sch, &scs, to, th, *lsop);
+ sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop);
SCH_UNLOCK(sch);
if (sc == NULL) {
if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
@@ -966,7 +1009,16 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
goto failed;
}
} else {
- /* Pull out the entry to unlock the bucket row. */
+ /*
+ * Pull out the entry to unlock the bucket row.
+ *
+ * NOTE: We must decrease TCPS_SYN_RECEIVED count here, not
+ * tcp_state_change(). The tcpcb is not existent at this
+ * moment. A new one will be allocated via syncache_socket->
+ * sonewconn->tcp_usr_attach in TCPS_CLOSED state, then
+ * syncache_socket() will change it to TCPS_SYN_RECEIVED.
+ */
+ TCPSTATES_DEC(TCPS_SYN_RECEIVED);
TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
sch->sch_length--;
#ifdef TCP_OFFLOAD
@@ -1002,12 +1054,32 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
goto failed;
}
+ /*
+ * If timestamps were not negotiated during SYN/ACK they
+ * must not appear on any segment during this session.
+ */
if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) {
if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
"segment rejected\n", s, __func__);
goto failed;
}
+
+ /*
+ * If timestamps were negotiated during SYN/ACK they should
+ * appear on every segment during this session.
+ * XXXAO: This is only informal as there have been unverified
+ * reports of non-compliants stacks.
+ */
+ if ((sc->sc_flags & SCF_TIMESTAMP) && !(to->to_flags & TOF_TS)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Timestamp missing, "
+ "no action\n", s, __func__);
+ free(s, M_TCPLOG);
+ s = NULL;
+ }
+ }
+
/*
* If timestamps were negotiated the reflected timestamp
* must be equal to what we actually sent in the SYN|ACK.
@@ -1040,6 +1112,39 @@ failed:
return (0);
}
+#ifdef TCP_RFC7413
+static void
+syncache_tfo_expand(struct syncache *sc, struct socket **lsop, struct mbuf *m,
+ uint64_t response_cookie)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ unsigned int *pending_counter;
+
+ /*
+ * Global TCP locks are held because we manipulate the PCB lists
+ * and create a new socket.
+ */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+ pending_counter = intotcpcb(sotoinpcb(*lsop))->t_tfo_pending;
+ *lsop = syncache_socket(sc, *lsop, m);
+ if (*lsop == NULL) {
+ TCPSTAT_INC(tcps_sc_aborted);
+ atomic_subtract_int(pending_counter, 1);
+ } else {
+ inp = sotoinpcb(*lsop);
+ tp = intotcpcb(inp);
+ tp->t_flags |= TF_FASTOPEN;
+ tp->t_tfo_cookie = response_cookie;
+ tp->snd_max = tp->iss;
+ tp->snd_nxt = tp->iss;
+ tp->t_tfo_pending = pending_counter;
+ TCPSTAT_INC(tcps_sc_completed);
+ }
+}
+#endif /* TCP_RFC7413 */
+
/*
* Given a LISTEN socket and an inbound SYN request, add
* this to the syn cache, and send back a segment:
@@ -1052,9 +1157,16 @@ failed:
* DoS attack, an attacker could send data which would eventually
* consume all available buffer space if it were ACKed. By not ACKing
* the data, we avoid this DoS scenario.
+ *
+ * The exception to the above is when a SYN with a valid TCP Fast Open (TFO)
+ * cookie is processed, V_tcp_fastopen_enabled set to true, and the
+ * TCP_FASTOPEN socket option is set. In this case, a new socket is created
+ * and returned via lsop, the mbuf is not freed so that tcp_input() can
+ * queue its data to the socket, and 1 is returned to indicate the
+ * TFO-socket-creation path was taken.
*/
-static void
-_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
+int
+syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod,
void *todctx)
{
@@ -1063,10 +1175,10 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
struct syncache *sc = NULL;
struct syncache_head *sch;
struct mbuf *ipopts = NULL;
- u_int32_t flowtmp;
u_int ltflags;
int win, sb_hiwat, ip_ttl, ip_tos;
char *s;
+ int rv = 0;
#ifdef INET6
int autoflowlabel = 0;
#endif
@@ -1075,8 +1187,12 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
#endif
struct syncache scs;
struct ucred *cred;
+#ifdef TCP_RFC7413
+ uint64_t tfo_response_cookie;
+ int tfo_cookie_valid = 0;
+ int tfo_response_cookie_valid = 0;
+#endif
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp); /* listen socket */
KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN,
("%s: unexpected tcp flags", __func__));
@@ -1100,6 +1216,29 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
sb_hiwat = so->so_rcv.sb_hiwat;
ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE));
+#ifdef TCP_RFC7413
+ if (V_tcp_fastopen_enabled && (tp->t_flags & TF_FASTOPEN) &&
+ (tp->t_tfo_pending != NULL) && (to->to_flags & TOF_FASTOPEN)) {
+ /*
+ * Limit the number of pending TFO connections to
+ * approximately half of the queue limit. This prevents TFO
+ * SYN floods from starving the service by filling the
+ * listen queue with bogus TFO connections.
+ */
+ if (atomic_fetchadd_int(tp->t_tfo_pending, 1) <=
+ (so->so_qlimit / 2)) {
+ int result;
+
+ result = tcp_fastopen_check_cookie(inc,
+ to->to_tfo_cookie, to->to_tfo_len,
+ &tfo_response_cookie);
+ tfo_cookie_valid = (result > 0);
+ tfo_response_cookie_valid = (result >= 0);
+ } else
+ atomic_subtract_int(tp->t_tfo_pending, 1);
+ }
+#endif
+
/* By the time we drop the lock these should no longer be used. */
so = NULL;
tp = NULL;
@@ -1107,13 +1246,14 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
#ifdef MAC
if (mac_syncache_init(&maclabel) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
goto done;
} else
mac_syncache_create(maclabel, inp);
#endif
- INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+#ifdef TCP_RFC7413
+ if (!tfo_cookie_valid)
+#endif
+ INP_WUNLOCK(inp);
/*
* Remember the IP options, if any.
@@ -1142,6 +1282,10 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
sc = syncache_lookup(inc, &sch); /* returns locked entry */
SCH_LOCK_ASSERT(sch);
if (sc != NULL) {
+#ifdef TCP_RFC7413
+ if (tfo_cookie_valid)
+ INP_WUNLOCK(inp);
+#endif
TCPSTAT_INC(tcps_sc_dupsyn);
if (ipopts) {
/*
@@ -1174,7 +1318,7 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
s, __func__);
free(s, M_TCPLOG);
}
- if (syncache_respond(sc) == 0) {
+ if (syncache_respond(sc, sch, 1, m) == 0) {
sc->sc_rxmits = 0;
syncache_timeout(sc, sch, 1);
TCPSTAT_INC(tcps_sndacks);
@@ -1184,6 +1328,14 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
goto done;
}
+#ifdef TCP_RFC7413
+ if (tfo_cookie_valid) {
+ bzero(&scs, sizeof(scs));
+ sc = &scs;
+ goto skip_alloc;
+ }
+#endif
+
sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO);
if (sc == NULL) {
/*
@@ -1207,7 +1359,13 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
}
}
}
-
+
+#ifdef TCP_RFC7413
+skip_alloc:
+ if (!tfo_cookie_valid && tfo_response_cookie_valid)
+ sc->sc_tfo_cookie = &tfo_response_cookie;
+#endif
+
/*
* Fill in the syncache values.
*/
@@ -1271,7 +1429,7 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
* With the default maxsockbuf of 256K, a scale factor
* of 3 will be chosen by this algorithm. Those who
* choose a larger maxsockbuf should watch out
- * for the compatiblity problems mentioned above.
+ * for the compatibility problems mentioned above.
*
* RFC1323: The Window field in a SYN (i.e., a <SYN>
* or <SYN,ACK>) segment itself is never scaled.
@@ -1286,11 +1444,9 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
}
#ifdef TCP_SIGNATURE
/*
- * If listening socket requested TCP digests, and received SYN
+ * If listening socket requested TCP digests, OR received SYN
* contains the option, flag this in the syncache so that
* syncache_respond() will do the right thing with the SYN+ACK.
- * XXX: Currently we always record the option by default and will
- * attempt to use it in syncache_respond().
*/
if (to->to_flags & TOF_SIGNATURE || ltflags & TF_SIGNATURE)
sc->sc_flags |= SCF_SIGNATURE;
@@ -1304,25 +1460,32 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn)
sc->sc_flags |= SCF_ECN;
- if (V_tcp_syncookies) {
- syncookie_generate(sch, sc, &flowtmp);
+ if (V_tcp_syncookies)
+ sc->sc_iss = syncookie_generate(sch, sc);
#ifdef INET6
- if (autoflowlabel)
- sc->sc_flowlabel = flowtmp;
-#endif
- } else {
-#ifdef INET6
- if (autoflowlabel)
- sc->sc_flowlabel =
- (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
-#endif
+ if (autoflowlabel) {
+ if (V_tcp_syncookies)
+ sc->sc_flowlabel = sc->sc_iss;
+ else
+ sc->sc_flowlabel = ip6_randomflowlabel();
+ sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK;
}
+#endif
SCH_UNLOCK(sch);
+#ifdef TCP_RFC7413
+ if (tfo_cookie_valid) {
+ syncache_tfo_expand(sc, lsop, m, tfo_response_cookie);
+ /* INP_WUNLOCK(inp) will be performed by the called */
+ rv = 1;
+ goto tfo_done;
+ }
+#endif
+
/*
* Do a standard 3-way handshake.
*/
- if (syncache_respond(sc) == 0) {
+ if (syncache_respond(sc, sch, 0, m) == 0) {
if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs)
syncache_free(sc);
else if (sc != &scs)
@@ -1336,21 +1499,29 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
}
done:
+ if (m) {
+ *lsop = NULL;
+ m_freem(m);
+ }
+#ifdef TCP_RFC7413
+tfo_done:
+#endif
if (cred != NULL)
crfree(cred);
#ifdef MAC
if (sc == &scs)
mac_syncache_destroy(&maclabel);
#endif
- if (m) {
-
- *lsop = NULL;
- m_freem(m);
- }
+ return (rv);
}
+/*
+ * Send SYN|ACK to the peer. Either in response to the peer's SYN,
+ * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL.
+ */
static int
-syncache_respond(struct syncache *sc)
+syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked,
+ const struct mbuf *m0)
{
struct ip *ip = NULL;
struct mbuf *m;
@@ -1361,6 +1532,9 @@ syncache_respond(struct syncache *sc)
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
#endif
+#ifdef TCP_SIGNATURE
+ struct secasvar *sav;
+#endif
hlen =
#ifdef INET6
@@ -1379,7 +1553,7 @@ syncache_respond(struct syncache *sc)
("syncache: mbuf too small"));
/* Create the IP+TCP header from scratch. */
- m = m_gethdr(M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL)
return (ENOBUFS);
#ifdef MAC
@@ -1413,7 +1587,7 @@ syncache_respond(struct syncache *sc)
ip = mtod(m, struct ip *);
ip->ip_v = IPVERSION;
ip->ip_hl = sizeof(struct ip) >> 2;
- ip->ip_len = tlen;
+ ip->ip_len = htons(tlen);
ip->ip_id = 0;
ip->ip_off = 0;
ip->ip_sum = 0;
@@ -1431,7 +1605,7 @@ syncache_respond(struct syncache *sc)
* 2) the SCF_UNREACH flag has been set
*/
if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
- ip->ip_off |= IP_DF;
+ ip->ip_off |= htons(IP_DF);
th = (struct tcphdr *)(ip + 1);
}
@@ -1471,8 +1645,39 @@ syncache_respond(struct syncache *sc)
if (sc->sc_flags & SCF_SACK)
to.to_flags |= TOF_SACKPERM;
#ifdef TCP_SIGNATURE
- if (sc->sc_flags & SCF_SIGNATURE)
- to.to_flags |= TOF_SIGNATURE;
+ sav = NULL;
+ if (sc->sc_flags & SCF_SIGNATURE) {
+ sav = tcp_get_sav(m, IPSEC_DIR_OUTBOUND);
+ if (sav != NULL)
+ to.to_flags |= TOF_SIGNATURE;
+ else {
+
+ /*
+ * We've got SCF_SIGNATURE flag
+ * inherited from listening socket,
+ * but no SADB key for given source
+ * address. Assume signature is not
+ * required and remove signature flag
+ * instead of silently dropping
+ * connection.
+ */
+ if (locked == 0)
+ SCH_LOCK(sch);
+ sc->sc_flags &= ~SCF_SIGNATURE;
+ if (locked == 0)
+ SCH_UNLOCK(sch);
+ }
+ }
+#endif
+
+#ifdef TCP_RFC7413
+ if (sc->sc_tfo_cookie) {
+ to.to_flags |= TOF_FASTOPEN;
+ to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
+ to.to_tfo_cookie = sc->sc_tfo_cookie;
+ /* don't send cookie again when retransmitting response */
+ sc->sc_tfo_cookie = NULL;
+ }
#endif
optlen = tcp_addoptions(&to, (u_char *)(th + 1));
@@ -1483,20 +1688,29 @@ syncache_respond(struct syncache *sc)
#ifdef TCP_SIGNATURE
if (sc->sc_flags & SCF_SIGNATURE)
- tcp_signature_compute(m, 0, 0, optlen,
- to.to_signature, IPSEC_DIR_OUTBOUND);
+ tcp_signature_do_compute(m, 0, optlen,
+ to.to_signature, sav);
#endif
#ifdef INET6
if (sc->sc_inc.inc_flags & INC_ISIPV6)
ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen);
else
#endif
- ip->ip_len += optlen;
+ ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
} else
optlen = 0;
M_SETFIB(m, sc->sc_inc.inc_fibnum);
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ /*
+ * If we have peer's SYN and it has a flowid, then let's assign it to
+ * our SYN|ACK. ip6_output() and ip_output() will not assign flowid
+ * to SYN|ACK due to lack of inp here.
+ */
+ if (m0 != NULL && M_HASHTYPE_GET(m0) != M_HASHTYPE_NONE) {
+ m->m_pkthdr.flowid = m0->m_pkthdr.flowid;
+ M_HASHTYPE_SET(m, M_HASHTYPE_GET(m0));
+ }
#ifdef INET6
if (sc->sc_inc.inc_flags & INC_ISIPV6) {
m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
@@ -1538,292 +1752,379 @@ syncache_respond(struct syncache *sc)
return (error);
}
-void
-syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
- struct inpcb *inp, struct socket **lsop, struct mbuf *m)
-{
- _syncache_add(inc, to, th, inp, lsop, m, NULL, NULL);
-}
-
-void
-tcp_offload_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
- struct inpcb *inp, struct socket **lsop, void *tod, void *todctx)
-{
-
- _syncache_add(inc, to, th, inp, lsop, NULL, tod, todctx);
-}
/*
- * The purpose of SYN cookies is to avoid keeping track of all SYN's we
- * receive and to be able to handle SYN floods from bogus source addresses
- * (where we will never receive any reply). SYN floods try to exhaust all
- * our memory and available slots in the SYN cache table to cause a denial
- * of service to legitimate users of the local host.
+ * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks
+ * that exceed the capacity of the syncache by avoiding the storage of any
+ * of the SYNs we receive. Syncookies defend against blind SYN flooding
+ * attacks where the attacker does not have access to our responses.
+ *
+ * Syncookies encode and include all necessary information about the
+ * connection setup within the SYN|ACK that we send back. That way we
+ * can avoid keeping any local state until the ACK to our SYN|ACK returns
+ * (if ever). Normally the syncache and syncookies are running in parallel
+ * with the latter taking over when the former is exhausted. When matching
+ * syncache entry is found the syncookie is ignored.
*
- * The idea of SYN cookies is to encode and include all necessary information
- * about the connection setup state within the SYN-ACK we send back and thus
- * to get along without keeping any local state until the ACK to the SYN-ACK
- * arrives (if ever). Everything we need to know should be available from
- * the information we encoded in the SYN-ACK.
+ * The only reliable information persisting the 3WHS is our initial sequence
+ * number ISS of 32 bits. Syncookies embed a cryptographically sufficient
+ * strong hash (MAC) value and a few bits of TCP SYN options in the ISS
+ * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK
+ * returns and signifies a legitimate connection if it matches the ACK.
*
- * More information about the theory behind SYN cookies and its first
- * discussion and specification can be found at:
- * http://cr.yp.to/syncookies.html (overview)
- * http://cr.yp.to/syncookies/archive (gory details)
+ * The available space of 32 bits to store the hash and to encode the SYN
+ * option information is very tight and we should have at least 24 bits for
+ * the MAC to keep the number of guesses by blind spoofing reasonably high.
*
- * This implementation extends the orginal idea and first implementation
- * of FreeBSD by using not only the initial sequence number field to store
- * information but also the timestamp field if present. This way we can
- * keep track of the entire state we need to know to recreate the session in
- * its original form. Almost all TCP speakers implement RFC1323 timestamps
- * these days. For those that do not we still have to live with the known
- * shortcomings of the ISN only SYN cookies.
+ * SYN option information we have to encode to fully restore a connection:
+ * MSS: is imporant to chose an optimal segment size to avoid IP level
+ * fragmentation along the path. The common MSS values can be encoded
+ * in a 3-bit table. Uncommon values are captured by the next lower value
+ * in the table leading to a slight increase in packetization overhead.
+ * WSCALE: is necessary to allow large windows to be used for high delay-
+ * bandwidth product links. Not scaling the window when it was initially
+ * negotiated is bad for performance as lack of scaling further decreases
+ * the apparent available send window. We only need to encode the WSCALE
+ * we received from the remote end. Our end can be recalculated at any
+ * time. The common WSCALE values can be encoded in a 3-bit table.
+ * Uncommon values are captured by the next lower value in the table
+ * making us under-estimate the available window size halving our
+ * theoretically possible maximum throughput for that connection.
+ * SACK: Greatly assists in packet loss recovery and requires 1 bit.
+ * TIMESTAMP and SIGNATURE is not encoded because they are permanent options
+ * that are included in all segments on a connection. We enable them when
+ * the ACK has them.
*
- * Cookie layers:
+ * Security of syncookies and attack vectors:
*
- * Initial sequence number we send:
- * 31|................................|0
- * DDDDDDDDDDDDDDDDDDDDDDDDDMMMRRRP
- * D = MD5 Digest (first dword)
- * M = MSS index
- * R = Rotation of secret
- * P = Odd or Even secret
+ * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod)
+ * together with the gloabl secret to make it unique per connection attempt.
+ * Thus any change of any of those parameters results in a different MAC output
+ * in an unpredictable way unless a collision is encountered. 24 bits of the
+ * MAC are embedded into the ISS.
*
- * The MD5 Digest is computed with over following parameters:
- * a) randomly rotated secret
- * b) struct in_conninfo containing the remote/local ip/port (IPv4&IPv6)
- * c) the received initial sequence number from remote host
- * d) the rotation offset and odd/even bit
+ * To prevent replay attacks two rotating global secrets are updated with a
+ * new random value every 15 seconds. The life-time of a syncookie is thus
+ * 15-30 seconds.
*
- * Timestamp we send:
- * 31|................................|0
- * DDDDDDDDDDDDDDDDDDDDDDSSSSRRRRA5
- * D = MD5 Digest (third dword) (only as filler)
- * S = Requested send window scale
- * R = Requested receive window scale
- * A = SACK allowed
- * 5 = TCP-MD5 enabled (not implemented yet)
- * XORed with MD5 Digest (forth dword)
+ * Vector 1: Attacking the secret. This requires finding a weakness in the
+ * MAC itself or the way it is used here. The attacker can do a chosen plain
+ * text attack by varying and testing the all parameters under his control.
+ * The strength depends on the size and randomness of the secret, and the
+ * cryptographic security of the MAC function. Due to the constant updating
+ * of the secret the attacker has at most 29.999 seconds to find the secret
+ * and launch spoofed connections. After that he has to start all over again.
*
- * The timestamp isn't cryptographically secure and doesn't need to be.
- * The double use of the MD5 digest dwords ties it to a specific remote/
- * local host/port, remote initial sequence number and our local time
- * limited secret. A received timestamp is reverted (XORed) and then
- * the contained MD5 dword is compared to the computed one to ensure the
- * timestamp belongs to the SYN-ACK we sent. The other parameters may
- * have been tampered with but this isn't different from supplying bogus
- * values in the SYN in the first place.
+ * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC
+ * size an average of 4,823 attempts are required for a 50% chance of success
+ * to spoof a single syncookie (birthday collision paradox). However the
+ * attacker is blind and doesn't know if one of his attempts succeeded unless
+ * he has a side channel to interfere success from. A single connection setup
+ * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets.
+ * This many attempts are required for each one blind spoofed connection. For
+ * every additional spoofed connection he has to launch another N attempts.
+ * Thus for a sustained rate 100 spoofed connections per second approximately
+ * 1,800,000 packets per second would have to be sent.
*
- * Some problems with SYN cookies remain however:
- * Consider the problem of a recreated (and retransmitted) cookie. If the
- * original SYN was accepted, the connection is established. The second
- * SYN is inflight, and if it arrives with an ISN that falls within the
- * receive window, the connection is killed.
+ * NB: The MAC function should be fast so that it doesn't become a CPU
+ * exhaustion attack vector itself.
*
- * Notes:
- * A heuristic to determine when to accept syn cookies is not necessary.
- * An ACK flood would cause the syncookie verification to be attempted,
- * but a SYN flood causes syncookies to be generated. Both are of equal
- * cost, so there's no point in trying to optimize the ACK flood case.
- * Also, if you don't process certain ACKs for some reason, then all someone
- * would have to do is launch a SYN and ACK flood at the same time, which
- * would stop cookie verification and defeat the entire purpose of syncookies.
+ * References:
+ * RFC4987 TCP SYN Flooding Attacks and Common Mitigations
+ * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996
+ * http://cr.yp.to/syncookies.html (overview)
+ * http://cr.yp.to/syncookies/archive (details)
+ *
+ *
+ * Schematic construction of a syncookie enabled Initial Sequence Number:
+ * 0 1 2 3
+ * 12345678901234567890123456789012
+ * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP|
+ *
+ * x 24 MAC (truncated)
+ * W 3 Send Window Scale index
+ * M 3 MSS index
+ * S 1 SACK permitted
+ * P 1 Odd/even secret
*/
-static int tcp_sc_msstab[] = { 0, 256, 468, 536, 996, 1452, 1460, 8960 };
-static void
-syncookie_generate(struct syncache_head *sch, struct syncache *sc,
- u_int32_t *flowlabel)
+/*
+ * Distribution and probability of certain MSS values. Those in between are
+ * rounded down to the next lower one.
+ * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011]
+ * .2% .3% 5% 7% 7% 20% 15% 45%
+ */
+static int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 };
+
+/*
+ * Distribution and probability of certain WSCALE values. We have to map the
+ * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3
+ * bits based on prevalence of certain values. Where we don't have an exact
+ * match for are rounded down to the next lower one letting us under-estimate
+ * the true available window. At the moment this would happen only for the
+ * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer
+ * and window size). The absence of the WSCALE option (no scaling in either
+ * direction) is encoded with index zero.
+ * [WSCALE values histograms, Allman, 2012]
+ * X 10 10 35 5 6 14 10% by host
+ * X 11 4 5 5 18 49 3% by connections
+ */
+static int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 };
+
+/*
+ * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed
+ * and good cryptographic properties.
+ */
+static uint32_t
+syncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags,
+ uint8_t *secbits, uintptr_t secmod)
{
- MD5_CTX ctx;
- u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)];
- u_int32_t data;
- u_int32_t *secbits;
- u_int off, pmss, mss;
- int i;
+ SIPHASH_CTX ctx;
+ uint32_t siphash[2];
+
+ SipHash24_Init(&ctx);
+ SipHash_SetKey(&ctx, secbits);
+ switch (inc->inc_flags & INC_ISIPV6) {
+#ifdef INET
+ case 0:
+ SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr));
+ SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr));
+ break;
+#endif
+#ifdef INET6
+ case INC_ISIPV6:
+ SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr));
+ SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr));
+ break;
+#endif
+ }
+ SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport));
+ SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport));
+ SipHash_Update(&ctx, &irs, sizeof(irs));
+ SipHash_Update(&ctx, &flags, sizeof(flags));
+ SipHash_Update(&ctx, &secmod, sizeof(secmod));
+ SipHash_Final((u_int8_t *)&siphash, &ctx);
+
+ return (siphash[0] ^ siphash[1]);
+}
+
+static tcp_seq
+syncookie_generate(struct syncache_head *sch, struct syncache *sc)
+{
+ u_int i, mss, secbit, wscale;
+ uint32_t iss, hash;
+ uint8_t *secbits;
+ union syncookie cookie;
SCH_LOCK_ASSERT(sch);
- /* Which of the two secrets to use. */
- secbits = sch->sch_oddeven ?
- sch->sch_secbits_odd : sch->sch_secbits_even;
-
- /* Reseed secret if too old. */
- if (sch->sch_reseed < time_uptime) {
- sch->sch_oddeven = sch->sch_oddeven ? 0 : 1; /* toggle */
- secbits = sch->sch_oddeven ?
- sch->sch_secbits_odd : sch->sch_secbits_even;
- for (i = 0; i < SYNCOOKIE_SECRET_SIZE; i++)
- secbits[i] = arc4random();
- sch->sch_reseed = time_uptime + SYNCOOKIE_LIFETIME;
- }
+ cookie.cookie = 0;
- /* Secret rotation offset. */
- off = sc->sc_iss & 0x7; /* iss was randomized before */
+ /* Map our computed MSS into the 3-bit index. */
+ mss = min(tcp_mssopt(&sc->sc_inc), max(sc->sc_peer_mss, V_tcp_minmss));
+ for (i = nitems(tcp_sc_msstab) - 1; tcp_sc_msstab[i] > mss && i > 0;
+ i--)
+ ;
+ cookie.flags.mss_idx = i;
- /* Maximum segment size calculation. */
- pmss =
- max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)), V_tcp_minmss);
- for (mss = sizeof(tcp_sc_msstab) / sizeof(int) - 1; mss > 0; mss--)
- if (tcp_sc_msstab[mss] <= pmss)
- break;
+ /*
+ * Map the send window scale into the 3-bit index but only if
+ * the wscale option was received.
+ */
+ if (sc->sc_flags & SCF_WINSCALE) {
+ wscale = sc->sc_requested_s_scale;
+ for (i = nitems(tcp_sc_wstab) - 1;
+ tcp_sc_wstab[i] > wscale && i > 0;
+ i--)
+ ;
+ cookie.flags.wscale_idx = i;
+ }
- /* Fold parameters and MD5 digest into the ISN we will send. */
- data = sch->sch_oddeven;/* odd or even secret, 1 bit */
- data |= off << 1; /* secret offset, derived from iss, 3 bits */
- data |= mss << 4; /* mss, 3 bits */
+ /* Can we do SACK? */
+ if (sc->sc_flags & SCF_SACK)
+ cookie.flags.sack_ok = 1;
- MD5Init(&ctx);
- MD5Update(&ctx, ((u_int8_t *)secbits) + off,
- SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off);
- MD5Update(&ctx, secbits, off);
- MD5Update(&ctx, &sc->sc_inc, sizeof(sc->sc_inc));
- MD5Update(&ctx, &sc->sc_irs, sizeof(sc->sc_irs));
- MD5Update(&ctx, &data, sizeof(data));
- MD5Final((u_int8_t *)&md5_buffer, &ctx);
+ /* Which of the two secrets to use. */
+ secbit = sch->sch_sc->secret.oddeven & 0x1;
+ cookie.flags.odd_even = secbit;
- data |= (md5_buffer[0] << 7);
- sc->sc_iss = data;
+ secbits = sch->sch_sc->secret.key[secbit];
+ hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits,
+ (uintptr_t)sch);
-#ifdef INET6
- *flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
-#endif
+ /*
+ * Put the flags into the hash and XOR them to get better ISS number
+ * variance. This doesn't enhance the cryptographic strength and is
+ * done to prevent the 8 cookie bits from showing up directly on the
+ * wire.
+ */
+ iss = hash & ~0xff;
+ iss |= cookie.cookie ^ (hash >> 24);
- /* Additional parameters are stored in the timestamp if present. */
+ /* Randomize the timestamp. */
if (sc->sc_flags & SCF_TIMESTAMP) {
- data = ((sc->sc_flags & SCF_SIGNATURE) ? 1 : 0); /* TCP-MD5, 1 bit */
- data |= ((sc->sc_flags & SCF_SACK) ? 1 : 0) << 1; /* SACK, 1 bit */
- data |= sc->sc_requested_s_scale << 2; /* SWIN scale, 4 bits */
- data |= sc->sc_requested_r_scale << 6; /* RWIN scale, 4 bits */
- data |= md5_buffer[2] << 10; /* more digest bits */
- data ^= md5_buffer[3];
- sc->sc_ts = data;
- sc->sc_tsoff = data - tcp_ts_getticks(); /* after XOR */
+ sc->sc_ts = arc4random();
+ sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks();
}
TCPSTAT_INC(tcps_sc_sendcookie);
+ return (iss);
}
static struct syncache *
syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch,
- struct syncache *sc, struct tcpopt *to, struct tcphdr *th,
- struct socket *so)
+ struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
+ struct socket *lso)
{
- MD5_CTX ctx;
- u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)];
- u_int32_t data = 0;
- u_int32_t *secbits;
+ uint32_t hash;
+ uint8_t *secbits;
tcp_seq ack, seq;
- int off, mss, wnd, flags;
+ int wnd, wscale = 0;
+ union syncookie cookie;
SCH_LOCK_ASSERT(sch);
/*
- * Pull information out of SYN-ACK/ACK and
- * revert sequence number advances.
+ * Pull information out of SYN-ACK/ACK and revert sequence number
+ * advances.
*/
ack = th->th_ack - 1;
seq = th->th_seq - 1;
- off = (ack >> 1) & 0x7;
- mss = (ack >> 4) & 0x7;
- flags = ack & 0x7f;
-
- /* Which of the two secrets to use. */
- secbits = (flags & 0x1) ? sch->sch_secbits_odd : sch->sch_secbits_even;
/*
- * The secret wasn't updated for the lifetime of a syncookie,
- * so this SYN-ACK/ACK is either too old (replay) or totally bogus.
+ * Unpack the flags containing enough information to restore the
+ * connection.
*/
- if (sch->sch_reseed + SYNCOOKIE_LIFETIME < time_uptime) {
- return (NULL);
- }
+ cookie.cookie = (ack & 0xff) ^ (ack >> 24);
- /* Recompute the digest so we can compare it. */
- MD5Init(&ctx);
- MD5Update(&ctx, ((u_int8_t *)secbits) + off,
- SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off);
- MD5Update(&ctx, secbits, off);
- MD5Update(&ctx, inc, sizeof(*inc));
- MD5Update(&ctx, &seq, sizeof(seq));
- MD5Update(&ctx, &flags, sizeof(flags));
- MD5Final((u_int8_t *)&md5_buffer, &ctx);
-
- /* Does the digest part of or ACK'ed ISS match? */
- if ((ack & (~0x7f)) != (md5_buffer[0] << 7))
- return (NULL);
+ /* Which of the two secrets to use. */
+ secbits = sch->sch_sc->secret.key[cookie.flags.odd_even];
- /* Does the digest part of our reflected timestamp match? */
- if (to->to_flags & TOF_TS) {
- data = md5_buffer[3] ^ to->to_tsecr;
- if ((data & (~0x3ff)) != (md5_buffer[2] << 10))
- return (NULL);
- }
+ hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch);
+
+ /* The recomputed hash matches the ACK if this was a genuine cookie. */
+ if ((ack & ~0xff) != (hash & ~0xff))
+ return (NULL);
/* Fill in the syncache values. */
+ sc->sc_flags = 0;
bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
sc->sc_ipopts = NULL;
sc->sc_irs = seq;
sc->sc_iss = ack;
+ switch (inc->inc_flags & INC_ISIPV6) {
+#ifdef INET
+ case 0:
+ sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl;
+ sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos;
+ break;
+#endif
#ifdef INET6
- if (inc->inc_flags & INC_ISIPV6) {
- if (sotoinpcb(so)->inp_flags & IN6P_AUTOFLOWLABEL)
- sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
- } else
+ case INC_ISIPV6:
+ if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL)
+ sc->sc_flowlabel = sc->sc_iss & IPV6_FLOWLABEL_MASK;
+ break;
#endif
- {
- sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl;
- sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos;
}
- /* Additional parameters that were encoded in the timestamp. */
- if (data) {
+ sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx];
+
+ /* We can simply recompute receive window scale we sent earlier. */
+ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max)
+ wscale++;
+
+ /* Only use wscale if it was enabled in the orignal SYN. */
+ if (cookie.flags.wscale_idx > 0) {
+ sc->sc_requested_r_scale = wscale;
+ sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx];
+ sc->sc_flags |= SCF_WINSCALE;
+ }
+
+ wnd = sbspace(&lso->so_rcv);
+ wnd = imax(wnd, 0);
+ wnd = imin(wnd, TCP_MAXWIN);
+ sc->sc_wnd = wnd;
+
+ if (cookie.flags.sack_ok)
+ sc->sc_flags |= SCF_SACK;
+
+ if (to->to_flags & TOF_TS) {
sc->sc_flags |= SCF_TIMESTAMP;
sc->sc_tsreflect = to->to_tsval;
sc->sc_ts = to->to_tsecr;
sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks();
- sc->sc_flags |= (data & 0x1) ? SCF_SIGNATURE : 0;
- sc->sc_flags |= ((data >> 1) & 0x1) ? SCF_SACK : 0;
- sc->sc_requested_s_scale = min((data >> 2) & 0xf,
- TCP_MAX_WINSHIFT);
- sc->sc_requested_r_scale = min((data >> 6) & 0xf,
- TCP_MAX_WINSHIFT);
- if (sc->sc_requested_s_scale || sc->sc_requested_r_scale)
- sc->sc_flags |= SCF_WINSCALE;
- } else
- sc->sc_flags |= SCF_NOOPT;
+ }
- wnd = sbspace(&so->so_rcv);
- wnd = imax(wnd, 0);
- wnd = imin(wnd, TCP_MAXWIN);
- sc->sc_wnd = wnd;
+ if (to->to_flags & TOF_SIGNATURE)
+ sc->sc_flags |= SCF_SIGNATURE;
sc->sc_rxmits = 0;
- sc->sc_peer_mss = tcp_sc_msstab[mss];
TCPSTAT_INC(tcps_sc_recvcookie);
return (sc);
}
-/*
- * Returns the current number of syncache entries. This number
- * will probably change before you get around to calling
- * syncache_pcblist.
- */
-
-int
-syncache_pcbcount(void)
+#ifdef INVARIANTS
+static int
+syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch,
+ struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
+ struct socket *lso)
{
- struct syncache_head *sch;
- int count, i;
+ struct syncache scs, *scx;
+ char *s;
- for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) {
- /* No need to lock for a read. */
- sch = &V_tcp_syncache.hashbase[i];
- count += sch->sch_length;
+ bzero(&scs, sizeof(scs));
+ scx = syncookie_lookup(inc, sch, &scs, th, to, lso);
+
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL)
+ return (0);
+
+ if (scx != NULL) {
+ if (sc->sc_peer_mss != scx->sc_peer_mss)
+ log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n",
+ s, __func__, sc->sc_peer_mss, scx->sc_peer_mss);
+
+ if (sc->sc_requested_r_scale != scx->sc_requested_r_scale)
+ log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n",
+ s, __func__, sc->sc_requested_r_scale,
+ scx->sc_requested_r_scale);
+
+ if (sc->sc_requested_s_scale != scx->sc_requested_s_scale)
+ log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n",
+ s, __func__, sc->sc_requested_s_scale,
+ scx->sc_requested_s_scale);
+
+ if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK))
+ log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__);
}
- return count;
+
+ if (s != NULL)
+ free(s, M_TCPLOG);
+ return (0);
+}
+#endif /* INVARIANTS */
+
+static void
+syncookie_reseed(void *arg)
+{
+ struct tcp_syncache *sc = arg;
+ uint8_t *secbits;
+ int secbit;
+
+ /*
+ * Reseeding the secret doesn't have to be protected by a lock.
+ * It only must be ensured that the new random values are visible
+ * to all CPUs in a SMP environment. The atomic with release
+ * semantics ensures that.
+ */
+ secbit = (sc->secret.oddeven & 0x1) ? 0 : 1;
+ secbits = sc->secret.key[secbit];
+ arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0);
+ atomic_add_rel_int(&sc->secret.oddeven, 1);
+
+ /* Reschedule ourself. */
+ callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz);
}
/*
diff --git a/freebsd/sys/netinet/tcp_syncache.h b/freebsd/sys/netinet/tcp_syncache.h
index c55bfbcd..6b12c13a 100644
--- a/freebsd/sys/netinet/tcp_syncache.h
+++ b/freebsd/sys/netinet/tcp_syncache.h
@@ -41,13 +41,11 @@ void syncache_destroy(void);
void syncache_unreach(struct in_conninfo *, struct tcphdr *);
int syncache_expand(struct in_conninfo *, struct tcpopt *,
struct tcphdr *, struct socket **, struct mbuf *);
-void syncache_add(struct in_conninfo *, struct tcpopt *,
- struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *);
-void tcp_offload_syncache_add(struct in_conninfo *, struct tcpopt *,
- struct tcphdr *, struct inpcb *, struct socket **, void *, void *);
+int syncache_add(struct in_conninfo *, struct tcpopt *,
+ struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *,
+ void *, void *);
void syncache_chkrst(struct in_conninfo *, struct tcphdr *);
void syncache_badack(struct in_conninfo *);
-int syncache_pcbcount(void);
int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported);
struct syncache {
@@ -75,7 +73,10 @@ struct syncache {
#endif
struct label *sc_label; /* MAC label reference */
struct ucred *sc_cred; /* cred cache for jail checks */
-
+#ifdef TCP_RFC7413
+ void *sc_tfo_cookie; /* for TCP Fast Open response */
+#endif
+ void *sc_pspare; /* TCP_SIGNATURE */
u_int32_t sc_spare[2]; /* UTO */
};
@@ -91,20 +92,23 @@ struct syncache {
#define SCF_SACK 0x80 /* send SACK option */
#define SCF_ECN 0x100 /* send ECN setup packet */
-#define SYNCOOKIE_SECRET_SIZE 8 /* dwords */
-#define SYNCOOKIE_LIFETIME 16 /* seconds */
-
struct syncache_head {
- struct vnet *sch_vnet;
struct mtx sch_mtx;
TAILQ_HEAD(sch_head, syncache) sch_bucket;
struct callout sch_timer;
int sch_nextc;
u_int sch_length;
- u_int sch_oddeven;
- u_int32_t sch_secbits_odd[SYNCOOKIE_SECRET_SIZE];
- u_int32_t sch_secbits_even[SYNCOOKIE_SECRET_SIZE];
- u_int sch_reseed; /* time_uptime, seconds */
+ struct tcp_syncache *sch_sc;
+};
+
+#define SYNCOOKIE_SECRET_SIZE 16
+#define SYNCOOKIE_LIFETIME 15 /* seconds */
+
+struct syncookie_secret {
+ volatile u_int oddeven;
+ uint8_t key[2][SYNCOOKIE_SECRET_SIZE];
+ struct callout reseed;
+ u_int lifetime;
};
struct tcp_syncache {
@@ -115,7 +119,20 @@ struct tcp_syncache {
u_int bucket_limit;
u_int cache_limit;
u_int rexmt_limit;
- u_int hash_secret;
+ uint32_t hash_secret;
+ struct vnet *vnet;
+ struct syncookie_secret secret;
+};
+
+/* Internal use for the syncookie functions. */
+union syncookie {
+ uint8_t cookie;
+ struct {
+ uint8_t odd_even:1,
+ sack_ok:1,
+ wscale_idx:3,
+ mss_idx:3;
+ } flags;
};
#endif /* _KERNEL */
diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c
index db952e42..edfc3829 100644
--- a/freebsd/sys/netinet/tcp_timer.c
+++ b/freebsd/sys/netinet/tcp_timer.c
@@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_tcpdebug.h>
+#include <rtems/bsd/local/opt_rss.h>
#include <rtems/bsd/sys/param.h>
#include <sys/kernel.h>
@@ -52,24 +53,40 @@ __FBSDID("$FreeBSD$");
#include <net/if.h>
#include <net/route.h>
+#include <net/rss_config.h>
#include <net/vnet.h>
+#include <net/netisr.h>
-#include <netinet/cc.h>
#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
+#include <netinet/in_rss.h>
#include <netinet/in_systm.h>
#ifdef INET6
#include <netinet6/in6_pcb.h>
#endif
#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
+#include <netinet/cc/cc.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
#include <netinet/tcpip.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
+int tcp_persmin;
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
+
+int tcp_persmax;
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
+
int tcp_keepinit;
SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
&tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
@@ -121,17 +138,110 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
/* max idle probes */
int tcp_maxpersistidle;
-static int tcp_rexmit_drop_options = 1;
+static int tcp_rexmit_drop_options = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
&tcp_rexmit_drop_options, 0,
"Drop TCP options from 3rd and later retransmitted SYN");
+static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
+#define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
+ CTLFLAG_RW|CTLFLAG_VNET,
+ &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
+ "Path MTU Discovery Black Hole Detection Enabled");
+
+static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
+#define V_tcp_pmtud_blackhole_activated \
+ VNET(tcp_pmtud_blackhole_activated)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
+ CTLFLAG_RD|CTLFLAG_VNET,
+ &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
+ "Path MTU Discovery Black Hole Detection, Activation Count");
+
+static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
+#define V_tcp_pmtud_blackhole_activated_min_mss \
+ VNET(tcp_pmtud_blackhole_activated_min_mss)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
+ CTLFLAG_RD|CTLFLAG_VNET,
+ &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
+ "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
+
+static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
+#define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
+ CTLFLAG_RD|CTLFLAG_VNET,
+ &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
+ "Path MTU Discovery Black Hole Detection, Failure Count");
+
+#ifdef INET
+static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
+#define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
+ CTLFLAG_RW|CTLFLAG_VNET,
+ &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
+ "Path MTU Discovery Black Hole Detection lowered MSS");
+#endif
+
+#ifdef INET6
+static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
+#define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
+ CTLFLAG_RW|CTLFLAG_VNET,
+ &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
+ "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
+#endif
+
+#ifdef RSS
+static int per_cpu_timers = 1;
+#else
static int per_cpu_timers = 0;
+#endif
SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
&per_cpu_timers , 0, "run tcp timers on all cpus");
+#if 0
#define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
+#endif
+
+/*
+ * Map the given inp to a CPU id.
+ *
+ * This queries RSS if it's compiled in, else it defaults to the current
+ * CPU ID.
+ */
+static inline int
+inp_to_cpuid(struct inpcb *inp)
+{
+ u_int cpuid;
+
+#ifdef RSS
+ if (per_cpu_timers) {
+ cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
+ if (cpuid == NETISR_CPUID_NONE)
+ return (curcpu); /* XXX */
+ else
+ return (cpuid);
+ }
+#else
+ /* Legacy, pre-RSS behaviour */
+ if (per_cpu_timers) {
+ /*
+ * We don't have a flowid -> cpuid mapping, so cheat and
+ * just map unknown cpuids to curcpu. Not the best, but
+ * apparently better than defaulting to swi 0.
+ */
+ cpuid = inp->inp_flowid % (mp_maxid + 1);
+ if (! CPU_ABSENT(cpuid))
+ return (cpuid);
+ return (curcpu);
+ }
+#endif
+ /* Default for RSS and non-RSS - cpuid 0 */
+ else {
+ return (0);
+ }
+}
/*
* Tcp protocol timeout routine called every 500 ms.
@@ -146,9 +256,7 @@ tcp_slowtimo(void)
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- INP_INFO_WLOCK(&V_tcbinfo);
(void) tcp_tw_2msl_scan(0);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
@@ -162,10 +270,6 @@ int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */
-static int tcp_timer_race;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race,
- 0, "Count of t_inpcb races on tcp_discardcb");
-
/*
* TCP timer processing.
*/
@@ -178,18 +282,7 @@ tcp_timer_delack(void *xtp)
CURVNET_SET(tp->t_vnet);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_delack) ||
!callout_active(&tp->t_timers->tt_delack)) {
@@ -203,14 +296,65 @@ tcp_timer_delack(void *xtp)
CURVNET_RESTORE();
return;
}
-
tp->t_flags |= TF_ACKNOW;
TCPSTAT_INC(tcps_delack);
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
INP_WUNLOCK(inp);
CURVNET_RESTORE();
}
+/*
+ * When a timer wants to remove a TCB it must
+ * hold the INP_INFO_RLOCK(). The timer function
+ * should only have grabbed the INP_WLOCK() when
+ * it entered. To safely switch to holding both the
+ * INP_INFO_RLOCK() and the INP_WLOCK() we must first
+ * grab a reference on the inp, which will hold the inp
+ * so that it can't be removed. We then unlock the INP_WLOCK(),
+ * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK()
+ * we proceed again to get the INP_WLOCK() (this preserves proper
+ * lock order). After acquiring the INP_WLOCK we must check if someone
+ * else deleted the pcb i.e. the inp_flags check.
+ * If so we return 1 otherwise we return 0.
+ *
+ * No matter what the tcp_inpinfo_lock_add() function
+ * returns the caller must afterwards call tcp_inpinfo_lock_del()
+ * to drop the locks and reference properly.
+ */
+
+int
+tcp_inpinfo_lock_add(struct inpcb *inp)
+{
+ in_pcbref(inp);
+ INP_WUNLOCK(inp);
+ INP_INFO_RLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ return(1);
+ }
+ return(0);
+
+}
+
+void
+tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp)
+{
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ if (inp && (tp == NULL)) {
+ /*
+ * If tcp_close/drop() gets called and tp
+ * returns NULL, then the function dropped
+ * the inp lock, we hold a reference keeping
+ * this around, so we must re-aquire the
+ * INP_WLOCK() in order to proceed with
+ * our dropping the inp reference.
+ */
+ INP_WLOCK(inp);
+ }
+ if (inp && in_pcbrele_wlocked(inp) == 0)
+ INP_WUNLOCK(inp);
+}
+
void
tcp_timer_2msl(void *xtp)
{
@@ -222,62 +366,66 @@ tcp_timer_2msl(void *xtp)
ostate = tp->t_state;
#endif
- /*
- * XXXRW: Does this actually happen?
- */
- INP_INFO_WLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
tcp_free_sackholes(tp);
if (callout_pending(&tp->t_timers->tt_2msl) ||
!callout_active(&tp->t_timers->tt_2msl)) {
INP_WUNLOCK(tp->t_inpcb);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_2msl);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
/*
* 2 MSL timeout in shutdown went off. If we're closed but
* still waiting for peer to close and connection has been idle
- * too long, or if 2MSL time is up from TIME_WAIT, delete connection
- * control block. Otherwise, check again in a bit.
+ * too long delete connection control block. Otherwise, check
+ * again in a bit.
+ *
+ * If in TIME_WAIT state just ignore as this timeout is handled in
+ * tcp_tw_2msl_scan().
*
* If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
* there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
* Ignore fact that there were recent incoming segments.
*/
+ if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
+ INP_WUNLOCK(inp);
+ CURVNET_RESTORE();
+ return;
+ }
if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
tp->t_inpcb && tp->t_inpcb->inp_socket &&
(tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
TCPSTAT_INC(tcps_finwait2_drops);
+ if (tcp_inpinfo_lock_add(inp)) {
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
+ }
tp = tcp_close(tp);
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
} else {
- if (tp->t_state != TCPS_TIME_WAIT &&
- ticks - tp->t_rcvtime <= TP_MAXIDLE(tp))
- callout_reset_on(&tp->t_timers->tt_2msl,
- TP_KEEPINTVL(tp), tcp_timer_2msl, tp, INP_CPU(inp));
- else
- tp = tcp_close(tp);
+ if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
+ callout_reset(&tp->t_timers->tt_2msl,
+ TP_KEEPINTVL(tp), tcp_timer_2msl, tp);
+ } else {
+ if (tcp_inpinfo_lock_add(inp)) {
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
+ }
+ tp = tcp_close(tp);
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
+ }
}
#ifdef TCPDEBUG
@@ -285,9 +433,11 @@ tcp_timer_2msl(void *xtp)
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
+ TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
+
if (tp != NULL)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+out:
CURVNET_RESTORE();
}
@@ -303,36 +453,23 @@ tcp_timer_keep(void *xtp)
ostate = tp->t_state;
#endif
- INP_INFO_WLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_keep) ||
!callout_active(&tp->t_timers->tt_keep)) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_keep);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
/*
* Keep-alive timer went off; send something
* or drop connection if idle for too long.
@@ -364,24 +501,29 @@ tcp_timer_keep(void *xtp)
tp->rcv_nxt, tp->snd_una - 1, 0);
free(t_template, M_TEMP);
}
- callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
- tcp_timer_keep, tp, INP_CPU(inp));
+ callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
+ tcp_timer_keep, tp);
} else
- callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
- tcp_timer_keep, tp, INP_CPU(inp));
+ callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
+ tcp_timer_keep, tp);
#ifdef TCPDEBUG
if (inp->inp_socket->so_options & SO_DEBUG)
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
+ TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
dropit:
TCPSTAT_INC(tcps_keepdrops);
+
+ if (tcp_inpinfo_lock_add(inp)) {
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
+ }
tp = tcp_drop(tp, ETIMEDOUT);
#ifdef TCPDEBUG
@@ -389,9 +531,9 @@ dropit:
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
- if (tp != NULL)
- INP_WUNLOCK(tp->t_inpcb);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
+ tcp_inpinfo_lock_del(inp, tp);
+out:
CURVNET_RESTORE();
}
@@ -406,38 +548,25 @@ tcp_timer_persist(void *xtp)
ostate = tp->t_state;
#endif
- INP_INFO_WLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_persist) ||
!callout_active(&tp->t_timers->tt_persist)) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_persist);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
/*
- * Persistance timer into zero window.
+ * Persistence timer into zero window.
* Force a byte to be output, if possible.
*/
TCPSTAT_INC(tcps_persisttimeo);
@@ -452,7 +581,12 @@ tcp_timer_persist(void *xtp)
(ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
TCPSTAT_INC(tcps_persistdrop);
+ if (tcp_inpinfo_lock_add(inp)) {
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
+ }
tp = tcp_drop(tp, ETIMEDOUT);
+ tcp_inpinfo_lock_del(inp, tp);
goto out;
}
/*
@@ -462,22 +596,26 @@ tcp_timer_persist(void *xtp)
if (tp->t_state > TCPS_CLOSE_WAIT &&
(ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
TCPSTAT_INC(tcps_persistdrop);
+ if (tcp_inpinfo_lock_add(inp)) {
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
+ }
tp = tcp_drop(tp, ETIMEDOUT);
+ tcp_inpinfo_lock_del(inp, tp);
goto out;
}
tcp_setpersist(tp);
tp->t_flags |= TF_FORCEDATA;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
tp->t_flags &= ~TF_FORCEDATA;
-out:
#ifdef TCPDEBUG
if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
#endif
- if (tp != NULL)
- INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
+ INP_WUNLOCK(inp);
+out:
CURVNET_RESTORE();
}
@@ -487,44 +625,34 @@ tcp_timer_rexmt(void * xtp)
struct tcpcb *tp = xtp;
CURVNET_SET(tp->t_vnet);
int rexmt;
- int headlocked;
struct inpcb *inp;
#ifdef TCPDEBUG
int ostate;
ostate = tp->t_state;
#endif
- INP_INFO_RLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_RUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_rexmt) ||
!callout_active(&tp->t_timers->tt_rexmt)) {
INP_WUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_rexmt);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
tcp_free_sackholes(tp);
+ if (tp->t_fb->tfb_tcp_rexmit_tmr) {
+ /* The stack has a timer action too. */
+ (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
+ }
/*
* Retransmission timer went off. Message has not
* been acked within retransmit interval. Back off
@@ -533,30 +661,15 @@ tcp_timer_rexmt(void * xtp)
if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
tp->t_rxtshift = TCP_MAXRXTSHIFT;
TCPSTAT_INC(tcps_timeoutdrop);
- in_pcbref(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
- INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- INP_WLOCK(inp);
- if (in_pcbrele_wlocked(inp)) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
- if (inp->inp_flags & INP_DROPPED) {
- INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
+ if (tcp_inpinfo_lock_add(inp)) {
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
}
-
tp = tcp_drop(tp, tp->t_softerror ?
tp->t_softerror : ETIMEDOUT);
- headlocked = 1;
+ tcp_inpinfo_lock_del(inp, tp);
goto out;
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
- headlocked = 0;
if (tp->t_state == TCPS_SYN_SENT) {
/*
* If the SYN was retransmitted, indicate CWND to be
@@ -589,12 +702,120 @@ tcp_timer_rexmt(void * xtp)
} else
tp->t_flags &= ~TF_PREVVALID;
TCPSTAT_INC(tcps_rexmttimeo);
- if (tp->t_state == TCPS_SYN_SENT)
+ if ((tp->t_state == TCPS_SYN_SENT) ||
+ (tp->t_state == TCPS_SYN_RECEIVED))
rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
else
rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
TCPT_RANGESET(tp->t_rxtcur, rexmt,
tp->t_rttmin, TCPTV_REXMTMAX);
+
+ /*
+ * We enter the path for PLMTUD if connection is established or, if
+ * connection is FIN_WAIT_1 status, reason for the last is that if
+ * amount of data we send is very small, we could send it in couple of
+ * packets and process straight to FIN. In that case we won't catch
+ * ESTABLISHED state.
+ */
+ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
+ || (tp->t_state == TCPS_FIN_WAIT_1))) {
+#ifdef INET6
+ int isipv6;
+#endif
+
+ /*
+ * Idea here is that at each stage of mtu probe (usually, 1448
+ * -> 1188 -> 524) should be given 2 chances to recover before
+ * further clamping down. 'tp->t_rxtshift % 2 == 0' should
+ * take care of that.
+ */
+ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
+ (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
+ (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) {
+ /*
+ * Enter Path MTU Black-hole Detection mechanism:
+ * - Disable Path MTU Discovery (IP "DF" bit).
+ * - Reduce MTU to lower value than what we
+ * negotiated with peer.
+ */
+ /* Record that we may have found a black hole. */
+ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
+
+ /* Keep track of previous MSS. */
+ tp->t_pmtud_saved_maxseg = tp->t_maxseg;
+
+ /*
+ * Reduce the MSS to blackhole value or to the default
+ * in an attempt to retransmit.
+ */
+#ifdef INET6
+ isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
+ if (isipv6 &&
+ tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
+ /* Use the sysctl tuneable blackhole MSS. */
+ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
+ V_tcp_pmtud_blackhole_activated++;
+ } else if (isipv6) {
+ /* Use the default MSS. */
+ tp->t_maxseg = V_tcp_v6mssdflt;
+ /*
+ * Disable Path MTU Discovery when we switch to
+ * minmss.
+ */
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ V_tcp_pmtud_blackhole_activated_min_mss++;
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
+ /* Use the sysctl tuneable blackhole MSS. */
+ tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
+ V_tcp_pmtud_blackhole_activated++;
+ } else {
+ /* Use the default MSS. */
+ tp->t_maxseg = V_tcp_mssdflt;
+ /*
+ * Disable Path MTU Discovery when we switch to
+ * minmss.
+ */
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ V_tcp_pmtud_blackhole_activated_min_mss++;
+ }
+#endif
+ /*
+ * Reset the slow-start flight size
+ * as it may depend on the new MSS.
+ */
+ if (CC_ALGO(tp)->conn_init != NULL)
+ CC_ALGO(tp)->conn_init(tp->ccv);
+ } else {
+ /*
+ * If further retransmissions are still unsuccessful
+ * with a lowered MTU, maybe this isn't a blackhole and
+ * we restore the previous MSS and blackhole detection
+ * flags.
+ * The limit '6' is determined by giving each probe
+ * stage (1448, 1188, 524) 2 chances to recover.
+ */
+ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
+ (tp->t_rxtshift > 6)) {
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
+ tp->t_maxseg = tp->t_pmtud_saved_maxseg;
+ V_tcp_pmtud_blackhole_failed++;
+ /*
+ * Reset the slow-start flight size as it
+ * may depend on the new MSS.
+ */
+ if (CC_ALGO(tp)->conn_init != NULL)
+ CC_ALGO(tp)->conn_init(tp->ccv);
+ }
+ }
+ }
+
/*
* Disable RFC1323 and SACK if we haven't got any response to
* our third SYN to work-around some broken terminal servers
@@ -615,7 +836,9 @@ tcp_timer_rexmt(void * xtp)
#ifdef INET6
if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
in6_losing(tp->t_inpcb);
+ else
#endif
+ in_losing(tp->t_inpcb);
tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
tp->t_srtt = 0;
}
@@ -632,34 +855,35 @@ tcp_timer_rexmt(void * xtp)
cc_cong_signal(tp, NULL, CC_RTO);
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
-out:
#ifdef TCPDEBUG
if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
- if (tp != NULL)
- INP_WUNLOCK(inp);
- if (headlocked)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
+ INP_WUNLOCK(inp);
+out:
CURVNET_RESTORE();
}
void
-tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
+tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
{
struct callout *t_callout;
- void *f_callout;
+ timeout_t *f_callout;
struct inpcb *inp = tp->t_inpcb;
- int cpu = INP_CPU(inp);
+ int cpu = inp_to_cpuid(inp);
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
return;
#endif
+ if (tp->t_timers->tt_flags & TT_STOPPED)
+ return;
+
switch (timer_type) {
case TT_DELACK:
t_callout = &tp->t_timers->tt_delack;
@@ -682,7 +906,11 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
f_callout = tcp_timer_2msl;
break;
default:
- panic("bad timer_type");
+ if (tp->t_fb->tfb_tcp_timer_activate) {
+ tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
+ return;
+ }
+ panic("tp %p bad timer_type %#x", tp, timer_type);
}
if (delta == 0) {
callout_stop(t_callout);
@@ -692,7 +920,7 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
}
int
-tcp_timer_active(struct tcpcb *tp, int timer_type)
+tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
{
struct callout *t_callout;
@@ -713,28 +941,79 @@ tcp_timer_active(struct tcpcb *tp, int timer_type)
t_callout = &tp->t_timers->tt_2msl;
break;
default:
- panic("bad timer_type");
+ if (tp->t_fb->tfb_tcp_timer_active) {
+ return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
+ }
+ panic("tp %p bad timer_type %#x", tp, timer_type);
}
return callout_active(t_callout);
}
+void
+tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
+{
+ struct callout *t_callout;
+
+ tp->t_timers->tt_flags |= TT_STOPPED;
+ switch (timer_type) {
+ case TT_DELACK:
+ t_callout = &tp->t_timers->tt_delack;
+ break;
+ case TT_REXMT:
+ t_callout = &tp->t_timers->tt_rexmt;
+ break;
+ case TT_PERSIST:
+ t_callout = &tp->t_timers->tt_persist;
+ break;
+ case TT_KEEP:
+ t_callout = &tp->t_timers->tt_keep;
+ break;
+ case TT_2MSL:
+ t_callout = &tp->t_timers->tt_2msl;
+ break;
+ default:
+ if (tp->t_fb->tfb_tcp_timer_stop) {
+ /*
+ * XXXrrs we need to look at this with the
+ * stop case below (flags).
+ */
+ tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
+ return;
+ }
+ panic("tp %p bad timer_type %#x", tp, timer_type);
+ }
+
+ if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
+ /*
+ * Can't stop the callout, defer tcpcb actual deletion
+ * to the last one. We do this using the async drain
+ * function and incrementing the count in
+ */
+ tp->t_timers->tt_draincnt++;
+ }
+}
+
#define ticks_to_msecs(t) (1000*(t) / hz)
void
-tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer)
+tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
+ struct xtcp_timer *xtimer)
{
- bzero(xtimer, sizeof(struct xtcp_timer));
+ sbintime_t now;
+
+ bzero(xtimer, sizeof(*xtimer));
if (timer == NULL)
return;
+ now = getsbinuptime();
if (callout_active(&timer->tt_delack))
- xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks);
+ xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
if (callout_active(&timer->tt_rexmt))
- xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks);
+ xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
if (callout_active(&timer->tt_persist))
- xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks);
+ xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
if (callout_active(&timer->tt_keep))
- xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks);
+ xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
if (callout_active(&timer->tt_2msl))
- xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks);
+ xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
}
diff --git a/freebsd/sys/netinet/tcp_timer.h b/freebsd/sys/netinet/tcp_timer.h
index 0da58fd8..bb78062d 100644
--- a/freebsd/sys/netinet/tcp_timer.h
+++ b/freebsd/sys/netinet/tcp_timer.h
@@ -76,9 +76,8 @@
#define TCPTV_SRTTBASE 0 /* base roundtrip time;
if 0, no idea yet */
#define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */
-#define TCPTV_SRTTDFLT ( 3*hz) /* assumed RTT if no info */
-#define TCPTV_PERSMIN ( 5*hz) /* retransmit persistence */
+#define TCPTV_PERSMIN ( 5*hz) /* minimum persist interval */
#define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */
#define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */
@@ -122,7 +121,7 @@
#ifdef TCPTIMERS
static const char *tcptimers[] =
- { "REXMT", "PERSIST", "KEEP", "2MSL" };
+ { "REXMT", "PERSIST", "KEEP", "2MSL", "DELACK" };
#endif
/*
@@ -146,12 +145,27 @@ struct tcp_timer {
struct callout tt_keep; /* keepalive */
struct callout tt_2msl; /* 2*msl TIME_WAIT timer */
struct callout tt_delack; /* delayed ACK timer */
+ uint32_t tt_flags; /* Timers flags */
+ uint32_t tt_draincnt; /* Count being drained */
};
-#define TT_DELACK 0x01
-#define TT_REXMT 0x02
-#define TT_PERSIST 0x04
-#define TT_KEEP 0x08
-#define TT_2MSL 0x10
+
+/*
+ * Flags for the tt_flags field.
+ */
+#define TT_DELACK 0x0001
+#define TT_REXMT 0x0002
+#define TT_PERSIST 0x0004
+#define TT_KEEP 0x0008
+#define TT_2MSL 0x0010
+#define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL)
+
+#define TT_DELACK_RST 0x0100
+#define TT_REXMT_RST 0x0200
+#define TT_PERSIST_RST 0x0400
+#define TT_KEEP_RST 0x0800
+#define TT_2MSL_RST 0x1000
+
+#define TT_STOPPED 0x00010000
#define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit)
#define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle)
@@ -159,6 +173,8 @@ struct tcp_timer {
#define TP_KEEPCNT(tp) ((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt)
#define TP_MAXIDLE(tp) (TP_KEEPCNT(tp) * TP_KEEPINTVL(tp))
+extern int tcp_persmin; /* minimum persist interval */
+extern int tcp_persmax; /* maximum persist interval */
extern int tcp_keepinit; /* time to establish connection */
extern int tcp_keepidle; /* time before keepalive probes begin */
extern int tcp_keepintvl; /* time between keepalive probes */
@@ -170,14 +186,19 @@ extern int tcp_rexmit_slop;
extern int tcp_msl;
extern int tcp_ttl; /* time to live for TCP segs */
extern int tcp_backoff[];
+extern int tcp_syn_backoff[];
extern int tcp_finwait2_timeout;
extern int tcp_fast_finwait2_recycle;
+int tcp_inpinfo_lock_add(struct inpcb *inp);
+void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp);
+
void tcp_timer_init(void);
void tcp_timer_2msl(void *xtp);
+void tcp_timer_discard(void *);
struct tcptw *
- tcp_tw_2msl_scan(int _reuse); /* XXX temporary */
+ tcp_tw_2msl_scan(int reuse); /* XXX temporary? */
void tcp_timer_keep(void *xtp);
void tcp_timer_persist(void *xtp);
void tcp_timer_rexmt(void *xtp);
diff --git a/freebsd/sys/netinet/tcp_timewait.c b/freebsd/sys/netinet/tcp_timewait.c
index 9034fab4..330e842e 100644
--- a/freebsd/sys/netinet/tcp_timewait.c
+++ b/freebsd/sys/netinet/tcp_timewait.c
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -93,20 +94,41 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
static VNET_DEFINE(uma_zone_t, tcptw_zone);
-#define V_tcptw_zone VNET(tcptw_zone)
+#define V_tcptw_zone VNET(tcptw_zone)
static int maxtcptw;
/*
* The timed wait queue contains references to each of the TCP sessions
* currently in the TIME_WAIT state. The queue pointers, including the
* queue pointers in each tcptw structure, are protected using the global
- * tcbinfo lock, which must be held over queue iteration and modification.
+ * timewait lock, which must be held over queue iteration and modification.
+ *
+ * Rules on tcptw usage:
+ * - a inpcb is always freed _after_ its tcptw
+ * - a tcptw relies on its inpcb reference counting for memory stability
+ * - a tcptw is dereferenceable only while its inpcb is locked
*/
static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
-#define V_twq_2msl VNET(twq_2msl)
+#define V_twq_2msl VNET(twq_2msl)
+
+/* Global timewait lock */
+static VNET_DEFINE(struct rwlock, tw_lock);
+#define V_tw_lock VNET(tw_lock)
+
+#define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0)
+#define TW_LOCK_DESTROY(tw) rw_destroy(&(tw))
+#define TW_RLOCK(tw) rw_rlock(&(tw))
+#define TW_WLOCK(tw) rw_wlock(&(tw))
+#define TW_RUNLOCK(tw) rw_runlock(&(tw))
+#define TW_WUNLOCK(tw) rw_wunlock(&(tw))
+#define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED)
+#define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED)
+#define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED)
+#define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED)
static void tcp_tw_2msl_reset(struct tcptw *, int);
-static void tcp_tw_2msl_stop(struct tcptw *);
+static void tcp_tw_2msl_stop(struct tcptw *, int);
+static int tcp_twrespond(struct tcptw *, int);
static int
tcptw_auto_size(void)
@@ -149,7 +171,7 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW,
VNET_DEFINE(int, nolocaltimewait) = 0;
#define V_nolocaltimewait VNET(nolocaltimewait)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(nolocaltimewait), 0,
"Do not create compressed TCP TIME_WAIT entries for local connections");
@@ -166,13 +188,14 @@ tcp_tw_init(void)
{
V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw);
if (maxtcptw == 0)
uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
else
uma_zone_set_max(V_tcptw_zone, maxtcptw);
TAILQ_INIT(&V_twq_2msl);
+ TW_LOCK_INIT(V_tw_lock, "tcptw");
}
#ifdef VIMAGE
@@ -181,11 +204,12 @@ tcp_tw_destroy(void)
{
struct tcptw *tw;
- INP_INFO_WLOCK(&V_tcbinfo);
- while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
+ INP_INFO_RLOCK(&V_tcbinfo);
+ while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
tcp_twclose(tw, 0);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ TW_LOCK_DESTROY(V_tw_lock);
uma_zdestroy(V_tcptw_zone);
}
#endif
@@ -206,7 +230,7 @@ tcp_twstart(struct tcpcb *tp)
int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
#endif
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if (V_nolocaltimewait) {
@@ -229,8 +253,23 @@ tcp_twstart(struct tcpcb *tp)
}
}
+
+ /*
+ * For use only by DTrace. We do not reference the state
+ * after this point so modifying it in place is not a problem.
+ */
+ tcp_state_change(tp, TCPS_TIME_WAIT);
+
tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
if (tw == NULL) {
+ /*
+ * Reached limit on total number of TIMEWAIT connections
+ * allowed. Remove a connection from TIMEWAIT queue in LRU
+ * fashion to make room for this connection.
+ *
+ * XXX: Check if it possible to always have enough room
+ * in advance based on guarantees provided by uma_zalloc().
+ */
tw = tcp_tw_2msl_scan(1);
if (tw == NULL) {
tp = tcp_close(tp);
@@ -239,7 +278,12 @@ tcp_twstart(struct tcpcb *tp)
return;
}
}
+ /*
+ * The tcptw will hold a reference on its inpcb until tcp_twclose
+ * is called
+ */
tw->tw_inpcb = inp;
+ in_pcbref(inp); /* Reference from tw */
/*
* Recover last window size sent.
@@ -313,53 +357,19 @@ tcp_twstart(struct tcpcb *tp)
INP_WUNLOCK(inp);
}
-#if 0
-/*
- * The appromixate rate of ISN increase of Microsoft TCP stacks;
- * the actual rate is slightly higher due to the addition of
- * random positive increments.
- *
- * Most other new OSes use semi-randomized ISN values, so we
- * do not need to worry about them.
- */
-#define MS_ISN_BYTES_PER_SECOND 250000
-
-/*
- * Determine if the ISN we will generate has advanced beyond the last
- * sequence number used by the previous connection. If so, indicate
- * that it is safe to recycle this tw socket by returning 1.
- */
-int
-tcp_twrecycleable(struct tcptw *tw)
-{
- tcp_seq new_iss = tw->iss;
- tcp_seq new_irs = tw->irs;
-
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz);
- new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz);
-
- if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt))
- return (1);
- else
- return (0);
-}
-#endif
-
/*
* Returns 1 if the TIME_WAIT state was killed and we should start over,
* looking for a pcb in the listen state. Returns 0 otherwise.
*/
int
-tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
+tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th,
struct mbuf *m, int tlen)
{
struct tcptw *tw;
int thflags;
tcp_seq seq;
- /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
/*
@@ -460,11 +470,10 @@ tcp_twclose(struct tcptw *tw, int reuse)
inp = tw->tw_inpcb;
KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
INP_WLOCK_ASSERT(inp);
- tw->tw_inpcb = NULL;
- tcp_tw_2msl_stop(tw);
+ tcp_tw_2msl_stop(tw, reuse);
inp->inp_ppcb = NULL;
in_pcbdrop(inp);
@@ -493,17 +502,17 @@ tcp_twclose(struct tcptw *tw, int reuse)
*/
INP_WUNLOCK(inp);
}
- } else
+ } else {
+ /*
+ * The socket has been already cleaned-up for us, only free the
+ * inpcb.
+ */
in_pcbfree(inp);
+ }
TCPSTAT_INC(tcps_closed);
- crfree(tw->tw_cred);
- tw->tw_cred = NULL;
- if (reuse)
- return;
- uma_zfree(V_tcptw_zone, tw);
}
-int
+static int
tcp_twrespond(struct tcptw *tw, int flags)
{
struct inpcb *inp = tw->tw_inpcb;
@@ -525,7 +534,7 @@ tcp_twrespond(struct tcptw *tw, int flags)
INP_WLOCK_ASSERT(inp);
- m = m_gethdr(M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL)
return (ENOBUFS);
m->m_data += max_linkhdr;
@@ -596,9 +605,9 @@ tcp_twrespond(struct tcptw *tw, int flags)
m->m_pkthdr.csum_flags = CSUM_TCP;
th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
- ip->ip_len = m->m_pkthdr.len;
+ ip->ip_len = htons(m->m_pkthdr.len);
if (V_path_mtu_discovery)
- ip->ip_off |= IP_DF;
+ ip->ip_off |= htons(IP_DF);
error = ip_output(m, inp->inp_options, NULL,
((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
NULL, inp);
@@ -616,36 +625,114 @@ static void
tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
{
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tw->tw_inpcb);
+
+ TW_WLOCK(V_tw_lock);
if (rearm)
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
tw->tw_time = ticks + 2 * tcp_msl;
TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
+ TW_WUNLOCK(V_tw_lock);
}
static void
-tcp_tw_2msl_stop(struct tcptw *tw)
+tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
{
+ struct ucred *cred;
+ struct inpcb *inp;
+ int released;
+
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+ TW_WLOCK(V_tw_lock);
+ inp = tw->tw_inpcb;
+ tw->tw_inpcb = NULL;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
+ cred = tw->tw_cred;
+ tw->tw_cred = NULL;
+ TW_WUNLOCK(V_tw_lock);
+
+ if (cred != NULL)
+ crfree(cred);
+
+ released = in_pcbrele_wlocked(inp);
+ KASSERT(!released, ("%s: inp should not be released here", __func__));
+
+ if (!reuse)
+ uma_zfree(V_tcptw_zone, tw);
+ TCPSTATES_DEC(TCPS_TIME_WAIT);
}
struct tcptw *
tcp_tw_2msl_scan(int reuse)
{
struct tcptw *tw;
+ struct inpcb *inp;
+
+#ifdef INVARIANTS
+ if (reuse) {
+ /*
+ * Exclusive pcbinfo lock is not required in reuse case even if
+ * two inpcb locks can be acquired simultaneously:
+ * - the inpcb transitioning to TIME_WAIT state in
+ * tcp_tw_start(),
+ * - the inpcb closed by tcp_twclose().
+ *
+ * It is because only inpcbs in FIN_WAIT2 or CLOSING states can
+ * transition in TIME_WAIT state. Then a pcbcb cannot be in
+ * TIME_WAIT list and transitioning to TIME_WAIT state at same
+ * time.
+ */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ }
+#endif
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
for (;;) {
+ TW_RLOCK(V_tw_lock);
tw = TAILQ_FIRST(&V_twq_2msl);
- if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0))
+ if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
+ TW_RUNLOCK(V_tw_lock);
break;
- INP_WLOCK(tw->tw_inpcb);
- tcp_twclose(tw, reuse);
- if (reuse)
- return (tw);
+ }
+ KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
+ __func__));
+
+ inp = tw->tw_inpcb;
+ in_pcbref(inp);
+ TW_RUNLOCK(V_tw_lock);
+
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) {
+
+ INP_WLOCK(inp);
+ tw = intotw(inp);
+ if (in_pcbrele_wlocked(inp)) {
+ KASSERT(tw == NULL, ("%s: held last inp "
+ "reference but tw not NULL", __func__));
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ continue;
+ }
+
+ if (tw == NULL) {
+ /* tcp_twclose() has already been called */
+ INP_WUNLOCK(inp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ continue;
+ }
+
+ tcp_twclose(tw, reuse);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ if (reuse)
+ return tw;
+ } else {
+ /* INP_INFO lock is busy, continue later. */
+ INP_WLOCK(inp);
+ if (!in_pcbrele_wlocked(inp))
+ INP_WUNLOCK(inp);
+ break;
+ }
}
- return (NULL);
+
+ return NULL;
}
diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c
index 61711a6e..d5fa680f 100644
--- a/freebsd/sys/netinet/tcp_usrreq.c
+++ b/freebsd/sys/netinet/tcp_usrreq.c
@@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/limits.h>
#include <sys/malloc.h>
+#include <sys/refcount.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/mbuf.h>
@@ -66,11 +67,12 @@ __FBSDID("$FreeBSD$");
#endif
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#include <net/vnet.h>
-#include <netinet/cc.h>
#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
@@ -81,11 +83,19 @@ __FBSDID("$FreeBSD$");
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#endif
+#ifdef TCP_RFC7413
+#include <netinet/tcp_fastopen.h>
+#endif
+#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
+#include <netinet/cc/cc.h>
+#ifdef TCPPCAP
+#include <netinet/tcp_pcap.h>
+#endif
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
@@ -147,6 +157,7 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td)
tp = intotcpcb(inp);
out:
TCPDEBUG2(PRU_ATTACH);
+ TCP_PROBE2(debug__user, tp, PRU_ATTACH);
return error;
}
@@ -164,7 +175,7 @@ tcp_detach(struct socket *so, struct inpcb *inp)
{
struct tcpcb *tp;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
@@ -184,6 +195,21 @@ tcp_detach(struct socket *so, struct inpcb *inp)
* present until timewait ends.
*
* XXXRW: Would it be cleaner to free the tcptw here?
+ *
+ * Astute question indeed, from twtcp perspective there are
+ * three cases to consider:
+ *
+ * #1 tcp_detach is called at tcptw creation time by
+ * tcp_twstart, then do not discard the newly created tcptw
+ * and leave inpcb present until timewait ends
+ * #2 tcp_detach is called at timewait end (or reuse) by
+ * tcp_twclose, then the tcptw has already been discarded
+ * (or reused) and inpcb is freed here
+ * #3 tcp_detach is called() after timewait ends (or reuse)
+ * (e.g. by soclose), then tcptw has already been discarded
+ * (or reused) and inpcb is freed here
+ *
+ * In all three cases the tcptw should not be freed here.
*/
if (inp->inp_flags & INP_DROPPED) {
KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
@@ -227,15 +253,20 @@ static void
tcp_usr_detach(struct socket *so)
{
struct inpcb *inp;
+ int rlock = 0;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
- INP_INFO_WLOCK(&V_tcbinfo);
+ if (!INP_INFO_WLOCKED(&V_tcbinfo)) {
+ INP_INFO_RLOCK(&V_tcbinfo);
+ rlock = 1;
+ }
INP_WLOCK(inp);
KASSERT(inp->inp_socket != NULL,
("tcp_usr_detach: inp_socket == NULL"));
tcp_detach(so, inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (rlock)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
#ifdef INET
@@ -276,6 +307,7 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
INP_HASH_WUNLOCK(&V_tcbinfo);
out:
TCPDEBUG2(PRU_BIND);
+ TCP_PROBE2(debug__user, tp, PRU_BIND);
INP_WUNLOCK(inp);
return (error);
@@ -336,6 +368,7 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
INP_HASH_WUNLOCK(&V_tcbinfo);
out:
TCPDEBUG2(PRU_BIND);
+ TCP_PROBE2(debug__user, tp, PRU_BIND);
INP_WUNLOCK(inp);
return (error);
}
@@ -369,7 +402,7 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
INP_HASH_WUNLOCK(&V_tcbinfo);
if (error == 0) {
- tp->t_state = TCPS_LISTEN;
+ tcp_state_change(tp, TCPS_LISTEN);
solisten_proto(so, backlog);
#ifdef TCP_OFFLOAD
if ((so->so_options & SO_NO_OFFLOAD) == 0)
@@ -378,8 +411,13 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
}
SOCK_UNLOCK(so);
+#ifdef TCP_RFC7413
+ if (tp->t_flags & TF_FASTOPEN)
+ tp->t_tfo_pending = tcp_fastopen_alloc_counter();
+#endif
out:
TCPDEBUG2(PRU_LISTEN);
+ TCP_PROBE2(debug__user, tp, PRU_LISTEN);
INP_WUNLOCK(inp);
return (error);
}
@@ -414,7 +452,7 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
}
INP_HASH_WUNLOCK(&V_tcbinfo);
if (error == 0) {
- tp->t_state = TCPS_LISTEN;
+ tcp_state_change(tp, TCPS_LISTEN);
solisten_proto(so, backlog);
#ifdef TCP_OFFLOAD
if ((so->so_options & SO_NO_OFFLOAD) == 0)
@@ -423,8 +461,13 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
}
SOCK_UNLOCK(so);
+#ifdef TCP_RFC7413
+ if (tp->t_flags & TF_FASTOPEN)
+ tp->t_tfo_pending = tcp_fastopen_alloc_counter();
+#endif
out:
TCPDEBUG2(PRU_LISTEN);
+ TCP_PROBE2(debug__user, tp, PRU_LISTEN);
INP_WUNLOCK(inp);
return (error);
}
@@ -462,8 +505,12 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
INP_WLOCK(inp);
- if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
- error = EINVAL;
+ if (inp->inp_flags & INP_TIMEWAIT) {
+ error = EADDRINUSE;
+ goto out;
+ }
+ if (inp->inp_flags & INP_DROPPED) {
+ error = ECONNREFUSED;
goto out;
}
tp = intotcpcb(inp);
@@ -477,9 +524,10 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
goto out;
#endif
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
- error = tcp_output(tp);
+ error = tp->t_fb->tfb_tcp_output(tp);
out:
TCPDEBUG2(PRU_CONNECT);
+ TCP_PROBE2(debug__user, tp, PRU_CONNECT);
INP_WUNLOCK(inp);
return (error);
}
@@ -509,8 +557,12 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
INP_WLOCK(inp);
- if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
- error = EINVAL;
+ if (inp->inp_flags & INP_TIMEWAIT) {
+ error = EADDRINUSE;
+ goto out;
+ }
+ if (inp->inp_flags & INP_DROPPED) {
+ error = ECONNREFUSED;
goto out;
}
tp = intotcpcb(inp);
@@ -543,7 +595,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
(error = tcp_offload_connect(so, nam)) == 0)
goto out;
#endif
- error = tcp_output(tp);
+ error = tp->t_fb->tfb_tcp_output(tp);
goto out;
}
#endif
@@ -561,10 +613,11 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
goto out;
#endif
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
- error = tcp_output(tp);
+ error = tp->t_fb->tfb_tcp_output(tp);
out:
TCPDEBUG2(PRU_CONNECT);
+ TCP_PROBE2(debug__user, tp, PRU_CONNECT);
INP_WUNLOCK(inp);
return (error);
}
@@ -589,11 +642,13 @@ tcp_usr_disconnect(struct socket *so)
int error = 0;
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
INP_WLOCK(inp);
- if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ if (inp->inp_flags & INP_TIMEWAIT)
+ goto out;
+ if (inp->inp_flags & INP_DROPPED) {
error = ECONNRESET;
goto out;
}
@@ -602,8 +657,9 @@ tcp_usr_disconnect(struct socket *so)
tcp_disconnect(tp);
out:
TCPDEBUG2(PRU_DISCONNECT);
+ TCP_PROBE2(debug__user, tp, PRU_DISCONNECT);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
@@ -611,13 +667,6 @@ out:
/*
* Accept a connection. Essentially all the work is done at higher levels;
* just return the address of the peer, storing through addr.
- *
- * The rationale for acquiring the tcbinfo lock here is somewhat complicated,
- * and is described in detail in the commit log entry for r175612. Acquiring
- * it delays an accept(2) racing with sonewconn(), which inserts the socket
- * before the inpcb address/port fields are initialized. A better fix would
- * prevent the socket from being placed in the listen queue until all fields
- * are fully initialized.
*/
static int
tcp_usr_accept(struct socket *so, struct sockaddr **nam)
@@ -634,7 +683,6 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
- INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
error = ECONNABORTED;
@@ -653,8 +701,8 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam)
out:
TCPDEBUG2(PRU_ACCEPT);
+ TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
INP_WUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
if (error == 0)
*nam = in_sockaddr(port, &addr);
return error;
@@ -704,6 +752,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
out:
TCPDEBUG2(PRU_ACCEPT);
+ TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
INP_WUNLOCK(inp);
INP_INFO_RUNLOCK(&V_tcbinfo);
if (error == 0) {
@@ -727,7 +776,7 @@ tcp_usr_shutdown(struct socket *so)
struct tcpcb *tp = NULL;
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("inp == NULL"));
INP_WLOCK(inp);
@@ -740,12 +789,13 @@ tcp_usr_shutdown(struct socket *so)
socantsendmore(so);
tcp_usrclosed(tp);
if (!(inp->inp_flags & INP_DROPPED))
- error = tcp_output(tp);
+ error = tp->t_fb->tfb_tcp_output(tp);
out:
TCPDEBUG2(PRU_SHUTDOWN);
+ TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
@@ -770,15 +820,28 @@ tcp_usr_rcvd(struct socket *so, int flags)
}
tp = intotcpcb(inp);
TCPDEBUG1();
+#ifdef TCP_RFC7413
+ /*
+ * For passively-created TFO connections, don't attempt a window
+ * update while still in SYN_RECEIVED as this may trigger an early
+ * SYN|ACK. It is preferable to have the SYN|ACK be sent along with
+ * application response data, or failing that, when the DELACK timer
+ * expires.
+ */
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ (tp->t_state == TCPS_SYN_RECEIVED))
+ goto out;
+#endif
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
tcp_offload_rcvd(tp);
else
#endif
- tcp_output(tp);
+ tp->t_fb->tfb_tcp_output(tp);
out:
TCPDEBUG2(PRU_RCVD);
+ TCP_PROBE2(debug__user, tp, PRU_RCVD);
INP_WUNLOCK(inp);
return (error);
}
@@ -807,14 +870,18 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* this call.
*/
if (flags & PRUS_EOF)
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
INP_WLOCK(inp);
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
if (control)
m_freem(control);
- if (m)
+ /*
+ * In case of PRUS_NOTREADY, tcp_usr_ready() is responsible
+ * for freeing memory.
+ */
+ if (m && (flags & PRUS_NOTREADY) == 0)
m_freem(m);
error = ECONNRESET;
goto out;
@@ -836,13 +903,12 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
m_freem(control); /* empty control, just free it */
}
if (!(flags & PRUS_OOB)) {
- sbappendstream(&so->so_snd, m);
+ sbappendstream(&so->so_snd, m, flags);
if (nam && tp->t_state < TCPS_SYN_SENT) {
/*
* Do implied connect if not yet connected,
* initialize window to default value, and
- * initialize maxseg/maxopd using peer's cached
- * MSS.
+ * initialize maxseg using peer's cached MSS.
*/
#ifdef INET6
if (isipv6)
@@ -864,14 +930,15 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* Close the send side of the connection after
* the data is sent.
*/
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
socantsendmore(so);
tcp_usrclosed(tp);
}
- if (!(inp->inp_flags & INP_DROPPED)) {
+ if (!(inp->inp_flags & INP_DROPPED) &&
+ !(flags & PRUS_NOTREADY)) {
if (flags & PRUS_MORETOCOME)
tp->t_flags |= TF_MORETOCOME;
- error = tcp_output(tp);
+ error = tp->t_fb->tfb_tcp_output(tp);
if (flags & PRUS_MORETOCOME)
tp->t_flags &= ~TF_MORETOCOME;
}
@@ -894,14 +961,13 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* of data past the urgent section.
* Otherwise, snd_up should be one lower.
*/
- sbappendstream_locked(&so->so_snd, m);
+ sbappendstream_locked(&so->so_snd, m, flags);
SOCKBUF_UNLOCK(&so->so_snd);
if (nam && tp->t_state < TCPS_SYN_SENT) {
/*
* Do implied connect if not yet connected,
* initialize window to default value, and
- * initialize maxseg/maxopd using peer's cached
- * MSS.
+ * initialize maxseg using peer's cached MSS.
*/
#ifdef INET6
if (isipv6)
@@ -918,17 +984,48 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
tp->snd_wnd = TTCP_CLIENT_SND_WND;
tcp_mss(tp, -1);
}
- tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
- tp->t_flags |= TF_FORCEDATA;
- error = tcp_output(tp);
- tp->t_flags &= ~TF_FORCEDATA;
+ tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
+ if (!(flags & PRUS_NOTREADY)) {
+ tp->t_flags |= TF_FORCEDATA;
+ error = tp->t_fb->tfb_tcp_output(tp);
+ tp->t_flags &= ~TF_FORCEDATA;
+ }
}
out:
TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
+ TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
+ ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
INP_WUNLOCK(inp);
if (flags & PRUS_EOF)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ return (error);
+}
+
+static int
+tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ int error;
+
+ inp = sotoinpcb(so);
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ INP_WUNLOCK(inp);
+ for (int i = 0; i < count; i++)
+ m = m_free(m);
+ return (ECONNRESET);
+ }
+ tp = intotcpcb(inp);
+
+ SOCKBUF_LOCK(&so->so_snd);
+ error = sbready(&so->so_snd, m, count);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (error == 0)
+ error = tp->t_fb->tfb_tcp_output(tp);
+ INP_WUNLOCK(inp);
+
return (error);
}
@@ -945,7 +1042,7 @@ tcp_usr_abort(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
KASSERT(inp->inp_socket != NULL,
("tcp_usr_abort: inp_socket == NULL"));
@@ -959,6 +1056,7 @@ tcp_usr_abort(struct socket *so)
TCPDEBUG1();
tcp_drop(tp, ECONNABORTED);
TCPDEBUG2(PRU_ABORT);
+ TCP_PROBE2(debug__user, tp, PRU_ABORT);
}
if (!(inp->inp_flags & INP_DROPPED)) {
SOCK_LOCK(so);
@@ -967,7 +1065,7 @@ tcp_usr_abort(struct socket *so)
inp->inp_flags |= INP_SOCKREF;
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
/*
@@ -983,7 +1081,7 @@ tcp_usr_close(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
KASSERT(inp->inp_socket != NULL,
("tcp_usr_close: inp_socket == NULL"));
@@ -998,6 +1096,7 @@ tcp_usr_close(struct socket *so)
TCPDEBUG1();
tcp_disconnect(tp);
TCPDEBUG2(PRU_CLOSE);
+ TCP_PROBE2(debug__user, tp, PRU_CLOSE);
}
if (!(inp->inp_flags & INP_DROPPED)) {
SOCK_LOCK(so);
@@ -1006,7 +1105,7 @@ tcp_usr_close(struct socket *so)
inp->inp_flags |= INP_SOCKREF;
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
/*
@@ -1047,6 +1146,7 @@ tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
out:
TCPDEBUG2(PRU_RCVOOB);
+ TCP_PROBE2(debug__user, tp, PRU_RCVOOB);
INP_WUNLOCK(inp);
return (error);
}
@@ -1066,6 +1166,7 @@ struct pr_usrreqs tcp_usrreqs = {
.pru_rcvd = tcp_usr_rcvd,
.pru_rcvoob = tcp_usr_rcvoob,
.pru_send = tcp_usr_send,
+ .pru_ready = tcp_usr_ready,
.pru_shutdown = tcp_usr_shutdown,
.pru_sockaddr = in_getsockaddr,
.pru_sosetlabel = in_pcbsosetlabel,
@@ -1088,6 +1189,7 @@ struct pr_usrreqs tcp6_usrreqs = {
.pru_rcvd = tcp_usr_rcvd,
.pru_rcvoob = tcp_usr_rcvoob,
.pru_send = tcp_usr_send,
+ .pru_ready = tcp_usr_ready,
.pru_shutdown = tcp_usr_shutdown,
.pru_sockaddr = in6_mapped_sockaddr,
.pru_sosetlabel = in_pcbsosetlabel,
@@ -1154,7 +1256,7 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
soisconnecting(so);
TCPSTAT_INC(tcps_connattempt);
- tp->t_state = TCPS_SYN_SENT;
+ tcp_state_change(tp, TCPS_SYN_SENT);
tp->iss = tcp_new_isn(tp);
tcp_sendseqinit(tp);
@@ -1170,10 +1272,7 @@ out:
static int
tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
{
- struct inpcb *inp = tp->t_inpcb, *oinp;
- struct socket *so = inp->inp_socket;
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
- struct in6_addr addr6;
+ struct inpcb *inp = tp->t_inpcb;
int error;
INP_WLOCK_ASSERT(inp);
@@ -1184,39 +1283,9 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
if (error)
goto out;
}
-
- /*
- * Cannot simply call in_pcbconnect, because there might be an
- * earlier incarnation of this same connection still in
- * TIME_WAIT state, creating an ADDRINUSE error.
- * in6_pcbladdr() also handles scope zone IDs.
- *
- * XXXRW: We wouldn't need to expose in6_pcblookup_hash_locked()
- * outside of in6_pcb.c if there were an in6_pcbconnect_setup().
- */
- error = in6_pcbladdr(inp, nam, &addr6);
- if (error)
- goto out;
- oinp = in6_pcblookup_hash_locked(inp->inp_pcbinfo,
- &sin6->sin6_addr, sin6->sin6_port,
- IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
- ? &addr6
- : &inp->in6p_laddr,
- inp->inp_lport, 0, NULL);
- if (oinp) {
- error = EADDRINUSE;
+ error = in6_pcbconnect(inp, nam, td->td_ucred);
+ if (error != 0)
goto out;
- }
- if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
- inp->in6p_laddr = addr6;
- inp->in6p_faddr = sin6->sin6_addr;
- inp->inp_fport = sin6->sin6_port;
- /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
- inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
- if (inp->inp_flags & IN6P_AUTOFLOWLABEL)
- inp->inp_flow |=
- (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
- in_pcbrehash(inp);
INP_HASH_WUNLOCK(&V_tcbinfo);
/* Compute window scaling to request. */
@@ -1224,9 +1293,9 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
(TCP_MAXWIN << tp->request_r_scale) < sb_max)
tp->request_r_scale++;
- soisconnecting(so);
+ soisconnecting(inp->inp_socket);
TCPSTAT_INC(tcps_connattempt);
- tp->t_state = TCPS_SYN_SENT;
+ tcp_state_change(tp, TCPS_SYN_SENT);
tp->iss = tcp_new_isn(tp);
tcp_sendseqinit(tp);
@@ -1294,25 +1363,25 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
* has to revalidate that the connection is still valid for the socket
* option.
*/
-#define INP_WLOCK_RECHECK(inp) do { \
+#define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \
INP_WLOCK(inp); \
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \
INP_WUNLOCK(inp); \
+ cleanup; \
return (ECONNRESET); \
} \
tp = intotcpcb(inp); \
} while(0)
+#define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */)
int
tcp_ctloutput(struct socket *so, struct sockopt *sopt)
{
- int error, opt, optval;
- u_int ui;
+ int error;
struct inpcb *inp;
struct tcpcb *tp;
- struct tcp_info ti;
- char buf[TCP_CA_NAME_MAX];
- struct cc_algo *algo;
+ struct tcp_function_block *blk;
+ struct tcp_function_set fsn;
error = 0;
inp = sotoinpcb(so);
@@ -1340,6 +1409,128 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
INP_WUNLOCK(inp);
return (ECONNRESET);
}
+ tp = intotcpcb(inp);
+ /*
+ * Protect the TCP option TCP_FUNCTION_BLK so
+ * that a sub-function can *never* overwrite this.
+ */
+ if ((sopt->sopt_dir == SOPT_SET) &&
+ (sopt->sopt_name == TCP_FUNCTION_BLK)) {
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &fsn, sizeof fsn,
+ sizeof fsn);
+ if (error)
+ return (error);
+ INP_WLOCK_RECHECK(inp);
+ blk = find_and_ref_tcp_functions(&fsn);
+ if (blk == NULL) {
+ INP_WUNLOCK(inp);
+ return (ENOENT);
+ }
+ if (tp->t_fb == blk) {
+ /* You already have this */
+ refcount_release(&blk->tfb_refcnt);
+ INP_WUNLOCK(inp);
+ return (0);
+ }
+ if (tp->t_state != TCPS_CLOSED) {
+ int error=EINVAL;
+ /*
+ * The user has advanced the state
+ * past the initial point, we may not
+ * be able to switch.
+ */
+ if (blk->tfb_tcp_handoff_ok != NULL) {
+ /*
+ * Does the stack provide a
+ * query mechanism, if so it may
+ * still be possible?
+ */
+ error = (*blk->tfb_tcp_handoff_ok)(tp);
+ }
+ if (error) {
+ refcount_release(&blk->tfb_refcnt);
+ INP_WUNLOCK(inp);
+ return(error);
+ }
+ }
+ if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
+ refcount_release(&blk->tfb_refcnt);
+ INP_WUNLOCK(inp);
+ return (ENOENT);
+ }
+ /*
+ * Release the old refcnt, the
+ * lookup acquired a ref on the
+ * new one already.
+ */
+ if (tp->t_fb->tfb_tcp_fb_fini) {
+ /*
+ * Tell the stack to cleanup with 0 i.e.
+ * the tcb is not going away.
+ */
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ }
+ refcount_release(&tp->t_fb->tfb_refcnt);
+ tp->t_fb = blk;
+ if (tp->t_fb->tfb_tcp_fb_init) {
+ (*tp->t_fb->tfb_tcp_fb_init)(tp);
+ }
+#ifdef TCP_OFFLOAD
+ if (tp->t_flags & TF_TOE) {
+ tcp_offload_ctloutput(tp, sopt->sopt_dir,
+ sopt->sopt_name);
+ }
+#endif
+ INP_WUNLOCK(inp);
+ return (error);
+ } else if ((sopt->sopt_dir == SOPT_GET) &&
+ (sopt->sopt_name == TCP_FUNCTION_BLK)) {
+ strcpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name);
+ fsn.pcbcnt = tp->t_fb->tfb_refcnt;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &fsn, sizeof fsn);
+ return (error);
+ }
+ /* Pass in the INP locked, called must unlock it */
+ return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp));
+}
+
+int
+tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
+{
+ int error, opt, optval;
+ u_int ui;
+ struct tcp_info ti;
+ struct cc_algo *algo;
+ char *pbuf, buf[TCP_CA_NAME_MAX];
+ size_t len;
+
+ /*
+ * For TCP_CCALGOOPT forward the control to CC module, for both
+ * SOPT_SET and SOPT_GET.
+ */
+ switch (sopt->sopt_name) {
+ case TCP_CCALGOOPT:
+ INP_WUNLOCK(inp);
+ pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO);
+ error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize,
+ sopt->sopt_valsize);
+ if (error) {
+ free(pbuf, M_TEMP);
+ return (error);
+ }
+ INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP));
+ if (CC_ALGO(tp)->ctl_output != NULL)
+ error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf);
+ else
+ error = ENOENT;
+ INP_WUNLOCK(inp);
+ if (error == 0 && sopt->sopt_dir == SOPT_GET)
+ error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize);
+ free(pbuf, M_TEMP);
+ return (error);
+ }
switch (sopt->sopt_dir) {
case SOPT_SET:
@@ -1408,7 +1599,7 @@ unlock_and_done:
else if (tp->t_flags & TF_NOPUSH) {
tp->t_flags &= ~TF_NOPUSH;
if (TCPS_HAVEESTABLISHED(tp->t_state))
- error = tcp_output(tp);
+ error = tp->t_fb->tfb_tcp_output(tp);
}
goto unlock_and_done;
@@ -1434,50 +1625,45 @@ unlock_and_done:
case TCP_CONGESTION:
INP_WUNLOCK(inp);
- bzero(buf, sizeof(buf));
- error = sooptcopyin(sopt, &buf, sizeof(buf), 1);
+ error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
if (error)
break;
+ buf[sopt->sopt_valsize] = '\0';
INP_WLOCK_RECHECK(inp);
+ CC_LIST_RLOCK();
+ STAILQ_FOREACH(algo, &cc_list, entries)
+ if (strncmp(buf, algo->name,
+ TCP_CA_NAME_MAX) == 0)
+ break;
+ CC_LIST_RUNLOCK();
+ if (algo == NULL) {
+ INP_WUNLOCK(inp);
+ error = EINVAL;
+ break;
+ }
/*
- * Return EINVAL if we can't find the requested cc algo.
+ * We hold a write lock over the tcb so it's safe to
+ * do these things without ordering concerns.
*/
- error = EINVAL;
- CC_LIST_RLOCK();
- STAILQ_FOREACH(algo, &cc_list, entries) {
- if (strncmp(buf, algo->name, TCP_CA_NAME_MAX)
- == 0) {
- /* We've found the requested algo. */
- error = 0;
- /*
- * We hold a write lock over the tcb
- * so it's safe to do these things
- * without ordering concerns.
- */
- if (CC_ALGO(tp)->cb_destroy != NULL)
- CC_ALGO(tp)->cb_destroy(tp->ccv);
- CC_ALGO(tp) = algo;
- /*
- * If something goes pear shaped
- * initialising the new algo,
- * fall back to newreno (which
- * does not require initialisation).
- */
- if (algo->cb_init != NULL)
- if (algo->cb_init(tp->ccv) > 0) {
- CC_ALGO(tp) = &newreno_cc_algo;
- /*
- * The only reason init
- * should fail is
- * because of malloc.
- */
- error = ENOMEM;
- }
- break; /* Break the STAILQ_FOREACH. */
- }
+ if (CC_ALGO(tp)->cb_destroy != NULL)
+ CC_ALGO(tp)->cb_destroy(tp->ccv);
+ CC_ALGO(tp) = algo;
+ /*
+ * If something goes pear shaped initialising the new
+ * algo, fall back to newreno (which does not
+ * require initialisation).
+ */
+ if (algo->cb_init != NULL &&
+ algo->cb_init(tp->ccv) != 0) {
+ CC_ALGO(tp) = &newreno_cc_algo;
+ /*
+ * The only reason init should fail is
+ * because of malloc.
+ */
+ error = ENOMEM;
}
- CC_LIST_RUNLOCK();
- goto unlock_and_done;
+ INP_WUNLOCK(inp);
+ break;
case TCP_KEEPIDLE:
case TCP_KEEPINTVL:
@@ -1535,8 +1721,49 @@ unlock_and_done:
(TP_MAXIDLE(tp) > 0))
tcp_timer_activate(tp, TT_2MSL,
TP_MAXIDLE(tp));
+ goto unlock_and_done;
+
+#ifdef TCPPCAP
+ case TCP_PCAP_OUT:
+ case TCP_PCAP_IN:
INP_WUNLOCK(inp);
- break;
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+
+ INP_WLOCK_RECHECK(inp);
+ if (optval >= 0)
+ tcp_pcap_set_sock_max(TCP_PCAP_OUT ?
+ &(tp->t_outpkts) : &(tp->t_inpkts),
+ optval);
+ else
+ error = EINVAL;
+ goto unlock_and_done;
+#endif
+
+#ifdef TCP_RFC7413
+ case TCP_FASTOPEN:
+ INP_WUNLOCK(inp);
+ if (!V_tcp_fastopen_enabled)
+ return (EPERM);
+
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+
+ INP_WLOCK_RECHECK(inp);
+ if (optval) {
+ tp->t_flags |= TF_FASTOPEN;
+ if ((tp->t_state == TCPS_LISTEN) &&
+ (tp->t_tfo_pending == NULL))
+ tp->t_tfo_pending =
+ tcp_fastopen_alloc_counter();
+ } else
+ tp->t_flags &= ~TF_FASTOPEN;
+ goto unlock_and_done;
+#endif
default:
INP_WUNLOCK(inp);
@@ -1582,11 +1809,48 @@ unlock_and_done:
error = sooptcopyout(sopt, &ti, sizeof ti);
break;
case TCP_CONGESTION:
- bzero(buf, sizeof(buf));
- strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
+ len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, buf, len + 1);
+ break;
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_KEEPINIT:
+ case TCP_KEEPCNT:
+ switch (sopt->sopt_name) {
+ case TCP_KEEPIDLE:
+ ui = tp->t_keepidle / hz;
+ break;
+ case TCP_KEEPINTVL:
+ ui = tp->t_keepintvl / hz;
+ break;
+ case TCP_KEEPINIT:
+ ui = tp->t_keepinit / hz;
+ break;
+ case TCP_KEEPCNT:
+ ui = tp->t_keepcnt;
+ break;
+ }
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &ui, sizeof(ui));
+ break;
+#ifdef TCPPCAP
+ case TCP_PCAP_OUT:
+ case TCP_PCAP_IN:
+ optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ?
+ &(tp->t_outpkts) : &(tp->t_inpkts));
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+#endif
+
+#ifdef TCP_RFC7413
+ case TCP_FASTOPEN:
+ optval = tp->t_flags & TF_FASTOPEN;
INP_WUNLOCK(inp);
- error = sooptcopyout(sopt, buf, TCP_CA_NAME_MAX);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
break;
+#endif
default:
INP_WUNLOCK(inp);
error = ENOPROTOOPT;
@@ -1597,6 +1861,7 @@ unlock_and_done:
return (error);
}
#undef INP_WLOCK_RECHECK
+#undef INP_WLOCK_RECHECK_CLEANUP
/*
* Attach TCP protocol to socket, allocating
@@ -1617,10 +1882,10 @@ tcp_attach(struct socket *so)
}
so->so_rcv.sb_flags |= SB_AUTOSIZE;
so->so_snd.sb_flags |= SB_AUTOSIZE;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
error = in_pcballoc(so, &V_tcbinfo);
if (error) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
inp = sotoinpcb(so);
@@ -1636,12 +1901,13 @@ tcp_attach(struct socket *so)
if (tp == NULL) {
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (ENOBUFS);
}
tp->t_state = TCPS_CLOSED;
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ TCPSTATES_INC(TCPS_CLOSED);
return (0);
}
@@ -1659,7 +1925,7 @@ tcp_disconnect(struct tcpcb *tp)
struct inpcb *inp = tp->t_inpcb;
struct socket *so = inp->inp_socket;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
/*
@@ -1679,7 +1945,7 @@ tcp_disconnect(struct tcpcb *tp)
sbflush(&so->so_rcv);
tcp_usrclosed(tp);
if (!(inp->inp_flags & INP_DROPPED))
- tcp_output(tp);
+ tp->t_fb->tfb_tcp_output(tp);
}
}
@@ -1697,7 +1963,7 @@ static void
tcp_usrclosed(struct tcpcb *tp)
{
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
switch (tp->t_state) {
@@ -1705,9 +1971,9 @@ tcp_usrclosed(struct tcpcb *tp)
#ifdef TCP_OFFLOAD
tcp_offload_listen_stop(tp);
#endif
+ tcp_state_change(tp, TCPS_CLOSED);
/* FALLTHROUGH */
case TCPS_CLOSED:
- tp->t_state = TCPS_CLOSED;
tp = tcp_close(tp);
/*
* tcp_close() should never return NULL here as the socket is
@@ -1723,11 +1989,11 @@ tcp_usrclosed(struct tcpcb *tp)
break;
case TCPS_ESTABLISHED:
- tp->t_state = TCPS_FIN_WAIT_1;
+ tcp_state_change(tp, TCPS_FIN_WAIT_1);
break;
case TCPS_CLOSE_WAIT:
- tp->t_state = TCPS_LAST_ACK;
+ tcp_state_change(tp, TCPS_LAST_ACK);
break;
}
if (tp->t_state >= TCPS_FIN_WAIT_2) {
@@ -1910,6 +2176,10 @@ db_print_tflags(u_int t_flags)
db_printf("%sTF_ECN_PERMIT", comma ? ", " : "");
comma = 1;
}
+ if (t_flags & TF_FASTOPEN) {
+ db_printf("%sTF_FASTOPEN", comma ? ", " : "");
+ comma = 1;
+ }
}
static void
@@ -1984,8 +2254,8 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
"0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
db_print_indent(indent);
- db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n",
- tp->t_maxopd, tp->t_rcvtime, tp->t_starttime);
+ db_printf("t_rcvtime: %u t_startime: %u\n",
+ tp->t_rcvtime, tp->t_starttime);
db_print_indent(indent);
db_printf("t_rttime: %u t_rtsq: 0x%08x\n",
diff --git a/freebsd/sys/netinet/tcp_var.h b/freebsd/sys/netinet/tcp_var.h
index dbd9ed11..5dcd35b8 100644
--- a/freebsd/sys/netinet/tcp_var.h
+++ b/freebsd/sys/netinet/tcp_var.h
@@ -34,9 +34,11 @@
#define _NETINET_TCP_VAR_H_
#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
#ifdef _KERNEL
#include <net/vnet.h>
+#include <sys/mbuf.h>
/*
* Kernel variables for tcp.
@@ -73,7 +75,12 @@ struct sackhint {
tcp_seq last_sack_ack; /* Most recent/largest sacked ack */
int ispare; /* explicit pad for 64bit alignment */
- uint64_t _pad[2]; /* 1 sacked_bytes, 1 TBD */
+ int sacked_bytes; /*
+ * Total sacked bytes reported by the
+ * receiver via sack option
+ */
+ uint32_t _pad1[1]; /* TBD */
+ uint64_t _pad[1]; /* TBD */
};
struct tcptemp {
@@ -83,17 +90,75 @@ struct tcptemp {
#define tcp6cb tcpcb /* for KAME src sync over BSD*'s */
-/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
-#ifdef INET6
-#define ND6_HINT(tp) \
-do { \
- if ((tp) && (tp)->t_inpcb && \
- ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
- nd6_nud_hint(NULL, NULL, 0); \
-} while (0)
-#else
-#define ND6_HINT(tp)
-#endif
+/*
+ * TODO: We yet need to brave plowing in
+ * to tcp_input() and the pru_usrreq() block.
+ * Right now these go to the old standards which
+ * are somewhat ok, but in the long term may
+ * need to be changed. If we do tackle tcp_input()
+ * then we need to get rid of the tcp_do_segment()
+ * function below.
+ */
+/* Flags for tcp functions */
+#define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */
+struct tcpcb;
+struct inpcb;
+struct sockopt;
+struct socket;
+
+/*
+ * If defining the optional tcp_timers, in the
+ * tfb_tcp_timer_stop call you must use the
+ * callout_async_drain() function with the
+ * tcp_timer_discard callback. You should check
+ * the return of callout_async_drain() and if 0
+ * increment tt_draincnt. Since the timer sub-system
+ * does not know your callbacks you must provide a
+ * stop_all function that loops through and calls
+ * tcp_timer_stop() with each of your defined timers.
+ * Adding a tfb_tcp_handoff_ok function allows the socket
+ * option to change stacks to query you even if the
+ * connection is in a later stage. You return 0 to
+ * say you can take over and run your stack, you return
+ * non-zero (an error number) to say no you can't.
+ * If the function is undefined you can only change
+ * in the early states (before connect or listen).
+ * tfb_tcp_fb_fini is changed to add a flag to tell
+ * the old stack if the tcb is being destroyed or
+ * not. A one in the flag means the TCB is being
+ * destroyed, a zero indicates its transitioning to
+ * another stack (via socket option).
+ */
+struct tcp_function_block {
+ char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
+ int (*tfb_tcp_output)(struct tcpcb *);
+ void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *,
+ int, int, uint8_t,
+ int);
+ int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt,
+ struct inpcb *inp, struct tcpcb *tp);
+ /* Optional memory allocation/free routine */
+ void (*tfb_tcp_fb_init)(struct tcpcb *);
+ void (*tfb_tcp_fb_fini)(struct tcpcb *, int);
+ /* Optional timers, must define all if you define one */
+ int (*tfb_tcp_timer_stop_all)(struct tcpcb *);
+ void (*tfb_tcp_timer_activate)(struct tcpcb *,
+ uint32_t, u_int);
+ int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t);
+ void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
+ void (*tfb_tcp_rexmit_tmr)(struct tcpcb *);
+ int (*tfb_tcp_handoff_ok)(struct tcpcb *);
+ volatile uint32_t tfb_refcnt;
+ uint32_t tfb_flags;
+};
+
+struct tcp_function {
+ TAILQ_ENTRY(tcp_function) tf_next;
+ struct tcp_function_block *tf_fb;
+};
+
+TAILQ_HEAD(tcp_funchead, tcp_function);
/*
* Tcp control block, one per tcp; fields:
@@ -113,7 +178,7 @@ struct tcpcb {
struct vnet *t_vnet; /* back pointer to parent vnet */
- tcp_seq snd_una; /* send unacknowledged */
+ tcp_seq snd_una; /* sent but unacknowledged */
tcp_seq snd_max; /* highest sequence number sent;
* used to recognize retransmits
*/
@@ -140,8 +205,6 @@ struct tcpcb {
u_long snd_spare2; /* unused */
tcp_seq snd_recover; /* for use in NewReno Fast Recovery */
- u_int t_maxopd; /* mss plus options */
-
u_int t_rcvtime; /* inactivity time */
u_int t_starttime; /* time connection was established */
u_int t_rtttime; /* RTT measurement start time */
@@ -152,6 +215,7 @@ struct tcpcb {
int t_rxtcur; /* current retransmit value (ticks) */
u_int t_maxseg; /* maximum segment size */
+ u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */
int t_srtt; /* smoothed round-trip time */
int t_rttvar; /* variance in round-trip time */
@@ -208,13 +272,35 @@ struct tcpcb {
u_int t_keepintvl; /* interval between keepalives */
u_int t_keepcnt; /* number of keepalives before close */
- u_int t_tsomax; /* tso burst length limit */
-
- uint32_t t_ispare[7]; /* 5 UTO, 2 TBD */
- void *t_pspare2[4]; /* 4 TBD */
- uint64_t _pad[5]; /* 5 TBD (1-2 CC/RTT?) */
- uint32_t t_tsomaxsegcount; /* TSO maximum segment count */
- uint32_t t_tsomaxsegsize; /* TSO maximum segment size in bytes */
+ u_int t_tsomax; /* TSO total burst length limit in bytes */
+ u_int t_tsomaxsegcount; /* TSO maximum segment count */
+ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */
+ u_int t_flags2; /* More tcpcb flags storage */
+#if defined(_KERNEL) && defined(TCP_RFC7413)
+ uint32_t t_ispare[6]; /* 5 UTO, 1 TBD */
+ uint64_t t_tfo_cookie; /* TCP Fast Open cookie */
+#else
+ uint32_t t_ispare[8]; /* 5 UTO, 3 TBD */
+#endif
+ struct tcp_function_block *t_fb;/* TCP function call block */
+ void *t_fb_ptr; /* Pointer to t_fb specific data */
+#if defined(_KERNEL) && defined(TCP_RFC7413)
+ unsigned int *t_tfo_pending; /* TCP Fast Open pending counter */
+ void *t_pspare2[1]; /* 1 TCP_SIGNATURE */
+#else
+ void *t_pspare2[2]; /* 1 TCP_SIGNATURE, 1 TBD */
+#endif
+#if defined(_KERNEL) && defined(TCPPCAP)
+ struct mbufq t_inpkts; /* List of saved input packets. */
+ struct mbufq t_outpkts; /* List of saved output packets. */
+#ifdef _LP64
+ uint64_t _pad[0]; /* all used! */
+#else
+ uint64_t _pad[2]; /* 2 are available */
+#endif /* _LP64 */
+#else
+ uint64_t _pad[6];
+#endif /* defined(_KERNEL) && defined(TCPPCAP) */
};
/*
@@ -249,6 +335,7 @@ struct tcpcb {
#define TF_ECN_SND_ECE 0x10000000 /* ECN ECE in queue */
#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */
#define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */
+#define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */
#define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY)
#define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY
@@ -286,6 +373,13 @@ struct tcpcb {
#endif /* TCP_SIGNATURE */
/*
+ * Flags for PLPMTU handling, t_flags2
+ */
+#define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */
+#define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */
+#define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */
+
+/*
* Structure to hold TCP options that are only used during segment
* processing (in tcp_input), but not held in the tcpcb.
* It's basically used to reduce the number of parameters
@@ -294,21 +388,24 @@ struct tcpcb {
* options in tcp_addoptions.
*/
struct tcpopt {
- u_int64_t to_flags; /* which options are present */
+ u_int32_t to_flags; /* which options are present */
#define TOF_MSS 0x0001 /* maximum segment size */
#define TOF_SCALE 0x0002 /* window scaling */
#define TOF_SACKPERM 0x0004 /* SACK permitted */
#define TOF_TS 0x0010 /* timestamp */
#define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */
#define TOF_SACK 0x0080 /* Peer sent SACK option */
-#define TOF_MAXOPT 0x0100
+#define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */
+#define TOF_MAXOPT 0x0200
u_int32_t to_tsval; /* new timestamp */
u_int32_t to_tsecr; /* reflected timestamp */
u_char *to_sacks; /* pointer to the first SACK blocks */
u_char *to_signature; /* pointer to the TCP-MD5 signature */
+ u_char *to_tfo_cookie; /* pointer to the TFO cookie */
u_int16_t to_mss; /* maximum segment size */
u_int8_t to_wscale; /* window scaling */
u_int8_t to_nsacks; /* number of SACK blocks */
+ u_int8_t to_tfo_len; /* TFO cookie length */
u_int32_t to_spare; /* UTO */
};
@@ -322,7 +419,6 @@ struct hc_metrics_lite { /* must stay in sync with hc_metrics */
u_long rmx_ssthresh; /* outbound gateway buffer limit */
u_long rmx_rtt; /* estimated round trip time */
u_long rmx_rttvar; /* estimated rtt variance */
- u_long rmx_bandwidth; /* estimated bandwidth */
u_long rmx_cwnd; /* congestion window */
u_long rmx_sendpipe; /* outbound delay-bandwidth product */
u_long rmx_recvpipe; /* inbound delay-bandwidth product */
@@ -357,6 +453,8 @@ struct tcptw {
u_int t_starttime;
int tw_time;
TAILQ_ENTRY(tcptw) tw_2msl;
+ void *tw_pspare; /* TCP_SIGNATURE */
+ u_int *tw_spare; /* TCP_SIGNATURE */
};
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
@@ -404,125 +502,133 @@ struct tcptw {
* but that's inconvenient at the moment.
*/
struct tcpstat {
- u_long tcps_connattempt; /* connections initiated */
- u_long tcps_accepts; /* connections accepted */
- u_long tcps_connects; /* connections established */
- u_long tcps_drops; /* connections dropped */
- u_long tcps_conndrops; /* embryonic connections dropped */
- u_long tcps_minmssdrops; /* average minmss too low drops */
- u_long tcps_closed; /* conn. closed (includes drops) */
- u_long tcps_segstimed; /* segs where we tried to get rtt */
- u_long tcps_rttupdated; /* times we succeeded */
- u_long tcps_delack; /* delayed acks sent */
- u_long tcps_timeoutdrop; /* conn. dropped in rxmt timeout */
- u_long tcps_rexmttimeo; /* retransmit timeouts */
- u_long tcps_persisttimeo; /* persist timeouts */
- u_long tcps_keeptimeo; /* keepalive timeouts */
- u_long tcps_keepprobe; /* keepalive probes sent */
- u_long tcps_keepdrops; /* connections dropped in keepalive */
-
- u_long tcps_sndtotal; /* total packets sent */
- u_long tcps_sndpack; /* data packets sent */
- u_long tcps_sndbyte; /* data bytes sent */
- u_long tcps_sndrexmitpack; /* data packets retransmitted */
- u_long tcps_sndrexmitbyte; /* data bytes retransmitted */
- u_long tcps_sndrexmitbad; /* unnecessary packet retransmissions */
- u_long tcps_sndacks; /* ack-only packets sent */
- u_long tcps_sndprobe; /* window probes sent */
- u_long tcps_sndurg; /* packets sent with URG only */
- u_long tcps_sndwinup; /* window update-only packets sent */
- u_long tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */
-
- u_long tcps_rcvtotal; /* total packets received */
- u_long tcps_rcvpack; /* packets received in sequence */
- u_long tcps_rcvbyte; /* bytes received in sequence */
- u_long tcps_rcvbadsum; /* packets received with ccksum errs */
- u_long tcps_rcvbadoff; /* packets received with bad offset */
- u_long tcps_rcvmemdrop; /* packets dropped for lack of memory */
- u_long tcps_rcvshort; /* packets received too short */
- u_long tcps_rcvduppack; /* duplicate-only packets received */
- u_long tcps_rcvdupbyte; /* duplicate-only bytes received */
- u_long tcps_rcvpartduppack; /* packets with some duplicate data */
- u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */
- u_long tcps_rcvoopack; /* out-of-order packets received */
- u_long tcps_rcvoobyte; /* out-of-order bytes received */
- u_long tcps_rcvpackafterwin; /* packets with data after window */
- u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */
- u_long tcps_rcvafterclose; /* packets rcvd after "close" */
- u_long tcps_rcvwinprobe; /* rcvd window probe packets */
- u_long tcps_rcvdupack; /* rcvd duplicate acks */
- u_long tcps_rcvacktoomuch; /* rcvd acks for unsent data */
- u_long tcps_rcvackpack; /* rcvd ack packets */
- u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */
- u_long tcps_rcvwinupd; /* rcvd window update packets */
- u_long tcps_pawsdrop; /* segments dropped due to PAWS */
- u_long tcps_predack; /* times hdr predict ok for acks */
- u_long tcps_preddat; /* times hdr predict ok for data pkts */
- u_long tcps_pcbcachemiss;
- u_long tcps_cachedrtt; /* times cached RTT in route updated */
- u_long tcps_cachedrttvar; /* times cached rttvar updated */
- u_long tcps_cachedssthresh; /* times cached ssthresh updated */
- u_long tcps_usedrtt; /* times RTT initialized from route */
- u_long tcps_usedrttvar; /* times RTTVAR initialized from rt */
- u_long tcps_usedssthresh; /* times ssthresh initialized from rt*/
- u_long tcps_persistdrop; /* timeout in persist state */
- u_long tcps_badsyn; /* bogus SYN, e.g. premature ACK */
- u_long tcps_mturesent; /* resends due to MTU discovery */
- u_long tcps_listendrop; /* listen queue overflows */
- u_long tcps_badrst; /* ignored RSTs in the window */
-
- u_long tcps_sc_added; /* entry added to syncache */
- u_long tcps_sc_retransmitted; /* syncache entry was retransmitted */
- u_long tcps_sc_dupsyn; /* duplicate SYN packet */
- u_long tcps_sc_dropped; /* could not reply to packet */
- u_long tcps_sc_completed; /* successful extraction of entry */
- u_long tcps_sc_bucketoverflow; /* syncache per-bucket limit hit */
- u_long tcps_sc_cacheoverflow; /* syncache cache limit hit */
- u_long tcps_sc_reset; /* RST removed entry from syncache */
- u_long tcps_sc_stale; /* timed out or listen socket gone */
- u_long tcps_sc_aborted; /* syncache entry aborted */
- u_long tcps_sc_badack; /* removed due to bad ACK */
- u_long tcps_sc_unreach; /* ICMP unreachable received */
- u_long tcps_sc_zonefail; /* zalloc() failed */
- u_long tcps_sc_sendcookie; /* SYN cookie sent */
- u_long tcps_sc_recvcookie; /* SYN cookie received */
-
- u_long tcps_hc_added; /* entry added to hostcache */
- u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */
-
- u_long tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */
+ uint64_t tcps_connattempt; /* connections initiated */
+ uint64_t tcps_accepts; /* connections accepted */
+ uint64_t tcps_connects; /* connections established */
+ uint64_t tcps_drops; /* connections dropped */
+ uint64_t tcps_conndrops; /* embryonic connections dropped */
+ uint64_t tcps_minmssdrops; /* average minmss too low drops */
+ uint64_t tcps_closed; /* conn. closed (includes drops) */
+ uint64_t tcps_segstimed; /* segs where we tried to get rtt */
+ uint64_t tcps_rttupdated; /* times we succeeded */
+ uint64_t tcps_delack; /* delayed acks sent */
+ uint64_t tcps_timeoutdrop; /* conn. dropped in rxmt timeout */
+ uint64_t tcps_rexmttimeo; /* retransmit timeouts */
+ uint64_t tcps_persisttimeo; /* persist timeouts */
+ uint64_t tcps_keeptimeo; /* keepalive timeouts */
+ uint64_t tcps_keepprobe; /* keepalive probes sent */
+ uint64_t tcps_keepdrops; /* connections dropped in keepalive */
+
+ uint64_t tcps_sndtotal; /* total packets sent */
+ uint64_t tcps_sndpack; /* data packets sent */
+ uint64_t tcps_sndbyte; /* data bytes sent */
+ uint64_t tcps_sndrexmitpack; /* data packets retransmitted */
+ uint64_t tcps_sndrexmitbyte; /* data bytes retransmitted */
+ uint64_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */
+ uint64_t tcps_sndacks; /* ack-only packets sent */
+ uint64_t tcps_sndprobe; /* window probes sent */
+ uint64_t tcps_sndurg; /* packets sent with URG only */
+ uint64_t tcps_sndwinup; /* window update-only packets sent */
+ uint64_t tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */
+
+ uint64_t tcps_rcvtotal; /* total packets received */
+ uint64_t tcps_rcvpack; /* packets received in sequence */
+ uint64_t tcps_rcvbyte; /* bytes received in sequence */
+ uint64_t tcps_rcvbadsum; /* packets received with ccksum errs */
+ uint64_t tcps_rcvbadoff; /* packets received with bad offset */
+ uint64_t tcps_rcvreassfull; /* packets dropped for no reass space */
+ uint64_t tcps_rcvshort; /* packets received too short */
+ uint64_t tcps_rcvduppack; /* duplicate-only packets received */
+ uint64_t tcps_rcvdupbyte; /* duplicate-only bytes received */
+ uint64_t tcps_rcvpartduppack; /* packets with some duplicate data */
+ uint64_t tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */
+ uint64_t tcps_rcvoopack; /* out-of-order packets received */
+ uint64_t tcps_rcvoobyte; /* out-of-order bytes received */
+ uint64_t tcps_rcvpackafterwin; /* packets with data after window */
+ uint64_t tcps_rcvbyteafterwin; /* bytes rcvd after window */
+ uint64_t tcps_rcvafterclose; /* packets rcvd after "close" */
+ uint64_t tcps_rcvwinprobe; /* rcvd window probe packets */
+ uint64_t tcps_rcvdupack; /* rcvd duplicate acks */
+ uint64_t tcps_rcvacktoomuch; /* rcvd acks for unsent data */
+ uint64_t tcps_rcvackpack; /* rcvd ack packets */
+ uint64_t tcps_rcvackbyte; /* bytes acked by rcvd acks */
+ uint64_t tcps_rcvwinupd; /* rcvd window update packets */
+ uint64_t tcps_pawsdrop; /* segments dropped due to PAWS */
+ uint64_t tcps_predack; /* times hdr predict ok for acks */
+ uint64_t tcps_preddat; /* times hdr predict ok for data pkts */
+ uint64_t tcps_pcbcachemiss;
+ uint64_t tcps_cachedrtt; /* times cached RTT in route updated */
+ uint64_t tcps_cachedrttvar; /* times cached rttvar updated */
+ uint64_t tcps_cachedssthresh; /* times cached ssthresh updated */
+ uint64_t tcps_usedrtt; /* times RTT initialized from route */
+ uint64_t tcps_usedrttvar; /* times RTTVAR initialized from rt */
+ uint64_t tcps_usedssthresh; /* times ssthresh initialized from rt*/
+ uint64_t tcps_persistdrop; /* timeout in persist state */
+ uint64_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */
+ uint64_t tcps_mturesent; /* resends due to MTU discovery */
+ uint64_t tcps_listendrop; /* listen queue overflows */
+ uint64_t tcps_badrst; /* ignored RSTs in the window */
+
+ uint64_t tcps_sc_added; /* entry added to syncache */
+ uint64_t tcps_sc_retransmitted; /* syncache entry was retransmitted */
+ uint64_t tcps_sc_dupsyn; /* duplicate SYN packet */
+ uint64_t tcps_sc_dropped; /* could not reply to packet */
+ uint64_t tcps_sc_completed; /* successful extraction of entry */
+ uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */
+ uint64_t tcps_sc_cacheoverflow; /* syncache cache limit hit */
+ uint64_t tcps_sc_reset; /* RST removed entry from syncache */
+ uint64_t tcps_sc_stale; /* timed out or listen socket gone */
+ uint64_t tcps_sc_aborted; /* syncache entry aborted */
+ uint64_t tcps_sc_badack; /* removed due to bad ACK */
+ uint64_t tcps_sc_unreach; /* ICMP unreachable received */
+ uint64_t tcps_sc_zonefail; /* zalloc() failed */
+ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */
+ uint64_t tcps_sc_recvcookie; /* SYN cookie received */
+
+ uint64_t tcps_hc_added; /* entry added to hostcache */
+ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */
+
+ uint64_t tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */
/* SACK related stats */
- u_long tcps_sack_recovery_episode; /* SACK recovery episodes */
- u_long tcps_sack_rexmits; /* SACK rexmit segments */
- u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */
- u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */
- u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */
- u_long tcps_sack_sboverflow; /* times scoreboard overflowed */
+ uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */
+ uint64_t tcps_sack_rexmits; /* SACK rexmit segments */
+ uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */
+ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */
+ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */
+ uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */
/* ECN related stats */
- u_long tcps_ecn_ce; /* ECN Congestion Experienced */
- u_long tcps_ecn_ect0; /* ECN Capable Transport */
- u_long tcps_ecn_ect1; /* ECN Capable Transport */
- u_long tcps_ecn_shs; /* ECN successful handshakes */
- u_long tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */
+ uint64_t tcps_ecn_ce; /* ECN Congestion Experienced */
+ uint64_t tcps_ecn_ect0; /* ECN Capable Transport */
+ uint64_t tcps_ecn_ect1; /* ECN Capable Transport */
+ uint64_t tcps_ecn_shs; /* ECN successful handshakes */
+ uint64_t tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */
/* TCP_SIGNATURE related stats */
- u_long tcps_sig_rcvgoodsig; /* Total matching signature received */
- u_long tcps_sig_rcvbadsig; /* Total bad signature received */
- u_long tcps_sig_err_buildsig; /* Mismatching signature received */
- u_long tcps_sig_err_sigopt; /* No signature expected by socket */
- u_long tcps_sig_err_nosigopt; /* No signature provided by segment */
+ uint64_t tcps_sig_rcvgoodsig; /* Total matching signature received */
+ uint64_t tcps_sig_rcvbadsig; /* Total bad signature received */
+ uint64_t tcps_sig_err_buildsig; /* Mismatching signature received */
+ uint64_t tcps_sig_err_sigopt; /* No signature expected by socket */
+ uint64_t tcps_sig_err_nosigopt; /* No signature provided by segment */
- u_long _pad[12]; /* 6 UTO, 6 TBD */
+ uint64_t _pad[12]; /* 6 UTO, 6 TBD */
};
+#define tcps_rcvmemdrop tcps_rcvreassfull /* compat */
+
#ifdef _KERNEL
+#define TI_UNLOCKED 1
+#define TI_RLOCKED 2
+#include <sys/counter.h>
+
+VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */
/*
* In-kernel consumers can use these accessor macros directly to update
* stats.
*/
-#define TCPSTAT_ADD(name, val) V_tcpstat.name += (val)
+#define TCPSTAT_ADD(name, val) \
+ VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val))
#define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1)
/*
@@ -530,7 +636,15 @@ struct tcpstat {
*/
void kmod_tcpstat_inc(int statnum);
#define KMOD_TCPSTAT_INC(name) \
- kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(u_long))
+ kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(uint64_t))
+
+/*
+ * Running TCP connection count by state.
+ */
+VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]);
+#define V_tcps_states VNET(tcps_states)
+#define TCPSTATES_INC(state) counter_u64_add(V_tcps_states[state], 1)
+#define TCPSTATES_DEC(state) counter_u64_add(V_tcps_states[state], -1)
/*
* TCP specific helper hook point identifiers.
@@ -574,11 +688,11 @@ struct xtcpcb {
#endif
/*
- * Names for TCP sysctl objects
+ * Identifiers for TCP sysctl nodes
*/
#define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */
#define TCPCTL_MSSDFLT 3 /* MSS default */
-#define TCPCTL_STATS 4 /* statistics (read-only) */
+#define TCPCTL_STATS 4 /* statistics */
#define TCPCTL_RTTDFLT 5 /* default RTT estimate */
#define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */
#define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */
@@ -590,26 +704,7 @@ struct xtcpcb {
#define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */
#define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */
#define TCPCTL_DROP 15 /* drop tcp connection */
-#define TCPCTL_MAXID 16
-#define TCPCTL_FINWAIT2_TIMEOUT 17
-
-#define TCPCTL_NAMES { \
- { 0, 0 }, \
- { "rfc1323", CTLTYPE_INT }, \
- { "mssdflt", CTLTYPE_INT }, \
- { "stats", CTLTYPE_STRUCT }, \
- { "rttdflt", CTLTYPE_INT }, \
- { "keepidle", CTLTYPE_INT }, \
- { "keepintvl", CTLTYPE_INT }, \
- { "sendspace", CTLTYPE_INT }, \
- { "recvspace", CTLTYPE_INT }, \
- { "keepinit", CTLTYPE_INT }, \
- { "pcblist", CTLTYPE_STRUCT }, \
- { "delacktime", CTLTYPE_INT }, \
- { "v6mssdflt", CTLTYPE_INT }, \
- { "maxid", CTLTYPE_INT }, \
-}
-
+#define TCPCTL_STATES 16 /* connection counts by TCP state */
#ifdef _KERNEL
#ifdef SYSCTL_DECL
@@ -620,13 +715,12 @@ MALLOC_DECLARE(M_TCPLOG);
VNET_DECLARE(struct inpcbhead, tcb); /* queue of active tcpcb's */
VNET_DECLARE(struct inpcbinfo, tcbinfo);
-VNET_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */
extern int tcp_log_in_vain;
VNET_DECLARE(int, tcp_mssdflt); /* XXX */
VNET_DECLARE(int, tcp_minmss);
VNET_DECLARE(int, tcp_delack_enabled);
VNET_DECLARE(int, tcp_do_rfc3390);
-VNET_DECLARE(int, tcp_do_initcwnd10);
+VNET_DECLARE(int, tcp_initcwnd_segments);
VNET_DECLARE(int, tcp_sendspace);
VNET_DECLARE(int, tcp_recvspace);
VNET_DECLARE(int, path_mtu_discovery);
@@ -634,12 +728,11 @@ VNET_DECLARE(int, tcp_do_rfc3465);
VNET_DECLARE(int, tcp_abc_l_var);
#define V_tcb VNET(tcb)
#define V_tcbinfo VNET(tcbinfo)
-#define V_tcpstat VNET(tcpstat)
#define V_tcp_mssdflt VNET(tcp_mssdflt)
#define V_tcp_minmss VNET(tcp_minmss)
#define V_tcp_delack_enabled VNET(tcp_delack_enabled)
#define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390)
-#define V_tcp_do_initcwnd10 VNET(tcp_do_initcwnd10)
+#define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments)
#define V_tcp_sendspace VNET(tcp_sendspace)
#define V_tcp_recvspace VNET(tcp_recvspace)
#define V_path_mtu_discovery VNET(path_mtu_discovery)
@@ -659,50 +752,69 @@ VNET_DECLARE(int, tcp_ecn_maxretries);
VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]);
#define V_tcp_hhh VNET(tcp_hhh)
+VNET_DECLARE(int, tcp_do_rfc6675_pipe);
+#define V_tcp_do_rfc6675_pipe VNET(tcp_do_rfc6675_pipe)
+
int tcp_addoptions(struct tcpopt *, u_char *);
int tcp_ccalgounload(struct cc_algo *unload_algo);
struct tcpcb *
tcp_close(struct tcpcb *);
void tcp_discardcb(struct tcpcb *);
void tcp_twstart(struct tcpcb *);
-#if 0
-int tcp_twrecycleable(struct tcptw *tw);
-#endif
-void tcp_twclose(struct tcptw *_tw, int _reuse);
+void tcp_twclose(struct tcptw *, int);
void tcp_ctlinput(int, struct sockaddr *, void *);
int tcp_ctloutput(struct socket *, struct sockopt *);
struct tcpcb *
tcp_drop(struct tcpcb *, int);
void tcp_drain(void);
void tcp_init(void);
-#ifdef VIMAGE
-void tcp_destroy(void);
-#endif
void tcp_fini(void *);
char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *,
const void *);
char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
const void *);
int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
-void tcp_reass_init(void);
+void tcp_reass_global_init(void);
void tcp_reass_flush(struct tcpcb *);
-#ifdef VIMAGE
-void tcp_reass_destroy(void);
-#endif
-void tcp_input(struct mbuf *, int);
+void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
+ struct tcpcb *, int, int);
+void tcp_pulloutofband(struct socket *,
+ struct tcphdr *, struct mbuf *, int);
+void tcp_xmit_timer(struct tcpcb *, int);
+void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
+void cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
+ uint16_t type);
+void cc_conn_init(struct tcpcb *tp);
+void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
+void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
+void hhook_run_tcp_est_in(struct tcpcb *tp,
+ struct tcphdr *th, struct tcpopt *to);
+
+int tcp_input(struct mbuf **, int *, int);
+void tcp_do_segment(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *, int, int, uint8_t,
+ int);
+
+int register_tcp_functions(struct tcp_function_block *blk, int wait);
+int deregister_tcp_functions(struct tcp_function_block *blk);
+struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs);
+struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk);
+int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp);
+
u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
+u_int tcp_maxseg(const struct tcpcb *);
void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
struct tcp_ifcap *);
void tcp_mss(struct tcpcb *, int);
int tcp_mssopt(struct in_conninfo *);
struct inpcb *
tcp_drop_syn_sent(struct inpcb *, int);
-struct inpcb *
- tcp_mtudisc(struct inpcb *, int);
struct tcpcb *
tcp_newtcpcb(struct inpcb *);
int tcp_output(struct tcpcb *);
+void tcp_state_change(struct tcpcb *, int);
void tcp_respond(struct tcpcb *, void *,
struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int);
void tcp_tw_init(void);
@@ -712,19 +824,25 @@ void tcp_tw_destroy(void);
void tcp_tw_zone_change(void);
int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *,
struct mbuf *, int);
-int tcp_twrespond(struct tcptw *, int);
void tcp_setpersist(struct tcpcb *);
#ifdef TCP_SIGNATURE
+struct secasvar;
+struct secasvar *tcp_get_sav(struct mbuf *, u_int);
+int tcp_signature_do_compute(struct mbuf *, int, int, u_char *,
+ struct secasvar *);
int tcp_signature_compute(struct mbuf *, int, int, int, u_char *, u_int);
int tcp_signature_verify(struct mbuf *, int, int, int, struct tcpopt *,
struct tcphdr *, u_int);
+int tcp_signature_check(struct mbuf *m, int off0, int tlen, int optlen,
+ struct tcpopt *to, struct tcphdr *th, u_int tcpbflag);
#endif
void tcp_slowtimo(void);
struct tcptemp *
tcpip_maketemplate(struct inpcb *);
void tcpip_fillheaders(struct inpcb *, void *, void *);
-void tcp_timer_activate(struct tcpcb *, int, u_int);
-int tcp_timer_active(struct tcpcb *, int);
+void tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
+int tcp_timer_active(struct tcpcb *, uint32_t);
+void tcp_timer_stop(struct tcpcb *, uint32_t);
void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
/*
* All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
@@ -741,7 +859,7 @@ void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *);
extern struct pr_usrreqs tcp_usrreqs;
tcp_seq tcp_new_isn(struct tcpcb *);
-void tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
+int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
void tcp_clean_sackreport(struct tcpcb *tp);
void tcp_sack_adjust(struct tcpcb *tp);
@@ -750,9 +868,29 @@ void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
void tcp_free_sackholes(struct tcpcb *tp);
int tcp_newreno(struct tcpcb *, struct tcphdr *);
u_long tcp_seq_subtract(u_long, u_long );
+int tcp_compute_pipe(struct tcpcb *);
-void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
+static inline void
+tcp_fields_to_host(struct tcphdr *th)
+{
+ th->th_seq = ntohl(th->th_seq);
+ th->th_ack = ntohl(th->th_ack);
+ th->th_win = ntohs(th->th_win);
+ th->th_urp = ntohs(th->th_urp);
+}
+
+#ifdef TCP_SIGNATURE
+static inline void
+tcp_fields_to_net(struct tcphdr *th)
+{
+
+ th->th_seq = htonl(th->th_seq);
+ th->th_ack = htonl(th->th_ack);
+ th->th_win = htons(th->th_win);
+ th->th_urp = htons(th->th_urp);
+}
+#endif
#endif /* _KERNEL */
#endif /* _NETINET_TCP_VAR_H_ */
diff --git a/freebsd/sys/netinet/udp_usrreq.c b/freebsd/sys/netinet/udp_usrreq.c
index bf95e954..7eb11648 100644
--- a/freebsd/sys/netinet/udp_usrreq.c
+++ b/freebsd/sys/netinet/udp_usrreq.c
@@ -5,6 +5,7 @@
* The Regents of the University of California.
* Copyright (c) 2008 Robert N. M. Watson
* Copyright (c) 2010-2011 Juniper Networks, Inc.
+ * Copyright (c) 2014 Kevin Lo
* All rights reserved.
*
* Portions of this software were developed by Robert N. M. Watson under
@@ -44,10 +45,10 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <rtems/bsd/local/opt_ipfw.h>
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_ipsec.h>
+#include <rtems/bsd/local/opt_rss.h>
#include <rtems/bsd/sys/param.h>
#include <sys/domain.h>
@@ -60,6 +61,7 @@ __FBSDID("$FreeBSD$");
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
+#include <sys/sdt.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
@@ -71,9 +73,12 @@ __FBSDID("$FreeBSD$");
#include <vm/uma.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
+#include <net/rss_config.h>
#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
@@ -90,6 +95,8 @@ __FBSDID("$FreeBSD$");
#endif
#include <netinet/udp.h>
#include <netinet/udp_var.h>
+#include <netinet/udplite.h>
+#include <netinet/in_rss.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
@@ -101,8 +108,9 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
/*
- * UDP protocol implementation.
+ * UDP and UDP-Lite protocols implementation.
* Per RFC 768, August, 1980.
+ * Per RFC 3828, July, 2004.
*/
/*
@@ -112,7 +120,7 @@ __FBSDID("$FreeBSD$");
* cause problems (especially for NFS data blocks).
*/
VNET_DEFINE(int, udp_cksum) = 1;
-SYSCTL_VNET_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(udp_cksum), 0, "compute udp checksum");
int udp_log_in_vain = 0;
@@ -120,12 +128,17 @@ SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
&udp_log_in_vain, 0, "Log all incoming UDP packets");
VNET_DEFINE(int, udp_blackhole) = 0;
-SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(udp_blackhole), 0,
"Do not send port unreachables for refused connects");
+static VNET_DEFINE(int, udp_require_l2_bcast) = 0;
+#define V_udp_require_l2_bcast VNET(udp_require_l2_bcast)
+SYSCTL_INT(_net_inet_udp, OID_AUTO, require_l2_bcast, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(udp_require_l2_bcast), 0,
+ "Only treat packets sent to an L2 broadcast address as broadcast packets");
+
u_long udp_sendspace = 9216; /* really max datagram size */
- /* 40 1K datagrams */
SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
&udp_sendspace, 0, "Maximum outgoing UDP datagram size");
@@ -135,13 +148,15 @@ u_long udp_recvspace = 40 * (1024 +
#else
sizeof(struct sockaddr_in)
#endif
- );
+ ); /* 40 1K datagrams */
SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
&udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */
VNET_DEFINE(struct inpcbinfo, udbinfo);
+VNET_DEFINE(struct inpcbhead, ulitecb);
+VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
static VNET_DEFINE(uma_zone_t, udpcb_zone);
#define V_udpcb_zone VNET(udpcb_zone)
@@ -149,11 +164,14 @@ static VNET_DEFINE(uma_zone_t, udpcb_zone);
#define UDBHASHSIZE 128
#endif
-VNET_DEFINE(struct udpstat, udpstat); /* from udp_var.h */
-SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW,
- &VNET_NAME(udpstat), udpstat,
- "UDP statistics (struct udpstat, netinet/udp_var.h)");
+VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat); /* from udp_var.h */
+VNET_PCPUSTAT_SYSINIT(udpstat);
+SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
+ udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
+#ifdef VIMAGE
+VNET_PCPUSTAT_SYSUNINIT(udpstat);
+#endif /* VIMAGE */
#ifdef INET
static void udp_detach(struct socket *so);
static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
@@ -187,20 +205,47 @@ udp_inpcb_init(void *mem, int size, int flags)
return (0);
}
+static int
+udplite_inpcb_init(void *mem, int size, int flags)
+{
+ struct inpcb *inp;
+
+ inp = mem;
+ INP_LOCK_INIT(inp, "inp", "udpliteinp");
+ return (0);
+}
+
void
udp_init(void)
{
+ /*
+ * For now default to 2-tuple UDP hashing - until the fragment
+ * reassembly code can also update the flowid.
+ *
+ * Once we can calculate the flowid that way and re-establish
+ * a 4-tuple, flip this to 4-tuple.
+ */
in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
- "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE,
+ "udp_inpcb", udp_inpcb_init, NULL, 0,
IPI_HASHFIELDS_2TUPLE);
V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
uma_zone_set_max(V_udpcb_zone, maxsockets);
+ uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
EVENTHANDLER_PRI_ANY);
}
+void
+udplite_init(void)
+{
+
+ in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE,
+ UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL,
+ 0, IPI_HASHFIELDS_2TUPLE);
+}
+
/*
* Kernel module interface for updating udpstat. The argument is an index
* into udpstat treated as an array of u_long. While this encodes the
@@ -212,7 +257,7 @@ void
kmod_udpstat_inc(int statnum)
{
- (*((u_long *)&V_udpstat + statnum))++;
+ counter_u64_add(VNET(udpstat)[statnum], 1);
}
int
@@ -235,13 +280,23 @@ udp_discardcb(struct udpcb *up)
}
#ifdef VIMAGE
-void
-udp_destroy(void)
+static void
+udp_destroy(void *unused __unused)
{
in_pcbinfo_destroy(&V_udbinfo);
uma_zdestroy(V_udpcb_zone);
}
+VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL);
+
+static void
+udplite_destroy(void *unused __unused)
+{
+
+ in_pcbinfo_destroy(&V_ulitecbinfo);
+}
+VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy,
+ NULL);
#endif
#ifdef INET
@@ -251,14 +306,23 @@ udp_destroy(void)
* contains the source address. If the socket ends up being an IPv6 socket,
* udp_append() will convert to a sockaddr_in6 before passing the address
* into the socket code.
+ *
+ * In the normal case udp_append() will return 0, indicating that you
+ * must unlock the inp. However if a tunneling protocol is in place we increment
+ * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we
+ * then decrement the reference count. If the inp_rele returns 1, indicating the
+ * inp is gone, we return that to the caller to tell them *not* to unlock
+ * the inp. In the case of multi-cast this will cause the distribution
+ * to stop (though most tunneling protocols known currently do *not* use
+ * multicast).
*/
-static void
+static int
udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
struct sockaddr_in *udp_in)
{
struct sockaddr *append_sa;
struct socket *so;
- struct mbuf *opts = 0;
+ struct mbuf *opts = NULL;
#ifdef INET6
struct sockaddr_in6 udp_in6;
#endif
@@ -271,21 +335,21 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
*/
up = intoudpcb(inp);
if (up->u_tun_func != NULL) {
- (*up->u_tun_func)(n, off, inp);
- return;
+ in_pcbref(inp);
+ INP_RUNLOCK(inp);
+ (*up->u_tun_func)(n, off, inp, (struct sockaddr *)udp_in,
+ up->u_tun_ctx);
+ INP_RLOCK(inp);
+ return (in_pcbrele_rlocked(inp));
}
- if (n == NULL)
- return;
-
off += sizeof(struct udphdr);
#ifdef IPSEC
/* Check AH/ESP integrity. */
if (ipsec4_in_reject(n, inp)) {
m_freem(n);
- IPSECSTAT_INC(in_polvio);
- return;
+ return (0);
}
#ifdef IPSEC_NAT_T
up = intoudpcb(inp);
@@ -293,14 +357,14 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
if (up->u_flags & UF_ESPINUDP_ALL) { /* IPSec UDP encaps. */
n = udp4_espdecap(inp, n, off);
if (n == NULL) /* Consumed. */
- return;
+ return (0);
}
#endif /* IPSEC_NAT_T */
#endif /* IPSEC */
#ifdef MAC
if (mac_inpcb_check_deliver(inp, n) != 0) {
m_freem(n);
- return;
+ return (0);
}
#endif /* MAC */
if (inp->inp_flags & INP_CONTROLOPTS ||
@@ -334,22 +398,28 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
UDPSTAT_INC(udps_fullsock);
} else
sorwakeup_locked(so);
+ return (0);
}
-void
-udp_input(struct mbuf *m, int off)
+int
+udp_input(struct mbuf **mp, int *offp, int proto)
{
- int iphlen = off;
struct ip *ip;
struct udphdr *uh;
struct ifnet *ifp;
struct inpcb *inp;
- int len;
+ uint16_t len, ip_len;
+ struct inpcbinfo *pcbinfo;
struct ip save_ip;
struct sockaddr_in udp_in;
+ struct mbuf *m;
struct m_tag *fwd_tag;
+ int cscov_partial, iphlen;
+ m = *mp;
+ iphlen = *offp;
ifp = m->m_pkthdr.rcvif;
+ *mp = NULL;
UDPSTAT_INC(udps_ipackets);
/*
@@ -358,7 +428,7 @@ udp_input(struct mbuf *m, int off)
* check the checksum with options still present.
*/
if (iphlen > sizeof (struct ip)) {
- ip_stripoptions(m, (struct mbuf *)0);
+ ip_stripoptions(m);
iphlen = sizeof(struct ip);
}
@@ -367,13 +437,14 @@ udp_input(struct mbuf *m, int off)
*/
ip = mtod(m, struct ip *);
if (m->m_len < iphlen + sizeof(struct udphdr)) {
- if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
+ if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
UDPSTAT_INC(udps_hdrops);
- return;
+ return (IPPROTO_DONE);
}
ip = mtod(m, struct ip *);
}
uh = (struct udphdr *)((caddr_t)ip + iphlen);
+ cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
/*
* Destination port of 0 is illegal, based on RFC768.
@@ -396,13 +467,20 @@ udp_input(struct mbuf *m, int off)
* reflect UDP length, drop.
*/
len = ntohs((u_short)uh->uh_ulen);
- if (ip->ip_len != len) {
- if (len > ip->ip_len || len < sizeof(struct udphdr)) {
+ ip_len = ntohs(ip->ip_len) - iphlen;
+ if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) {
+ /* Zero means checksum over the complete packet. */
+ if (len == 0)
+ len = ip_len;
+ cscov_partial = 0;
+ }
+ if (ip_len != len) {
+ if (len > ip_len || len < sizeof(struct udphdr)) {
UDPSTAT_INC(udps_badlen);
goto badunlocked;
}
- m_adj(m, len - ip->ip_len);
- /* ip->ip_len = len; */
+ if (proto == IPPROTO_UDP)
+ m_adj(m, len - ip_len);
}
/*
@@ -420,39 +498,53 @@ udp_input(struct mbuf *m, int off)
if (uh->uh_sum) {
u_short uh_sum;
- if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
+ !cscov_partial) {
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
uh_sum = m->m_pkthdr.csum_data;
else
uh_sum = in_pseudo(ip->ip_src.s_addr,
ip->ip_dst.s_addr, htonl((u_short)len +
- m->m_pkthdr.csum_data + IPPROTO_UDP));
+ m->m_pkthdr.csum_data + proto));
uh_sum ^= 0xffff;
} else {
char b[9];
bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
bzero(((struct ipovly *)ip)->ih_x1, 9);
- ((struct ipovly *)ip)->ih_len = uh->uh_ulen;
+ ((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ?
+ uh->uh_ulen : htons(ip_len);
uh_sum = in_cksum(m, len + sizeof (struct ip));
bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
}
if (uh_sum) {
UDPSTAT_INC(udps_badsum);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
}
- } else
- UDPSTAT_INC(udps_nosum);
+ } else {
+ if (proto == IPPROTO_UDP) {
+ UDPSTAT_INC(udps_nosum);
+ } else {
+ /* UDPLite requires a checksum */
+ /* XXX: What is the right UDPLite MIB counter here? */
+ m_freem(m);
+ return (IPPROTO_DONE);
+ }
+ }
+ pcbinfo = udp_get_inpcbinfo(proto);
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
- in_broadcast(ip->ip_dst, ifp)) {
+ ((!V_udp_require_l2_bcast || m->m_flags & M_BCAST) &&
+ in_broadcast(ip->ip_dst, ifp))) {
struct inpcb *last;
+ struct inpcbhead *pcblist;
struct ip_moptions *imo;
- INP_INFO_RLOCK(&V_udbinfo);
+ INP_INFO_RLOCK(pcbinfo);
+ pcblist = udp_get_pcblist(proto);
last = NULL;
- LIST_FOREACH(inp, &V_udb, inp_list) {
+ LIST_FOREACH(inp, pcblist, inp_list) {
if (inp->inp_lport != uh->uh_dport)
continue;
#ifdef INET6
@@ -511,8 +603,14 @@ udp_input(struct mbuf *m, int off)
if (last != NULL) {
struct mbuf *n;
- n = m_copy(m, 0, M_COPYALL);
- udp_append(last, ip, n, iphlen, &udp_in);
+ if ((n = m_copy(m, 0, M_COPYALL)) != NULL) {
+ UDP_PROBE(receive, NULL, last, ip,
+ last, uh);
+ if (udp_append(last, ip, n, iphlen,
+ &udp_in)) {
+ goto inp_lost;
+ }
+ }
INP_RUNLOCK(last);
}
last = inp;
@@ -538,13 +636,15 @@ udp_input(struct mbuf *m, int off)
UDPSTAT_INC(udps_noportbcast);
if (inp)
INP_RUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_udbinfo);
+ INP_INFO_RUNLOCK(pcbinfo);
goto badunlocked;
}
- udp_append(last, ip, m, iphlen, &udp_in);
- INP_RUNLOCK(last);
- INP_INFO_RUNLOCK(&V_udbinfo);
- return;
+ UDP_PROBE(receive, NULL, last, ip, last, uh);
+ if (udp_append(last, ip, m, iphlen, &udp_in) == 0)
+ INP_RUNLOCK(last);
+ inp_lost:
+ INP_INFO_RUNLOCK(pcbinfo);
+ return (IPPROTO_DONE);
}
/*
@@ -564,7 +664,7 @@ udp_input(struct mbuf *m, int off)
* Transparently forwarded. Pretend to be the destination.
* Already got one like this?
*/
- inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport,
+ inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
if (!inp) {
/*
@@ -572,7 +672,7 @@ udp_input(struct mbuf *m, int off)
* Because we've rewritten the destination address,
* any hardware-generated hash is ignored.
*/
- inp = in_pcblookup(&V_udbinfo, ip->ip_src,
+ inp = in_pcblookup(pcbinfo, ip->ip_src,
uh->uh_sport, next_hop->sin_addr,
next_hop->sin_port ? htons(next_hop->sin_port) :
uh->uh_dport, INPLOOKUP_WILDCARD |
@@ -582,7 +682,7 @@ udp_input(struct mbuf *m, int off)
m_tag_delete(m, fwd_tag);
m->m_flags &= ~M_IP_NEXTHOP;
} else
- inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport,
+ inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
INPLOOKUP_RLOCKPCB, ifp, m);
if (inp == NULL) {
@@ -605,9 +705,8 @@ udp_input(struct mbuf *m, int off)
if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
goto badunlocked;
*ip = save_ip;
- ip->ip_len += iphlen;
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
- return;
+ return (IPPROTO_DONE);
}
/*
@@ -617,14 +716,27 @@ udp_input(struct mbuf *m, int off)
if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
INP_RUNLOCK(inp);
m_freem(m);
- return;
+ return (IPPROTO_DONE);
+ }
+ if (cscov_partial) {
+ struct udpcb *up;
+
+ up = intoudpcb(inp);
+ if (up->u_rxcslen == 0 || up->u_rxcslen > len) {
+ INP_RUNLOCK(inp);
+ m_freem(m);
+ return (IPPROTO_DONE);
+ }
}
- udp_append(inp, ip, m, iphlen, &udp_in);
- INP_RUNLOCK(inp);
- return;
+
+ UDP_PROBE(receive, NULL, inp, ip, inp, uh);
+ if (udp_append(inp, ip, m, iphlen, &udp_in) == 0)
+ INP_RUNLOCK(inp);
+ return (IPPROTO_DONE);
badunlocked:
m_freem(m);
+ return (IPPROTO_DONE);
}
#endif /* INET */
@@ -643,6 +755,11 @@ udp_notify(struct inpcb *inp, int errno)
* or a write lock, but a read lock is sufficient.
*/
INP_LOCK_ASSERT(inp);
+ if ((errno == EHOSTUNREACH || errno == ENETUNREACH ||
+ errno == EHOSTDOWN) && inp->inp_route.ro_rt) {
+ RTFREE(inp->inp_route.ro_rt);
+ inp->inp_route.ro_rt = (struct rtentry *)NULL;
+ }
inp->inp_socket->so_error = errno;
sorwakeup(inp->inp_socket);
@@ -651,8 +768,9 @@ udp_notify(struct inpcb *inp, int errno)
}
#ifdef INET
-void
-udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
+static void
+udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
+ struct inpcbinfo *pcbinfo)
{
struct ip *ip = vip;
struct udphdr *uh;
@@ -663,11 +781,11 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
return;
- /*
- * Redirects don't need to be handled up here.
- */
- if (PRC_IS_REDIRECT(cmd))
+ if (PRC_IS_REDIRECT(cmd)) {
+ /* signal EHOSTDOWN, as it flushes the cached route */
+ in_pcbnotifyall(&V_udbinfo, faddr, EHOSTDOWN, udp_notify);
return;
+ }
/*
* Hostdead is ugly because it goes linearly through all PCBs.
@@ -681,7 +799,7 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
return;
if (ip != NULL) {
uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
- inp = in_pcblookup(&V_udbinfo, faddr, uh->uh_dport,
+ inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL);
if (inp != NULL) {
INP_RLOCK_ASSERT(inp);
@@ -689,11 +807,39 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
udp_notify(inp, inetctlerrmap[cmd]);
}
INP_RUNLOCK(inp);
+ } else {
+ inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
+ ip->ip_src, uh->uh_sport,
+ INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
+ if (inp != NULL) {
+ struct udpcb *up;
+
+ up = intoudpcb(inp);
+ if (up->u_icmp_func != NULL) {
+ INP_RUNLOCK(inp);
+ (*up->u_icmp_func)(cmd, sa, vip, up->u_tun_ctx);
+ } else {
+ INP_RUNLOCK(inp);
+ }
+ }
}
} else
- in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd],
+ in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
udp_notify);
}
+void
+udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
+{
+
+ return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
+}
+
+void
+udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
+{
+
+ return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
+}
#endif /* INET */
static int
@@ -740,7 +886,7 @@ udp_pcblist(SYSCTL_HANDLER_ARGS)
return (error);
inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
- if (inp_list == 0)
+ if (inp_list == NULL)
return (ENOMEM);
INP_INFO_RLOCK(&V_udbinfo);
@@ -849,16 +995,16 @@ SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
int
udp_ctloutput(struct socket *so, struct sockopt *sopt)
{
- int error = 0, optval;
struct inpcb *inp;
-#ifdef IPSEC_NAT_T
struct udpcb *up;
-#endif
+ int isudplite, error, optval;
+ error = 0;
+ isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
INP_WLOCK(inp);
- if (sopt->sopt_level != IPPROTO_UDP) {
+ if (sopt->sopt_level != so->so_proto->pr_protocol) {
#ifdef INET6
if (INP_CHECK_SOCKAF(so, AF_INET6)) {
INP_WUNLOCK(inp);
@@ -916,6 +1062,34 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt)
}
INP_WUNLOCK(inp);
break;
+ case UDPLITE_SEND_CSCOV:
+ case UDPLITE_RECV_CSCOV:
+ if (!isudplite) {
+ INP_WUNLOCK(inp);
+ error = ENOPROTOOPT;
+ break;
+ }
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof(optval),
+ sizeof(optval));
+ if (error != 0)
+ break;
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
+ INP_WLOCK(inp);
+ up = intoudpcb(inp);
+ KASSERT(up != NULL, ("%s: up == NULL", __func__));
+ if ((optval != 0 && optval < 8) || (optval > 65535)) {
+ INP_WUNLOCK(inp);
+ error = EINVAL;
+ break;
+ }
+ if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
+ up->u_txcslen = optval;
+ else
+ up->u_rxcslen = optval;
+ INP_WUNLOCK(inp);
+ break;
default:
INP_WUNLOCK(inp);
error = ENOPROTOOPT;
@@ -933,6 +1107,22 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt)
error = sooptcopyout(sopt, &optval, sizeof optval);
break;
#endif
+ case UDPLITE_SEND_CSCOV:
+ case UDPLITE_RECV_CSCOV:
+ if (!isudplite) {
+ INP_WUNLOCK(inp);
+ error = ENOPROTOOPT;
+ break;
+ }
+ up = intoudpcb(inp);
+ KASSERT(up != NULL, ("%s: up == NULL", __func__));
+ if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
+ optval = up->u_txcslen;
+ else
+ optval = up->u_rxcslen;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
default:
INP_WUNLOCK(inp);
error = ENOPROTOOPT;
@@ -955,12 +1145,18 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
int len = m->m_pkthdr.len;
struct in_addr faddr, laddr;
struct cmsghdr *cm;
+ struct inpcbinfo *pcbinfo;
struct sockaddr_in *sin, src;
+ int cscov_partial = 0;
int error = 0;
int ipflags;
u_short fport, lport;
- int unlock_udbinfo;
+ int unlock_udbinfo, unlock_inp;
u_char tos;
+ uint8_t pr;
+ uint16_t cscov = 0;
+ uint32_t flowid = 0;
+ uint8_t flowtype = M_HASHTYPE_NONE;
/*
* udp_output() may need to temporarily bind or connect the current
@@ -976,7 +1172,15 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
}
src.sin_family = 0;
- INP_RLOCK(inp);
+ sin = (struct sockaddr_in *)addr;
+ if (sin == NULL ||
+ (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
+ INP_WLOCK(inp);
+ unlock_inp = UH_WLOCKED;
+ } else {
+ INP_RLOCK(inp);
+ unlock_inp = UH_RLOCKED;
+ }
tos = inp->inp_ip_tos;
if (control != NULL) {
/*
@@ -984,7 +1188,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
* stored in a single mbuf.
*/
if (control->m_next) {
- INP_RUNLOCK(inp);
+ if (unlock_inp == UH_WLOCKED)
+ INP_WUNLOCK(inp);
+ else
+ INP_RUNLOCK(inp);
m_freem(control);
m_freem(m);
return (EINVAL);
@@ -1024,6 +1231,31 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
tos = *(u_char *)CMSG_DATA(cm);
break;
+ case IP_FLOWID:
+ if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
+ error = EINVAL;
+ break;
+ }
+ flowid = *(uint32_t *) CMSG_DATA(cm);
+ break;
+
+ case IP_FLOWTYPE:
+ if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
+ error = EINVAL;
+ break;
+ }
+ flowtype = *(uint32_t *) CMSG_DATA(cm);
+ break;
+
+#ifdef RSS
+ case IP_RSSBUCKETID:
+ if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
+ error = EINVAL;
+ break;
+ }
+ /* This is just a placeholder for now */
+ break;
+#endif /* RSS */
default:
error = ENOPROTOOPT;
break;
@@ -1034,7 +1266,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
m_freem(control);
}
if (error) {
- INP_RUNLOCK(inp);
+ if (unlock_inp == UH_WLOCKED)
+ INP_WUNLOCK(inp);
+ else
+ INP_RUNLOCK(inp);
m_freem(m);
return (error);
}
@@ -1055,12 +1290,12 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
*
* XXXRW: Check that hash locking update here is correct.
*/
+ pr = inp->inp_socket->so_proto->pr_protocol;
+ pcbinfo = udp_get_inpcbinfo(pr);
sin = (struct sockaddr_in *)addr;
if (sin != NULL &&
(inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
- INP_RUNLOCK(inp);
- INP_WLOCK(inp);
- INP_HASH_WLOCK(&V_udbinfo);
+ INP_HASH_WLOCK(pcbinfo);
unlock_udbinfo = UH_WLOCKED;
} else if ((sin != NULL && (
(sin->sin_addr.s_addr == INADDR_ANY) ||
@@ -1068,7 +1303,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
(inp->inp_laddr.s_addr == INADDR_ANY) ||
(inp->inp_lport == 0))) ||
(src.sin_family == AF_INET)) {
- INP_HASH_RLOCK(&V_udbinfo);
+ INP_HASH_RLOCK(pcbinfo);
unlock_udbinfo = UH_RLOCKED;
} else
unlock_udbinfo = UH_UNLOCKED;
@@ -1081,7 +1316,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
laddr = inp->inp_laddr;
lport = inp->inp_lport;
if (src.sin_family == AF_INET) {
- INP_HASH_LOCK_ASSERT(&V_udbinfo);
+ INP_HASH_LOCK_ASSERT(pcbinfo);
if ((lport == 0) ||
(laddr.s_addr == INADDR_ANY &&
src.sin_addr.s_addr == INADDR_ANY)) {
@@ -1132,7 +1367,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
inp->inp_lport == 0 ||
sin->sin_addr.s_addr == INADDR_ANY ||
sin->sin_addr.s_addr == INADDR_BROADCAST) {
- INP_HASH_LOCK_ASSERT(&V_udbinfo);
+ INP_HASH_LOCK_ASSERT(pcbinfo);
error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
&lport, &faddr.s_addr, &fport, NULL,
td->td_ucred);
@@ -1147,7 +1382,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
if (inp->inp_laddr.s_addr == INADDR_ANY &&
inp->inp_lport == 0) {
INP_WLOCK_ASSERT(inp);
- INP_HASH_WLOCK_ASSERT(&V_udbinfo);
+ INP_HASH_WLOCK_ASSERT(pcbinfo);
/*
* Remember addr if jailed, to prevent
* rebinding.
@@ -1181,7 +1416,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
* link-layer headers. Immediate slide the data pointer back forward
* since we won't use that space at this layer.
*/
- M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT);
+ M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
if (m == NULL) {
error = ENOBUFS;
goto release;
@@ -1196,12 +1431,30 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
*/
ui = mtod(m, struct udpiphdr *);
bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */
- ui->ui_pr = IPPROTO_UDP;
+ ui->ui_pr = pr;
ui->ui_src = laddr;
ui->ui_dst = faddr;
ui->ui_sport = lport;
ui->ui_dport = fport;
ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
+ if (pr == IPPROTO_UDPLITE) {
+ struct udpcb *up;
+ uint16_t plen;
+
+ up = intoudpcb(inp);
+ cscov = up->u_txcslen;
+ plen = (u_short)len + sizeof(struct udphdr);
+ if (cscov >= plen)
+ cscov = 0;
+ ui->ui_len = htons(plen);
+ ui->ui_ulen = htons(cscov);
+ /*
+ * For UDP-Lite, checksum coverage length of zero means
+ * the entire UDPLite packet is covered by the checksum.
+ */
+ cscov_partial = (cscov == 0) ? 0 : 1;
+ } else
+ ui->ui_v = IPVERSION << 4;
/*
* Set the Don't Fragment bit in the IP header.
@@ -1210,7 +1463,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
struct ip *ip;
ip = (struct ip *)&ui->ui_i;
- ip->ip_off |= IP_DF;
+ ip->ip_off |= htons(IP_DF);
}
ipflags = 0;
@@ -1228,27 +1481,90 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
/*
* Set up checksum and output datagram.
*/
- if (V_udp_cksum) {
+ ui->ui_sum = 0;
+ if (pr == IPPROTO_UDPLITE) {
+ if (inp->inp_flags & INP_ONESBCAST)
+ faddr.s_addr = INADDR_BROADCAST;
+ if (cscov_partial) {
+ if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
+ ui->ui_sum = 0xffff;
+ } else {
+ if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
+ ui->ui_sum = 0xffff;
+ }
+ } else if (V_udp_cksum) {
if (inp->inp_flags & INP_ONESBCAST)
faddr.s_addr = INADDR_BROADCAST;
ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
- htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
+ htons((u_short)len + sizeof(struct udphdr) + pr));
m->m_pkthdr.csum_flags = CSUM_UDP;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
- } else
- ui->ui_sum = 0;
- ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
+ }
+ ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */
((struct ip *)ui)->ip_tos = tos; /* XXX */
UDPSTAT_INC(udps_opackets);
+ /*
+ * Setup flowid / RSS information for outbound socket.
+ *
+ * Once the UDP code decides to set a flowid some other way,
+ * this allows the flowid to be overridden by userland.
+ */
+ if (flowtype != M_HASHTYPE_NONE) {
+ m->m_pkthdr.flowid = flowid;
+ M_HASHTYPE_SET(m, flowtype);
+#ifdef RSS
+ } else {
+ uint32_t hash_val, hash_type;
+ /*
+ * Calculate an appropriate RSS hash for UDP and
+ * UDP Lite.
+ *
+ * The called function will take care of figuring out
+ * whether a 2-tuple or 4-tuple hash is required based
+ * on the currently configured scheme.
+ *
+ * Later later on connected socket values should be
+ * cached in the inpcb and reused, rather than constantly
+ * re-calculating it.
+ *
+ * UDP Lite is a different protocol number and will
+ * likely end up being hashed as a 2-tuple until
+ * RSS / NICs grow UDP Lite protocol awareness.
+ */
+ if (rss_proto_software_hash_v4(faddr, laddr, fport, lport,
+ pr, &hash_val, &hash_type) == 0) {
+ m->m_pkthdr.flowid = hash_val;
+ M_HASHTYPE_SET(m, hash_type);
+ }
+#endif
+ }
+
+#ifdef RSS
+ /*
+ * Don't override with the inp cached flowid value.
+ *
+ * Depending upon the kind of send being done, the inp
+ * flowid/flowtype values may actually not be appropriate
+ * for this particular socket send.
+ *
+ * We should either leave the flowid at zero (which is what is
+ * currently done) or set it to some software generated
+ * hash value based on the packet contents.
+ */
+ ipflags |= IP_NODEFAULTFLOWID;
+#endif /* RSS */
+
if (unlock_udbinfo == UH_WLOCKED)
- INP_HASH_WUNLOCK(&V_udbinfo);
+ INP_HASH_WUNLOCK(pcbinfo);
else if (unlock_udbinfo == UH_RLOCKED)
- INP_HASH_RUNLOCK(&V_udbinfo);
- error = ip_output(m, inp->inp_options, NULL, ipflags,
+ INP_HASH_RUNLOCK(pcbinfo);
+ UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
+ error = ip_output(m, inp->inp_options,
+ (unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags,
inp->inp_moptions, inp);
- if (unlock_udbinfo == UH_WLOCKED)
+ if (unlock_inp == UH_WLOCKED)
INP_WUNLOCK(inp);
else
INP_RUNLOCK(inp);
@@ -1256,10 +1572,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
release:
if (unlock_udbinfo == UH_WLOCKED) {
- INP_HASH_WUNLOCK(&V_udbinfo);
+ INP_HASH_WUNLOCK(pcbinfo);
INP_WUNLOCK(inp);
} else if (unlock_udbinfo == UH_RLOCKED) {
- INP_HASH_RUNLOCK(&V_udbinfo);
+ INP_HASH_RUNLOCK(pcbinfo);
INP_RUNLOCK(inp);
} else
INP_RUNLOCK(inp);
@@ -1297,7 +1613,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
if (minlen > m->m_pkthdr.len)
minlen = m->m_pkthdr.len;
if ((m = m_pullup(m, minlen)) == NULL) {
- IPSECSTAT_INC(in_inval);
+ IPSECSTAT_INC(ips_in_inval);
return (NULL); /* Bypass caller processing. */
}
data = mtod(m, caddr_t); /* Points to ip header. */
@@ -1337,7 +1653,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
uint32_t spi;
if (payload <= sizeof(struct esp)) {
- IPSECSTAT_INC(in_inval);
+ IPSECSTAT_INC(ips_in_inval);
m_freem(m);
return (NULL); /* Discard. */
}
@@ -1358,7 +1674,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
2 * sizeof(uint16_t), M_NOWAIT);
if (tag == NULL) {
- IPSECSTAT_INC(in_nomem);
+ IPSECSTAT_INC(ips_in_nomem);
m_freem(m);
return (NULL); /* Discard. */
}
@@ -1387,7 +1703,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
m_adj(m, skip);
ip = mtod(m, struct ip *);
- ip->ip_len -= skip;
+ ip->ip_len = htons(ntohs(ip->ip_len) - skip);
ip->ip_p = IPPROTO_ESP;
/*
@@ -1397,7 +1713,8 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)
m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
- (void) ipsec4_common_input(m, iphlen, ip->ip_p);
+ (void) ipsec_common_input(m, iphlen, offsetof(struct ip, ip_p),
+ AF_INET, ip->ip_p);
return (NULL); /* NB: consumed, bypass processing. */
}
#endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
@@ -1406,15 +1723,17 @@ static void
udp_abort(struct socket *so)
{
struct inpcb *inp;
+ struct inpcbinfo *pcbinfo;
+ pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
INP_WLOCK(inp);
if (inp->inp_faddr.s_addr != INADDR_ANY) {
- INP_HASH_WLOCK(&V_udbinfo);
+ INP_HASH_WLOCK(pcbinfo);
in_pcbdisconnect(inp);
inp->inp_laddr.s_addr = INADDR_ANY;
- INP_HASH_WUNLOCK(&V_udbinfo);
+ INP_HASH_WUNLOCK(pcbinfo);
soisdisconnected(so);
}
INP_WUNLOCK(inp);
@@ -1424,17 +1743,19 @@ static int
udp_attach(struct socket *so, int proto, struct thread *td)
{
struct inpcb *inp;
+ struct inpcbinfo *pcbinfo;
int error;
+ pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
error = soreserve(so, udp_sendspace, udp_recvspace);
if (error)
return (error);
- INP_INFO_WLOCK(&V_udbinfo);
- error = in_pcballoc(so, &V_udbinfo);
+ INP_INFO_WLOCK(pcbinfo);
+ error = in_pcballoc(so, pcbinfo);
if (error) {
- INP_INFO_WUNLOCK(&V_udbinfo);
+ INP_INFO_WUNLOCK(pcbinfo);
return (error);
}
@@ -1446,18 +1767,18 @@ udp_attach(struct socket *so, int proto, struct thread *td)
if (error) {
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(&V_udbinfo);
+ INP_INFO_WUNLOCK(pcbinfo);
return (error);
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_udbinfo);
+ INP_INFO_WUNLOCK(pcbinfo);
return (0);
}
#endif /* INET */
int
-udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f)
+udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx)
{
struct inpcb *inp;
struct udpcb *up;
@@ -1468,11 +1789,14 @@ udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f)
KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
INP_WLOCK(inp);
up = intoudpcb(inp);
- if (up->u_tun_func != NULL) {
+ if ((up->u_tun_func != NULL) ||
+ (up->u_icmp_func != NULL)) {
INP_WUNLOCK(inp);
return (EBUSY);
}
up->u_tun_func = f;
+ up->u_icmp_func = i;
+ up->u_tun_ctx = ctx;
INP_WUNLOCK(inp);
return (0);
}
@@ -1482,14 +1806,16 @@ static int
udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
struct inpcb *inp;
+ struct inpcbinfo *pcbinfo;
int error;
+ pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
INP_WLOCK(inp);
- INP_HASH_WLOCK(&V_udbinfo);
+ INP_HASH_WLOCK(pcbinfo);
error = in_pcbbind(inp, nam, td->td_ucred);
- INP_HASH_WUNLOCK(&V_udbinfo);
+ INP_HASH_WUNLOCK(pcbinfo);
INP_WUNLOCK(inp);
return (error);
}
@@ -1498,15 +1824,17 @@ static void
udp_close(struct socket *so)
{
struct inpcb *inp;
+ struct inpcbinfo *pcbinfo;
+ pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp_close: inp == NULL"));
INP_WLOCK(inp);
if (inp->inp_faddr.s_addr != INADDR_ANY) {
- INP_HASH_WLOCK(&V_udbinfo);
+ INP_HASH_WLOCK(pcbinfo);
in_pcbdisconnect(inp);
inp->inp_laddr.s_addr = INADDR_ANY;
- INP_HASH_WUNLOCK(&V_udbinfo);
+ INP_HASH_WUNLOCK(pcbinfo);
soisdisconnected(so);
}
INP_WUNLOCK(inp);
@@ -1516,9 +1844,11 @@ static int
udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
struct inpcb *inp;
- int error;
+ struct inpcbinfo *pcbinfo;
struct sockaddr_in *sin;
+ int error;
+ pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
INP_WLOCK(inp);
@@ -1532,9 +1862,9 @@ udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
INP_WUNLOCK(inp);
return (error);
}
- INP_HASH_WLOCK(&V_udbinfo);
+ INP_HASH_WLOCK(pcbinfo);
error = in_pcbconnect(inp, nam, td->td_ucred);
- INP_HASH_WUNLOCK(&V_udbinfo);
+ INP_HASH_WUNLOCK(pcbinfo);
if (error == 0)
soisconnected(so);
INP_WUNLOCK(inp);
@@ -1545,20 +1875,22 @@ static void
udp_detach(struct socket *so)
{
struct inpcb *inp;
+ struct inpcbinfo *pcbinfo;
struct udpcb *up;
+ pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
("udp_detach: not disconnected"));
- INP_INFO_WLOCK(&V_udbinfo);
+ INP_INFO_WLOCK(pcbinfo);
INP_WLOCK(inp);
up = intoudpcb(inp);
KASSERT(up != NULL, ("%s: up == NULL", __func__));
inp->inp_ppcb = NULL;
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(&V_udbinfo);
+ INP_INFO_WUNLOCK(pcbinfo);
udp_discardcb(up);
}
@@ -1566,7 +1898,9 @@ static int
udp_disconnect(struct socket *so)
{
struct inpcb *inp;
+ struct inpcbinfo *pcbinfo;
+ pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
INP_WLOCK(inp);
@@ -1574,10 +1908,10 @@ udp_disconnect(struct socket *so)
INP_WUNLOCK(inp);
return (ENOTCONN);
}
- INP_HASH_WLOCK(&V_udbinfo);
+ INP_HASH_WLOCK(pcbinfo);
in_pcbdisconnect(inp);
inp->inp_laddr.s_addr = INADDR_ANY;
- INP_HASH_WUNLOCK(&V_udbinfo);
+ INP_HASH_WUNLOCK(pcbinfo);
SOCK_LOCK(so);
so->so_state &= ~SS_ISCONNECTED; /* XXX */
SOCK_UNLOCK(so);
diff --git a/freebsd/sys/netinet/udp_var.h b/freebsd/sys/netinet/udp_var.h
index 6b9b5362..172d969d 100644
--- a/freebsd/sys/netinet/udp_var.h
+++ b/freebsd/sys/netinet/udp_var.h
@@ -42,6 +42,7 @@ struct udpiphdr {
struct udphdr ui_u; /* udp header */
};
#define ui_x1 ui_i.ih_x1
+#define ui_v ui_i.ih_x1[0]
#define ui_pr ui_i.ih_pr
#define ui_len ui_i.ih_len
#define ui_src ui_i.ih_src
@@ -51,14 +52,23 @@ struct udpiphdr {
#define ui_ulen ui_u.uh_ulen
#define ui_sum ui_u.uh_sum
-typedef void(*udp_tun_func_t)(struct mbuf *, int off, struct inpcb *);
+struct inpcb;
+struct mbuf;
+typedef void(*udp_tun_func_t)(struct mbuf *, int, struct inpcb *,
+ const struct sockaddr *, void *);
+typedef void(*udp_tun_icmp_t)(int, struct sockaddr *, void *, void *);
+
/*
* UDP control block; one per udp.
*/
struct udpcb {
udp_tun_func_t u_tun_func; /* UDP kernel tunneling callback. */
+ udp_tun_icmp_t u_icmp_func; /* UDP kernel tunneling icmp callback */
u_int u_flags; /* Generic UDP flags. */
+ uint16_t u_rxcslen; /* Coverage for incoming datagrams. */
+ uint16_t u_txcslen; /* Coverage for outgoing datagrams. */
+ void *u_tun_ctx; /* Tunneling callback context. */
};
#define intoudpcb(ip) ((struct udpcb *)(ip)->inp_ppcb)
@@ -72,96 +82,107 @@ struct udpcb {
struct udpstat {
/* input statistics: */
- u_long udps_ipackets; /* total input packets */
- u_long udps_hdrops; /* packet shorter than header */
- u_long udps_badsum; /* checksum error */
- u_long udps_nosum; /* no checksum */
- u_long udps_badlen; /* data length larger than packet */
- u_long udps_noport; /* no socket on port */
- u_long udps_noportbcast; /* of above, arrived as broadcast */
- u_long udps_fullsock; /* not delivered, input socket full */
- u_long udpps_pcbcachemiss; /* input packets missing pcb cache */
- u_long udpps_pcbhashmiss; /* input packets not for hashed pcb */
+ uint64_t udps_ipackets; /* total input packets */
+ uint64_t udps_hdrops; /* packet shorter than header */
+ uint64_t udps_badsum; /* checksum error */
+ uint64_t udps_nosum; /* no checksum */
+ uint64_t udps_badlen; /* data length larger than packet */
+ uint64_t udps_noport; /* no socket on port */
+ uint64_t udps_noportbcast; /* of above, arrived as broadcast */
+ uint64_t udps_fullsock; /* not delivered, input socket full */
+ uint64_t udpps_pcbcachemiss; /* input packets missing pcb cache */
+ uint64_t udpps_pcbhashmiss; /* input packets not for hashed pcb */
/* output statistics: */
- u_long udps_opackets; /* total output packets */
- u_long udps_fastout; /* output packets on fast path */
+ uint64_t udps_opackets; /* total output packets */
+ uint64_t udps_fastout; /* output packets on fast path */
/* of no socket on port, arrived as multicast */
- u_long udps_noportmcast;
- u_long udps_filtermcast; /* blocked by multicast filter */
+ uint64_t udps_noportmcast;
+ uint64_t udps_filtermcast; /* blocked by multicast filter */
};
#ifdef _KERNEL
+#include <sys/counter.h>
#ifdef __rtems__
#include <errno.h>
#undef errno
#endif /* __rtems__ */
+
+VNET_PCPUSTAT_DECLARE(struct udpstat, udpstat);
/*
* In-kernel consumers can use these accessor macros directly to update
* stats.
*/
-#define UDPSTAT_ADD(name, val) V_udpstat.name += (val)
+#define UDPSTAT_ADD(name, val) \
+ VNET_PCPUSTAT_ADD(struct udpstat, udpstat, name, (val))
#define UDPSTAT_INC(name) UDPSTAT_ADD(name, 1)
/*
* Kernel module consumers must use this accessor macro.
*/
void kmod_udpstat_inc(int statnum);
-#define KMOD_UDPSTAT_INC(name) \
- kmod_udpstat_inc(offsetof(struct udpstat, name) / sizeof(u_long))
+#define KMOD_UDPSTAT_INC(name) \
+ kmod_udpstat_inc(offsetof(struct udpstat, name) / sizeof(uint64_t))
#endif
/*
- * Names for UDP sysctl objects.
+ * Identifiers for UDP sysctl nodes.
*/
#define UDPCTL_CHECKSUM 1 /* checksum UDP packets */
#define UDPCTL_STATS 2 /* statistics (read-only) */
#define UDPCTL_MAXDGRAM 3 /* max datagram size */
#define UDPCTL_RECVSPACE 4 /* default receive buffer space */
#define UDPCTL_PCBLIST 5 /* list of PCBs for UDP sockets */
-#define UDPCTL_MAXID 6
-
-#define UDPCTL_NAMES { \
- { 0, 0 }, \
- { "checksum", CTLTYPE_INT }, \
- { "stats", CTLTYPE_STRUCT }, \
- { "maxdgram", CTLTYPE_INT }, \
- { "recvspace", CTLTYPE_INT }, \
- { "pcblist", CTLTYPE_STRUCT }, \
-}
#ifdef _KERNEL
+#include <netinet/in_pcb.h>
SYSCTL_DECL(_net_inet_udp);
extern struct pr_usrreqs udp_usrreqs;
VNET_DECLARE(struct inpcbhead, udb);
VNET_DECLARE(struct inpcbinfo, udbinfo);
+VNET_DECLARE(struct inpcbhead, ulitecb);
+VNET_DECLARE(struct inpcbinfo, ulitecbinfo);
#define V_udb VNET(udb)
#define V_udbinfo VNET(udbinfo)
+#define V_ulitecb VNET(ulitecb)
+#define V_ulitecbinfo VNET(ulitecbinfo)
extern u_long udp_sendspace;
extern u_long udp_recvspace;
VNET_DECLARE(int, udp_cksum);
-VNET_DECLARE(struct udpstat, udpstat);
VNET_DECLARE(int, udp_blackhole);
#define V_udp_cksum VNET(udp_cksum)
-#define V_udpstat VNET(udpstat)
#define V_udp_blackhole VNET(udp_blackhole)
extern int udp_log_in_vain;
-int udp_newudpcb(struct inpcb *);
-void udp_discardcb(struct udpcb *);
+static __inline struct inpcbinfo *
+udp_get_inpcbinfo(int protocol)
+{
+ return (protocol == IPPROTO_UDP) ? &V_udbinfo : &V_ulitecbinfo;
+}
-void udp_ctlinput(int, struct sockaddr *, void *);
-int udp_ctloutput(struct socket *, struct sockopt *);
-void udp_init(void);
-#ifdef VIMAGE
-void udp_destroy(void);
-#endif
-void udp_input(struct mbuf *, int);
+static __inline struct inpcbhead *
+udp_get_pcblist(int protocol)
+{
+ return (protocol == IPPROTO_UDP) ? &V_udb : &V_ulitecb;
+}
+
+int udp_newudpcb(struct inpcb *);
+void udp_discardcb(struct udpcb *);
+
+void udp_ctlinput(int, struct sockaddr *, void *);
+void udplite_ctlinput(int, struct sockaddr *, void *);
+int udp_ctloutput(struct socket *, struct sockopt *);
+void udp_init(void);
+void udplite_init(void);
+int udp_input(struct mbuf **, int *, int);
+void udplite_input(struct mbuf *, int);
struct inpcb *udp_notify(struct inpcb *inp, int errno);
-int udp_shutdown(struct socket *so);
+int udp_shutdown(struct socket *so);
-int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f);
-#endif
+int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f,
+ udp_tun_icmp_t i, void *ctx);
-#endif
+#endif /* _KERNEL */
+
+#endif /* _NETINET_UDP_VAR_H_ */
diff --git a/freebsd/sys/netinet/udplite.h b/freebsd/sys/netinet/udplite.h
new file mode 100644
index 00000000..0e23cd70
--- /dev/null
+++ b/freebsd/sys/netinet/udplite.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2014, Kevin Lo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_UDPLITE_H_
+#define _NETINET_UDPLITE_H_
+
+/*
+ * User-settable options (used with setsockopt).
+ */
+#define UDPLITE_SEND_CSCOV 2 /* Sender checksum coverage. */
+#define UDPLITE_RECV_CSCOV 4 /* Receiver checksum coverage. */
+
+#endif /* !_NETINET_UDPLITE_H_ */