summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet/in_pcb.h
diff options
context:
space:
mode:
Diffstat (limited to 'freebsd/sys/netinet/in_pcb.h')
-rw-r--r--freebsd/sys/netinet/in_pcb.h167
1 files changed, 101 insertions, 66 deletions
diff --git a/freebsd/sys/netinet/in_pcb.h b/freebsd/sys/netinet/in_pcb.h
index d00dd456..86c9705c 100644
--- a/freebsd/sys/netinet/in_pcb.h
+++ b/freebsd/sys/netinet/in_pcb.h
@@ -51,8 +51,11 @@
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <net/vnet.h>
+#include <net/if.h>
+#include <net/if_var.h>
#include <vm/uma.h>
#endif
+#include <sys/ck.h>
#define in6pcb inpcb /* for KAME src sync over BSD*'s */
#define in6p_sp inp_sp /* for KAME src sync over BSD*'s */
@@ -65,8 +68,9 @@
* numbers, and pointers up (to a socket structure) and down (to a
* protocol-specific control block) are stored here.
*/
-LIST_HEAD(inpcbhead, inpcb);
-LIST_HEAD(inpcbporthead, inpcbport);
+CK_LIST_HEAD(inpcbhead, inpcb);
+CK_LIST_HEAD(inpcbporthead, inpcbport);
+CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
typedef uint64_t inp_gen_t;
/*
@@ -79,6 +83,11 @@ struct in_addr_4in6 {
struct in_addr ia46_addr4;
};
+union in_dependaddr {
+ struct in_addr_4in6 id46_addr;
+ struct in6_addr id6_addr;
+};
+
/*
* NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has
* some extra padding to accomplish this.
@@ -89,22 +98,14 @@ struct in_endpoints {
u_int16_t ie_fport; /* foreign port */
u_int16_t ie_lport; /* local port */
/* protocol dependent part, local and foreign addr */
- union {
- /* foreign host table entry */
- struct in_addr_4in6 ie46_foreign;
- struct in6_addr ie6_foreign;
- } ie_dependfaddr;
- union {
- /* local host table entry */
- struct in_addr_4in6 ie46_local;
- struct in6_addr ie6_local;
- } ie_dependladdr;
+ union in_dependaddr ie_dependfaddr; /* foreign host table entry */
+ union in_dependaddr ie_dependladdr; /* local host table entry */
+#define ie_faddr ie_dependfaddr.id46_addr.ia46_addr4
+#define ie_laddr ie_dependladdr.id46_addr.ia46_addr4
+#define ie6_faddr ie_dependfaddr.id6_addr
+#define ie6_laddr ie_dependladdr.id6_addr
u_int32_t ie6_zoneid; /* scope zone id */
};
-#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4
-#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4
-#define ie6_faddr ie_dependfaddr.ie6_foreign
-#define ie6_laddr ie_dependladdr.ie6_local
/*
* XXX The defines for inc_* are hacks and should be changed to direct
@@ -122,8 +123,8 @@ struct in_conninfo {
* Flags for inc_flags.
*/
#define INC_ISIPV6 0x01
+#define INC_IPV6MINMTU 0x02
-#define inc_isipv6 inc_flags /* temp compatibility */
#define inc_fport inc_ie.ie_fport
#define inc_lport inc_ie.ie_lport
#define inc_faddr inc_ie.ie_faddr
@@ -159,6 +160,7 @@ struct in_conninfo {
* Key:
* (b) - Protected by the hpts lock.
* (c) - Constant after initialization
+ * (e) - Protected by the net_epoch_prempt epoch
* (g) - Protected by the pcbgroup lock
* (i) - Protected by the inpcb lock
* (p) - Protected by the pcbinfo lock for the inpcb
@@ -233,8 +235,8 @@ struct inpcbpolicy;
struct m_snd_tag;
struct inpcb {
/* Cache line #1 (amd64) */
- LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */
- LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
+ CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */
+ CK_LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
#define inp_start_zero inp_hpts
@@ -278,7 +280,7 @@ struct inpcb {
TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
- LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
+ CK_LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
struct ucred *inp_cred; /* (c) cache of socket cred */
u_int32_t inp_flow; /* (i) IPv6 flow information */
u_char inp_vflag; /* (i) IP version flag (v4/v6) */
@@ -316,18 +318,19 @@ struct inpcb {
int in6p_cksum;
short in6p_hops;
};
- LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */
+ CK_LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */
struct inpcbport *inp_phd; /* (i/h) head of this list */
inp_gen_t inp_gencnt; /* (c) generation count */
- struct llentry *inp_lle; /* cached L2 information */
+ void *spare_ptr; /* Spare pointer. */
rt_gen_t inp_rt_cookie; /* generation for route entry */
union { /* cached L3 information */
struct route inp_route;
struct route_in6 inp_route6;
};
- LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */
- /* (p[w]) for list iteration */
- /* (p[r]/l) for addition/removal */
+ CK_LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */
+ /* (e[r]) for list iteration */
+ /* (p[w]/l) for addition/removal */
+ struct epoch_context inp_epoch_ctx;
};
#endif /* _KERNEL */
@@ -364,14 +367,11 @@ struct inpcb {
*/
#ifdef _SYS_SOCKETVAR_H_
struct xinpcb {
- size_t xi_len; /* length of this structure */
+ ksize_t xi_len; /* length of this structure */
struct xsocket xi_socket; /* (s,p) */
struct in_conninfo inp_inc; /* (s,p) */
uint64_t inp_gencnt; /* (s,p) */
- union {
- void *inp_ppcb; /* (s) netstat(1) */
- int64_t ph_ppcb;
- };
+ kvaddr_t inp_ppcb; /* (s) netstat(1) */
int64_t inp_spare64[4];
uint32_t inp_flow; /* (s) */
uint32_t inp_flowid; /* (s) */
@@ -392,10 +392,12 @@ struct xinpcb {
} __aligned(8);
struct xinpgen {
- size_t xig_len; /* length of this structure */
+ ksize_t xig_len; /* length of this structure */
u_int xig_count; /* number of PCBs at this time */
+ uint32_t _xig_spare32;
inp_gen_t xig_gen; /* generation count at this time */
so_gen_t xig_sogen; /* socket generation count this time */
+ uint64_t _xig_spare64[4];
} __aligned(8);
#ifdef _KERNEL
void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
@@ -403,7 +405,8 @@ void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
#endif /* _SYS_SOCKETVAR_H_ */
struct inpcbport {
- LIST_ENTRY(inpcbport) phd_hash;
+ struct epoch_context phd_epoch_ctx;
+ CK_LIST_ENTRY(inpcbport) phd_hash;
struct inpcbhead phd_pcblist;
u_short phd_port;
};
@@ -436,22 +439,23 @@ struct in_pcblist {
* Locking key:
*
* (c) Constant or nearly constant after initialisation
+ * (e) - Protected by the net_epoch_prempt epoch
* (g) Locked by ipi_lock
* (l) Locked by ipi_list_lock
- * (h) Read using either ipi_hash_lock or inpcb lock; write requires both
+ * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock
* (p) Protected by one or more pcbgroup locks
* (x) Synchronisation properties poorly defined
*/
struct inpcbinfo {
/*
- * Global lock protecting full inpcb list traversal
+ * Global lock protecting inpcb list modification
*/
- struct rwlock ipi_lock;
+ struct mtx ipi_lock;
/*
* Global list of inpcbs on the protocol.
*/
- struct inpcbhead *ipi_listhead; /* (g/l) */
+ struct inpcbhead *ipi_listhead; /* [r](e) [w](g/l) */
u_int ipi_count; /* (l) */
/*
@@ -482,9 +486,9 @@ struct inpcbinfo {
u_int ipi_hashfields; /* (c) */
/*
- * Global lock protecting non-pcbgroup hash lookup tables.
+ * Global lock protecting modification non-pcbgroup hash lookup tables.
*/
- struct rwlock ipi_hash_lock;
+ struct mtx ipi_hash_lock;
/*
* Global hash of inpcbs, hashed by local and foreign addresses and
@@ -508,6 +512,13 @@ struct inpcbinfo {
u_long ipi_wildmask; /* (p) */
/*
+ * Load balance groups used for the SO_REUSEPORT_LB option,
+ * hashed by local port.
+ */
+ struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (h) */
+ u_long ipi_lbgrouphashmask; /* (h) */
+
+ /*
* Pointer to network stack instance
*/
struct vnet *ipi_vnet; /* (c) */
@@ -549,6 +560,27 @@ struct inpcbgroup {
struct mtx ipg_lock;
} __aligned(CACHE_LINE_SIZE);
+/*
+ * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
+ * (or unique address:port combination) can be re-used at most
+ * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which
+ * is dynamically resized as processes bind/unbind to that specific group.
+ */
+struct inpcblbgroup {
+ CK_LIST_ENTRY(inpcblbgroup) il_list;
+ struct epoch_context il_epoch_ctx;
+ uint16_t il_lport; /* (c) */
+ u_char il_vflag; /* (c) */
+ u_char il_pad;
+ uint32_t il_pad2;
+ union in_dependaddr il_dependladdr; /* (c) */
+#define il_laddr il_dependladdr.id46_addr.ia46_addr4
+#define il6_laddr il_dependladdr.id6_addr
+ uint32_t il_inpsiz; /* max count in il_inp[] (h) */
+ uint32_t il_inpcnt; /* cur count in il_inp[] (h) */
+ struct inpcb *il_inp[]; /* (h) */
+};
+
#define INP_LOCK_INIT(inp, d, t) \
rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK)
#define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock)
@@ -593,25 +625,24 @@ struct tcpcb *
inp_inpcbtotcpcb(struct inpcb *inp);
void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
uint32_t *faddr, uint16_t *fp);
-short inp_so_options(const struct inpcb *inp);
+int inp_so_options(const struct inpcb *inp);
#endif /* _KERNEL */
#define INP_INFO_LOCK_INIT(ipi, d) \
- rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE)
-#define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock)
-#define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock)
-#define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock)
-#define INP_INFO_WLOCKED(ipi) rw_wowned(&(ipi)->ipi_lock)
-#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock)
-#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock)
-#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED)
-#define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED)
-#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
-#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
+ mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE)
+#define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock)
+#define INP_INFO_RLOCK_ET(ipi, et) NET_EPOCH_ENTER_ET((et))
+#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
+#define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock)
+#define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock)
+#define INP_INFO_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT_ET((et))
+#define INP_INFO_RUNLOCK_TP(ipi, tp) NET_EPOCH_EXIT_ET(*(tp)->t_inpcb->inp_et)
+#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock)
+#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock))
+#define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt))
+#define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
+#define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch(net_epoch_preempt) && !mtx_owned(&(ipi)->ipi_lock))
#define INP_LIST_LOCK_INIT(ipi, d) \
rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
@@ -632,17 +663,16 @@ short inp_so_options(const struct inpcb *inp);
#define INP_LIST_UNLOCK_ASSERT(ipi) \
rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
-#define INP_HASH_LOCK_INIT(ipi, d) \
- rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0)
-#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock)
-#define INP_HASH_RLOCK(ipi) rw_rlock(&(ipi)->ipi_hash_lock)
-#define INP_HASH_WLOCK(ipi) rw_wlock(&(ipi)->ipi_hash_lock)
-#define INP_HASH_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_hash_lock)
-#define INP_HASH_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_hash_lock)
-#define INP_HASH_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \
- RA_LOCKED)
-#define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \
- RA_WLOCKED)
+#define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF)
+#define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock)
+#define INP_HASH_RLOCK(ipi) struct epoch_tracker inp_hash_et; epoch_enter_preempt(net_epoch_preempt, &inp_hash_et)
+#define INP_HASH_RLOCK_ET(ipi, et) epoch_enter_preempt(net_epoch_preempt, &(et))
+#define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock)
+#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT_ET(inp_hash_et)
+#define INP_HASH_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT_ET((et))
+#define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock)
+#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock))
+#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED);
#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \
MTX_DEF | MTX_DUPOK)
@@ -656,6 +686,10 @@ short inp_so_options(const struct inpcb *inp);
(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
#define INP_PCBPORTHASH(lport, mask) \
(ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PORTHASH(lport, mask) \
+ (ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
+ ((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport)))
#define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3])
/*
@@ -711,8 +745,8 @@ short inp_so_options(const struct inpcb *inp);
/*
* Flags for inp_flags2.
*/
-#define INP_LLE_VALID 0x00000001 /* cached lle is valid */
-#define INP_RT_VALID 0x00000002 /* cached rtentry is valid */
+#define INP_2UNUSED1 0x00000001
+#define INP_2UNUSED2 0x00000002
#define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */
#define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */
#define INP_FREED 0x00000010 /* inp itself is not valid */
@@ -724,6 +758,7 @@ short inp_so_options(const struct inpcb *inp);
#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
#define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */
+#define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */
/*
* Flags passed to in_pcblookup*() functions.