From bceabc95c1c85d793200446fa85f1ddc6313ea29 Mon Sep 17 00:00:00 2001 From: Sebastian Huber Date: Wed, 9 Oct 2013 22:42:09 +0200 Subject: Move files to match FreeBSD layout --- freebsd/sys/net/bpf.c | 2398 ++++++++++++++++ freebsd/sys/net/bpf.h | 974 +++++++ freebsd/sys/net/bpf_buffer.c | 212 ++ freebsd/sys/net/bpf_buffer.h | 50 + freebsd/sys/net/bpf_filter.c | 582 ++++ freebsd/sys/net/bpf_jitter.c | 143 + freebsd/sys/net/bpf_jitter.h | 84 + freebsd/sys/net/bpf_zerocopy.h | 56 + freebsd/sys/net/bpfdesc.h | 149 + freebsd/sys/net/bridgestp.c | 2250 +++++++++++++++ freebsd/sys/net/bridgestp.h | 396 +++ freebsd/sys/net/ethernet.h | 2 + freebsd/sys/net/fddi.h | 105 + freebsd/sys/net/firewire.h | 142 + freebsd/sys/net/flowtable.h | 82 + freebsd/sys/net/ieee8023ad_lacp.c | 1947 +++++++++++++ freebsd/sys/net/ieee8023ad_lacp.h | 333 +++ freebsd/sys/net/if.c | 3431 +++++++++++++++++++++++ freebsd/sys/net/if.h | 2 + freebsd/sys/net/if_arc.h | 143 + freebsd/sys/net/if_arcsubr.c | 886 ++++++ freebsd/sys/net/if_arp.h | 2 + freebsd/sys/net/if_atm.h | 337 +++ freebsd/sys/net/if_atmsubr.c | 504 ++++ freebsd/sys/net/if_bridge.c | 3458 +++++++++++++++++++++++ freebsd/sys/net/if_bridgevar.h | 328 +++ freebsd/sys/net/if_clone.c | 617 +++++ freebsd/sys/net/if_clone.h | 116 + freebsd/sys/net/if_dead.c | 116 + freebsd/sys/net/if_disc.c | 247 ++ freebsd/sys/net/if_dl.h | 2 + freebsd/sys/net/if_edsc.c | 356 +++ freebsd/sys/net/if_ef.c | 610 ++++ freebsd/sys/net/if_enc.c | 375 +++ freebsd/sys/net/if_enc.h | 35 + freebsd/sys/net/if_epair.c | 955 +++++++ freebsd/sys/net/if_ethersubr.c | 1364 +++++++++ freebsd/sys/net/if_faith.c | 353 +++ freebsd/sys/net/if_fddisubr.c | 800 ++++++ freebsd/sys/net/if_fwsubr.c | 853 ++++++ freebsd/sys/net/if_gif.c | 1025 +++++++ freebsd/sys/net/if_gif.h | 130 + freebsd/sys/net/if_gre.c | 909 ++++++ freebsd/sys/net/if_gre.h | 194 ++ freebsd/sys/net/if_iso88025subr.c | 831 ++++++ freebsd/sys/net/if_lagg.c | 1808 ++++++++++++ freebsd/sys/net/if_lagg.h | 247 ++ freebsd/sys/net/if_llatbl.c | 528 ++++ freebsd/sys/net/if_llatbl.h | 208 ++ freebsd/sys/net/if_llc.h | 161 ++ freebsd/sys/net/if_loop.c | 451 +++ freebsd/sys/net/if_media.c | 566 ++++ freebsd/sys/net/if_media.h | 692 +++++ freebsd/sys/net/if_mib.c | 171 ++ freebsd/sys/net/if_mib.h | 171 ++ freebsd/sys/net/if_sppp.h | 234 ++ freebsd/sys/net/if_spppfr.c | 636 +++++ freebsd/sys/net/if_spppsubr.c | 5492 +++++++++++++++++++++++++++++++++++++ freebsd/sys/net/if_stf.c | 850 ++++++ freebsd/sys/net/if_stf.h | 38 + freebsd/sys/net/if_tap.c | 1086 ++++++++ freebsd/sys/net/if_tap.h | 74 + freebsd/sys/net/if_tapvar.h | 69 + freebsd/sys/net/if_tun.c | 1059 +++++++ freebsd/sys/net/if_tun.h | 48 + freebsd/sys/net/if_types.h | 2 + freebsd/sys/net/if_var.h | 904 ++++++ freebsd/sys/net/if_vlan.c | 1538 +++++++++++ freebsd/sys/net/if_vlan_var.h | 137 + freebsd/sys/net/iso88025.h | 172 ++ freebsd/sys/net/netisr.c | 1172 ++++++++ freebsd/sys/net/netisr.h | 156 ++ freebsd/sys/net/pfil.c | 331 +++ freebsd/sys/net/pfil.h | 117 + freebsd/sys/net/pfkeyv2.h | 432 +++ freebsd/sys/net/ppp_defs.h | 158 ++ freebsd/sys/net/radix.c | 1205 ++++++++ freebsd/sys/net/radix.h | 176 ++ freebsd/sys/net/radix_mpath.c | 365 +++ freebsd/sys/net/radix_mpath.h | 63 + freebsd/sys/net/raw_cb.c | 119 + freebsd/sys/net/raw_cb.h | 84 + freebsd/sys/net/raw_usrreq.c | 266 ++ freebsd/sys/net/route.c | 1601 +++++++++++ freebsd/sys/net/route.h | 2 + freebsd/sys/net/rtsock.c | 1702 ++++++++++++ freebsd/sys/net/slcompress.c | 609 ++++ freebsd/sys/net/slcompress.h | 158 ++ freebsd/sys/net/vnet.h | 437 +++ freebsd/sys/net/zlib.c | 5409 ++++++++++++++++++++++++++++++++++++ freebsd/sys/net/zlib.h | 1018 +++++++ 91 files changed, 61806 insertions(+) create mode 100644 freebsd/sys/net/bpf.c create mode 100644 freebsd/sys/net/bpf.h create mode 100644 freebsd/sys/net/bpf_buffer.c create mode 100644 freebsd/sys/net/bpf_buffer.h create mode 100644 freebsd/sys/net/bpf_filter.c create mode 100644 freebsd/sys/net/bpf_jitter.c create mode 100644 freebsd/sys/net/bpf_jitter.h create mode 100644 freebsd/sys/net/bpf_zerocopy.h create mode 100644 freebsd/sys/net/bpfdesc.h create mode 100644 freebsd/sys/net/bridgestp.c create mode 100644 freebsd/sys/net/bridgestp.h create mode 100644 freebsd/sys/net/ethernet.h create mode 100644 freebsd/sys/net/fddi.h create mode 100644 freebsd/sys/net/firewire.h create mode 100644 freebsd/sys/net/flowtable.h create mode 100644 freebsd/sys/net/ieee8023ad_lacp.c create mode 100644 freebsd/sys/net/ieee8023ad_lacp.h create mode 100644 freebsd/sys/net/if.c create mode 100644 freebsd/sys/net/if.h create mode 100644 freebsd/sys/net/if_arc.h create mode 100644 freebsd/sys/net/if_arcsubr.c create mode 100644 freebsd/sys/net/if_arp.h create mode 100644 freebsd/sys/net/if_atm.h create mode 100644 freebsd/sys/net/if_atmsubr.c create mode 100644 freebsd/sys/net/if_bridge.c create mode 100644 freebsd/sys/net/if_bridgevar.h create mode 100644 freebsd/sys/net/if_clone.c create mode 100644 freebsd/sys/net/if_clone.h create mode 100644 freebsd/sys/net/if_dead.c create mode 100644 freebsd/sys/net/if_disc.c create mode 100644 freebsd/sys/net/if_dl.h create mode 100644 freebsd/sys/net/if_edsc.c create mode 100644 freebsd/sys/net/if_ef.c create mode 100644 freebsd/sys/net/if_enc.c create mode 100644 freebsd/sys/net/if_enc.h create mode 100644 freebsd/sys/net/if_epair.c create mode 100644 freebsd/sys/net/if_ethersubr.c create mode 100644 freebsd/sys/net/if_faith.c create mode 100644 freebsd/sys/net/if_fddisubr.c create mode 100644 freebsd/sys/net/if_fwsubr.c create mode 100644 freebsd/sys/net/if_gif.c create mode 100644 freebsd/sys/net/if_gif.h create mode 100644 freebsd/sys/net/if_gre.c create mode 100644 freebsd/sys/net/if_gre.h create mode 100644 freebsd/sys/net/if_iso88025subr.c create mode 100644 freebsd/sys/net/if_lagg.c create mode 100644 freebsd/sys/net/if_lagg.h create mode 100644 freebsd/sys/net/if_llatbl.c create mode 100644 freebsd/sys/net/if_llatbl.h create mode 100644 freebsd/sys/net/if_llc.h create mode 100644 freebsd/sys/net/if_loop.c create mode 100644 freebsd/sys/net/if_media.c create mode 100644 freebsd/sys/net/if_media.h create mode 100644 freebsd/sys/net/if_mib.c create mode 100644 freebsd/sys/net/if_mib.h create mode 100644 freebsd/sys/net/if_sppp.h create mode 100644 freebsd/sys/net/if_spppfr.c create mode 100644 freebsd/sys/net/if_spppsubr.c create mode 100644 freebsd/sys/net/if_stf.c create mode 100644 freebsd/sys/net/if_stf.h create mode 100644 freebsd/sys/net/if_tap.c create mode 100644 freebsd/sys/net/if_tap.h create mode 100644 freebsd/sys/net/if_tapvar.h create mode 100644 freebsd/sys/net/if_tun.c create mode 100644 freebsd/sys/net/if_tun.h create mode 100644 freebsd/sys/net/if_types.h create mode 100644 freebsd/sys/net/if_var.h create mode 100644 freebsd/sys/net/if_vlan.c create mode 100644 freebsd/sys/net/if_vlan_var.h create mode 100644 freebsd/sys/net/iso88025.h create mode 100644 freebsd/sys/net/netisr.c create mode 100644 freebsd/sys/net/netisr.h create mode 100644 freebsd/sys/net/pfil.c create mode 100644 freebsd/sys/net/pfil.h create mode 100644 freebsd/sys/net/pfkeyv2.h create mode 100644 freebsd/sys/net/ppp_defs.h create mode 100644 freebsd/sys/net/radix.c create mode 100644 freebsd/sys/net/radix.h create mode 100644 freebsd/sys/net/radix_mpath.c create mode 100644 freebsd/sys/net/radix_mpath.h create mode 100644 freebsd/sys/net/raw_cb.c create mode 100644 freebsd/sys/net/raw_cb.h create mode 100644 freebsd/sys/net/raw_usrreq.c create mode 100644 freebsd/sys/net/route.c create mode 100644 freebsd/sys/net/route.h create mode 100644 freebsd/sys/net/rtsock.c create mode 100644 freebsd/sys/net/slcompress.c create mode 100644 freebsd/sys/net/slcompress.h create mode 100644 freebsd/sys/net/vnet.h create mode 100644 freebsd/sys/net/zlib.c create mode 100644 freebsd/sys/net/zlib.h (limited to 'freebsd/sys/net') diff --git a/freebsd/sys/net/bpf.c b/freebsd/sys/net/bpf.c new file mode 100644 index 00000000..684c7343 --- /dev/null +++ b/freebsd/sys/net/bpf.c @@ -0,0 +1,2398 @@ +#include + +/*- + * Copyright (c) 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpf.c 8.4 (Berkeley) 1/9/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#ifdef BPF_JITTER +#include +#endif +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); + +#if defined(DEV_BPF) || defined(NETGRAPH_BPF) + +#define PRINET 26 /* interruptible */ + +#ifdef COMPAT_FREEBSD32 +#include +#include +#define BPF_ALIGNMENT32 sizeof(int32_t) +#define BPF_WORDALIGN32(x) (((x)+(BPF_ALIGNMENT32-1))&~(BPF_ALIGNMENT32-1)) + +/* + * 32-bit version of structure prepended to each packet. We use this header + * instead of the standard one for 32-bit streams. We mark the a stream as + * 32-bit the first time we see a 32-bit compat ioctl request. + */ +struct bpf_hdr32 { + struct timeval32 bh_tstamp; /* time stamp */ + uint32_t bh_caplen; /* length of captured portion */ + uint32_t bh_datalen; /* original length of packet */ + uint16_t bh_hdrlen; /* length of bpf header (this struct + plus alignment padding) */ +}; + +struct bpf_program32 { + u_int bf_len; + uint32_t bf_insns; +}; + +struct bpf_dltlist32 { + u_int bfl_len; + u_int bfl_list; +}; + +#define BIOCSETF32 _IOW('B', 103, struct bpf_program32) +#define BIOCSRTIMEOUT32 _IOW('B',109, struct timeval32) +#define BIOCGRTIMEOUT32 _IOR('B',110, struct timeval32) +#define BIOCGDLTLIST32 _IOWR('B',121, struct bpf_dltlist32) +#define BIOCSETWF32 _IOW('B',123, struct bpf_program32) +#define BIOCSETFNR32 _IOW('B',130, struct bpf_program32) +#endif + +/* + * bpf_iflist is a list of BPF interface structures, each corresponding to a + * specific DLT. The same network interface might have several BPF interface + * structures registered by different layers in the stack (i.e., 802.11 + * frames, ethernet frames, etc). + */ +static LIST_HEAD(, bpf_if) bpf_iflist; +static struct mtx bpf_mtx; /* bpf global lock */ +static int bpf_bpfd_cnt; + +static void bpf_attachd(struct bpf_d *, struct bpf_if *); +static void bpf_detachd(struct bpf_d *); +static void bpf_freed(struct bpf_d *); +static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, + struct sockaddr *, int *, struct bpf_insn *); +static int bpf_setif(struct bpf_d *, struct ifreq *); +static void bpf_timed_out(void *); +static __inline void + bpf_wakeup(struct bpf_d *); +static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, + void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int), + struct timeval *); +static void reset_d(struct bpf_d *); +static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); +static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); +static int bpf_setdlt(struct bpf_d *, u_int); +static void filt_bpfdetach(struct knote *); +static int filt_bpfread(struct knote *, long); +static void bpf_drvinit(void *); +static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS); + +SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl"); +int bpf_maxinsns = BPF_MAXINSNS; +SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW, + &bpf_maxinsns, 0, "Maximum bpf program instructions"); +static int bpf_zerocopy_enable = 0; +SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW, + &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions"); +SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW, + bpf_stats_sysctl, "bpf statistics portal"); + +static d_open_t bpfopen; +static d_read_t bpfread; +static d_write_t bpfwrite; +static d_ioctl_t bpfioctl; +static d_poll_t bpfpoll; +static d_kqfilter_t bpfkqfilter; + +static struct cdevsw bpf_cdevsw = { + .d_version = D_VERSION, + .d_open = bpfopen, + .d_read = bpfread, + .d_write = bpfwrite, + .d_ioctl = bpfioctl, + .d_poll = bpfpoll, + .d_name = "bpf", + .d_kqfilter = bpfkqfilter, +}; + +static struct filterops bpfread_filtops = + { 1, NULL, filt_bpfdetach, filt_bpfread }; + +/* + * Wrapper functions for various buffering methods. If the set of buffer + * modes expands, we will probably want to introduce a switch data structure + * similar to protosw, et. + */ +static void +bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_append_bytes(d, buf, offset, src, len)); + + case BPF_BUFMODE_ZBUF: + d->bd_zcopy++; + return (bpf_zerocopy_append_bytes(d, buf, offset, src, len)); + + default: + panic("bpf_buf_append_bytes"); + } +} + +static void +bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_append_mbuf(d, buf, offset, src, len)); + + case BPF_BUFMODE_ZBUF: + d->bd_zcopy++; + return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len)); + + default: + panic("bpf_buf_append_mbuf"); + } +} + +/* + * This function gets called when the free buffer is re-assigned. + */ +static void +bpf_buf_reclaimed(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return; + + case BPF_BUFMODE_ZBUF: + bpf_zerocopy_buf_reclaimed(d); + return; + + default: + panic("bpf_buf_reclaimed"); + } +} + +/* + * If the buffer mechanism has a way to decide that a held buffer can be made + * free, then it is exposed via the bpf_canfreebuf() interface. (1) is + * returned if the buffer can be discarded, (0) is returned if it cannot. + */ +static int +bpf_canfreebuf(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_canfreebuf(d)); + } + return (0); +} + +/* + * Allow the buffer model to indicate that the current store buffer is + * immutable, regardless of the appearance of space. Return (1) if the + * buffer is writable, and (0) if not. + */ +static int +bpf_canwritebuf(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_canwritebuf(d)); + } + return (1); +} + +/* + * Notify buffer model that an attempt to write to the store buffer has + * resulted in a dropped packet, in which case the buffer may be considered + * full. + */ +static void +bpf_buffull(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_ZBUF: + bpf_zerocopy_buffull(d); + break; + } +} + +/* + * Notify the buffer model that a buffer has moved into the hold position. + */ +void +bpf_bufheld(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + switch (d->bd_bufmode) { + case BPF_BUFMODE_ZBUF: + bpf_zerocopy_bufheld(d); + break; + } +} + +static void +bpf_free(struct bpf_d *d) +{ + + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + return (bpf_buffer_free(d)); + + case BPF_BUFMODE_ZBUF: + return (bpf_zerocopy_free(d)); + + default: + panic("bpf_buf_free"); + } +} + +static int +bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) +{ + + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) + return (EOPNOTSUPP); + return (bpf_buffer_uiomove(d, buf, len, uio)); +} + +static int +bpf_ioctl_sblen(struct bpf_d *d, u_int *i) +{ + + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) + return (EOPNOTSUPP); + return (bpf_buffer_ioctl_sblen(d, i)); +} + +static int +bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); + return (bpf_zerocopy_ioctl_getzmax(td, d, i)); +} + +static int +bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); + return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz)); +} + +static int +bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) +{ + + if (d->bd_bufmode != BPF_BUFMODE_ZBUF) + return (EOPNOTSUPP); + return (bpf_zerocopy_ioctl_setzbuf(td, d, bz)); +} + +/* + * General BPF functions. + */ +static int +bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, + struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter) +{ + const struct ieee80211_bpf_params *p; + struct ether_header *eh; + struct mbuf *m; + int error; + int len; + int hlen; + int slen; + + /* + * Build a sockaddr based on the data link layer type. + * We do this at this level because the ethernet header + * is copied directly into the data field of the sockaddr. + * In the case of SLIP, there is no header and the packet + * is forwarded as is. + * Also, we are careful to leave room at the front of the mbuf + * for the link level header. + */ + switch (linktype) { + + case DLT_SLIP: + sockp->sa_family = AF_INET; + hlen = 0; + break; + + case DLT_EN10MB: + sockp->sa_family = AF_UNSPEC; + /* XXX Would MAXLINKHDR be better? */ + hlen = ETHER_HDR_LEN; + break; + + case DLT_FDDI: + sockp->sa_family = AF_IMPLINK; + hlen = 0; + break; + + case DLT_RAW: + sockp->sa_family = AF_UNSPEC; + hlen = 0; + break; + + case DLT_NULL: + /* + * null interface types require a 4 byte pseudo header which + * corresponds to the address family of the packet. + */ + sockp->sa_family = AF_UNSPEC; + hlen = 4; + break; + + case DLT_ATM_RFC1483: + /* + * en atm driver requires 4-byte atm pseudo header. + * though it isn't standard, vpi:vci needs to be + * specified anyway. + */ + sockp->sa_family = AF_UNSPEC; + hlen = 12; /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */ + break; + + case DLT_PPP: + sockp->sa_family = AF_UNSPEC; + hlen = 4; /* This should match PPP_HDRLEN */ + break; + + case DLT_IEEE802_11: /* IEEE 802.11 wireless */ + sockp->sa_family = AF_IEEE80211; + hlen = 0; + break; + + case DLT_IEEE802_11_RADIO: /* IEEE 802.11 wireless w/ phy params */ + sockp->sa_family = AF_IEEE80211; + sockp->sa_len = 12; /* XXX != 0 */ + hlen = sizeof(struct ieee80211_bpf_params); + break; + + default: + return (EIO); + } + + len = uio->uio_resid; + + if (len - hlen > ifp->if_mtu) + return (EMSGSIZE); + + if ((unsigned)len > MJUM16BYTES) + return (EIO); + + if (len <= MHLEN) + MGETHDR(m, M_WAIT, MT_DATA); + else if (len <= MCLBYTES) + m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR); + else + m = m_getjcl(M_WAIT, MT_DATA, M_PKTHDR, +#if (MJUMPAGESIZE > MCLBYTES) + len <= MJUMPAGESIZE ? MJUMPAGESIZE : +#endif + (len <= MJUM9BYTES ? MJUM9BYTES : MJUM16BYTES)); + m->m_pkthdr.len = m->m_len = len; + m->m_pkthdr.rcvif = NULL; + *mp = m; + + if (m->m_len < hlen) { + error = EPERM; + goto bad; + } + + error = uiomove(mtod(m, u_char *), len, uio); + if (error) + goto bad; + + slen = bpf_filter(wfilter, mtod(m, u_char *), len, len); + if (slen == 0) { + error = EPERM; + goto bad; + } + + /* Check for multicast destination */ + switch (linktype) { + case DLT_EN10MB: + eh = mtod(m, struct ether_header *); + if (ETHER_IS_MULTICAST(eh->ether_dhost)) { + if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost, + ETHER_ADDR_LEN) == 0) + m->m_flags |= M_BCAST; + else + m->m_flags |= M_MCAST; + } + break; + } + + /* + * Make room for link header, and copy it to sockaddr + */ + if (hlen != 0) { + if (sockp->sa_family == AF_IEEE80211) { + /* + * Collect true length from the parameter header + * NB: sockp is known to be zero'd so if we do a + * short copy unspecified parameters will be + * zero. + * NB: packet may not be aligned after stripping + * bpf params + * XXX check ibp_vers + */ + p = mtod(m, const struct ieee80211_bpf_params *); + hlen = p->ibp_len; + if (hlen > sizeof(sockp->sa_data)) { + error = EINVAL; + goto bad; + } + } + bcopy(m->m_data, sockp->sa_data, hlen); + } + *hdrlen = hlen; + + return (0); +bad: + m_freem(m); + return (error); +} + +/* + * Attach file to the bpf interface, i.e. make d listen on bp. + */ +static void +bpf_attachd(struct bpf_d *d, struct bpf_if *bp) +{ + /* + * Point d at bp, and add d to the interface's list of listeners. + * Finally, point the driver's bpf cookie at the interface so + * it will divert packets to bpf. + */ + BPFIF_LOCK(bp); + d->bd_bif = bp; + LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); + + bpf_bpfd_cnt++; + BPFIF_UNLOCK(bp); + + EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1); +} + +/* + * Detach a file from its interface. + */ +static void +bpf_detachd(struct bpf_d *d) +{ + int error; + struct bpf_if *bp; + struct ifnet *ifp; + + bp = d->bd_bif; + BPFIF_LOCK(bp); + BPFD_LOCK(d); + ifp = d->bd_bif->bif_ifp; + + /* + * Remove d from the interface's descriptor list. + */ + LIST_REMOVE(d, bd_next); + + bpf_bpfd_cnt--; + d->bd_bif = NULL; + BPFD_UNLOCK(d); + BPFIF_UNLOCK(bp); + + EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0); + + /* + * Check if this descriptor had requested promiscuous mode. + * If so, turn it off. + */ + if (d->bd_promisc) { + d->bd_promisc = 0; + CURVNET_SET(ifp->if_vnet); + error = ifpromisc(ifp, 0); + CURVNET_RESTORE(); + if (error != 0 && error != ENXIO) { + /* + * ENXIO can happen if a pccard is unplugged + * Something is really wrong if we were able to put + * the driver into promiscuous mode, but can't + * take it out. + */ + if_printf(bp->bif_ifp, + "bpf_detach: ifpromisc failed (%d)\n", error); + } + } +} + +/* + * Close the descriptor by detaching it from its interface, + * deallocating its buffers, and marking it free. + */ +static void +bpf_dtor(void *data) +{ + struct bpf_d *d = data; + + BPFD_LOCK(d); + if (d->bd_state == BPF_WAITING) + callout_stop(&d->bd_callout); + d->bd_state = BPF_IDLE; + BPFD_UNLOCK(d); + funsetown(&d->bd_sigio); + mtx_lock(&bpf_mtx); + if (d->bd_bif) + bpf_detachd(d); + mtx_unlock(&bpf_mtx); + selwakeuppri(&d->bd_sel, PRINET); +#ifdef MAC + mac_bpfdesc_destroy(d); +#endif /* MAC */ + knlist_destroy(&d->bd_sel.si_note); + callout_drain(&d->bd_callout); + bpf_freed(d); + free(d, M_BPF); +} + +/* + * Open ethernet device. Returns ENXIO for illegal minor device number, + * EBUSY if file is open by another process. + */ +/* ARGSUSED */ +static int +bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + struct bpf_d *d; + int error; + + d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO); + error = devfs_set_cdevpriv(d, bpf_dtor); + if (error != 0) { + free(d, M_BPF); + return (error); + } + + /* + * For historical reasons, perform a one-time initialization call to + * the buffer routines, even though we're not yet committed to a + * particular buffer method. + */ + bpf_buffer_init(d); + d->bd_bufmode = BPF_BUFMODE_BUFFER; + d->bd_sig = SIGIO; + d->bd_direction = BPF_D_INOUT; + d->bd_pid = td->td_proc->p_pid; +#ifdef MAC + mac_bpfdesc_init(d); + mac_bpfdesc_create(td->td_ucred, d); +#endif + mtx_init(&d->bd_mtx, devtoname(dev), "bpf cdev lock", MTX_DEF); + callout_init_mtx(&d->bd_callout, &d->bd_mtx, 0); + knlist_init_mtx(&d->bd_sel.si_note, &d->bd_mtx); + + return (0); +} + +/* + * bpfread - read next chunk of packets from buffers + */ +static int +bpfread(struct cdev *dev, struct uio *uio, int ioflag) +{ + struct bpf_d *d; + int error; + int non_block; + int timed_out; + + error = devfs_get_cdevpriv((void **)&d); + if (error != 0) + return (error); + + /* + * Restrict application to use a buffer the same size as + * as kernel buffers. + */ + if (uio->uio_resid != d->bd_bufsize) + return (EINVAL); + + non_block = ((ioflag & O_NONBLOCK) != 0); + + BPFD_LOCK(d); + d->bd_pid = curthread->td_proc->p_pid; + if (d->bd_bufmode != BPF_BUFMODE_BUFFER) { + BPFD_UNLOCK(d); + return (EOPNOTSUPP); + } + if (d->bd_state == BPF_WAITING) + callout_stop(&d->bd_callout); + timed_out = (d->bd_state == BPF_TIMED_OUT); + d->bd_state = BPF_IDLE; + /* + * If the hold buffer is empty, then do a timed sleep, which + * ends when the timeout expires or when enough packets + * have arrived to fill the store buffer. + */ + while (d->bd_hbuf == NULL) { + if (d->bd_slen != 0) { + /* + * A packet(s) either arrived since the previous + * read or arrived while we were asleep. + */ + if (d->bd_immediate || non_block || timed_out) { + /* + * Rotate the buffers and return what's here + * if we are in immediate mode, non-blocking + * flag is set, or this descriptor timed out. + */ + ROTATE_BUFFERS(d); + break; + } + } + + /* + * No data is available, check to see if the bpf device + * is still pointed at a real interface. If not, return + * ENXIO so that the userland process knows to rebind + * it before using it again. + */ + if (d->bd_bif == NULL) { + BPFD_UNLOCK(d); + return (ENXIO); + } + + if (non_block) { + BPFD_UNLOCK(d); + return (EWOULDBLOCK); + } + error = msleep(d, &d->bd_mtx, PRINET|PCATCH, + "bpf", d->bd_rtout); + if (error == EINTR || error == ERESTART) { + BPFD_UNLOCK(d); + return (error); + } + if (error == EWOULDBLOCK) { + /* + * On a timeout, return what's in the buffer, + * which may be nothing. If there is something + * in the store buffer, we can rotate the buffers. + */ + if (d->bd_hbuf) + /* + * We filled up the buffer in between + * getting the timeout and arriving + * here, so we don't need to rotate. + */ + break; + + if (d->bd_slen == 0) { + BPFD_UNLOCK(d); + return (0); + } + ROTATE_BUFFERS(d); + break; + } + } + /* + * At this point, we know we have something in the hold slot. + */ + BPFD_UNLOCK(d); + + /* + * Move data from hold buffer into user space. + * We know the entire buffer is transferred since + * we checked above that the read buffer is bpf_bufsize bytes. + * + * XXXRW: More synchronization needed here: what if a second thread + * issues a read on the same fd at the same time? Don't want this + * getting invalidated. + */ + error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio); + + BPFD_LOCK(d); + d->bd_fbuf = d->bd_hbuf; + d->bd_hbuf = NULL; + d->bd_hlen = 0; + bpf_buf_reclaimed(d); + BPFD_UNLOCK(d); + + return (error); +} + +/* + * If there are processes sleeping on this descriptor, wake them up. + */ +static __inline void +bpf_wakeup(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + if (d->bd_state == BPF_WAITING) { + callout_stop(&d->bd_callout); + d->bd_state = BPF_IDLE; + } + wakeup(d); + if (d->bd_async && d->bd_sig && d->bd_sigio) + pgsigio(&d->bd_sigio, d->bd_sig, 0); + + selwakeuppri(&d->bd_sel, PRINET); + KNOTE_LOCKED(&d->bd_sel.si_note, 0); +} + +static void +bpf_timed_out(void *arg) +{ + struct bpf_d *d = (struct bpf_d *)arg; + + BPFD_LOCK_ASSERT(d); + + if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout)) + return; + if (d->bd_state == BPF_WAITING) { + d->bd_state = BPF_TIMED_OUT; + if (d->bd_slen != 0) + bpf_wakeup(d); + } +} + +static int +bpf_ready(struct bpf_d *d) +{ + + BPFD_LOCK_ASSERT(d); + + if (!bpf_canfreebuf(d) && d->bd_hlen != 0) + return (1); + if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && + d->bd_slen != 0) + return (1); + return (0); +} + +static int +bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) +{ + struct bpf_d *d; + struct ifnet *ifp; + struct mbuf *m, *mc; + struct sockaddr dst; + int error, hlen; + + error = devfs_get_cdevpriv((void **)&d); + if (error != 0) + return (error); + + d->bd_pid = curthread->td_proc->p_pid; + d->bd_wcount++; + if (d->bd_bif == NULL) { + d->bd_wdcount++; + return (ENXIO); + } + + ifp = d->bd_bif->bif_ifp; + + if ((ifp->if_flags & IFF_UP) == 0) { + d->bd_wdcount++; + return (ENETDOWN); + } + + if (uio->uio_resid == 0) { + d->bd_wdcount++; + return (0); + } + + bzero(&dst, sizeof(dst)); + m = NULL; + hlen = 0; + error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp, + &m, &dst, &hlen, d->bd_wfilter); + if (error) { + d->bd_wdcount++; + return (error); + } + d->bd_wfcount++; + if (d->bd_hdrcmplt) + dst.sa_family = pseudo_AF_HDRCMPLT; + + if (d->bd_feedback) { + mc = m_dup(m, M_DONTWAIT); + if (mc != NULL) + mc->m_pkthdr.rcvif = ifp; + /* Set M_PROMISC for outgoing packets to be discarded. */ + if (d->bd_direction == BPF_D_INOUT) + m->m_flags |= M_PROMISC; + } else + mc = NULL; + + m->m_pkthdr.len -= hlen; + m->m_len -= hlen; + m->m_data += hlen; /* XXX */ + + CURVNET_SET(ifp->if_vnet); +#ifdef MAC + BPFD_LOCK(d); + mac_bpfdesc_create_mbuf(d, m); + if (mc != NULL) + mac_bpfdesc_create_mbuf(d, mc); + BPFD_UNLOCK(d); +#endif + + error = (*ifp->if_output)(ifp, m, &dst, NULL); + if (error) + d->bd_wdcount++; + + if (mc != NULL) { + if (error == 0) + (*ifp->if_input)(ifp, mc); + else + m_freem(mc); + } + CURVNET_RESTORE(); + + return (error); +} + +/* + * Reset a descriptor by flushing its packet buffer and clearing the receive + * and drop counts. This is doable for kernel-only buffers, but with + * zero-copy buffers, we can't write to (or rotate) buffers that are + * currently owned by userspace. It would be nice if we could encapsulate + * this logic in the buffer code rather than here. + */ +static void +reset_d(struct bpf_d *d) +{ + + mtx_assert(&d->bd_mtx, MA_OWNED); + + if ((d->bd_hbuf != NULL) && + (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) { + /* Free the hold buffer. */ + d->bd_fbuf = d->bd_hbuf; + d->bd_hbuf = NULL; + d->bd_hlen = 0; + bpf_buf_reclaimed(d); + } + if (bpf_canwritebuf(d)) + d->bd_slen = 0; + d->bd_rcount = 0; + d->bd_dcount = 0; + d->bd_fcount = 0; + d->bd_wcount = 0; + d->bd_wfcount = 0; + d->bd_wdcount = 0; + d->bd_zcopy = 0; +} + +/* + * FIONREAD Check for read packet available. + * SIOCGIFADDR Get interface address - convenient hook to driver. + * BIOCGBLEN Get buffer len [for read()]. + * BIOCSETF Set read filter. + * BIOCSETFNR Set read filter without resetting descriptor. + * BIOCSETWF Set write filter. + * BIOCFLUSH Flush read packet buffer. + * BIOCPROMISC Put interface into promiscuous mode. + * BIOCGDLT Get link layer type. + * BIOCGETIF Get interface name. + * BIOCSETIF Set interface. + * BIOCSRTIMEOUT Set read timeout. + * BIOCGRTIMEOUT Get read timeout. + * BIOCGSTATS Get packet stats. + * BIOCIMMEDIATE Set immediate mode. + * BIOCVERSION Get filter language version. + * BIOCGHDRCMPLT Get "header already complete" flag + * BIOCSHDRCMPLT Set "header already complete" flag + * BIOCGDIRECTION Get packet direction flag + * BIOCSDIRECTION Set packet direction flag + * BIOCLOCK Set "locked" flag + * BIOCFEEDBACK Set packet feedback mode. + * BIOCSETZBUF Set current zero-copy buffer locations. + * BIOCGETZMAX Get maximum zero-copy buffer size. + * BIOCROTZBUF Force rotation of zero-copy buffer + * BIOCSETBUFMODE Set buffer mode. + * BIOCGETBUFMODE Get current buffer mode. + */ +/* ARGSUSED */ +static int +bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, + struct thread *td) +{ + struct bpf_d *d; + int error; + + error = devfs_get_cdevpriv((void **)&d); + if (error != 0) + return (error); + + /* + * Refresh PID associated with this descriptor. + */ + BPFD_LOCK(d); + d->bd_pid = td->td_proc->p_pid; + if (d->bd_state == BPF_WAITING) + callout_stop(&d->bd_callout); + d->bd_state = BPF_IDLE; + BPFD_UNLOCK(d); + + if (d->bd_locked == 1) { + switch (cmd) { + case BIOCGBLEN: + case BIOCFLUSH: + case BIOCGDLT: + case BIOCGDLTLIST: +#ifdef COMPAT_FREEBSD32 + case BIOCGDLTLIST32: +#endif + case BIOCGETIF: + case BIOCGRTIMEOUT: +#ifdef COMPAT_FREEBSD32 + case BIOCGRTIMEOUT32: +#endif + case BIOCGSTATS: + case BIOCVERSION: + case BIOCGRSIG: + case BIOCGHDRCMPLT: + case BIOCFEEDBACK: + case FIONREAD: + case BIOCLOCK: + case BIOCSRTIMEOUT: +#ifdef COMPAT_FREEBSD32 + case BIOCSRTIMEOUT32: +#endif + case BIOCIMMEDIATE: + case TIOCGPGRP: + case BIOCROTZBUF: + break; + default: + return (EPERM); + } + } +#ifdef COMPAT_FREEBSD32 + /* + * If we see a 32-bit compat ioctl, mark the stream as 32-bit so + * that it will get 32-bit packet headers. + */ + switch (cmd) { + case BIOCSETF32: + case BIOCSETFNR32: + case BIOCSETWF32: + case BIOCGDLTLIST32: + case BIOCGRTIMEOUT32: + case BIOCSRTIMEOUT32: + d->bd_compat32 = 1; + } +#endif + + CURVNET_SET(TD_TO_VNET(td)); + switch (cmd) { + + default: + error = EINVAL; + break; + + /* + * Check for read packet available. + */ + case FIONREAD: + { + int n; + + BPFD_LOCK(d); + n = d->bd_slen; + if (d->bd_hbuf) + n += d->bd_hlen; + BPFD_UNLOCK(d); + + *(int *)addr = n; + break; + } + + case SIOCGIFADDR: + { + struct ifnet *ifp; + + if (d->bd_bif == NULL) + error = EINVAL; + else { + ifp = d->bd_bif->bif_ifp; + error = (*ifp->if_ioctl)(ifp, cmd, addr); + } + break; + } + + /* + * Get buffer len [for read()]. + */ + case BIOCGBLEN: + *(u_int *)addr = d->bd_bufsize; + break; + + /* + * Set buffer length. + */ + case BIOCSBLEN: + error = bpf_ioctl_sblen(d, (u_int *)addr); + break; + + /* + * Set link layer read filter. + */ + case BIOCSETF: + case BIOCSETFNR: + case BIOCSETWF: +#ifdef COMPAT_FREEBSD32 + case BIOCSETF32: + case BIOCSETFNR32: + case BIOCSETWF32: +#endif + error = bpf_setf(d, (struct bpf_program *)addr, cmd); + break; + + /* + * Flush read packet buffer. + */ + case BIOCFLUSH: + BPFD_LOCK(d); + reset_d(d); + BPFD_UNLOCK(d); + break; + + /* + * Put interface into promiscuous mode. + */ + case BIOCPROMISC: + if (d->bd_bif == NULL) { + /* + * No interface attached yet. + */ + error = EINVAL; + break; + } + if (d->bd_promisc == 0) { + error = ifpromisc(d->bd_bif->bif_ifp, 1); + if (error == 0) + d->bd_promisc = 1; + } + break; + + /* + * Get current data link type. + */ + case BIOCGDLT: + if (d->bd_bif == NULL) + error = EINVAL; + else + *(u_int *)addr = d->bd_bif->bif_dlt; + break; + + /* + * Get a list of supported data link types. + */ +#ifdef COMPAT_FREEBSD32 + case BIOCGDLTLIST32: + { + struct bpf_dltlist32 *list32; + struct bpf_dltlist dltlist; + + list32 = (struct bpf_dltlist32 *)addr; + dltlist.bfl_len = list32->bfl_len; + dltlist.bfl_list = PTRIN(list32->bfl_list); + if (d->bd_bif == NULL) + error = EINVAL; + else { + error = bpf_getdltlist(d, &dltlist); + if (error == 0) + list32->bfl_len = dltlist.bfl_len; + } + break; + } +#endif + + case BIOCGDLTLIST: + if (d->bd_bif == NULL) + error = EINVAL; + else + error = bpf_getdltlist(d, (struct bpf_dltlist *)addr); + break; + + /* + * Set data link type. + */ + case BIOCSDLT: + if (d->bd_bif == NULL) + error = EINVAL; + else + error = bpf_setdlt(d, *(u_int *)addr); + break; + + /* + * Get interface name. + */ + case BIOCGETIF: + if (d->bd_bif == NULL) + error = EINVAL; + else { + struct ifnet *const ifp = d->bd_bif->bif_ifp; + struct ifreq *const ifr = (struct ifreq *)addr; + + strlcpy(ifr->ifr_name, ifp->if_xname, + sizeof(ifr->ifr_name)); + } + break; + + /* + * Set interface. + */ + case BIOCSETIF: + error = bpf_setif(d, (struct ifreq *)addr); + break; + + /* + * Set read timeout. + */ + case BIOCSRTIMEOUT: +#ifdef COMPAT_FREEBSD32 + case BIOCSRTIMEOUT32: +#endif + { + struct timeval *tv = (struct timeval *)addr; +#ifdef COMPAT_FREEBSD32 + struct timeval32 *tv32; + struct timeval tv64; + + if (cmd == BIOCSRTIMEOUT32) { + tv32 = (struct timeval32 *)addr; + tv = &tv64; + tv->tv_sec = tv32->tv_sec; + tv->tv_usec = tv32->tv_usec; + } else +#endif + tv = (struct timeval *)addr; + + /* + * Subtract 1 tick from tvtohz() since this isn't + * a one-shot timer. + */ + if ((error = itimerfix(tv)) == 0) + d->bd_rtout = tvtohz(tv) - 1; + break; + } + + /* + * Get read timeout. + */ + case BIOCGRTIMEOUT: +#ifdef COMPAT_FREEBSD32 + case BIOCGRTIMEOUT32: +#endif + { + struct timeval *tv; +#ifdef COMPAT_FREEBSD32 + struct timeval32 *tv32; + struct timeval tv64; + + if (cmd == BIOCGRTIMEOUT32) + tv = &tv64; + else +#endif + tv = (struct timeval *)addr; + + tv->tv_sec = d->bd_rtout / hz; + tv->tv_usec = (d->bd_rtout % hz) * tick; +#ifdef COMPAT_FREEBSD32 + if (cmd == BIOCGRTIMEOUT32) { + tv32 = (struct timeval32 *)addr; + tv32->tv_sec = tv->tv_sec; + tv32->tv_usec = tv->tv_usec; + } +#endif + + break; + } + + /* + * Get packet stats. + */ + case BIOCGSTATS: + { + struct bpf_stat *bs = (struct bpf_stat *)addr; + + /* XXXCSJP overflow */ + bs->bs_recv = d->bd_rcount; + bs->bs_drop = d->bd_dcount; + break; + } + + /* + * Set immediate mode. + */ + case BIOCIMMEDIATE: + d->bd_immediate = *(u_int *)addr; + break; + + case BIOCVERSION: + { + struct bpf_version *bv = (struct bpf_version *)addr; + + bv->bv_major = BPF_MAJOR_VERSION; + bv->bv_minor = BPF_MINOR_VERSION; + break; + } + + /* + * Get "header already complete" flag + */ + case BIOCGHDRCMPLT: + *(u_int *)addr = d->bd_hdrcmplt; + break; + + /* + * Set "header already complete" flag + */ + case BIOCSHDRCMPLT: + d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0; + break; + + /* + * Get packet direction flag + */ + case BIOCGDIRECTION: + *(u_int *)addr = d->bd_direction; + break; + + /* + * Set packet direction flag + */ + case BIOCSDIRECTION: + { + u_int direction; + + direction = *(u_int *)addr; + switch (direction) { + case BPF_D_IN: + case BPF_D_INOUT: + case BPF_D_OUT: + d->bd_direction = direction; + break; + default: + error = EINVAL; + } + } + break; + + case BIOCFEEDBACK: + d->bd_feedback = *(u_int *)addr; + break; + + case BIOCLOCK: + d->bd_locked = 1; + break; + + case FIONBIO: /* Non-blocking I/O */ + break; + + case FIOASYNC: /* Send signal on receive packets */ + d->bd_async = *(int *)addr; + break; + + case FIOSETOWN: + error = fsetown(*(int *)addr, &d->bd_sigio); + break; + + case FIOGETOWN: + *(int *)addr = fgetown(&d->bd_sigio); + break; + + /* This is deprecated, FIOSETOWN should be used instead. */ + case TIOCSPGRP: + error = fsetown(-(*(int *)addr), &d->bd_sigio); + break; + + /* This is deprecated, FIOGETOWN should be used instead. */ + case TIOCGPGRP: + *(int *)addr = -fgetown(&d->bd_sigio); + break; + + case BIOCSRSIG: /* Set receive signal */ + { + u_int sig; + + sig = *(u_int *)addr; + + if (sig >= NSIG) + error = EINVAL; + else + d->bd_sig = sig; + break; + } + case BIOCGRSIG: + *(u_int *)addr = d->bd_sig; + break; + + case BIOCGETBUFMODE: + *(u_int *)addr = d->bd_bufmode; + break; + + case BIOCSETBUFMODE: + /* + * Allow the buffering mode to be changed as long as we + * haven't yet committed to a particular mode. Our + * definition of commitment, for now, is whether or not a + * buffer has been allocated or an interface attached, since + * that's the point where things get tricky. + */ + switch (*(u_int *)addr) { + case BPF_BUFMODE_BUFFER: + break; + + case BPF_BUFMODE_ZBUF: + if (bpf_zerocopy_enable) + break; + /* FALLSTHROUGH */ + + default: + CURVNET_RESTORE(); + return (EINVAL); + } + + BPFD_LOCK(d); + if (d->bd_sbuf != NULL || d->bd_hbuf != NULL || + d->bd_fbuf != NULL || d->bd_bif != NULL) { + BPFD_UNLOCK(d); + CURVNET_RESTORE(); + return (EBUSY); + } + d->bd_bufmode = *(u_int *)addr; + BPFD_UNLOCK(d); + break; + + case BIOCGETZMAX: + error = bpf_ioctl_getzmax(td, d, (size_t *)addr); + break; + + case BIOCSETZBUF: + error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr); + break; + + case BIOCROTZBUF: + error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr); + break; + } + CURVNET_RESTORE(); + return (error); +} + +/* + * Set d's packet filter program to fp. If this file already has a filter, + * free it and replace it. Returns EINVAL for bogus requests. + */ +static int +bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) +{ + struct bpf_insn *fcode, *old; + u_int wfilter, flen, size; +#ifdef BPF_JITTER + bpf_jit_filter *ofunc; +#endif +#ifdef COMPAT_FREEBSD32 + struct bpf_program32 *fp32; + struct bpf_program fp_swab; + + if (cmd == BIOCSETWF32 || cmd == BIOCSETF32 || cmd == BIOCSETFNR32) { + fp32 = (struct bpf_program32 *)fp; + fp_swab.bf_len = fp32->bf_len; + fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns; + fp = &fp_swab; + if (cmd == BIOCSETWF32) + cmd = BIOCSETWF; + } +#endif + if (cmd == BIOCSETWF) { + old = d->bd_wfilter; + wfilter = 1; +#ifdef BPF_JITTER + ofunc = NULL; +#endif + } else { + wfilter = 0; + old = d->bd_rfilter; +#ifdef BPF_JITTER + ofunc = d->bd_bfilter; +#endif + } + if (fp->bf_insns == NULL) { + if (fp->bf_len != 0) + return (EINVAL); + BPFD_LOCK(d); + if (wfilter) + d->bd_wfilter = NULL; + else { + d->bd_rfilter = NULL; +#ifdef BPF_JITTER + d->bd_bfilter = NULL; +#endif + if (cmd == BIOCSETF) + reset_d(d); + } + BPFD_UNLOCK(d); + if (old != NULL) + free((caddr_t)old, M_BPF); +#ifdef BPF_JITTER + if (ofunc != NULL) + bpf_destroy_jit_filter(ofunc); +#endif + return (0); + } + flen = fp->bf_len; + if (flen > bpf_maxinsns) + return (EINVAL); + + size = flen * sizeof(*fp->bf_insns); + fcode = (struct bpf_insn *)malloc(size, M_BPF, M_WAITOK); + if (copyin((caddr_t)fp->bf_insns, (caddr_t)fcode, size) == 0 && + bpf_validate(fcode, (int)flen)) { + BPFD_LOCK(d); + if (wfilter) + d->bd_wfilter = fcode; + else { + d->bd_rfilter = fcode; +#ifdef BPF_JITTER + d->bd_bfilter = bpf_jitter(fcode, flen); +#endif + if (cmd == BIOCSETF) + reset_d(d); + } + BPFD_UNLOCK(d); + if (old != NULL) + free((caddr_t)old, M_BPF); +#ifdef BPF_JITTER + if (ofunc != NULL) + bpf_destroy_jit_filter(ofunc); +#endif + + return (0); + } + free((caddr_t)fcode, M_BPF); + return (EINVAL); +} + +/* + * Detach a file from its current interface (if attached at all) and attach + * to the interface indicated by the name stored in ifr. + * Return an errno or 0. + */ +static int +bpf_setif(struct bpf_d *d, struct ifreq *ifr) +{ + struct bpf_if *bp; + struct ifnet *theywant; + + theywant = ifunit(ifr->ifr_name); + if (theywant == NULL || theywant->if_bpf == NULL) + return (ENXIO); + + bp = theywant->if_bpf; + + /* + * Behavior here depends on the buffering model. If we're using + * kernel memory buffers, then we can allocate them here. If we're + * using zero-copy, then the user process must have registered + * buffers by the time we get here. If not, return an error. + * + * XXXRW: There are locking issues here with multi-threaded use: what + * if two threads try to set the interface at once? + */ + switch (d->bd_bufmode) { + case BPF_BUFMODE_BUFFER: + if (d->bd_sbuf == NULL) + bpf_buffer_alloc(d); + KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL")); + break; + + case BPF_BUFMODE_ZBUF: + if (d->bd_sbuf == NULL) + return (EINVAL); + break; + + default: + panic("bpf_setif: bufmode %d", d->bd_bufmode); + } + if (bp != d->bd_bif) { + if (d->bd_bif) + /* + * Detach if attached to something else. + */ + bpf_detachd(d); + + bpf_attachd(d, bp); + } + BPFD_LOCK(d); + reset_d(d); + BPFD_UNLOCK(d); + return (0); +} + +/* + * Support for select() and poll() system calls + * + * Return true iff the specific operation will not block indefinitely. + * Otherwise, return false but make a note that a selwakeup() must be done. + */ +static int +bpfpoll(struct cdev *dev, int events, struct thread *td) +{ + struct bpf_d *d; + int revents; + + if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL) + return (events & + (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); + + /* + * Refresh PID associated with this descriptor. + */ + revents = events & (POLLOUT | POLLWRNORM); + BPFD_LOCK(d); + d->bd_pid = td->td_proc->p_pid; + if (events & (POLLIN | POLLRDNORM)) { + if (bpf_ready(d)) + revents |= events & (POLLIN | POLLRDNORM); + else { + selrecord(td, &d->bd_sel); + /* Start the read timeout if necessary. */ + if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { + callout_reset(&d->bd_callout, d->bd_rtout, + bpf_timed_out, d); + d->bd_state = BPF_WAITING; + } + } + } + BPFD_UNLOCK(d); + return (revents); +} + +/* + * Support for kevent() system call. Register EVFILT_READ filters and + * reject all others. + */ +int +bpfkqfilter(struct cdev *dev, struct knote *kn) +{ + struct bpf_d *d; + + if (devfs_get_cdevpriv((void **)&d) != 0 || + kn->kn_filter != EVFILT_READ) + return (1); + + /* + * Refresh PID associated with this descriptor. + */ + BPFD_LOCK(d); + d->bd_pid = curthread->td_proc->p_pid; + kn->kn_fop = &bpfread_filtops; + kn->kn_hook = d; + knlist_add(&d->bd_sel.si_note, kn, 1); + BPFD_UNLOCK(d); + + return (0); +} + +static void +filt_bpfdetach(struct knote *kn) +{ + struct bpf_d *d = (struct bpf_d *)kn->kn_hook; + + knlist_remove(&d->bd_sel.si_note, kn, 0); +} + +static int +filt_bpfread(struct knote *kn, long hint) +{ + struct bpf_d *d = (struct bpf_d *)kn->kn_hook; + int ready; + + BPFD_LOCK_ASSERT(d); + ready = bpf_ready(d); + if (ready) { + kn->kn_data = d->bd_slen; + if (d->bd_hbuf) + kn->kn_data += d->bd_hlen; + } else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { + callout_reset(&d->bd_callout, d->bd_rtout, + bpf_timed_out, d); + d->bd_state = BPF_WAITING; + } + + return (ready); +} + +/* + * Incoming linkage from device drivers. Process the packet pkt, of length + * pktlen, which is stored in a contiguous buffer. The packet is parsed + * by each process' filter, and if accepted, stashed into the corresponding + * buffer. + */ +void +bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) +{ + struct bpf_d *d; +#ifdef BPF_JITTER + bpf_jit_filter *bf; +#endif + u_int slen; + int gottime; + struct timeval tv; + + gottime = 0; + BPFIF_LOCK(bp); + LIST_FOREACH(d, &bp->bif_dlist, bd_next) { + BPFD_LOCK(d); + ++d->bd_rcount; + /* + * NB: We dont call BPF_CHECK_DIRECTION() here since there is no + * way for the caller to indiciate to us whether this packet + * is inbound or outbound. In the bpf_mtap() routines, we use + * the interface pointers on the mbuf to figure it out. + */ +#ifdef BPF_JITTER + bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; + if (bf != NULL) + slen = (*(bf->func))(pkt, pktlen, pktlen); + else +#endif + slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen); + if (slen != 0) { + d->bd_fcount++; + if (!gottime) { + microtime(&tv); + gottime = 1; + } +#ifdef MAC + if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) +#endif + catchpacket(d, pkt, pktlen, slen, + bpf_append_bytes, &tv); + } + BPFD_UNLOCK(d); + } + BPFIF_UNLOCK(bp); +} + +#define BPF_CHECK_DIRECTION(d, r, i) \ + (((d)->bd_direction == BPF_D_IN && (r) != (i)) || \ + ((d)->bd_direction == BPF_D_OUT && (r) == (i))) + +/* + * Incoming linkage from device drivers, when packet is in an mbuf chain. + */ +void +bpf_mtap(struct bpf_if *bp, struct mbuf *m) +{ + struct bpf_d *d; +#ifdef BPF_JITTER + bpf_jit_filter *bf; +#endif + u_int pktlen, slen; + int gottime; + struct timeval tv; + + /* Skip outgoing duplicate packets. */ + if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { + m->m_flags &= ~M_PROMISC; + return; + } + + gottime = 0; + + pktlen = m_length(m, NULL); + + BPFIF_LOCK(bp); + LIST_FOREACH(d, &bp->bif_dlist, bd_next) { + if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) + continue; + BPFD_LOCK(d); + ++d->bd_rcount; +#ifdef BPF_JITTER + bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; + /* XXX We cannot handle multiple mbufs. */ + if (bf != NULL && m->m_next == NULL) + slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen); + else +#endif + slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0); + if (slen != 0) { + d->bd_fcount++; + if (!gottime) { + microtime(&tv); + gottime = 1; + } +#ifdef MAC + if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) +#endif + catchpacket(d, (u_char *)m, pktlen, slen, + bpf_append_mbuf, &tv); + } + BPFD_UNLOCK(d); + } + BPFIF_UNLOCK(bp); +} + +/* + * Incoming linkage from device drivers, when packet is in + * an mbuf chain and to be prepended by a contiguous header. + */ +void +bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) +{ + struct mbuf mb; + struct bpf_d *d; + u_int pktlen, slen; + int gottime; + struct timeval tv; + + /* Skip outgoing duplicate packets. */ + if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { + m->m_flags &= ~M_PROMISC; + return; + } + + gottime = 0; + + pktlen = m_length(m, NULL); + /* + * Craft on-stack mbuf suitable for passing to bpf_filter. + * Note that we cut corners here; we only setup what's + * absolutely needed--this mbuf should never go anywhere else. + */ + mb.m_next = m; + mb.m_data = data; + mb.m_len = dlen; + pktlen += dlen; + + BPFIF_LOCK(bp); + LIST_FOREACH(d, &bp->bif_dlist, bd_next) { + if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) + continue; + BPFD_LOCK(d); + ++d->bd_rcount; + slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0); + if (slen != 0) { + d->bd_fcount++; + if (!gottime) { + microtime(&tv); + gottime = 1; + } +#ifdef MAC + if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) +#endif + catchpacket(d, (u_char *)&mb, pktlen, slen, + bpf_append_mbuf, &tv); + } + BPFD_UNLOCK(d); + } + BPFIF_UNLOCK(bp); +} + +#undef BPF_CHECK_DIRECTION + +/* + * Move the packet data from interface memory (pkt) into the + * store buffer. "cpfn" is the routine called to do the actual data + * transfer. bcopy is passed in to copy contiguous chunks, while + * bpf_append_mbuf is passed in to copy mbuf chains. In the latter case, + * pkt is really an mbuf. + */ +static void +catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, + void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int), + struct timeval *tv) +{ + struct bpf_hdr hdr; +#ifdef COMPAT_FREEBSD32 + struct bpf_hdr32 hdr32; +#endif + int totlen, curlen; + int hdrlen = d->bd_bif->bif_hdrlen; + int do_wakeup = 0; + + BPFD_LOCK_ASSERT(d); + + /* + * Detect whether user space has released a buffer back to us, and if + * so, move it from being a hold buffer to a free buffer. This may + * not be the best place to do it (for example, we might only want to + * run this check if we need the space), but for now it's a reliable + * spot to do it. + */ + if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) { + d->bd_fbuf = d->bd_hbuf; + d->bd_hbuf = NULL; + d->bd_hlen = 0; + bpf_buf_reclaimed(d); + } + + /* + * Figure out how many bytes to move. If the packet is + * greater or equal to the snapshot length, transfer that + * much. Otherwise, transfer the whole packet (unless + * we hit the buffer size limit). + */ + totlen = hdrlen + min(snaplen, pktlen); + if (totlen > d->bd_bufsize) + totlen = d->bd_bufsize; + + /* + * Round up the end of the previous packet to the next longword. + * + * Drop the packet if there's no room and no hope of room + * If the packet would overflow the storage buffer or the storage + * buffer is considered immutable by the buffer model, try to rotate + * the buffer and wakeup pending processes. + */ +#ifdef COMPAT_FREEBSD32 + if (d->bd_compat32) + curlen = BPF_WORDALIGN32(d->bd_slen); + else +#endif + curlen = BPF_WORDALIGN(d->bd_slen); + if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) { + if (d->bd_fbuf == NULL) { + /* + * There's no room in the store buffer, and no + * prospect of room, so drop the packet. Notify the + * buffer model. + */ + bpf_buffull(d); + ++d->bd_dcount; + return; + } + ROTATE_BUFFERS(d); + do_wakeup = 1; + curlen = 0; + } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) + /* + * Immediate mode is set, or the read timeout has already + * expired during a select call. A packet arrived, so the + * reader should be woken up. + */ + do_wakeup = 1; +#ifdef COMPAT_FREEBSD32 + /* + * If this is a 32-bit stream, then stick a 32-bit header at the + * front and copy the data into the buffer. + */ + if (d->bd_compat32) { + bzero(&hdr32, sizeof(hdr32)); + hdr32.bh_tstamp.tv_sec = tv->tv_sec; + hdr32.bh_tstamp.tv_usec = tv->tv_usec; + hdr32.bh_datalen = pktlen; + hdr32.bh_hdrlen = hdrlen; + hdr.bh_caplen = hdr32.bh_caplen = totlen - hdrlen; + bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32, sizeof(hdr32)); + goto copy; + } +#endif + + /* + * Append the bpf header. Note we append the actual header size, but + * move forward the length of the header plus padding. + */ + bzero(&hdr, sizeof(hdr)); + hdr.bh_tstamp = *tv; + hdr.bh_datalen = pktlen; + hdr.bh_hdrlen = hdrlen; + hdr.bh_caplen = totlen - hdrlen; + bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr)); + + /* + * Copy the packet data into the store buffer and update its length. + */ +#ifdef COMPAT_FREEBSD32 + copy: +#endif + (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen); + d->bd_slen = curlen + totlen; + + if (do_wakeup) + bpf_wakeup(d); +} + +/* + * Free buffers currently in use by a descriptor. + * Called on close. + */ +static void +bpf_freed(struct bpf_d *d) +{ + + /* + * We don't need to lock out interrupts since this descriptor has + * been detached from its interface and it yet hasn't been marked + * free. + */ + bpf_free(d); + if (d->bd_rfilter != NULL) { + free((caddr_t)d->bd_rfilter, M_BPF); +#ifdef BPF_JITTER + if (d->bd_bfilter != NULL) + bpf_destroy_jit_filter(d->bd_bfilter); +#endif + } + if (d->bd_wfilter != NULL) + free((caddr_t)d->bd_wfilter, M_BPF); + mtx_destroy(&d->bd_mtx); +} + +/* + * Attach an interface to bpf. dlt is the link layer type; hdrlen is the + * fixed size of the link header (variable length headers not yet supported). + */ +void +bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) +{ + + bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); +} + +/* + * Attach an interface to bpf. ifp is a pointer to the structure + * defining the interface to be attached, dlt is the link layer type, + * and hdrlen is the fixed size of the link header (variable length + * headers are not yet supporrted). + */ +void +bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) +{ + struct bpf_if *bp; + + bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO); + if (bp == NULL) + panic("bpfattach"); + + LIST_INIT(&bp->bif_dlist); + bp->bif_ifp = ifp; + bp->bif_dlt = dlt; + mtx_init(&bp->bif_mtx, "bpf interface lock", NULL, MTX_DEF); + KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized")); + *driverp = bp; + + mtx_lock(&bpf_mtx); + LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); + mtx_unlock(&bpf_mtx); + + /* + * Compute the length of the bpf header. This is not necessarily + * equal to SIZEOF_BPF_HDR because we want to insert spacing such + * that the network layer header begins on a longword boundary (for + * performance reasons and to alleviate alignment restrictions). + */ + bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen; + + if (bootverbose) + if_printf(ifp, "bpf attached\n"); +} + +/* + * Detach bpf from an interface. This involves detaching each descriptor + * associated with the interface, and leaving bd_bif NULL. Notify each + * descriptor as it's detached so that any sleepers wake up and get + * ENXIO. + */ +void +bpfdetach(struct ifnet *ifp) +{ + struct bpf_if *bp; + struct bpf_d *d; + + /* Locate BPF interface information */ + mtx_lock(&bpf_mtx); + LIST_FOREACH(bp, &bpf_iflist, bif_next) { + if (ifp == bp->bif_ifp) + break; + } + + /* Interface wasn't attached */ + if ((bp == NULL) || (bp->bif_ifp == NULL)) { + mtx_unlock(&bpf_mtx); + printf("bpfdetach: %s was not attached\n", ifp->if_xname); + return; + } + + LIST_REMOVE(bp, bif_next); + mtx_unlock(&bpf_mtx); + + while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) { + bpf_detachd(d); + BPFD_LOCK(d); + bpf_wakeup(d); + BPFD_UNLOCK(d); + } + + mtx_destroy(&bp->bif_mtx); + free(bp, M_BPF); +} + +/* + * Get a list of available data link type of the interface. + */ +static int +bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) +{ + int n, error; + struct ifnet *ifp; + struct bpf_if *bp; + + ifp = d->bd_bif->bif_ifp; + n = 0; + error = 0; + mtx_lock(&bpf_mtx); + LIST_FOREACH(bp, &bpf_iflist, bif_next) { + if (bp->bif_ifp != ifp) + continue; + if (bfl->bfl_list != NULL) { + if (n >= bfl->bfl_len) { + mtx_unlock(&bpf_mtx); + return (ENOMEM); + } + error = copyout(&bp->bif_dlt, + bfl->bfl_list + n, sizeof(u_int)); + } + n++; + } + mtx_unlock(&bpf_mtx); + bfl->bfl_len = n; + return (error); +} + +/* + * Set the data link type of a BPF instance. + */ +static int +bpf_setdlt(struct bpf_d *d, u_int dlt) +{ + int error, opromisc; + struct ifnet *ifp; + struct bpf_if *bp; + + if (d->bd_bif->bif_dlt == dlt) + return (0); + ifp = d->bd_bif->bif_ifp; + mtx_lock(&bpf_mtx); + LIST_FOREACH(bp, &bpf_iflist, bif_next) { + if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) + break; + } + mtx_unlock(&bpf_mtx); + if (bp != NULL) { + opromisc = d->bd_promisc; + bpf_detachd(d); + bpf_attachd(d, bp); + BPFD_LOCK(d); + reset_d(d); + BPFD_UNLOCK(d); + if (opromisc) { + error = ifpromisc(bp->bif_ifp, 1); + if (error) + if_printf(bp->bif_ifp, + "bpf_setdlt: ifpromisc failed (%d)\n", + error); + else + d->bd_promisc = 1; + } + } + return (bp == NULL ? EINVAL : 0); +} + +static void +bpf_drvinit(void *unused) +{ + struct cdev *dev; + + mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF); + LIST_INIT(&bpf_iflist); + + dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf"); + /* For compatibility */ + make_dev_alias(dev, "bpf0"); +} + +/* + * Zero out the various packet counters associated with all of the bpf + * descriptors. At some point, we will probably want to get a bit more + * granular and allow the user to specify descriptors to be zeroed. + */ +static void +bpf_zero_counters(void) +{ + struct bpf_if *bp; + struct bpf_d *bd; + + mtx_lock(&bpf_mtx); + LIST_FOREACH(bp, &bpf_iflist, bif_next) { + BPFIF_LOCK(bp); + LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { + BPFD_LOCK(bd); + bd->bd_rcount = 0; + bd->bd_dcount = 0; + bd->bd_fcount = 0; + bd->bd_wcount = 0; + bd->bd_wfcount = 0; + bd->bd_zcopy = 0; + BPFD_UNLOCK(bd); + } + BPFIF_UNLOCK(bp); + } + mtx_unlock(&bpf_mtx); +} + +static void +bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) +{ + + bzero(d, sizeof(*d)); + BPFD_LOCK_ASSERT(bd); + d->bd_structsize = sizeof(*d); + d->bd_immediate = bd->bd_immediate; + d->bd_promisc = bd->bd_promisc; + d->bd_hdrcmplt = bd->bd_hdrcmplt; + d->bd_direction = bd->bd_direction; + d->bd_feedback = bd->bd_feedback; + d->bd_async = bd->bd_async; + d->bd_rcount = bd->bd_rcount; + d->bd_dcount = bd->bd_dcount; + d->bd_fcount = bd->bd_fcount; + d->bd_sig = bd->bd_sig; + d->bd_slen = bd->bd_slen; + d->bd_hlen = bd->bd_hlen; + d->bd_bufsize = bd->bd_bufsize; + d->bd_pid = bd->bd_pid; + strlcpy(d->bd_ifname, + bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ); + d->bd_locked = bd->bd_locked; + d->bd_wcount = bd->bd_wcount; + d->bd_wdcount = bd->bd_wdcount; + d->bd_wfcount = bd->bd_wfcount; + d->bd_zcopy = bd->bd_zcopy; + d->bd_bufmode = bd->bd_bufmode; +} + +static int +bpf_stats_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct xbpf_d *xbdbuf, *xbd, zerostats; + int index, error; + struct bpf_if *bp; + struct bpf_d *bd; + + /* + * XXX This is not technically correct. It is possible for non + * privileged users to open bpf devices. It would make sense + * if the users who opened the devices were able to retrieve + * the statistics for them, too. + */ + error = priv_check(req->td, PRIV_NET_BPF); + if (error) + return (error); + /* + * Check to see if the user is requesting that the counters be + * zeroed out. Explicitly check that the supplied data is zeroed, + * as we aren't allowing the user to set the counters currently. + */ + if (req->newptr != NULL) { + if (req->newlen != sizeof(zerostats)) + return (EINVAL); + bzero(&zerostats, sizeof(zerostats)); + xbd = req->newptr; + if (bcmp(xbd, &zerostats, sizeof(*xbd)) != 0) + return (EINVAL); + bpf_zero_counters(); + return (0); + } + if (req->oldptr == NULL) + return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd))); + if (bpf_bpfd_cnt == 0) + return (SYSCTL_OUT(req, 0, 0)); + xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK); + mtx_lock(&bpf_mtx); + if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) { + mtx_unlock(&bpf_mtx); + free(xbdbuf, M_BPF); + return (ENOMEM); + } + index = 0; + LIST_FOREACH(bp, &bpf_iflist, bif_next) { + BPFIF_LOCK(bp); + LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { + xbd = &xbdbuf[index++]; + BPFD_LOCK(bd); + bpfstats_fill_xbpf(xbd, bd); + BPFD_UNLOCK(bd); + } + BPFIF_UNLOCK(bp); + } + mtx_unlock(&bpf_mtx); + error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd)); + free(xbdbuf, M_BPF); + return (error); +} + +SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL); + +#else /* !DEV_BPF && !NETGRAPH_BPF */ +/* + * NOP stubs to allow bpf-using drivers to load and function. + * + * A 'better' implementation would allow the core bpf functionality + * to be loaded at runtime. + */ +static struct bpf_if bp_null; + +void +bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) +{ +} + +void +bpf_mtap(struct bpf_if *bp, struct mbuf *m) +{ +} + +void +bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m) +{ +} + +void +bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) +{ + + bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); +} + +void +bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) +{ + + *driverp = &bp_null; +} + +void +bpfdetach(struct ifnet *ifp) +{ +} + +u_int +bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) +{ + return -1; /* "no filter" behaviour */ +} + +int +bpf_validate(const struct bpf_insn *f, int len) +{ + return 0; /* false */ +} + +#endif /* !DEV_BPF && !NETGRAPH_BPF */ diff --git a/freebsd/sys/net/bpf.h b/freebsd/sys/net/bpf.h new file mode 100644 index 00000000..d9dd4289 --- /dev/null +++ b/freebsd/sys/net/bpf.h @@ -0,0 +1,974 @@ +/*- + * Copyright (c) 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpf.h 8.1 (Berkeley) 6/10/93 + * @(#)bpf.h 1.34 (LBL) 6/16/96 + * + * $FreeBSD$ + */ + +#ifndef _NET_BPF_HH_ +#define _NET_BPF_HH_ + +/* BSD style release date */ +#define BPF_RELEASE 199606 + +typedef int32_t bpf_int32; +typedef u_int32_t bpf_u_int32; + +/* + * Alignment macros. BPF_WORDALIGN rounds up to the next + * even multiple of BPF_ALIGNMENT. + */ +#define BPF_ALIGNMENT sizeof(long) +#define BPF_WORDALIGN(x) (((x)+(BPF_ALIGNMENT-1))&~(BPF_ALIGNMENT-1)) + +#define BPF_MAXINSNS 512 +#define BPF_MAXBUFSIZE 0x80000 +#define BPF_MINBUFSIZE 32 + +/* + * Structure for BIOCSETF. + */ +struct bpf_program { + u_int bf_len; + struct bpf_insn *bf_insns; +}; + +/* + * Struct returned by BIOCGSTATS. + */ +struct bpf_stat { + u_int bs_recv; /* number of packets received */ + u_int bs_drop; /* number of packets dropped */ +}; + +/* + * Struct return by BIOCVERSION. This represents the version number of + * the filter language described by the instruction encodings below. + * bpf understands a program iff kernel_major == filter_major && + * kernel_minor >= filter_minor, that is, if the value returned by the + * running kernel has the same major number and a minor number equal + * equal to or less than the filter being downloaded. Otherwise, the + * results are undefined, meaning an error may be returned or packets + * may be accepted haphazardly. + * It has nothing to do with the source code version. + */ +struct bpf_version { + u_short bv_major; + u_short bv_minor; +}; +/* Current version number of filter architecture. */ +#define BPF_MAJOR_VERSION 1 +#define BPF_MINOR_VERSION 1 + +/* + * Historically, BPF has supported a single buffering model, first using mbuf + * clusters in kernel, and later using malloc(9) buffers in kernel. We now + * support multiple buffering modes, which may be queried and set using + * BIOCGETBUFMODE and BIOCSETBUFMODE. So as to avoid handling the complexity + * of changing modes while sniffing packets, the mode becomes fixed once an + * interface has been attached to the BPF descriptor. + */ +#define BPF_BUFMODE_BUFFER 1 /* Kernel buffers with read(). */ +#define BPF_BUFMODE_ZBUF 2 /* Zero-copy buffers. */ + +/*- + * Struct used by BIOCSETZBUF, BIOCROTZBUF: describes up to two zero-copy + * buffer as used by BPF. + */ +struct bpf_zbuf { + void *bz_bufa; /* Location of 'a' zero-copy buffer. */ + void *bz_bufb; /* Location of 'b' zero-copy buffer. */ + size_t bz_buflen; /* Size of zero-copy buffers. */ +}; + +#define BIOCGBLEN _IOR('B',102, u_int) +#define BIOCSBLEN _IOWR('B',102, u_int) +#define BIOCSETF _IOW('B',103, struct bpf_program) +#define BIOCFLUSH _IO('B',104) +#define BIOCPROMISC _IO('B',105) +#define BIOCGDLT _IOR('B',106, u_int) +#define BIOCGETIF _IOR('B',107, struct ifreq) +#define BIOCSETIF _IOW('B',108, struct ifreq) +#define BIOCSRTIMEOUT _IOW('B',109, struct timeval) +#define BIOCGRTIMEOUT _IOR('B',110, struct timeval) +#define BIOCGSTATS _IOR('B',111, struct bpf_stat) +#define BIOCIMMEDIATE _IOW('B',112, u_int) +#define BIOCVERSION _IOR('B',113, struct bpf_version) +#define BIOCGRSIG _IOR('B',114, u_int) +#define BIOCSRSIG _IOW('B',115, u_int) +#define BIOCGHDRCMPLT _IOR('B',116, u_int) +#define BIOCSHDRCMPLT _IOW('B',117, u_int) +#define BIOCGDIRECTION _IOR('B',118, u_int) +#define BIOCSDIRECTION _IOW('B',119, u_int) +#define BIOCSDLT _IOW('B',120, u_int) +#define BIOCGDLTLIST _IOWR('B',121, struct bpf_dltlist) +#define BIOCLOCK _IO('B', 122) +#define BIOCSETWF _IOW('B',123, struct bpf_program) +#define BIOCFEEDBACK _IOW('B',124, u_int) +#define BIOCGETBUFMODE _IOR('B',125, u_int) +#define BIOCSETBUFMODE _IOW('B',126, u_int) +#define BIOCGETZMAX _IOR('B',127, size_t) +#define BIOCROTZBUF _IOR('B',128, struct bpf_zbuf) +#define BIOCSETZBUF _IOW('B',129, struct bpf_zbuf) +#define BIOCSETFNR _IOW('B',130, struct bpf_program) + +/* Obsolete */ +#define BIOCGSEESENT BIOCGDIRECTION +#define BIOCSSEESENT BIOCSDIRECTION + +/* Packet directions */ +enum bpf_direction { + BPF_D_IN, /* See incoming packets */ + BPF_D_INOUT, /* See incoming and outgoing packets */ + BPF_D_OUT /* See outgoing packets */ +}; + +/* + * Structure prepended to each packet. + */ +struct bpf_hdr { + struct timeval bh_tstamp; /* time stamp */ + bpf_u_int32 bh_caplen; /* length of captured portion */ + bpf_u_int32 bh_datalen; /* original length of packet */ + u_short bh_hdrlen; /* length of bpf header (this struct + plus alignment padding) */ +}; +/* + * Because the structure above is not a multiple of 4 bytes, some compilers + * will insist on inserting padding; hence, sizeof(struct bpf_hdr) won't work. + * Only the kernel needs to know about it; applications use bh_hdrlen. + */ +#ifdef _KERNEL +#define SIZEOF_BPF_HDR (sizeof(struct bpf_hdr) <= 20 ? 18 : \ + sizeof(struct bpf_hdr)) +#endif + +/* + * When using zero-copy BPF buffers, a shared memory header is present + * allowing the kernel BPF implementation and user process to synchronize + * without using system calls. This structure defines that header. When + * accessing these fields, appropriate atomic operation and memory barriers + * are required in order not to see stale or out-of-order data; see bpf(4) + * for reference code to access these fields from userspace. + * + * The layout of this structure is critical, and must not be changed; if must + * fit in a single page on all architectures. + */ +struct bpf_zbuf_header { + volatile u_int bzh_kernel_gen; /* Kernel generation number. */ + volatile u_int bzh_kernel_len; /* Length of data in the buffer. */ + volatile u_int bzh_user_gen; /* User generation number. */ + u_int _bzh_pad[5]; +}; + +/* + * Data-link level type codes. + */ +#define DLT_NULL 0 /* BSD loopback encapsulation */ +#define DLT_EN10MB 1 /* Ethernet (10Mb) */ +#define DLT_EN3MB 2 /* Experimental Ethernet (3Mb) */ +#define DLT_AX25 3 /* Amateur Radio AX.25 */ +#define DLT_PRONET 4 /* Proteon ProNET Token Ring */ +#define DLT_CHAOS 5 /* Chaos */ +#define DLT_IEEE802 6 /* IEEE 802 Networks */ +#define DLT_ARCNET 7 /* ARCNET */ +#define DLT_SLIP 8 /* Serial Line IP */ +#define DLT_PPP 9 /* Point-to-point Protocol */ +#define DLT_FDDI 10 /* FDDI */ +#define DLT_ATM_RFC1483 11 /* LLC/SNAP encapsulated atm */ +#define DLT_RAW 12 /* raw IP */ + +/* + * These are values from BSD/OS's "bpf.h". + * These are not the same as the values from the traditional libpcap + * "bpf.h"; however, these values shouldn't be generated by any + * OS other than BSD/OS, so the correct values to use here are the + * BSD/OS values. + * + * Platforms that have already assigned these values to other + * DLT_ codes, however, should give these codes the values + * from that platform, so that programs that use these codes will + * continue to compile - even though they won't correctly read + * files of these types. + */ +#define DLT_SLIP_BSDOS 15 /* BSD/OS Serial Line IP */ +#define DLT_PPP_BSDOS 16 /* BSD/OS Point-to-point Protocol */ + +#define DLT_ATM_CLIP 19 /* Linux Classical-IP over ATM */ + +/* + * These values are defined by NetBSD; other platforms should refrain from + * using them for other purposes, so that NetBSD savefiles with link + * types of 50 or 51 can be read as this type on all platforms. + */ +#define DLT_PPP_SERIAL 50 /* PPP over serial with HDLC encapsulation */ +#define DLT_PPP_ETHER 51 /* PPP over Ethernet */ + +/* + * Reserved for the Symantec Enterprise Firewall. + */ +#define DLT_SYMANTEC_FIREWALL 99 + + +/* + * This value was defined by libpcap 0.5; platforms that have defined + * it with a different value should define it here with that value - + * a link type of 104 in a save file will be mapped to DLT_C_HDLC, + * whatever value that happens to be, so programs will correctly + * handle files with that link type regardless of the value of + * DLT_C_HDLC. + * + * The name DLT_C_HDLC was used by BSD/OS; we use that name for source + * compatibility with programs written for BSD/OS. + * + * libpcap 0.5 defined it as DLT_CHDLC; we define DLT_CHDLC as well, + * for source compatibility with programs written for libpcap 0.5. + */ +#define DLT_C_HDLC 104 /* Cisco HDLC */ +#define DLT_CHDLC DLT_C_HDLC + +#define DLT_IEEE802_11 105 /* IEEE 802.11 wireless */ + +/* + * Values between 106 and 107 are used in capture file headers as + * link-layer types corresponding to DLT_ types that might differ + * between platforms; don't use those values for new DLT_ new types. + */ + +/* + * Frame Relay; BSD/OS has a DLT_FR with a value of 11, but that collides + * with other values. + * DLT_FR and DLT_FRELAY packets start with the Q.922 Frame Relay header + * (DLCI, etc.). + */ +#define DLT_FRELAY 107 + +/* + * OpenBSD DLT_LOOP, for loopback devices; it's like DLT_NULL, except + * that the AF_ type in the link-layer header is in network byte order. + * + * OpenBSD defines it as 12, but that collides with DLT_RAW, so we + * define it as 108 here. If OpenBSD picks up this file, it should + * define DLT_LOOP as 12 in its version, as per the comment above - + * and should not use 108 as a DLT_ value. + */ +#define DLT_LOOP 108 + +/* + * Values between 109 and 112 are used in capture file headers as + * link-layer types corresponding to DLT_ types that might differ + * between platforms; don't use those values for new DLT_ new types. + */ + +/* + * Encapsulated packets for IPsec; DLT_ENC is 13 in OpenBSD, but that's + * DLT_SLIP_BSDOS in NetBSD, so we don't use 13 for it in OSes other + * than OpenBSD. + */ +#define DLT_ENC 109 + +/* + * This is for Linux cooked sockets. + */ +#define DLT_LINUX_SLL 113 + +/* + * Apple LocalTalk hardware. + */ +#define DLT_LTALK 114 + +/* + * Acorn Econet. + */ +#define DLT_ECONET 115 + +/* + * Reserved for use with OpenBSD ipfilter. + */ +#define DLT_IPFILTER 116 + +/* + * Reserved for use in capture-file headers as a link-layer type + * corresponding to OpenBSD DLT_PFLOG; DLT_PFLOG is 17 in OpenBSD, + * but that's DLT_LANE8023 in SuSE 6.3, so we can't use 17 for it + * in capture-file headers. + */ +#define DLT_PFLOG 117 + +/* + * Registered for Cisco-internal use. + */ +#define DLT_CISCO_IOS 118 + +/* + * Reserved for 802.11 cards using the Prism II chips, with a link-layer + * header including Prism monitor mode information plus an 802.11 + * header. + */ +#define DLT_PRISM_HEADER 119 + +/* + * Reserved for Aironet 802.11 cards, with an Aironet link-layer header + * (see Doug Ambrisko's FreeBSD patches). + */ +#define DLT_AIRONET_HEADER 120 + +/* + * Reserved for use by OpenBSD's pfsync device. + */ +#define DLT_PFSYNC 121 + +/* + * Reserved for Siemens HiPath HDLC. XXX + */ +#define DLT_HHDLC 121 + +/* + * Reserved for RFC 2625 IP-over-Fibre Channel. + */ +#define DLT_IP_OVER_FC 122 + +/* + * Reserved for Full Frontal ATM on Solaris. + */ +#define DLT_SUNATM 123 + +/* + * Reserved as per request from Kent Dahlgren + * for private use. + */ +#define DLT_RIO 124 /* RapidIO */ +#define DLT_PCI_EXP 125 /* PCI Express */ +#define DLT_AURORA 126 /* Xilinx Aurora link layer */ + +/* + * BSD header for 802.11 plus a number of bits of link-layer information + * including radio information. + */ +#ifndef DLT_IEEE802_11_RADIO +#define DLT_IEEE802_11_RADIO 127 +#endif + +/* + * Reserved for TZSP encapsulation. + */ +#define DLT_TZSP 128 /* Tazmen Sniffer Protocol */ + +/* + * Reserved for Linux ARCNET. + */ +#define DLT_ARCNET_LINUX 129 + +/* + * Juniper-private data link types. + */ +#define DLT_JUNIPER_MLPPP 130 +#define DLT_JUNIPER_MLFR 131 +#define DLT_JUNIPER_ES 132 +#define DLT_JUNIPER_GGSN 133 +#define DLT_JUNIPER_MFR 134 +#define DLT_JUNIPER_ATM2 135 +#define DLT_JUNIPER_SERVICES 136 +#define DLT_JUNIPER_ATM1 137 + +/* + * Apple IP-over-IEEE 1394, as per a request from Dieter Siegmund + * . The header that's presented is an Ethernet-like + * header: + * + * #define FIREWIRE_EUI64_LEN 8 + * struct firewire_header { + * u_char firewire_dhost[FIREWIRE_EUI64_LEN]; + * u_char firewire_shost[FIREWIRE_EUI64_LEN]; + * u_short firewire_type; + * }; + * + * with "firewire_type" being an Ethernet type value, rather than, + * for example, raw GASP frames being handed up. + */ +#define DLT_APPLE_IP_OVER_IEEE1394 138 + +/* + * Various SS7 encapsulations, as per a request from Jeff Morriss + * and subsequent discussions. + */ +#define DLT_MTP2_WITH_PHDR 139 /* pseudo-header with various info, followed by MTP2 */ +#define DLT_MTP2 140 /* MTP2, without pseudo-header */ +#define DLT_MTP3 141 /* MTP3, without pseudo-header or MTP2 */ +#define DLT_SCCP 142 /* SCCP, without pseudo-header or MTP2 or MTP3 */ + +/* + * Reserved for DOCSIS. + */ +#define DLT_DOCSIS 143 + +/* + * Reserved for Linux IrDA. + */ +#define DLT_LINUX_IRDA 144 + +/* + * Reserved for IBM SP switch and IBM Next Federation switch. + */ +#define DLT_IBM_SP 145 +#define DLT_IBM_SN 146 + +/* + * Reserved for private use. If you have some link-layer header type + * that you want to use within your organization, with the capture files + * using that link-layer header type not ever be sent outside your + * organization, you can use these values. + * + * No libpcap release will use these for any purpose, nor will any + * tcpdump release use them, either. + * + * Do *NOT* use these in capture files that you expect anybody not using + * your private versions of capture-file-reading tools to read; in + * particular, do *NOT* use them in products, otherwise you may find that + * people won't be able to use tcpdump, or snort, or Ethereal, or... to + * read capture files from your firewall/intrusion detection/traffic + * monitoring/etc. appliance, or whatever product uses that DLT_ value, + * and you may also find that the developers of those applications will + * not accept patches to let them read those files. + * + * Also, do not use them if somebody might send you a capture using them + * for *their* private type and tools using them for *your* private type + * would have to read them. + * + * Instead, ask "tcpdump-workers@tcpdump.org" for a new DLT_ value, + * as per the comment above, and use the type you're given. + */ +#define DLT_USER0 147 +#define DLT_USER1 148 +#define DLT_USER2 149 +#define DLT_USER3 150 +#define DLT_USER4 151 +#define DLT_USER5 152 +#define DLT_USER6 153 +#define DLT_USER7 154 +#define DLT_USER8 155 +#define DLT_USER9 156 +#define DLT_USER10 157 +#define DLT_USER11 158 +#define DLT_USER12 159 +#define DLT_USER13 160 +#define DLT_USER14 161 +#define DLT_USER15 162 + +/* + * For future use with 802.11 captures - defined by AbsoluteValue + * Systems to store a number of bits of link-layer information + * including radio information: + * + * http://www.shaftnet.org/~pizza/software/capturefrm.txt + * + * but it might be used by some non-AVS drivers now or in the + * future. + */ +#define DLT_IEEE802_11_RADIO_AVS 163 /* 802.11 plus AVS radio header */ + +/* + * Juniper-private data link type, as per request from + * Hannes Gredler . The DLT_s are used + * for passing on chassis-internal metainformation such as + * QOS profiles, etc.. + */ +#define DLT_JUNIPER_MONITOR 164 + +/* + * Reserved for BACnet MS/TP. + */ +#define DLT_BACNET_MS_TP 165 + +/* + * Another PPP variant as per request from Karsten Keil . + * + * This is used in some OSes to allow a kernel socket filter to distinguish + * between incoming and outgoing packets, on a socket intended to + * supply pppd with outgoing packets so it can do dial-on-demand and + * hangup-on-lack-of-demand; incoming packets are filtered out so they + * don't cause pppd to hold the connection up (you don't want random + * input packets such as port scans, packets from old lost connections, + * etc. to force the connection to stay up). + * + * The first byte of the PPP header (0xff03) is modified to accomodate + * the direction - 0x00 = IN, 0x01 = OUT. + */ +#define DLT_PPP_PPPD 166 + +/* + * Names for backwards compatibility with older versions of some PPP + * software; new software should use DLT_PPP_PPPD. + */ +#define DLT_PPP_WITH_DIRECTION DLT_PPP_PPPD +#define DLT_LINUX_PPP_WITHDIRECTION DLT_PPP_PPPD + +/* + * Juniper-private data link type, as per request from + * Hannes Gredler . The DLT_s are used + * for passing on chassis-internal metainformation such as + * QOS profiles, cookies, etc.. + */ +#define DLT_JUNIPER_PPPOE 167 +#define DLT_JUNIPER_PPPOE_ATM 168 + +#define DLT_GPRS_LLC 169 /* GPRS LLC */ +#define DLT_GPF_T 170 /* GPF-T (ITU-T G.7041/Y.1303) */ +#define DLT_GPF_F 171 /* GPF-F (ITU-T G.7041/Y.1303) */ + +/* + * Requested by Oolan Zimmer for use in Gcom's T1/E1 line + * monitoring equipment. + */ +#define DLT_GCOM_T1E1 172 +#define DLT_GCOM_SERIAL 173 + +/* + * Juniper-private data link type, as per request from + * Hannes Gredler . The DLT_ is used + * for internal communication to Physical Interface Cards (PIC) + */ +#define DLT_JUNIPER_PIC_PEER 174 + +/* + * Link types requested by Gregor Maier of Endace + * Measurement Systems. They add an ERF header (see + * http://www.endace.com/support/EndaceRecordFormat.pdf) in front of + * the link-layer header. + */ +#define DLT_ERF_ETH 175 /* Ethernet */ +#define DLT_ERF_POS 176 /* Packet-over-SONET */ + +/* + * Requested by Daniele Orlandi for raw LAPD + * for vISDN (http://www.orlandi.com/visdn/). Its link-layer header + * includes additional information before the LAPD header, so it's + * not necessarily a generic LAPD header. + */ +#define DLT_LINUX_LAPD 177 + +/* + * Juniper-private data link type, as per request from + * Hannes Gredler . + * The DLT_ are used for prepending meta-information + * like interface index, interface name + * before standard Ethernet, PPP, Frelay & C-HDLC Frames + */ +#define DLT_JUNIPER_ETHER 178 +#define DLT_JUNIPER_PPP 179 +#define DLT_JUNIPER_FRELAY 180 +#define DLT_JUNIPER_CHDLC 181 + +/* + * Multi Link Frame Relay (FRF.16) + */ +#define DLT_MFR 182 + +/* + * Juniper-private data link type, as per request from + * Hannes Gredler . + * The DLT_ is used for internal communication with a + * voice Adapter Card (PIC) + */ +#define DLT_JUNIPER_VP 183 + +/* + * Arinc 429 frames. + * DLT_ requested by Gianluca Varenni . + * Every frame contains a 32bit A429 label. + * More documentation on Arinc 429 can be found at + * http://www.condoreng.com/support/downloads/tutorials/ARINCTutorial.pdf + */ +#define DLT_A429 184 + +/* + * Arinc 653 Interpartition Communication messages. + * DLT_ requested by Gianluca Varenni . + * Please refer to the A653-1 standard for more information. + */ +#define DLT_A653_ICM 185 + +/* + * USB packets, beginning with a USB setup header; requested by + * Paolo Abeni . + */ +#define DLT_USB 186 + +/* + * Bluetooth HCI UART transport layer (part H:4); requested by + * Paolo Abeni. + */ +#define DLT_BLUETOOTH_HCI_H4 187 + +/* + * IEEE 802.16 MAC Common Part Sublayer; requested by Maria Cruz + * . + */ +#define DLT_IEEE802_16_MAC_CPS 188 + +/* + * USB packets, beginning with a Linux USB header; requested by + * Paolo Abeni . + */ +#define DLT_USB_LINUX 189 + +/* + * Controller Area Network (CAN) v. 2.0B packets. + * DLT_ requested by Gianluca Varenni . + * Used to dump CAN packets coming from a CAN Vector board. + * More documentation on the CAN v2.0B frames can be found at + * http://www.can-cia.org/downloads/?269 + */ +#define DLT_CAN20B 190 + +/* + * IEEE 802.15.4, with address fields padded, as is done by Linux + * drivers; requested by Juergen Schimmer. + */ +#define DLT_IEEE802_15_4_LINUX 191 + +/* + * Per Packet Information encapsulated packets. + * DLT_ requested by Gianluca Varenni . + */ +#define DLT_PPI 192 + +/* + * Header for 802.16 MAC Common Part Sublayer plus a radiotap radio header; + * requested by Charles Clancy. + */ +#define DLT_IEEE802_16_MAC_CPS_RADIO 193 + +/* + * Juniper-private data link type, as per request from + * Hannes Gredler . + * The DLT_ is used for internal communication with a + * integrated service module (ISM). + */ +#define DLT_JUNIPER_ISM 194 + +/* + * IEEE 802.15.4, exactly as it appears in the spec (no padding, no + * nothing); requested by Mikko Saarnivala . + */ +#define DLT_IEEE802_15_4 195 + +/* + * Various link-layer types, with a pseudo-header, for SITA + * (http://www.sita.aero/); requested by Fulko Hew (fulko.hew@gmail.com). + */ +#define DLT_SITA 196 + +/* + * Various link-layer types, with a pseudo-header, for Endace DAG cards; + * encapsulates Endace ERF records. Requested by Stephen Donnelly + * . + */ +#define DLT_ERF 197 + +/* + * Special header prepended to Ethernet packets when capturing from a + * u10 Networks board. Requested by Phil Mulholland + * . + */ +#define DLT_RAIF1 198 + +/* + * IPMB packet for IPMI, beginning with the I2C slave address, followed + * by the netFn and LUN, etc.. Requested by Chanthy Toeung + * . + */ +#define DLT_IPMB 199 + +/* + * Juniper-private data link type, as per request from + * Hannes Gredler . + * The DLT_ is used for capturing data on a secure tunnel interface. + */ +#define DLT_JUNIPER_ST 200 + +/* + * Bluetooth HCI UART transport layer (part H:4), with pseudo-header + * that includes direction information; requested by Paolo Abeni. + */ +#define DLT_BLUETOOTH_HCI_H4_WITH_PHDR 201 + +/* + * AX.25 packet with a 1-byte KISS header; see + * + * http://www.ax25.net/kiss.htm + * + * as per Richard Stearn . + */ +#define DLT_AX25_KISS 202 + +/* + * LAPD packets from an ISDN channel, starting with the address field, + * with no pseudo-header. + * Requested by Varuna De Silva . + */ +#define DLT_LAPD 203 + +/* + * Variants of various link-layer headers, with a one-byte direction + * pseudo-header prepended - zero means "received by this host", + * non-zero (any non-zero value) means "sent by this host" - as per + * Will Barker . + */ +#define DLT_PPP_WITH_DIR 204 /* PPP - don't confuse with DLT_PPP_WITH_DIRECTION */ +#define DLT_C_HDLC_WITH_DIR 205 /* Cisco HDLC */ +#define DLT_FRELAY_WITH_DIR 206 /* Frame Relay */ +#define DLT_LAPB_WITH_DIR 207 /* LAPB */ + +/* + * 208 is reserved for an as-yet-unspecified proprietary link-layer + * type, as requested by Will Barker. + */ + +/* + * IPMB with a Linux-specific pseudo-header; as requested by Alexey Neyman + * . + */ +#define DLT_IPMB_LINUX 209 + +/* + * FlexRay automotive bus - http://www.flexray.com/ - as requested + * by Hannes Kaelber . + */ +#define DLT_FLEXRAY 210 + +/* + * Media Oriented Systems Transport (MOST) bus for multimedia + * transport - http://www.mostcooperation.com/ - as requested + * by Hannes Kaelber . + */ +#define DLT_MOST 211 + +/* + * Local Interconnect Network (LIN) bus for vehicle networks - + * http://www.lin-subbus.org/ - as requested by Hannes Kaelber + * . + */ +#define DLT_LIN 212 + +/* + * X2E-private data link type used for serial line capture, + * as requested by Hannes Kaelber . + */ +#define DLT_X2E_SERIAL 213 + +/* + * X2E-private data link type used for the Xoraya data logger + * family, as requested by Hannes Kaelber . + */ +#define DLT_X2E_XORAYA 214 + +/* + * IEEE 802.15.4, exactly as it appears in the spec (no padding, no + * nothing), but with the PHY-level data for non-ASK PHYs (4 octets + * of 0 as preamble, one octet of SFD, one octet of frame length+ + * reserved bit, and then the MAC-layer data, starting with the + * frame control field). + * + * Requested by Max Filippov . + */ +#define DLT_IEEE802_15_4_NONASK_PHY 215 + +/* + * DLT and savefile link type values are split into a class and + * a member of that class. A class value of 0 indicates a regular + * DLT_/LINKTYPE_ value. + */ +#define DLT_CLASS(x) ((x) & 0x03ff0000) + +/* + * The instruction encodings. + */ +/* instruction classes */ +#define BPF_CLASS(code) ((code) & 0x07) +#define BPF_LD 0x00 +#define BPF_LDX 0x01 +#define BPF_ST 0x02 +#define BPF_STX 0x03 +#define BPF_ALU 0x04 +#define BPF_JMP 0x05 +#define BPF_RET 0x06 +#define BPF_MISC 0x07 + +/* ld/ldx fields */ +#define BPF_SIZE(code) ((code) & 0x18) +#define BPF_W 0x00 +#define BPF_H 0x08 +#define BPF_B 0x10 +#define BPF_MODE(code) ((code) & 0xe0) +#define BPF_IMM 0x00 +#define BPF_ABS 0x20 +#define BPF_IND 0x40 +#define BPF_MEM 0x60 +#define BPF_LEN 0x80 +#define BPF_MSH 0xa0 + +/* alu/jmp fields */ +#define BPF_OP(code) ((code) & 0xf0) +#define BPF_ADD 0x00 +#define BPF_SUB 0x10 +#define BPF_MUL 0x20 +#define BPF_DIV 0x30 +#define BPF_OR 0x40 +#define BPF_AND 0x50 +#define BPF_LSH 0x60 +#define BPF_RSH 0x70 +#define BPF_NEG 0x80 +#define BPF_JA 0x00 +#define BPF_JEQ 0x10 +#define BPF_JGT 0x20 +#define BPF_JGE 0x30 +#define BPF_JSET 0x40 +#define BPF_SRC(code) ((code) & 0x08) +#define BPF_K 0x00 +#define BPF_X 0x08 + +/* ret - BPF_K and BPF_X also apply */ +#define BPF_RVAL(code) ((code) & 0x18) +#define BPF_A 0x10 + +/* misc */ +#define BPF_MISCOP(code) ((code) & 0xf8) +#define BPF_TAX 0x00 +#define BPF_TXA 0x80 + +/* + * The instruction data structure. + */ +struct bpf_insn { + u_short code; + u_char jt; + u_char jf; + bpf_u_int32 k; +}; + +/* + * Macros for insn array initializers. + */ +#define BPF_STMT(code, k) { (u_short)(code), 0, 0, k } +#define BPF_JUMP(code, k, jt, jf) { (u_short)(code), jt, jf, k } + +/* + * Structure to retrieve available DLTs for the interface. + */ +struct bpf_dltlist { + u_int bfl_len; /* number of bfd_list array */ + u_int *bfl_list; /* array of DLTs */ +}; + +#ifdef _KERNEL +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_BPF); +#endif +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_bpf); +#endif + +/* + * Rotate the packet buffers in descriptor d. Move the store buffer into the + * hold slot, and the free buffer ino the store slot. Zero the length of the + * new store buffer. Descriptor lock should be held. + */ +#define ROTATE_BUFFERS(d) do { \ + (d)->bd_hbuf = (d)->bd_sbuf; \ + (d)->bd_hlen = (d)->bd_slen; \ + (d)->bd_sbuf = (d)->bd_fbuf; \ + (d)->bd_slen = 0; \ + (d)->bd_fbuf = NULL; \ + bpf_bufheld(d); \ +} while (0) + +/* + * Descriptor associated with each attached hardware interface. + */ +struct bpf_if { + LIST_ENTRY(bpf_if) bif_next; /* list of all interfaces */ + LIST_HEAD(, bpf_d) bif_dlist; /* descriptor list */ + u_int bif_dlt; /* link layer type */ + u_int bif_hdrlen; /* length of header (with padding) */ + struct ifnet *bif_ifp; /* corresponding interface */ + struct mtx bif_mtx; /* mutex for interface */ +}; + +void bpf_bufheld(struct bpf_d *d); +int bpf_validate(const struct bpf_insn *, int); +void bpf_tap(struct bpf_if *, u_char *, u_int); +void bpf_mtap(struct bpf_if *, struct mbuf *); +void bpf_mtap2(struct bpf_if *, void *, u_int, struct mbuf *); +void bpfattach(struct ifnet *, u_int, u_int); +void bpfattach2(struct ifnet *, u_int, u_int, struct bpf_if **); +void bpfdetach(struct ifnet *); + +void bpfilterattach(int); +u_int bpf_filter(const struct bpf_insn *, u_char *, u_int, u_int); + +static __inline int +bpf_peers_present(struct bpf_if *bpf) +{ + + if (!LIST_EMPTY(&bpf->bif_dlist)) + return (1); + return (0); +} + +#define BPF_TAP(_ifp,_pkt,_pktlen) do { \ + if (bpf_peers_present((_ifp)->if_bpf)) \ + bpf_tap((_ifp)->if_bpf, (_pkt), (_pktlen)); \ +} while (0) +#define BPF_MTAP(_ifp,_m) do { \ + if (bpf_peers_present((_ifp)->if_bpf)) { \ + M_ASSERTVALID(_m); \ + bpf_mtap((_ifp)->if_bpf, (_m)); \ + } \ +} while (0) +#define BPF_MTAP2(_ifp,_data,_dlen,_m) do { \ + if (bpf_peers_present((_ifp)->if_bpf)) { \ + M_ASSERTVALID(_m); \ + bpf_mtap2((_ifp)->if_bpf,(_data),(_dlen),(_m)); \ + } \ +} while (0) +#endif + +/* + * Number of scratch memory words (for BPF_LD|BPF_MEM and BPF_ST). + */ +#define BPF_MEMWORDS 16 + +#endif /* _NET_BPF_HH_ */ diff --git a/freebsd/sys/net/bpf_buffer.c b/freebsd/sys/net/bpf_buffer.c new file mode 100644 index 00000000..623b4f8a --- /dev/null +++ b/freebsd/sys/net/bpf_buffer.c @@ -0,0 +1,212 @@ +#include + +/*- + * Copyright (c) 2007 Seccuris Inc. + * All rights reserved. + * + * This sofware was developed by Robert N. M. Watson under contract to + * Seccuris Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (c) 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpf.c 8.4 (Berkeley) 1/9/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Implement historical kernel memory buffering model for BPF: two malloc(9) + * kernel buffers are hung off of the descriptor. The size is fixed prior to + * attaching to an ifnet, ad cannot be changed after that. read(2) simply + * copies the data to user space using uiomove(9). + */ + +static int bpf_bufsize = 4096; +SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW, + &bpf_bufsize, 0, "Maximum capture buffer size in bytes"); +static int bpf_maxbufsize = BPF_MAXBUFSIZE; +SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW, + &bpf_maxbufsize, 0, "Default capture buffer in bytes"); + +void +bpf_buffer_alloc(struct bpf_d *d) +{ + + KASSERT(d->bd_fbuf == NULL, ("bpf_buffer_alloc: bd_fbuf != NULL")); + KASSERT(d->bd_sbuf == NULL, ("bpf_buffer_alloc: bd_sbuf != NULL")); + KASSERT(d->bd_hbuf == NULL, ("bpf_buffer_alloc: bd_hbuf != NULL")); + + d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); + d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); + d->bd_hbuf = NULL; + d->bd_slen = 0; + d->bd_hlen = 0; +} + +/* + * Simple data copy to the current kernel buffer. + */ +void +bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len) +{ + u_char *src_bytes; + + src_bytes = (u_char *)src; + bcopy(src_bytes, buf + offset, len); +} + +/* + * Scatter-gather data copy from an mbuf chain to the current kernel buffer. + */ +void +bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, + u_int len) +{ + const struct mbuf *m; + u_char *dst; + u_int count; + + m = (struct mbuf *)src; + dst = (u_char *)buf + offset; + while (len > 0) { + if (m == NULL) + panic("bpf_mcopy"); + count = min(m->m_len, len); + bcopy(mtod(m, void *), dst, count); + m = m->m_next; + dst += count; + len -= count; + } +} + +/* + * Free BPF kernel buffers on device close. + */ +void +bpf_buffer_free(struct bpf_d *d) +{ + + if (d->bd_sbuf != NULL) + free(d->bd_sbuf, M_BPF); + if (d->bd_hbuf != NULL) + free(d->bd_hbuf, M_BPF); + if (d->bd_fbuf != NULL) + free(d->bd_fbuf, M_BPF); + +#ifdef INVARIANTS + d->bd_sbuf = d->bd_hbuf = d->bd_fbuf = (caddr_t)~0; +#endif +} + +/* + * This is a historical initialization that occurs when the BPF descriptor is + * first opened. It does not imply selection of a buffer mode, so we don't + * allocate buffers here. + */ +void +bpf_buffer_init(struct bpf_d *d) +{ + + d->bd_bufsize = bpf_bufsize; +} + +/* + * Allocate or resize buffers. + */ +int +bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i) +{ + u_int size; + + BPFD_LOCK(d); + if (d->bd_bif != NULL) { + BPFD_UNLOCK(d); + return (EINVAL); + } + size = *i; + if (size > bpf_maxbufsize) + *i = size = bpf_maxbufsize; + else if (size < BPF_MINBUFSIZE) + *i = size = BPF_MINBUFSIZE; + d->bd_bufsize = size; + BPFD_UNLOCK(d); + return (0); +} + +/* + * Copy buffer storage to user space in read(). + */ +int +bpf_buffer_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) +{ + + return (uiomove(buf, len, uio)); +} diff --git a/freebsd/sys/net/bpf_buffer.h b/freebsd/sys/net/bpf_buffer.h new file mode 100644 index 00000000..545ddb22 --- /dev/null +++ b/freebsd/sys/net/bpf_buffer.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2007 Seccuris Inc. + * All rights reserved. + * + * This sofware was developed by Robert N. M. Watson under contract to + * Seccuris Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_BPF_BUFFER_HH_ +#define _NET_BPF_BUFFER_HH_ + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +void bpf_buffer_alloc(struct bpf_d *d); +void bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len); +void bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len); +void bpf_buffer_free(struct bpf_d *d); +void bpf_buffer_init(struct bpf_d *d); +int bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i); +int bpf_buffer_uiomove(struct bpf_d *d, caddr_t buf, u_int len, + struct uio *uio); + +#endif /* !_NET_BPF_BUFFER_HH_ */ diff --git a/freebsd/sys/net/bpf_filter.c b/freebsd/sys/net/bpf_filter.c new file mode 100644 index 00000000..3452cc4a --- /dev/null +++ b/freebsd/sys/net/bpf_filter.c @@ -0,0 +1,582 @@ +#include + +/*- + * Copyright (c) 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpf_filter.c 8.1 (Berkeley) 6/10/93 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#if !defined(_KERNEL) || defined(sun) +#include +#endif + +#ifndef __i386__ +#define BPF_ALIGN +#endif + +#ifndef BPF_ALIGN +#define EXTRACT_SHORT(p) ((u_int16_t)ntohs(*(u_int16_t *)p)) +#define EXTRACT_LONG(p) (ntohl(*(u_int32_t *)p)) +#else +#define EXTRACT_SHORT(p)\ + ((u_int16_t)\ + ((u_int16_t)*((u_char *)p+0)<<8|\ + (u_int16_t)*((u_char *)p+1)<<0)) +#define EXTRACT_LONG(p)\ + ((u_int32_t)*((u_char *)p+0)<<24|\ + (u_int32_t)*((u_char *)p+1)<<16|\ + (u_int32_t)*((u_char *)p+2)<<8|\ + (u_int32_t)*((u_char *)p+3)<<0) +#endif + +#ifdef _KERNEL +#include +#else +#include +#endif +#include +#ifdef _KERNEL +#define MINDEX(m, k) \ +{ \ + register int len = m->m_len; \ + \ + while (k >= len) { \ + k -= len; \ + m = m->m_next; \ + if (m == 0) \ + return (0); \ + len = m->m_len; \ + } \ +} + +static u_int16_t m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err); +static u_int32_t m_xword(struct mbuf *m, bpf_u_int32 k, int *err); + +static u_int32_t +m_xword(struct mbuf *m, bpf_u_int32 k, int *err) +{ + size_t len; + u_char *cp, *np; + struct mbuf *m0; + + len = m->m_len; + while (k >= len) { + k -= len; + m = m->m_next; + if (m == 0) + goto bad; + len = m->m_len; + } + cp = mtod(m, u_char *) + k; + if (len - k >= 4) { + *err = 0; + return (EXTRACT_LONG(cp)); + } + m0 = m->m_next; + if (m0 == 0 || m0->m_len + len - k < 4) + goto bad; + *err = 0; + np = mtod(m0, u_char *); + switch (len - k) { + case 1: + return (((u_int32_t)cp[0] << 24) | + ((u_int32_t)np[0] << 16) | + ((u_int32_t)np[1] << 8) | + (u_int32_t)np[2]); + + case 2: + return (((u_int32_t)cp[0] << 24) | + ((u_int32_t)cp[1] << 16) | + ((u_int32_t)np[0] << 8) | + (u_int32_t)np[1]); + + default: + return (((u_int32_t)cp[0] << 24) | + ((u_int32_t)cp[1] << 16) | + ((u_int32_t)cp[2] << 8) | + (u_int32_t)np[0]); + } + bad: + *err = 1; + return (0); +} + +static u_int16_t +m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err) +{ + size_t len; + u_char *cp; + struct mbuf *m0; + + len = m->m_len; + while (k >= len) { + k -= len; + m = m->m_next; + if (m == 0) + goto bad; + len = m->m_len; + } + cp = mtod(m, u_char *) + k; + if (len - k >= 2) { + *err = 0; + return (EXTRACT_SHORT(cp)); + } + m0 = m->m_next; + if (m0 == 0) + goto bad; + *err = 0; + return ((cp[0] << 8) | mtod(m0, u_char *)[0]); + bad: + *err = 1; + return (0); +} +#endif + +/* + * Execute the filter program starting at pc on the packet p + * wirelen is the length of the original packet + * buflen is the amount of data present + */ +u_int +bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) +{ + u_int32_t A = 0, X = 0; + bpf_u_int32 k; + u_int32_t mem[BPF_MEMWORDS]; + + if (pc == NULL) + /* + * No filter means accept all. + */ + return ((u_int)-1); + + --pc; + while (1) { + ++pc; + switch (pc->code) { + default: +#ifdef _KERNEL + return (0); +#else + abort(); +#endif + + case BPF_RET|BPF_K: + return ((u_int)pc->k); + + case BPF_RET|BPF_A: + return ((u_int)A); + + case BPF_LD|BPF_W|BPF_ABS: + k = pc->k; + if (k > buflen || sizeof(int32_t) > buflen - k) { +#ifdef _KERNEL + int merr; + + if (buflen != 0) + return (0); + A = m_xword((struct mbuf *)p, k, &merr); + if (merr != 0) + return (0); + continue; +#else + return (0); +#endif + } +#ifdef BPF_ALIGN + if (((intptr_t)(p + k) & 3) != 0) + A = EXTRACT_LONG(&p[k]); + else +#endif + A = ntohl(*(int32_t *)(p + k)); + continue; + + case BPF_LD|BPF_H|BPF_ABS: + k = pc->k; + if (k > buflen || sizeof(int16_t) > buflen - k) { +#ifdef _KERNEL + int merr; + + if (buflen != 0) + return (0); + A = m_xhalf((struct mbuf *)p, k, &merr); + continue; +#else + return (0); +#endif + } + A = EXTRACT_SHORT(&p[k]); + continue; + + case BPF_LD|BPF_B|BPF_ABS: + k = pc->k; + if (k >= buflen) { +#ifdef _KERNEL + struct mbuf *m; + + if (buflen != 0) + return (0); + m = (struct mbuf *)p; + MINDEX(m, k); + A = mtod(m, u_char *)[k]; + continue; +#else + return (0); +#endif + } + A = p[k]; + continue; + + case BPF_LD|BPF_W|BPF_LEN: + A = wirelen; + continue; + + case BPF_LDX|BPF_W|BPF_LEN: + X = wirelen; + continue; + + case BPF_LD|BPF_W|BPF_IND: + k = X + pc->k; + if (pc->k > buflen || X > buflen - pc->k || + sizeof(int32_t) > buflen - k) { +#ifdef _KERNEL + int merr; + + if (buflen != 0) + return (0); + A = m_xword((struct mbuf *)p, k, &merr); + if (merr != 0) + return (0); + continue; +#else + return (0); +#endif + } +#ifdef BPF_ALIGN + if (((intptr_t)(p + k) & 3) != 0) + A = EXTRACT_LONG(&p[k]); + else +#endif + A = ntohl(*(int32_t *)(p + k)); + continue; + + case BPF_LD|BPF_H|BPF_IND: + k = X + pc->k; + if (X > buflen || pc->k > buflen - X || + sizeof(int16_t) > buflen - k) { +#ifdef _KERNEL + int merr; + + if (buflen != 0) + return (0); + A = m_xhalf((struct mbuf *)p, k, &merr); + if (merr != 0) + return (0); + continue; +#else + return (0); +#endif + } + A = EXTRACT_SHORT(&p[k]); + continue; + + case BPF_LD|BPF_B|BPF_IND: + k = X + pc->k; + if (pc->k >= buflen || X >= buflen - pc->k) { +#ifdef _KERNEL + struct mbuf *m; + + if (buflen != 0) + return (0); + m = (struct mbuf *)p; + MINDEX(m, k); + A = mtod(m, u_char *)[k]; + continue; +#else + return (0); +#endif + } + A = p[k]; + continue; + + case BPF_LDX|BPF_MSH|BPF_B: + k = pc->k; + if (k >= buflen) { +#ifdef _KERNEL + register struct mbuf *m; + + if (buflen != 0) + return (0); + m = (struct mbuf *)p; + MINDEX(m, k); + X = (mtod(m, u_char *)[k] & 0xf) << 2; + continue; +#else + return (0); +#endif + } + X = (p[pc->k] & 0xf) << 2; + continue; + + case BPF_LD|BPF_IMM: + A = pc->k; + continue; + + case BPF_LDX|BPF_IMM: + X = pc->k; + continue; + + case BPF_LD|BPF_MEM: + A = mem[pc->k]; + continue; + + case BPF_LDX|BPF_MEM: + X = mem[pc->k]; + continue; + + case BPF_ST: + mem[pc->k] = A; + continue; + + case BPF_STX: + mem[pc->k] = X; + continue; + + case BPF_JMP|BPF_JA: + pc += pc->k; + continue; + + case BPF_JMP|BPF_JGT|BPF_K: + pc += (A > pc->k) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_K: + pc += (A >= pc->k) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_K: + pc += (A == pc->k) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_K: + pc += (A & pc->k) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JGT|BPF_X: + pc += (A > X) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_X: + pc += (A >= X) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_X: + pc += (A == X) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_X: + pc += (A & X) ? pc->jt : pc->jf; + continue; + + case BPF_ALU|BPF_ADD|BPF_X: + A += X; + continue; + + case BPF_ALU|BPF_SUB|BPF_X: + A -= X; + continue; + + case BPF_ALU|BPF_MUL|BPF_X: + A *= X; + continue; + + case BPF_ALU|BPF_DIV|BPF_X: + if (X == 0) + return (0); + A /= X; + continue; + + case BPF_ALU|BPF_AND|BPF_X: + A &= X; + continue; + + case BPF_ALU|BPF_OR|BPF_X: + A |= X; + continue; + + case BPF_ALU|BPF_LSH|BPF_X: + A <<= X; + continue; + + case BPF_ALU|BPF_RSH|BPF_X: + A >>= X; + continue; + + case BPF_ALU|BPF_ADD|BPF_K: + A += pc->k; + continue; + + case BPF_ALU|BPF_SUB|BPF_K: + A -= pc->k; + continue; + + case BPF_ALU|BPF_MUL|BPF_K: + A *= pc->k; + continue; + + case BPF_ALU|BPF_DIV|BPF_K: + A /= pc->k; + continue; + + case BPF_ALU|BPF_AND|BPF_K: + A &= pc->k; + continue; + + case BPF_ALU|BPF_OR|BPF_K: + A |= pc->k; + continue; + + case BPF_ALU|BPF_LSH|BPF_K: + A <<= pc->k; + continue; + + case BPF_ALU|BPF_RSH|BPF_K: + A >>= pc->k; + continue; + + case BPF_ALU|BPF_NEG: + A = -A; + continue; + + case BPF_MISC|BPF_TAX: + X = A; + continue; + + case BPF_MISC|BPF_TXA: + A = X; + continue; + } + } +} + +#ifdef _KERNEL +static const u_short bpf_code_map[] = { + 0x10ff, /* 0x00-0x0f: 1111111100001000 */ + 0x3070, /* 0x10-0x1f: 0000111000001100 */ + 0x3131, /* 0x20-0x2f: 1000110010001100 */ + 0x3031, /* 0x30-0x3f: 1000110000001100 */ + 0x3131, /* 0x40-0x4f: 1000110010001100 */ + 0x1011, /* 0x50-0x5f: 1000100000001000 */ + 0x1013, /* 0x60-0x6f: 1100100000001000 */ + 0x1010, /* 0x70-0x7f: 0000100000001000 */ + 0x0093, /* 0x80-0x8f: 1100100100000000 */ + 0x0000, /* 0x90-0x9f: 0000000000000000 */ + 0x0000, /* 0xa0-0xaf: 0000000000000000 */ + 0x0002, /* 0xb0-0xbf: 0100000000000000 */ + 0x0000, /* 0xc0-0xcf: 0000000000000000 */ + 0x0000, /* 0xd0-0xdf: 0000000000000000 */ + 0x0000, /* 0xe0-0xef: 0000000000000000 */ + 0x0000 /* 0xf0-0xff: 0000000000000000 */ +}; + +#define BPF_VALIDATE_CODE(c) \ + ((c) <= 0xff && (bpf_code_map[(c) >> 4] & (1 << ((c) & 0xf))) != 0) + +/* + * Return true if the 'fcode' is a valid filter program. + * The constraints are that each jump be forward and to a valid + * code. The code must terminate with either an accept or reject. + * + * The kernel needs to be able to verify an application's filter code. + * Otherwise, a bogus program could easily crash the system. + */ +int +bpf_validate(const struct bpf_insn *f, int len) +{ + register int i; + register const struct bpf_insn *p; + + /* Do not accept negative length filter. */ + if (len < 0) + return (0); + + /* An empty filter means accept all. */ + if (len == 0) + return (1); + + for (i = 0; i < len; ++i) { + p = &f[i]; + /* + * Check that the code is valid. + */ + if (!BPF_VALIDATE_CODE(p->code)) + return (0); + /* + * Check that that jumps are forward, and within + * the code block. + */ + if (BPF_CLASS(p->code) == BPF_JMP) { + register u_int offset; + + if (p->code == (BPF_JMP|BPF_JA)) + offset = p->k; + else + offset = p->jt > p->jf ? p->jt : p->jf; + if (offset >= (u_int)(len - i) - 1) + return (0); + continue; + } + /* + * Check that memory operations use valid addresses. + */ + if (p->code == BPF_ST || p->code == BPF_STX || + p->code == (BPF_LD|BPF_MEM) || + p->code == (BPF_LDX|BPF_MEM)) { + if (p->k >= BPF_MEMWORDS) + return (0); + continue; + } + /* + * Check for constant division by 0. + */ + if (p->code == (BPF_ALU|BPF_DIV|BPF_K) && p->k == 0) + return (0); + } + return (BPF_CLASS(f[len - 1].code) == BPF_RET); +} +#endif diff --git a/freebsd/sys/net/bpf_jitter.c b/freebsd/sys/net/bpf_jitter.c new file mode 100644 index 00000000..bb373725 --- /dev/null +++ b/freebsd/sys/net/bpf_jitter.c @@ -0,0 +1,143 @@ +#include + +/*- + * Copyright (C) 2002-2003 NetGroup, Politecnico di Torino (Italy) + * Copyright (C) 2005-2008 Jung-uk Kim + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Politecnico di Torino nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#ifdef _KERNEL +#include + +#include +#include +#include +#include +#include +#else +#include +#include +#include +#endif + +#include +#include + +bpf_filter_func bpf_jit_compile(struct bpf_insn *, u_int, int *); + +static u_int bpf_jit_accept_all(u_char *, u_int, u_int); + +#ifdef _KERNEL +MALLOC_DEFINE(M_BPFJIT, "BPF_JIT", "BPF JIT compiler"); + +SYSCTL_NODE(_net, OID_AUTO, bpf_jitter, CTLFLAG_RW, 0, "BPF JIT compiler"); +int bpf_jitter_enable = 1; +SYSCTL_INT(_net_bpf_jitter, OID_AUTO, enable, CTLFLAG_RW, + &bpf_jitter_enable, 0, "enable BPF JIT compiler"); + +bpf_jit_filter * +bpf_jitter(struct bpf_insn *fp, int nins) +{ + bpf_jit_filter *filter; + + /* Allocate the filter structure */ + filter = (struct bpf_jit_filter *)malloc(sizeof(*filter), + M_BPFJIT, M_NOWAIT | M_ZERO); + if (filter == NULL) + return (NULL); + + /* No filter means accept all */ + if (fp == NULL || nins == 0) { + filter->func = bpf_jit_accept_all; + return (filter); + } + + /* Create the binary */ + if ((filter->func = bpf_jit_compile(fp, nins, filter->mem)) == NULL) { + free(filter, M_BPFJIT); + return (NULL); + } + + return (filter); +} + +void +bpf_destroy_jit_filter(bpf_jit_filter *filter) +{ + + if (filter->func != bpf_jit_accept_all) + free(filter->func, M_BPFJIT); + free(filter, M_BPFJIT); +} +#else +bpf_jit_filter * +bpf_jitter(struct bpf_insn *fp, int nins) +{ + bpf_jit_filter *filter; + + /* Allocate the filter structure */ + filter = (struct bpf_jit_filter *)malloc(sizeof(*filter)); + if (filter == NULL) + return (NULL); + memset(filter, 0, sizeof(*filter)); + + /* No filter means accept all */ + if (fp == NULL || nins == 0) { + filter->func = bpf_jit_accept_all; + return (filter); + } + + /* Create the binary */ + if ((filter->func = bpf_jit_compile(fp, nins, filter->mem)) == NULL) { + free(filter); + return (NULL); + } + + return (filter); +} + +void +bpf_destroy_jit_filter(bpf_jit_filter *filter) +{ + + if (filter->func != bpf_jit_accept_all) + free(filter->func); + free(filter); +} +#endif + +static u_int +bpf_jit_accept_all(__unused u_char *p, __unused u_int wirelen, + __unused u_int buflen) +{ + + return ((u_int)-1); +} diff --git a/freebsd/sys/net/bpf_jitter.h b/freebsd/sys/net/bpf_jitter.h new file mode 100644 index 00000000..c0dd7e04 --- /dev/null +++ b/freebsd/sys/net/bpf_jitter.h @@ -0,0 +1,84 @@ +/*- + * Copyright (C) 2002-2003 NetGroup, Politecnico di Torino (Italy) + * Copyright (C) 2005-2008 Jung-uk Kim + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Politecnico di Torino nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_BPF_JITTER_HH_ +#define _NET_BPF_JITTER_HH_ + +#ifdef _KERNEL +MALLOC_DECLARE(M_BPFJIT); +#endif + +extern int bpf_jitter_enable; + +/* + * Prototype of a filtering function created by the jitter. + * + * The syntax and the meaning of the parameters is analogous to the one of + * bpf_filter(). Notice that the filter is not among the parameters because + * it is hardwired in the function. + */ +typedef u_int (*bpf_filter_func)(u_char *, u_int, u_int); + +/* Structure describing a native filtering program created by the jitter. */ +typedef struct bpf_jit_filter { + /* The native filtering binary, in the form of a bpf_filter_func. */ + bpf_filter_func func; + + int mem[BPF_MEMWORDS]; /* Scratch memory */ +} bpf_jit_filter; + +/* + * BPF jitter, builds a machine function from a BPF program. + * + * param fp The BPF pseudo-assembly filter that will be translated + * into native code. + * param nins Number of instructions of the input filter. + * return The bpf_jit_filter structure containing the native filtering + * binary. + * + * bpf_jitter allocates the buffers for the new native filter and + * then translates the program pointed by fp calling bpf_jit_compile(). + */ +bpf_jit_filter *bpf_jitter(struct bpf_insn *fp, int nins); + +/* + * Deletes a filtering function that was previously created by bpf_jitter(). + * + * param filter The filter to destroy. + * + * This function frees the variuos buffers (code, memory, etc.) associated + * with a filtering function. + */ +void bpf_destroy_jit_filter(bpf_jit_filter *filter); + +#endif /* _NET_BPF_JITTER_HH_ */ diff --git a/freebsd/sys/net/bpf_zerocopy.h b/freebsd/sys/net/bpf_zerocopy.h new file mode 100644 index 00000000..455bd41c --- /dev/null +++ b/freebsd/sys/net/bpf_zerocopy.h @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 2007 Seccuris Inc. + * All rights reserved. + * + * This sofware was developed by Robert N. M. Watson under contract to + * Seccuris Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_BPF_ZEROCOPY_HH_ +#define _NET_BPF_ZEROCOPY_HH_ + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +void bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len); +void bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, + void *src, u_int len); +void bpf_zerocopy_buffull(struct bpf_d *); +void bpf_zerocopy_bufheld(struct bpf_d *); +void bpf_zerocopy_buf_reclaimed(struct bpf_d *); +int bpf_zerocopy_canfreebuf(struct bpf_d *); +int bpf_zerocopy_canwritebuf(struct bpf_d *); +void bpf_zerocopy_free(struct bpf_d *d); +int bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, + size_t *i); +int bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz); +int bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d, + struct bpf_zbuf *bz); + +#endif /* !_NET_BPF_ZEROCOPY_HH_ */ diff --git a/freebsd/sys/net/bpfdesc.h b/freebsd/sys/net/bpfdesc.h new file mode 100644 index 00000000..d28ecca2 --- /dev/null +++ b/freebsd/sys/net/bpfdesc.h @@ -0,0 +1,149 @@ +/*- + * Copyright (c) 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpfdesc.h 8.1 (Berkeley) 6/10/93 + * + * $FreeBSD$ + */ + +#ifndef _NET_BPFDESC_HH_ +#define _NET_BPFDESC_HH_ + +#include +#include +#include +#include +#include + +/* + * Descriptor associated with each open bpf file. + */ +struct zbuf; +struct bpf_d { + LIST_ENTRY(bpf_d) bd_next; /* Linked list of descriptors */ + /* + * Buffer slots: two memory buffers store the incoming packets. + * The model has three slots. Sbuf is always occupied. + * sbuf (store) - Receive interrupt puts packets here. + * hbuf (hold) - When sbuf is full, put buffer here and + * wakeup read (replace sbuf with fbuf). + * fbuf (free) - When read is done, put buffer here. + * On receiving, if sbuf is full and fbuf is 0, packet is dropped. + */ + caddr_t bd_sbuf; /* store slot */ + caddr_t bd_hbuf; /* hold slot */ + caddr_t bd_fbuf; /* free slot */ + int bd_slen; /* current length of store buffer */ + int bd_hlen; /* current length of hold buffer */ + + int bd_bufsize; /* absolute length of buffers */ + + struct bpf_if * bd_bif; /* interface descriptor */ + u_long bd_rtout; /* Read timeout in 'ticks' */ + struct bpf_insn *bd_rfilter; /* read filter code */ + struct bpf_insn *bd_wfilter; /* write filter code */ + void *bd_bfilter; /* binary filter code */ + u_int64_t bd_rcount; /* number of packets received */ + u_int64_t bd_dcount; /* number of packets dropped */ + + u_char bd_promisc; /* true if listening promiscuously */ + u_char bd_state; /* idle, waiting, or timed out */ + u_char bd_immediate; /* true to return on packet arrival */ + int bd_hdrcmplt; /* false to fill in src lladdr automatically */ + int bd_direction; /* select packet direction */ + int bd_feedback; /* true to feed back sent packets */ + int bd_async; /* non-zero if packet reception should generate signal */ + int bd_sig; /* signal to send upon packet reception */ + struct sigio * bd_sigio; /* information for async I/O */ + struct selinfo bd_sel; /* bsd select info */ + struct mtx bd_mtx; /* mutex for this descriptor */ + struct callout bd_callout; /* for BPF timeouts with select */ + struct label *bd_label; /* MAC label for descriptor */ + u_int64_t bd_fcount; /* number of packets which matched filter */ + pid_t bd_pid; /* PID which created descriptor */ + int bd_locked; /* true if descriptor is locked */ + u_int bd_bufmode; /* Current buffer mode. */ + u_int64_t bd_wcount; /* number of packets written */ + u_int64_t bd_wfcount; /* number of packets that matched write filter */ + u_int64_t bd_wdcount; /* number of packets dropped during a write */ + u_int64_t bd_zcopy; /* number of zero copy operations */ + u_char bd_compat32; /* 32-bit stream on LP64 system */ +}; + +/* Values for bd_state */ +#define BPF_IDLE 0 /* no select in progress */ +#define BPF_WAITING 1 /* waiting for read timeout in select */ +#define BPF_TIMED_OUT 2 /* read timeout has expired in select */ + +#define BPFD_LOCK(bd) mtx_lock(&(bd)->bd_mtx) +#define BPFD_UNLOCK(bd) mtx_unlock(&(bd)->bd_mtx) +#define BPFD_LOCK_ASSERT(bd) mtx_assert(&(bd)->bd_mtx, MA_OWNED) + +/* + * External representation of the bpf descriptor + */ +struct xbpf_d { + u_int bd_structsize; /* Size of this structure. */ + u_char bd_promisc; + u_char bd_immediate; + u_char __bd_pad[6]; + int bd_hdrcmplt; + int bd_direction; + int bd_feedback; + int bd_async; + u_int64_t bd_rcount; + u_int64_t bd_dcount; + u_int64_t bd_fcount; + int bd_sig; + int bd_slen; + int bd_hlen; + int bd_bufsize; + pid_t bd_pid; + char bd_ifname[IFNAMSIZ]; + int bd_locked; + u_int64_t bd_wcount; + u_int64_t bd_wfcount; + u_int64_t bd_wdcount; + u_int64_t bd_zcopy; + int bd_bufmode; + /* + * Allocate 4 64 bit unsigned integers for future expansion so we do + * not have to worry about breaking the ABI. + */ + u_int64_t bd_spare[4]; +}; + +#define BPFIF_LOCK(bif) mtx_lock(&(bif)->bif_mtx) +#define BPFIF_UNLOCK(bif) mtx_unlock(&(bif)->bif_mtx) + +#endif diff --git a/freebsd/sys/net/bridgestp.c b/freebsd/sys/net/bridgestp.c new file mode 100644 index 00000000..2191be26 --- /dev/null +++ b/freebsd/sys/net/bridgestp.c @@ -0,0 +1,2250 @@ +#include + +/* $NetBSD: bridgestp.c,v 1.5 2003/11/28 08:56:48 keihan Exp $ */ + +/* + * Copyright (c) 2000 Jason L. Wright (jason@thought.net) + * Copyright (c) 2006 Andrew Thompson (thompsa@FreeBSD.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: bridgestp.c,v 1.5 2001/03/22 03:48:29 jason Exp + */ + +/* + * Implementation of the spanning tree protocol as defined in + * ISO/IEC 802.1D-2004, June 9, 2004. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef BRIDGESTP_DEBUG +#define DPRINTF(fmt, arg...) printf("bstp: " fmt, ##arg) +#else +#define DPRINTF(fmt, arg...) (void)0 +#endif + +#define PV2ADDR(pv, eaddr) do { \ + eaddr[0] = pv >> 40; \ + eaddr[1] = pv >> 32; \ + eaddr[2] = pv >> 24; \ + eaddr[3] = pv >> 16; \ + eaddr[4] = pv >> 8; \ + eaddr[5] = pv >> 0; \ +} while (0) + +#define INFO_BETTER 1 +#define INFO_SAME 0 +#define INFO_WORSE -1 + +const uint8_t bstp_etheraddr[] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; + +LIST_HEAD(, bstp_state) bstp_list; +static struct mtx bstp_list_mtx; + +static void bstp_transmit(struct bstp_state *, struct bstp_port *); +static void bstp_transmit_bpdu(struct bstp_state *, struct bstp_port *); +static void bstp_transmit_tcn(struct bstp_state *, struct bstp_port *); +static void bstp_decode_bpdu(struct bstp_port *, struct bstp_cbpdu *, + struct bstp_config_unit *); +static void bstp_send_bpdu(struct bstp_state *, struct bstp_port *, + struct bstp_cbpdu *); +static int bstp_pdu_flags(struct bstp_port *); +static void bstp_received_stp(struct bstp_state *, struct bstp_port *, + struct mbuf **, struct bstp_tbpdu *); +static void bstp_received_rstp(struct bstp_state *, struct bstp_port *, + struct mbuf **, struct bstp_tbpdu *); +static void bstp_received_tcn(struct bstp_state *, struct bstp_port *, + struct bstp_tcn_unit *); +static void bstp_received_bpdu(struct bstp_state *, struct bstp_port *, + struct bstp_config_unit *); +static int bstp_pdu_rcvtype(struct bstp_port *, struct bstp_config_unit *); +static int bstp_pdu_bettersame(struct bstp_port *, int); +static int bstp_info_cmp(struct bstp_pri_vector *, + struct bstp_pri_vector *); +static int bstp_info_superior(struct bstp_pri_vector *, + struct bstp_pri_vector *); +static void bstp_assign_roles(struct bstp_state *); +static void bstp_update_roles(struct bstp_state *, struct bstp_port *); +static void bstp_update_state(struct bstp_state *, struct bstp_port *); +static void bstp_update_tc(struct bstp_port *); +static void bstp_update_info(struct bstp_port *); +static void bstp_set_other_tcprop(struct bstp_port *); +static void bstp_set_all_reroot(struct bstp_state *); +static void bstp_set_all_sync(struct bstp_state *); +static void bstp_set_port_state(struct bstp_port *, int); +static void bstp_set_port_role(struct bstp_port *, int); +static void bstp_set_port_proto(struct bstp_port *, int); +static void bstp_set_port_tc(struct bstp_port *, int); +static void bstp_set_timer_tc(struct bstp_port *); +static void bstp_set_timer_msgage(struct bstp_port *); +static int bstp_rerooted(struct bstp_state *, struct bstp_port *); +static uint32_t bstp_calc_path_cost(struct bstp_port *); +static void bstp_notify_state(void *, int); +static void bstp_notify_rtage(void *, int); +static void bstp_ifupdstatus(struct bstp_state *, struct bstp_port *); +static void bstp_enable_port(struct bstp_state *, struct bstp_port *); +static void bstp_disable_port(struct bstp_state *, struct bstp_port *); +static void bstp_tick(void *); +static void bstp_timer_start(struct bstp_timer *, uint16_t); +static void bstp_timer_stop(struct bstp_timer *); +static void bstp_timer_latch(struct bstp_timer *); +static int bstp_timer_expired(struct bstp_timer *); +static void bstp_hello_timer_expiry(struct bstp_state *, + struct bstp_port *); +static void bstp_message_age_expiry(struct bstp_state *, + struct bstp_port *); +static void bstp_migrate_delay_expiry(struct bstp_state *, + struct bstp_port *); +static void bstp_edge_delay_expiry(struct bstp_state *, + struct bstp_port *); +static int bstp_addr_cmp(const uint8_t *, const uint8_t *); +static int bstp_same_bridgeid(uint64_t, uint64_t); +static void bstp_reinit(struct bstp_state *); + +static void +bstp_transmit(struct bstp_state *bs, struct bstp_port *bp) +{ + if (bs->bs_running == 0) + return; + + /* + * a PDU can only be sent if we have tx quota left and the + * hello timer is running. + */ + if (bp->bp_hello_timer.active == 0) { + /* Test if it needs to be reset */ + bstp_hello_timer_expiry(bs, bp); + return; + } + if (bp->bp_txcount > bs->bs_txholdcount) + /* Ran out of karma */ + return; + + if (bp->bp_protover == BSTP_PROTO_RSTP) { + bstp_transmit_bpdu(bs, bp); + bp->bp_tc_ack = 0; + } else { /* STP */ + switch (bp->bp_role) { + case BSTP_ROLE_DESIGNATED: + bstp_transmit_bpdu(bs, bp); + bp->bp_tc_ack = 0; + break; + + case BSTP_ROLE_ROOT: + bstp_transmit_tcn(bs, bp); + break; + } + } + bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime); + bp->bp_flags &= ~BSTP_PORT_NEWINFO; +} + +static void +bstp_transmit_bpdu(struct bstp_state *bs, struct bstp_port *bp) +{ + struct bstp_cbpdu bpdu; + + BSTP_LOCK_ASSERT(bs); + + bpdu.cbu_rootpri = htons(bp->bp_desg_pv.pv_root_id >> 48); + PV2ADDR(bp->bp_desg_pv.pv_root_id, bpdu.cbu_rootaddr); + + bpdu.cbu_rootpathcost = htonl(bp->bp_desg_pv.pv_cost); + + bpdu.cbu_bridgepri = htons(bp->bp_desg_pv.pv_dbridge_id >> 48); + PV2ADDR(bp->bp_desg_pv.pv_dbridge_id, bpdu.cbu_bridgeaddr); + + bpdu.cbu_portid = htons(bp->bp_port_id); + bpdu.cbu_messageage = htons(bp->bp_desg_msg_age); + bpdu.cbu_maxage = htons(bp->bp_desg_max_age); + bpdu.cbu_hellotime = htons(bp->bp_desg_htime); + bpdu.cbu_forwarddelay = htons(bp->bp_desg_fdelay); + + bpdu.cbu_flags = bstp_pdu_flags(bp); + + switch (bp->bp_protover) { + case BSTP_PROTO_STP: + bpdu.cbu_bpdutype = BSTP_MSGTYPE_CFG; + break; + + case BSTP_PROTO_RSTP: + bpdu.cbu_bpdutype = BSTP_MSGTYPE_RSTP; + break; + } + + bstp_send_bpdu(bs, bp, &bpdu); +} + +static void +bstp_transmit_tcn(struct bstp_state *bs, struct bstp_port *bp) +{ + struct bstp_tbpdu bpdu; + struct ifnet *ifp = bp->bp_ifp; + struct ether_header *eh; + struct mbuf *m; + + KASSERT(bp == bs->bs_root_port, ("%s: bad root port\n", __func__)); + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + return; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return; + + m->m_pkthdr.rcvif = ifp; + m->m_pkthdr.len = sizeof(*eh) + sizeof(bpdu); + m->m_len = m->m_pkthdr.len; + + eh = mtod(m, struct ether_header *); + + memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); + memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN); + eh->ether_type = htons(sizeof(bpdu)); + + bpdu.tbu_ssap = bpdu.tbu_dsap = LLC_8021D_LSAP; + bpdu.tbu_ctl = LLC_UI; + bpdu.tbu_protoid = 0; + bpdu.tbu_protover = 0; + bpdu.tbu_bpdutype = BSTP_MSGTYPE_TCN; + + memcpy(mtod(m, caddr_t) + sizeof(*eh), &bpdu, sizeof(bpdu)); + + bp->bp_txcount++; + ifp->if_transmit(ifp, m); +} + +static void +bstp_decode_bpdu(struct bstp_port *bp, struct bstp_cbpdu *cpdu, + struct bstp_config_unit *cu) +{ + int flags; + + cu->cu_pv.pv_root_id = + (((uint64_t)ntohs(cpdu->cbu_rootpri)) << 48) | + (((uint64_t)cpdu->cbu_rootaddr[0]) << 40) | + (((uint64_t)cpdu->cbu_rootaddr[1]) << 32) | + (((uint64_t)cpdu->cbu_rootaddr[2]) << 24) | + (((uint64_t)cpdu->cbu_rootaddr[3]) << 16) | + (((uint64_t)cpdu->cbu_rootaddr[4]) << 8) | + (((uint64_t)cpdu->cbu_rootaddr[5]) << 0); + + cu->cu_pv.pv_dbridge_id = + (((uint64_t)ntohs(cpdu->cbu_bridgepri)) << 48) | + (((uint64_t)cpdu->cbu_bridgeaddr[0]) << 40) | + (((uint64_t)cpdu->cbu_bridgeaddr[1]) << 32) | + (((uint64_t)cpdu->cbu_bridgeaddr[2]) << 24) | + (((uint64_t)cpdu->cbu_bridgeaddr[3]) << 16) | + (((uint64_t)cpdu->cbu_bridgeaddr[4]) << 8) | + (((uint64_t)cpdu->cbu_bridgeaddr[5]) << 0); + + cu->cu_pv.pv_cost = ntohl(cpdu->cbu_rootpathcost); + cu->cu_message_age = ntohs(cpdu->cbu_messageage); + cu->cu_max_age = ntohs(cpdu->cbu_maxage); + cu->cu_hello_time = ntohs(cpdu->cbu_hellotime); + cu->cu_forward_delay = ntohs(cpdu->cbu_forwarddelay); + cu->cu_pv.pv_dport_id = ntohs(cpdu->cbu_portid); + cu->cu_pv.pv_port_id = bp->bp_port_id; + cu->cu_message_type = cpdu->cbu_bpdutype; + + /* Strip off unused flags in STP mode */ + flags = cpdu->cbu_flags; + switch (cpdu->cbu_protover) { + case BSTP_PROTO_STP: + flags &= BSTP_PDU_STPMASK; + /* A STP BPDU explicitly conveys a Designated Port */ + cu->cu_role = BSTP_ROLE_DESIGNATED; + break; + + case BSTP_PROTO_RSTP: + flags &= BSTP_PDU_RSTPMASK; + break; + } + + cu->cu_topology_change_ack = + (flags & BSTP_PDU_F_TCA) ? 1 : 0; + cu->cu_proposal = + (flags & BSTP_PDU_F_P) ? 1 : 0; + cu->cu_agree = + (flags & BSTP_PDU_F_A) ? 1 : 0; + cu->cu_learning = + (flags & BSTP_PDU_F_L) ? 1 : 0; + cu->cu_forwarding = + (flags & BSTP_PDU_F_F) ? 1 : 0; + cu->cu_topology_change = + (flags & BSTP_PDU_F_TC) ? 1 : 0; + + switch ((flags & BSTP_PDU_PRMASK) >> BSTP_PDU_PRSHIFT) { + case BSTP_PDU_F_ROOT: + cu->cu_role = BSTP_ROLE_ROOT; + break; + case BSTP_PDU_F_ALT: + cu->cu_role = BSTP_ROLE_ALTERNATE; + break; + case BSTP_PDU_F_DESG: + cu->cu_role = BSTP_ROLE_DESIGNATED; + break; + } +} + +static void +bstp_send_bpdu(struct bstp_state *bs, struct bstp_port *bp, + struct bstp_cbpdu *bpdu) +{ + struct ifnet *ifp; + struct mbuf *m; + struct ether_header *eh; + + BSTP_LOCK_ASSERT(bs); + + ifp = bp->bp_ifp; + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + return; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return; + + eh = mtod(m, struct ether_header *); + + bpdu->cbu_ssap = bpdu->cbu_dsap = LLC_8021D_LSAP; + bpdu->cbu_ctl = LLC_UI; + bpdu->cbu_protoid = htons(BSTP_PROTO_ID); + + memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); + memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN); + + switch (bpdu->cbu_bpdutype) { + case BSTP_MSGTYPE_CFG: + bpdu->cbu_protover = BSTP_PROTO_STP; + m->m_pkthdr.len = sizeof(*eh) + BSTP_BPDU_STP_LEN; + eh->ether_type = htons(BSTP_BPDU_STP_LEN); + memcpy(mtod(m, caddr_t) + sizeof(*eh), bpdu, + BSTP_BPDU_STP_LEN); + break; + + case BSTP_MSGTYPE_RSTP: + bpdu->cbu_protover = BSTP_PROTO_RSTP; + bpdu->cbu_versionlen = htons(0); + m->m_pkthdr.len = sizeof(*eh) + BSTP_BPDU_RSTP_LEN; + eh->ether_type = htons(BSTP_BPDU_RSTP_LEN); + memcpy(mtod(m, caddr_t) + sizeof(*eh), bpdu, + BSTP_BPDU_RSTP_LEN); + break; + + default: + panic("not implemented"); + } + m->m_pkthdr.rcvif = ifp; + m->m_len = m->m_pkthdr.len; + + bp->bp_txcount++; + ifp->if_transmit(ifp, m); +} + +static int +bstp_pdu_flags(struct bstp_port *bp) +{ + int flags = 0; + + if (bp->bp_proposing && bp->bp_state != BSTP_IFSTATE_FORWARDING) + flags |= BSTP_PDU_F_P; + + if (bp->bp_agree) + flags |= BSTP_PDU_F_A; + + if (bp->bp_tc_timer.active) + flags |= BSTP_PDU_F_TC; + + if (bp->bp_tc_ack) + flags |= BSTP_PDU_F_TCA; + + switch (bp->bp_state) { + case BSTP_IFSTATE_LEARNING: + flags |= BSTP_PDU_F_L; + break; + + case BSTP_IFSTATE_FORWARDING: + flags |= (BSTP_PDU_F_L | BSTP_PDU_F_F); + break; + } + + switch (bp->bp_role) { + case BSTP_ROLE_ROOT: + flags |= + (BSTP_PDU_F_ROOT << BSTP_PDU_PRSHIFT); + break; + + case BSTP_ROLE_ALTERNATE: + case BSTP_ROLE_BACKUP: /* fall through */ + flags |= + (BSTP_PDU_F_ALT << BSTP_PDU_PRSHIFT); + break; + + case BSTP_ROLE_DESIGNATED: + flags |= + (BSTP_PDU_F_DESG << BSTP_PDU_PRSHIFT); + break; + } + + /* Strip off unused flags in either mode */ + switch (bp->bp_protover) { + case BSTP_PROTO_STP: + flags &= BSTP_PDU_STPMASK; + break; + case BSTP_PROTO_RSTP: + flags &= BSTP_PDU_RSTPMASK; + break; + } + return (flags); +} + +struct mbuf * +bstp_input(struct bstp_port *bp, struct ifnet *ifp, struct mbuf *m) +{ + struct bstp_state *bs = bp->bp_bs; + struct ether_header *eh; + struct bstp_tbpdu tpdu; + uint16_t len; + + if (bp->bp_active == 0) { + m_freem(m); + return (NULL); + } + + BSTP_LOCK(bs); + + eh = mtod(m, struct ether_header *); + + len = ntohs(eh->ether_type); + if (len < sizeof(tpdu)) + goto out; + + m_adj(m, ETHER_HDR_LEN); + + if (m->m_pkthdr.len > len) + m_adj(m, len - m->m_pkthdr.len); + if (m->m_len < sizeof(tpdu) && + (m = m_pullup(m, sizeof(tpdu))) == NULL) + goto out; + + memcpy(&tpdu, mtod(m, caddr_t), sizeof(tpdu)); + + /* basic packet checks */ + if (tpdu.tbu_dsap != LLC_8021D_LSAP || + tpdu.tbu_ssap != LLC_8021D_LSAP || + tpdu.tbu_ctl != LLC_UI) + goto out; + if (tpdu.tbu_protoid != BSTP_PROTO_ID) + goto out; + + /* + * We can treat later versions of the PDU as the same as the maximum + * version we implement. All additional parameters/flags are ignored. + */ + if (tpdu.tbu_protover > BSTP_PROTO_MAX) + tpdu.tbu_protover = BSTP_PROTO_MAX; + + if (tpdu.tbu_protover != bp->bp_protover) { + /* + * Wait for the migration delay timer to expire before changing + * protocol version to avoid flip-flops. + */ + if (bp->bp_flags & BSTP_PORT_CANMIGRATE) + bstp_set_port_proto(bp, tpdu.tbu_protover); + else + goto out; + } + + /* Clear operedge upon receiving a PDU on the port */ + bp->bp_operedge = 0; + bstp_timer_start(&bp->bp_edge_delay_timer, + BSTP_DEFAULT_MIGRATE_DELAY); + + switch (tpdu.tbu_protover) { + case BSTP_PROTO_STP: + bstp_received_stp(bs, bp, &m, &tpdu); + break; + + case BSTP_PROTO_RSTP: + bstp_received_rstp(bs, bp, &m, &tpdu); + break; + } +out: + BSTP_UNLOCK(bs); + if (m) + m_freem(m); + return (NULL); +} + +static void +bstp_received_stp(struct bstp_state *bs, struct bstp_port *bp, + struct mbuf **mp, struct bstp_tbpdu *tpdu) +{ + struct bstp_cbpdu cpdu; + struct bstp_config_unit *cu = &bp->bp_msg_cu; + struct bstp_tcn_unit tu; + + switch (tpdu->tbu_bpdutype) { + case BSTP_MSGTYPE_TCN: + tu.tu_message_type = tpdu->tbu_bpdutype; + bstp_received_tcn(bs, bp, &tu); + break; + case BSTP_MSGTYPE_CFG: + if ((*mp)->m_len < BSTP_BPDU_STP_LEN && + (*mp = m_pullup(*mp, BSTP_BPDU_STP_LEN)) == NULL) + return; + memcpy(&cpdu, mtod(*mp, caddr_t), BSTP_BPDU_STP_LEN); + + bstp_decode_bpdu(bp, &cpdu, cu); + bstp_received_bpdu(bs, bp, cu); + break; + } +} + +static void +bstp_received_rstp(struct bstp_state *bs, struct bstp_port *bp, + struct mbuf **mp, struct bstp_tbpdu *tpdu) +{ + struct bstp_cbpdu cpdu; + struct bstp_config_unit *cu = &bp->bp_msg_cu; + + if (tpdu->tbu_bpdutype != BSTP_MSGTYPE_RSTP) + return; + + if ((*mp)->m_len < BSTP_BPDU_RSTP_LEN && + (*mp = m_pullup(*mp, BSTP_BPDU_RSTP_LEN)) == NULL) + return; + memcpy(&cpdu, mtod(*mp, caddr_t), BSTP_BPDU_RSTP_LEN); + + bstp_decode_bpdu(bp, &cpdu, cu); + bstp_received_bpdu(bs, bp, cu); +} + +static void +bstp_received_tcn(struct bstp_state *bs, struct bstp_port *bp, + struct bstp_tcn_unit *tcn) +{ + bp->bp_rcvdtcn = 1; + bstp_update_tc(bp); +} + +static void +bstp_received_bpdu(struct bstp_state *bs, struct bstp_port *bp, + struct bstp_config_unit *cu) +{ + int type; + + BSTP_LOCK_ASSERT(bs); + + /* We need to have transitioned to INFO_MINE before proceeding */ + switch (bp->bp_infois) { + case BSTP_INFO_DISABLED: + case BSTP_INFO_AGED: + return; + } + + type = bstp_pdu_rcvtype(bp, cu); + + switch (type) { + case BSTP_PDU_SUPERIOR: + bs->bs_allsynced = 0; + bp->bp_agreed = 0; + bp->bp_proposing = 0; + + if (cu->cu_proposal && cu->cu_forwarding == 0) + bp->bp_proposed = 1; + if (cu->cu_topology_change) + bp->bp_rcvdtc = 1; + if (cu->cu_topology_change_ack) + bp->bp_rcvdtca = 1; + + if (bp->bp_agree && + !bstp_pdu_bettersame(bp, BSTP_INFO_RECEIVED)) + bp->bp_agree = 0; + + /* copy the received priority and timers to the port */ + bp->bp_port_pv = cu->cu_pv; + bp->bp_port_msg_age = cu->cu_message_age; + bp->bp_port_max_age = cu->cu_max_age; + bp->bp_port_fdelay = cu->cu_forward_delay; + bp->bp_port_htime = + (cu->cu_hello_time > BSTP_MIN_HELLO_TIME ? + cu->cu_hello_time : BSTP_MIN_HELLO_TIME); + + /* set expiry for the new info */ + bstp_set_timer_msgage(bp); + + bp->bp_infois = BSTP_INFO_RECEIVED; + bstp_assign_roles(bs); + break; + + case BSTP_PDU_REPEATED: + if (cu->cu_proposal && cu->cu_forwarding == 0) + bp->bp_proposed = 1; + if (cu->cu_topology_change) + bp->bp_rcvdtc = 1; + if (cu->cu_topology_change_ack) + bp->bp_rcvdtca = 1; + + /* rearm the age timer */ + bstp_set_timer_msgage(bp); + break; + + case BSTP_PDU_INFERIOR: + if (cu->cu_learning) { + bp->bp_agreed = 1; + bp->bp_proposing = 0; + } + break; + + case BSTP_PDU_INFERIORALT: + /* + * only point to point links are allowed fast + * transitions to forwarding. + */ + if (cu->cu_agree && bp->bp_ptp_link) { + bp->bp_agreed = 1; + bp->bp_proposing = 0; + } else + bp->bp_agreed = 0; + + if (cu->cu_topology_change) + bp->bp_rcvdtc = 1; + if (cu->cu_topology_change_ack) + bp->bp_rcvdtca = 1; + break; + + case BSTP_PDU_OTHER: + return; /* do nothing */ + } + /* update the state machines with the new data */ + bstp_update_state(bs, bp); +} + +static int +bstp_pdu_rcvtype(struct bstp_port *bp, struct bstp_config_unit *cu) +{ + int type; + + /* default return type */ + type = BSTP_PDU_OTHER; + + switch (cu->cu_role) { + case BSTP_ROLE_DESIGNATED: + if (bstp_info_superior(&bp->bp_port_pv, &cu->cu_pv)) + /* bpdu priority is superior */ + type = BSTP_PDU_SUPERIOR; + else if (bstp_info_cmp(&bp->bp_port_pv, &cu->cu_pv) == + INFO_SAME) { + if (bp->bp_port_msg_age != cu->cu_message_age || + bp->bp_port_max_age != cu->cu_max_age || + bp->bp_port_fdelay != cu->cu_forward_delay || + bp->bp_port_htime != cu->cu_hello_time) + /* bpdu priority is equal and timers differ */ + type = BSTP_PDU_SUPERIOR; + else + /* bpdu is equal */ + type = BSTP_PDU_REPEATED; + } else + /* bpdu priority is worse */ + type = BSTP_PDU_INFERIOR; + + break; + + case BSTP_ROLE_ROOT: + case BSTP_ROLE_ALTERNATE: + case BSTP_ROLE_BACKUP: + if (bstp_info_cmp(&bp->bp_port_pv, &cu->cu_pv) <= INFO_SAME) + /* + * not a designated port and priority is the same or + * worse + */ + type = BSTP_PDU_INFERIORALT; + break; + } + + return (type); +} + +static int +bstp_pdu_bettersame(struct bstp_port *bp, int newinfo) +{ + if (newinfo == BSTP_INFO_RECEIVED && + bp->bp_infois == BSTP_INFO_RECEIVED && + bstp_info_cmp(&bp->bp_port_pv, &bp->bp_msg_cu.cu_pv) >= INFO_SAME) + return (1); + + if (newinfo == BSTP_INFO_MINE && + bp->bp_infois == BSTP_INFO_MINE && + bstp_info_cmp(&bp->bp_port_pv, &bp->bp_desg_pv) >= INFO_SAME) + return (1); + + return (0); +} + +static int +bstp_info_cmp(struct bstp_pri_vector *pv, + struct bstp_pri_vector *cpv) +{ + if (cpv->pv_root_id < pv->pv_root_id) + return (INFO_BETTER); + if (cpv->pv_root_id > pv->pv_root_id) + return (INFO_WORSE); + + if (cpv->pv_cost < pv->pv_cost) + return (INFO_BETTER); + if (cpv->pv_cost > pv->pv_cost) + return (INFO_WORSE); + + if (cpv->pv_dbridge_id < pv->pv_dbridge_id) + return (INFO_BETTER); + if (cpv->pv_dbridge_id > pv->pv_dbridge_id) + return (INFO_WORSE); + + if (cpv->pv_dport_id < pv->pv_dport_id) + return (INFO_BETTER); + if (cpv->pv_dport_id > pv->pv_dport_id) + return (INFO_WORSE); + + return (INFO_SAME); +} + +/* + * This message priority vector is superior to the port priority vector and + * will replace it if, and only if, the message priority vector is better than + * the port priority vector, or the message has been transmitted from the same + * designated bridge and designated port as the port priority vector. + */ +static int +bstp_info_superior(struct bstp_pri_vector *pv, + struct bstp_pri_vector *cpv) +{ + if (bstp_info_cmp(pv, cpv) == INFO_BETTER || + (bstp_same_bridgeid(pv->pv_dbridge_id, cpv->pv_dbridge_id) && + (cpv->pv_dport_id & 0xfff) == (pv->pv_dport_id & 0xfff))) + return (1); + return (0); +} + +static void +bstp_assign_roles(struct bstp_state *bs) +{ + struct bstp_port *bp, *rbp = NULL; + struct bstp_pri_vector pv; + + /* default to our priority vector */ + bs->bs_root_pv = bs->bs_bridge_pv; + bs->bs_root_msg_age = 0; + bs->bs_root_max_age = bs->bs_bridge_max_age; + bs->bs_root_fdelay = bs->bs_bridge_fdelay; + bs->bs_root_htime = bs->bs_bridge_htime; + bs->bs_root_port = NULL; + + /* check if any recieved info supersedes us */ + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + if (bp->bp_infois != BSTP_INFO_RECEIVED) + continue; + + pv = bp->bp_port_pv; + pv.pv_cost += bp->bp_path_cost; + + /* + * The root priority vector is the best of the set comprising + * the bridge priority vector plus all root path priority + * vectors whose bridge address is not equal to us. + */ + if (bstp_same_bridgeid(pv.pv_dbridge_id, + bs->bs_bridge_pv.pv_dbridge_id) == 0 && + bstp_info_cmp(&bs->bs_root_pv, &pv) == INFO_BETTER) { + /* the port vector replaces the root */ + bs->bs_root_pv = pv; + bs->bs_root_msg_age = bp->bp_port_msg_age + + BSTP_MESSAGE_AGE_INCR; + bs->bs_root_max_age = bp->bp_port_max_age; + bs->bs_root_fdelay = bp->bp_port_fdelay; + bs->bs_root_htime = bp->bp_port_htime; + rbp = bp; + } + } + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + /* calculate the port designated vector */ + bp->bp_desg_pv.pv_root_id = bs->bs_root_pv.pv_root_id; + bp->bp_desg_pv.pv_cost = bs->bs_root_pv.pv_cost; + bp->bp_desg_pv.pv_dbridge_id = bs->bs_bridge_pv.pv_dbridge_id; + bp->bp_desg_pv.pv_dport_id = bp->bp_port_id; + bp->bp_desg_pv.pv_port_id = bp->bp_port_id; + + /* calculate designated times */ + bp->bp_desg_msg_age = bs->bs_root_msg_age; + bp->bp_desg_max_age = bs->bs_root_max_age; + bp->bp_desg_fdelay = bs->bs_root_fdelay; + bp->bp_desg_htime = bs->bs_bridge_htime; + + + switch (bp->bp_infois) { + case BSTP_INFO_DISABLED: + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + break; + + case BSTP_INFO_AGED: + bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED); + bstp_update_info(bp); + break; + + case BSTP_INFO_MINE: + bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED); + /* update the port info if stale */ + if (bstp_info_cmp(&bp->bp_port_pv, + &bp->bp_desg_pv) != INFO_SAME || + (rbp != NULL && + (bp->bp_port_msg_age != rbp->bp_port_msg_age || + bp->bp_port_max_age != rbp->bp_port_max_age || + bp->bp_port_fdelay != rbp->bp_port_fdelay || + bp->bp_port_htime != rbp->bp_port_htime))) + bstp_update_info(bp); + break; + + case BSTP_INFO_RECEIVED: + if (bp == rbp) { + /* + * root priority is derived from this + * port, make it the root port. + */ + bstp_set_port_role(bp, BSTP_ROLE_ROOT); + bs->bs_root_port = bp; + } else if (bstp_info_cmp(&bp->bp_port_pv, + &bp->bp_desg_pv) == INFO_BETTER) { + /* + * the port priority is lower than the root + * port. + */ + bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED); + bstp_update_info(bp); + } else { + if (bstp_same_bridgeid( + bp->bp_port_pv.pv_dbridge_id, + bs->bs_bridge_pv.pv_dbridge_id)) { + /* + * the designated bridge refers to + * another port on this bridge. + */ + bstp_set_port_role(bp, + BSTP_ROLE_BACKUP); + } else { + /* + * the port is an inferior path to the + * root bridge. + */ + bstp_set_port_role(bp, + BSTP_ROLE_ALTERNATE); + } + } + break; + } + } +} + +static void +bstp_update_state(struct bstp_state *bs, struct bstp_port *bp) +{ + struct bstp_port *bp2; + int synced; + + BSTP_LOCK_ASSERT(bs); + + /* check if all the ports have syncronised again */ + if (!bs->bs_allsynced) { + synced = 1; + LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) { + if (!(bp2->bp_synced || + bp2->bp_role == BSTP_ROLE_ROOT)) { + synced = 0; + break; + } + } + bs->bs_allsynced = synced; + } + + bstp_update_roles(bs, bp); + bstp_update_tc(bp); +} + +static void +bstp_update_roles(struct bstp_state *bs, struct bstp_port *bp) +{ + switch (bp->bp_role) { + case BSTP_ROLE_DISABLED: + /* Clear any flags if set */ + if (bp->bp_sync || !bp->bp_synced || bp->bp_reroot) { + bp->bp_sync = 0; + bp->bp_synced = 1; + bp->bp_reroot = 0; + } + break; + + case BSTP_ROLE_ALTERNATE: + case BSTP_ROLE_BACKUP: + if ((bs->bs_allsynced && !bp->bp_agree) || + (bp->bp_proposed && bp->bp_agree)) { + bp->bp_proposed = 0; + bp->bp_agree = 1; + bp->bp_flags |= BSTP_PORT_NEWINFO; + DPRINTF("%s -> ALTERNATE_AGREED\n", + bp->bp_ifp->if_xname); + } + + if (bp->bp_proposed && !bp->bp_agree) { + bstp_set_all_sync(bs); + bp->bp_proposed = 0; + DPRINTF("%s -> ALTERNATE_PROPOSED\n", + bp->bp_ifp->if_xname); + } + + /* Clear any flags if set */ + if (bp->bp_sync || !bp->bp_synced || bp->bp_reroot) { + bp->bp_sync = 0; + bp->bp_synced = 1; + bp->bp_reroot = 0; + DPRINTF("%s -> ALTERNATE_PORT\n", bp->bp_ifp->if_xname); + } + break; + + case BSTP_ROLE_ROOT: + if (bp->bp_state != BSTP_IFSTATE_FORWARDING && !bp->bp_reroot) { + bstp_set_all_reroot(bs); + DPRINTF("%s -> ROOT_REROOT\n", bp->bp_ifp->if_xname); + } + + if ((bs->bs_allsynced && !bp->bp_agree) || + (bp->bp_proposed && bp->bp_agree)) { + bp->bp_proposed = 0; + bp->bp_sync = 0; + bp->bp_agree = 1; + bp->bp_flags |= BSTP_PORT_NEWINFO; + DPRINTF("%s -> ROOT_AGREED\n", bp->bp_ifp->if_xname); + } + + if (bp->bp_proposed && !bp->bp_agree) { + bstp_set_all_sync(bs); + bp->bp_proposed = 0; + DPRINTF("%s -> ROOT_PROPOSED\n", bp->bp_ifp->if_xname); + } + + if (bp->bp_state != BSTP_IFSTATE_FORWARDING && + (bp->bp_forward_delay_timer.active == 0 || + (bstp_rerooted(bs, bp) && + bp->bp_recent_backup_timer.active == 0 && + bp->bp_protover == BSTP_PROTO_RSTP))) { + switch (bp->bp_state) { + case BSTP_IFSTATE_DISCARDING: + bstp_set_port_state(bp, BSTP_IFSTATE_LEARNING); + break; + case BSTP_IFSTATE_LEARNING: + bstp_set_port_state(bp, + BSTP_IFSTATE_FORWARDING); + break; + } + } + + if (bp->bp_state == BSTP_IFSTATE_FORWARDING && bp->bp_reroot) { + bp->bp_reroot = 0; + DPRINTF("%s -> ROOT_REROOTED\n", bp->bp_ifp->if_xname); + } + break; + + case BSTP_ROLE_DESIGNATED: + if (bp->bp_recent_root_timer.active == 0 && bp->bp_reroot) { + bp->bp_reroot = 0; + DPRINTF("%s -> DESIGNATED_RETIRED\n", + bp->bp_ifp->if_xname); + } + + if ((bp->bp_state == BSTP_IFSTATE_DISCARDING && + !bp->bp_synced) || (bp->bp_agreed && !bp->bp_synced) || + (bp->bp_operedge && !bp->bp_synced) || + (bp->bp_sync && bp->bp_synced)) { + bstp_timer_stop(&bp->bp_recent_root_timer); + bp->bp_synced = 1; + bp->bp_sync = 0; + DPRINTF("%s -> DESIGNATED_SYNCED\n", + bp->bp_ifp->if_xname); + } + + if (bp->bp_state != BSTP_IFSTATE_FORWARDING && + !bp->bp_agreed && !bp->bp_proposing && + !bp->bp_operedge) { + bp->bp_proposing = 1; + bp->bp_flags |= BSTP_PORT_NEWINFO; + bstp_timer_start(&bp->bp_edge_delay_timer, + (bp->bp_ptp_link ? BSTP_DEFAULT_MIGRATE_DELAY : + bp->bp_desg_max_age)); + DPRINTF("%s -> DESIGNATED_PROPOSE\n", + bp->bp_ifp->if_xname); + } + + if (bp->bp_state != BSTP_IFSTATE_FORWARDING && + (bp->bp_forward_delay_timer.active == 0 || bp->bp_agreed || + bp->bp_operedge) && + (bp->bp_recent_root_timer.active == 0 || !bp->bp_reroot) && + !bp->bp_sync) { + if (bp->bp_agreed) + DPRINTF("%s -> AGREED\n", bp->bp_ifp->if_xname); + /* + * If agreed|operedge then go straight to forwarding, + * otherwise follow discard -> learn -> forward. + */ + if (bp->bp_agreed || bp->bp_operedge || + bp->bp_state == BSTP_IFSTATE_LEARNING) { + bstp_set_port_state(bp, + BSTP_IFSTATE_FORWARDING); + bp->bp_agreed = bp->bp_protover; + } else if (bp->bp_state == BSTP_IFSTATE_DISCARDING) + bstp_set_port_state(bp, BSTP_IFSTATE_LEARNING); + } + + if (((bp->bp_sync && !bp->bp_synced) || + (bp->bp_reroot && bp->bp_recent_root_timer.active) || + (bp->bp_flags & BSTP_PORT_DISPUTED)) && !bp->bp_operedge && + bp->bp_state != BSTP_IFSTATE_DISCARDING) { + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + bp->bp_flags &= ~BSTP_PORT_DISPUTED; + bstp_timer_start(&bp->bp_forward_delay_timer, + bp->bp_protover == BSTP_PROTO_RSTP ? + bp->bp_desg_htime : bp->bp_desg_fdelay); + DPRINTF("%s -> DESIGNATED_DISCARD\n", + bp->bp_ifp->if_xname); + } + break; + } + + if (bp->bp_flags & BSTP_PORT_NEWINFO) + bstp_transmit(bs, bp); +} + +static void +bstp_update_tc(struct bstp_port *bp) +{ + switch (bp->bp_tcstate) { + case BSTP_TCSTATE_ACTIVE: + if ((bp->bp_role != BSTP_ROLE_DESIGNATED && + bp->bp_role != BSTP_ROLE_ROOT) || bp->bp_operedge) + bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING); + + if (bp->bp_rcvdtcn) + bstp_set_port_tc(bp, BSTP_TCSTATE_TCN); + if (bp->bp_rcvdtc) + bstp_set_port_tc(bp, BSTP_TCSTATE_TC); + + if (bp->bp_tc_prop && !bp->bp_operedge) + bstp_set_port_tc(bp, BSTP_TCSTATE_PROPAG); + + if (bp->bp_rcvdtca) + bstp_set_port_tc(bp, BSTP_TCSTATE_ACK); + break; + + case BSTP_TCSTATE_INACTIVE: + if ((bp->bp_state == BSTP_IFSTATE_LEARNING || + bp->bp_state == BSTP_IFSTATE_FORWARDING) && + bp->bp_fdbflush == 0) + bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING); + break; + + case BSTP_TCSTATE_LEARNING: + if (bp->bp_rcvdtc || bp->bp_rcvdtcn || bp->bp_rcvdtca || + bp->bp_tc_prop) + bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING); + else if (bp->bp_role != BSTP_ROLE_DESIGNATED && + bp->bp_role != BSTP_ROLE_ROOT && + bp->bp_state == BSTP_IFSTATE_DISCARDING) + bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE); + + if ((bp->bp_role == BSTP_ROLE_DESIGNATED || + bp->bp_role == BSTP_ROLE_ROOT) && + bp->bp_state == BSTP_IFSTATE_FORWARDING && + !bp->bp_operedge) + bstp_set_port_tc(bp, BSTP_TCSTATE_DETECTED); + break; + + /* these are transient states and go straight back to ACTIVE */ + case BSTP_TCSTATE_DETECTED: + case BSTP_TCSTATE_TCN: + case BSTP_TCSTATE_TC: + case BSTP_TCSTATE_PROPAG: + case BSTP_TCSTATE_ACK: + DPRINTF("Invalid TC state for %s\n", + bp->bp_ifp->if_xname); + break; + } + +} + +static void +bstp_update_info(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + + bp->bp_proposing = 0; + bp->bp_proposed = 0; + + if (bp->bp_agreed && !bstp_pdu_bettersame(bp, BSTP_INFO_MINE)) + bp->bp_agreed = 0; + + if (bp->bp_synced && !bp->bp_agreed) { + bp->bp_synced = 0; + bs->bs_allsynced = 0; + } + + /* copy the designated pv to the port */ + bp->bp_port_pv = bp->bp_desg_pv; + bp->bp_port_msg_age = bp->bp_desg_msg_age; + bp->bp_port_max_age = bp->bp_desg_max_age; + bp->bp_port_fdelay = bp->bp_desg_fdelay; + bp->bp_port_htime = bp->bp_desg_htime; + bp->bp_infois = BSTP_INFO_MINE; + + /* Set transmit flag but do not immediately send */ + bp->bp_flags |= BSTP_PORT_NEWINFO; +} + +/* set tcprop on every port other than the caller */ +static void +bstp_set_other_tcprop(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + struct bstp_port *bp2; + + BSTP_LOCK_ASSERT(bs); + + LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) { + if (bp2 == bp) + continue; + bp2->bp_tc_prop = 1; + } +} + +static void +bstp_set_all_reroot(struct bstp_state *bs) +{ + struct bstp_port *bp; + + BSTP_LOCK_ASSERT(bs); + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) + bp->bp_reroot = 1; +} + +static void +bstp_set_all_sync(struct bstp_state *bs) +{ + struct bstp_port *bp; + + BSTP_LOCK_ASSERT(bs); + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + bp->bp_sync = 1; + bp->bp_synced = 0; /* Not explicit in spec */ + } + + bs->bs_allsynced = 0; +} + +static void +bstp_set_port_state(struct bstp_port *bp, int state) +{ + if (bp->bp_state == state) + return; + + bp->bp_state = state; + + switch (bp->bp_state) { + case BSTP_IFSTATE_DISCARDING: + DPRINTF("state changed to DISCARDING on %s\n", + bp->bp_ifp->if_xname); + break; + + case BSTP_IFSTATE_LEARNING: + DPRINTF("state changed to LEARNING on %s\n", + bp->bp_ifp->if_xname); + + bstp_timer_start(&bp->bp_forward_delay_timer, + bp->bp_protover == BSTP_PROTO_RSTP ? + bp->bp_desg_htime : bp->bp_desg_fdelay); + break; + + case BSTP_IFSTATE_FORWARDING: + DPRINTF("state changed to FORWARDING on %s\n", + bp->bp_ifp->if_xname); + + bstp_timer_stop(&bp->bp_forward_delay_timer); + /* Record that we enabled forwarding */ + bp->bp_forward_transitions++; + break; + } + + /* notify the parent bridge */ + taskqueue_enqueue(taskqueue_swi, &bp->bp_statetask); +} + +static void +bstp_set_port_role(struct bstp_port *bp, int role) +{ + struct bstp_state *bs = bp->bp_bs; + + if (bp->bp_role == role) + return; + + /* perform pre-change tasks */ + switch (bp->bp_role) { + case BSTP_ROLE_DISABLED: + bstp_timer_start(&bp->bp_forward_delay_timer, + bp->bp_desg_max_age); + break; + + case BSTP_ROLE_BACKUP: + bstp_timer_start(&bp->bp_recent_backup_timer, + bp->bp_desg_htime * 2); + /* fall through */ + case BSTP_ROLE_ALTERNATE: + bstp_timer_start(&bp->bp_forward_delay_timer, + bp->bp_desg_fdelay); + bp->bp_sync = 0; + bp->bp_synced = 1; + bp->bp_reroot = 0; + break; + + case BSTP_ROLE_ROOT: + bstp_timer_start(&bp->bp_recent_root_timer, + BSTP_DEFAULT_FORWARD_DELAY); + break; + } + + bp->bp_role = role; + /* clear values not carried between roles */ + bp->bp_proposing = 0; + bs->bs_allsynced = 0; + + /* initialise the new role */ + switch (bp->bp_role) { + case BSTP_ROLE_DISABLED: + case BSTP_ROLE_ALTERNATE: + case BSTP_ROLE_BACKUP: + DPRINTF("%s role -> ALT/BACK/DISABLED\n", + bp->bp_ifp->if_xname); + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + bstp_timer_stop(&bp->bp_recent_root_timer); + bstp_timer_latch(&bp->bp_forward_delay_timer); + bp->bp_sync = 0; + bp->bp_synced = 1; + bp->bp_reroot = 0; + break; + + case BSTP_ROLE_ROOT: + DPRINTF("%s role -> ROOT\n", + bp->bp_ifp->if_xname); + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + bstp_timer_latch(&bp->bp_recent_root_timer); + bp->bp_proposing = 0; + break; + + case BSTP_ROLE_DESIGNATED: + DPRINTF("%s role -> DESIGNATED\n", + bp->bp_ifp->if_xname); + bstp_timer_start(&bp->bp_hello_timer, + bp->bp_desg_htime); + bp->bp_agree = 0; + break; + } + + /* let the TC state know that the role changed */ + bstp_update_tc(bp); +} + +static void +bstp_set_port_proto(struct bstp_port *bp, int proto) +{ + struct bstp_state *bs = bp->bp_bs; + + /* supported protocol versions */ + switch (proto) { + case BSTP_PROTO_STP: + /* we can downgrade protocols only */ + bstp_timer_stop(&bp->bp_migrate_delay_timer); + /* clear unsupported features */ + bp->bp_operedge = 0; + /* STP compat mode only uses 16 bits of the 32 */ + if (bp->bp_path_cost > 65535) + bp->bp_path_cost = 65535; + break; + + case BSTP_PROTO_RSTP: + bstp_timer_start(&bp->bp_migrate_delay_timer, + bs->bs_migration_delay); + break; + + default: + DPRINTF("Unsupported STP version %d\n", proto); + return; + } + + bp->bp_protover = proto; + bp->bp_flags &= ~BSTP_PORT_CANMIGRATE; +} + +static void +bstp_set_port_tc(struct bstp_port *bp, int state) +{ + struct bstp_state *bs = bp->bp_bs; + + bp->bp_tcstate = state; + + /* initialise the new state */ + switch (bp->bp_tcstate) { + case BSTP_TCSTATE_ACTIVE: + DPRINTF("%s -> TC_ACTIVE\n", bp->bp_ifp->if_xname); + /* nothing to do */ + break; + + case BSTP_TCSTATE_INACTIVE: + bstp_timer_stop(&bp->bp_tc_timer); + /* flush routes on the parent bridge */ + bp->bp_fdbflush = 1; + taskqueue_enqueue(taskqueue_swi, &bp->bp_rtagetask); + bp->bp_tc_ack = 0; + DPRINTF("%s -> TC_INACTIVE\n", bp->bp_ifp->if_xname); + break; + + case BSTP_TCSTATE_LEARNING: + bp->bp_rcvdtc = 0; + bp->bp_rcvdtcn = 0; + bp->bp_rcvdtca = 0; + bp->bp_tc_prop = 0; + DPRINTF("%s -> TC_LEARNING\n", bp->bp_ifp->if_xname); + break; + + case BSTP_TCSTATE_DETECTED: + bstp_set_timer_tc(bp); + bstp_set_other_tcprop(bp); + /* send out notification */ + bp->bp_flags |= BSTP_PORT_NEWINFO; + bstp_transmit(bs, bp); + getmicrotime(&bs->bs_last_tc_time); + DPRINTF("%s -> TC_DETECTED\n", bp->bp_ifp->if_xname); + bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ + break; + + case BSTP_TCSTATE_TCN: + bstp_set_timer_tc(bp); + DPRINTF("%s -> TC_TCN\n", bp->bp_ifp->if_xname); + /* fall through */ + case BSTP_TCSTATE_TC: + bp->bp_rcvdtc = 0; + bp->bp_rcvdtcn = 0; + if (bp->bp_role == BSTP_ROLE_DESIGNATED) + bp->bp_tc_ack = 1; + + bstp_set_other_tcprop(bp); + DPRINTF("%s -> TC_TC\n", bp->bp_ifp->if_xname); + bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ + break; + + case BSTP_TCSTATE_PROPAG: + /* flush routes on the parent bridge */ + bp->bp_fdbflush = 1; + taskqueue_enqueue(taskqueue_swi, &bp->bp_rtagetask); + bp->bp_tc_prop = 0; + bstp_set_timer_tc(bp); + DPRINTF("%s -> TC_PROPAG\n", bp->bp_ifp->if_xname); + bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ + break; + + case BSTP_TCSTATE_ACK: + bstp_timer_stop(&bp->bp_tc_timer); + bp->bp_rcvdtca = 0; + DPRINTF("%s -> TC_ACK\n", bp->bp_ifp->if_xname); + bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ + break; + } +} + +static void +bstp_set_timer_tc(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + + if (bp->bp_tc_timer.active) + return; + + switch (bp->bp_protover) { + case BSTP_PROTO_RSTP: + bstp_timer_start(&bp->bp_tc_timer, + bp->bp_desg_htime + BSTP_TICK_VAL); + bp->bp_flags |= BSTP_PORT_NEWINFO; + break; + + case BSTP_PROTO_STP: + bstp_timer_start(&bp->bp_tc_timer, + bs->bs_root_max_age + bs->bs_root_fdelay); + break; + } +} + +static void +bstp_set_timer_msgage(struct bstp_port *bp) +{ + if (bp->bp_port_msg_age + BSTP_MESSAGE_AGE_INCR <= + bp->bp_port_max_age) { + bstp_timer_start(&bp->bp_message_age_timer, + bp->bp_port_htime * 3); + } else + /* expires immediately */ + bstp_timer_start(&bp->bp_message_age_timer, 0); +} + +static int +bstp_rerooted(struct bstp_state *bs, struct bstp_port *bp) +{ + struct bstp_port *bp2; + int rr_set = 0; + + LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) { + if (bp2 == bp) + continue; + if (bp2->bp_recent_root_timer.active) { + rr_set = 1; + break; + } + } + return (!rr_set); +} + +int +bstp_set_htime(struct bstp_state *bs, int t) +{ + /* convert seconds to ticks */ + t *= BSTP_TICK_VAL; + + /* value can only be changed in leagacy stp mode */ + if (bs->bs_protover != BSTP_PROTO_STP) + return (EPERM); + + if (t < BSTP_MIN_HELLO_TIME || t > BSTP_MAX_HELLO_TIME) + return (EINVAL); + + BSTP_LOCK(bs); + bs->bs_bridge_htime = t; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_fdelay(struct bstp_state *bs, int t) +{ + /* convert seconds to ticks */ + t *= BSTP_TICK_VAL; + + if (t < BSTP_MIN_FORWARD_DELAY || t > BSTP_MAX_FORWARD_DELAY) + return (EINVAL); + + BSTP_LOCK(bs); + bs->bs_bridge_fdelay = t; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_maxage(struct bstp_state *bs, int t) +{ + /* convert seconds to ticks */ + t *= BSTP_TICK_VAL; + + if (t < BSTP_MIN_MAX_AGE || t > BSTP_MAX_MAX_AGE) + return (EINVAL); + + BSTP_LOCK(bs); + bs->bs_bridge_max_age = t; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_holdcount(struct bstp_state *bs, int count) +{ + struct bstp_port *bp; + + if (count < BSTP_MIN_HOLD_COUNT || + count > BSTP_MAX_HOLD_COUNT) + return (EINVAL); + + BSTP_LOCK(bs); + bs->bs_txholdcount = count; + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) + bp->bp_txcount = 0; + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_protocol(struct bstp_state *bs, int proto) +{ + struct bstp_port *bp; + + switch (proto) { + /* Supported protocol versions */ + case BSTP_PROTO_STP: + case BSTP_PROTO_RSTP: + break; + + default: + return (EINVAL); + } + + BSTP_LOCK(bs); + bs->bs_protover = proto; + bs->bs_bridge_htime = BSTP_DEFAULT_HELLO_TIME; + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + /* reinit state */ + bp->bp_infois = BSTP_INFO_DISABLED; + bp->bp_txcount = 0; + bstp_set_port_proto(bp, bs->bs_protover); + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE); + bstp_timer_stop(&bp->bp_recent_backup_timer); + } + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_priority(struct bstp_state *bs, int pri) +{ + if (pri < 0 || pri > BSTP_MAX_PRIORITY) + return (EINVAL); + + /* Limit to steps of 4096 */ + pri -= pri % 4096; + + BSTP_LOCK(bs); + bs->bs_bridge_priority = pri; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_port_priority(struct bstp_port *bp, int pri) +{ + struct bstp_state *bs = bp->bp_bs; + + if (pri < 0 || pri > BSTP_MAX_PORT_PRIORITY) + return (EINVAL); + + /* Limit to steps of 16 */ + pri -= pri % 16; + + BSTP_LOCK(bs); + bp->bp_priority = pri; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_path_cost(struct bstp_port *bp, uint32_t path_cost) +{ + struct bstp_state *bs = bp->bp_bs; + + if (path_cost > BSTP_MAX_PATH_COST) + return (EINVAL); + + /* STP compat mode only uses 16 bits of the 32 */ + if (bp->bp_protover == BSTP_PROTO_STP && path_cost > 65535) + path_cost = 65535; + + BSTP_LOCK(bs); + + if (path_cost == 0) { /* use auto */ + bp->bp_flags &= ~BSTP_PORT_ADMCOST; + bp->bp_path_cost = bstp_calc_path_cost(bp); + } else { + bp->bp_path_cost = path_cost; + bp->bp_flags |= BSTP_PORT_ADMCOST; + } + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_edge(struct bstp_port *bp, int set) +{ + struct bstp_state *bs = bp->bp_bs; + + BSTP_LOCK(bs); + if ((bp->bp_operedge = set) == 0) + bp->bp_flags &= ~BSTP_PORT_ADMEDGE; + else + bp->bp_flags |= BSTP_PORT_ADMEDGE; + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_autoedge(struct bstp_port *bp, int set) +{ + struct bstp_state *bs = bp->bp_bs; + + BSTP_LOCK(bs); + if (set) { + bp->bp_flags |= BSTP_PORT_AUTOEDGE; + /* we may be able to transition straight to edge */ + if (bp->bp_edge_delay_timer.active == 0) + bstp_edge_delay_expiry(bs, bp); + } else + bp->bp_flags &= ~BSTP_PORT_AUTOEDGE; + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_ptp(struct bstp_port *bp, int set) +{ + struct bstp_state *bs = bp->bp_bs; + + BSTP_LOCK(bs); + bp->bp_ptp_link = set; + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_autoptp(struct bstp_port *bp, int set) +{ + struct bstp_state *bs = bp->bp_bs; + + BSTP_LOCK(bs); + if (set) { + bp->bp_flags |= BSTP_PORT_AUTOPTP; + if (bp->bp_role != BSTP_ROLE_DISABLED) + bstp_ifupdstatus(bs, bp); + } else + bp->bp_flags &= ~BSTP_PORT_AUTOPTP; + BSTP_UNLOCK(bs); + return (0); +} + +/* + * Calculate the path cost according to the link speed. + */ +static uint32_t +bstp_calc_path_cost(struct bstp_port *bp) +{ + struct ifnet *ifp = bp->bp_ifp; + uint32_t path_cost; + + /* If the priority has been manually set then retain the value */ + if (bp->bp_flags & BSTP_PORT_ADMCOST) + return bp->bp_path_cost; + + if (ifp->if_link_state == LINK_STATE_DOWN) { + /* Recalc when the link comes up again */ + bp->bp_flags |= BSTP_PORT_PNDCOST; + return (BSTP_DEFAULT_PATH_COST); + } + + if (ifp->if_baudrate < 1000) + return (BSTP_DEFAULT_PATH_COST); + + /* formula from section 17.14, IEEE Std 802.1D-2004 */ + path_cost = 20000000000ULL / (ifp->if_baudrate / 1000); + + if (path_cost > BSTP_MAX_PATH_COST) + path_cost = BSTP_MAX_PATH_COST; + + /* STP compat mode only uses 16 bits of the 32 */ + if (bp->bp_protover == BSTP_PROTO_STP && path_cost > 65535) + path_cost = 65535; + + return (path_cost); +} + +/* + * Notify the bridge that a port state has changed, we need to do this from a + * taskqueue to avoid a LOR. + */ +static void +bstp_notify_state(void *arg, int pending) +{ + struct bstp_port *bp = (struct bstp_port *)arg; + struct bstp_state *bs = bp->bp_bs; + + if (bp->bp_active == 1 && bs->bs_state_cb != NULL) + (*bs->bs_state_cb)(bp->bp_ifp, bp->bp_state); +} + +/* + * Flush the routes on the bridge port, we need to do this from a + * taskqueue to avoid a LOR. + */ +static void +bstp_notify_rtage(void *arg, int pending) +{ + struct bstp_port *bp = (struct bstp_port *)arg; + struct bstp_state *bs = bp->bp_bs; + int age = 0; + + BSTP_LOCK(bs); + switch (bp->bp_protover) { + case BSTP_PROTO_STP: + /* convert to seconds */ + age = bp->bp_desg_fdelay / BSTP_TICK_VAL; + break; + + case BSTP_PROTO_RSTP: + age = 0; + break; + } + BSTP_UNLOCK(bs); + + if (bp->bp_active == 1 && bs->bs_rtage_cb != NULL) + (*bs->bs_rtage_cb)(bp->bp_ifp, age); + + /* flush is complete */ + BSTP_LOCK(bs); + bp->bp_fdbflush = 0; + BSTP_UNLOCK(bs); +} + +void +bstp_linkstate(struct ifnet *ifp, int state) +{ + struct bstp_state *bs; + struct bstp_port *bp; + + /* search for the stp port */ + mtx_lock(&bstp_list_mtx); + LIST_FOREACH(bs, &bstp_list, bs_list) { + BSTP_LOCK(bs); + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + if (bp->bp_ifp == ifp) { + bstp_ifupdstatus(bs, bp); + bstp_update_state(bs, bp); + /* it only exists once so return */ + BSTP_UNLOCK(bs); + mtx_unlock(&bstp_list_mtx); + return; + } + } + BSTP_UNLOCK(bs); + } + mtx_unlock(&bstp_list_mtx); +} + +static void +bstp_ifupdstatus(struct bstp_state *bs, struct bstp_port *bp) +{ + struct ifnet *ifp = bp->bp_ifp; + struct ifmediareq ifmr; + int error = 0; + + BSTP_LOCK_ASSERT(bs); + + bzero((char *)&ifmr, sizeof(ifmr)); + error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (caddr_t)&ifmr); + + if ((error == 0) && (ifp->if_flags & IFF_UP)) { + if (ifmr.ifm_status & IFM_ACTIVE) { + /* A full-duplex link is assumed to be point to point */ + if (bp->bp_flags & BSTP_PORT_AUTOPTP) { + bp->bp_ptp_link = + ifmr.ifm_active & IFM_FDX ? 1 : 0; + } + + /* Calc the cost if the link was down previously */ + if (bp->bp_flags & BSTP_PORT_PNDCOST) { + bp->bp_path_cost = bstp_calc_path_cost(bp); + bp->bp_flags &= ~BSTP_PORT_PNDCOST; + } + + if (bp->bp_role == BSTP_ROLE_DISABLED) + bstp_enable_port(bs, bp); + } else { + if (bp->bp_role != BSTP_ROLE_DISABLED) { + bstp_disable_port(bs, bp); + if ((bp->bp_flags & BSTP_PORT_ADMEDGE) && + bp->bp_protover == BSTP_PROTO_RSTP) + bp->bp_operedge = 1; + } + } + return; + } + + if (bp->bp_infois != BSTP_INFO_DISABLED) + bstp_disable_port(bs, bp); +} + +static void +bstp_enable_port(struct bstp_state *bs, struct bstp_port *bp) +{ + bp->bp_infois = BSTP_INFO_AGED; + bstp_assign_roles(bs); +} + +static void +bstp_disable_port(struct bstp_state *bs, struct bstp_port *bp) +{ + bp->bp_infois = BSTP_INFO_DISABLED; + bstp_assign_roles(bs); +} + +static void +bstp_tick(void *arg) +{ + struct bstp_state *bs = arg; + struct bstp_port *bp; + + BSTP_LOCK_ASSERT(bs); + + if (bs->bs_running == 0) + return; + + /* slow timer to catch missed link events */ + if (bstp_timer_expired(&bs->bs_link_timer)) { + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) + bstp_ifupdstatus(bs, bp); + bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER); + } + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + /* no events need to happen for these */ + bstp_timer_expired(&bp->bp_tc_timer); + bstp_timer_expired(&bp->bp_recent_root_timer); + bstp_timer_expired(&bp->bp_forward_delay_timer); + bstp_timer_expired(&bp->bp_recent_backup_timer); + + if (bstp_timer_expired(&bp->bp_hello_timer)) + bstp_hello_timer_expiry(bs, bp); + + if (bstp_timer_expired(&bp->bp_message_age_timer)) + bstp_message_age_expiry(bs, bp); + + if (bstp_timer_expired(&bp->bp_migrate_delay_timer)) + bstp_migrate_delay_expiry(bs, bp); + + if (bstp_timer_expired(&bp->bp_edge_delay_timer)) + bstp_edge_delay_expiry(bs, bp); + + /* update the various state machines for the port */ + bstp_update_state(bs, bp); + + if (bp->bp_txcount > 0) + bp->bp_txcount--; + } + + callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs); +} + +static void +bstp_timer_start(struct bstp_timer *t, uint16_t v) +{ + t->value = v; + t->active = 1; + t->latched = 0; +} + +static void +bstp_timer_stop(struct bstp_timer *t) +{ + t->value = 0; + t->active = 0; + t->latched = 0; +} + +static void +bstp_timer_latch(struct bstp_timer *t) +{ + t->latched = 1; + t->active = 1; +} + +static int +bstp_timer_expired(struct bstp_timer *t) +{ + if (t->active == 0 || t->latched) + return (0); + t->value -= BSTP_TICK_VAL; + if (t->value <= 0) { + bstp_timer_stop(t); + return (1); + } + return (0); +} + +static void +bstp_hello_timer_expiry(struct bstp_state *bs, struct bstp_port *bp) +{ + if ((bp->bp_flags & BSTP_PORT_NEWINFO) || + bp->bp_role == BSTP_ROLE_DESIGNATED || + (bp->bp_role == BSTP_ROLE_ROOT && + bp->bp_tc_timer.active == 1)) { + bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime); + bp->bp_flags |= BSTP_PORT_NEWINFO; + bstp_transmit(bs, bp); + } +} + +static void +bstp_message_age_expiry(struct bstp_state *bs, struct bstp_port *bp) +{ + if (bp->bp_infois == BSTP_INFO_RECEIVED) { + bp->bp_infois = BSTP_INFO_AGED; + bstp_assign_roles(bs); + DPRINTF("aged info on %s\n", bp->bp_ifp->if_xname); + } +} + +static void +bstp_migrate_delay_expiry(struct bstp_state *bs, struct bstp_port *bp) +{ + bp->bp_flags |= BSTP_PORT_CANMIGRATE; +} + +static void +bstp_edge_delay_expiry(struct bstp_state *bs, struct bstp_port *bp) +{ + if ((bp->bp_flags & BSTP_PORT_AUTOEDGE) && + bp->bp_protover == BSTP_PROTO_RSTP && bp->bp_proposing && + bp->bp_role == BSTP_ROLE_DESIGNATED) { + bp->bp_operedge = 1; + DPRINTF("%s -> edge port\n", bp->bp_ifp->if_xname); + } +} + +static int +bstp_addr_cmp(const uint8_t *a, const uint8_t *b) +{ + int i, d; + + for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) { + d = ((int)a[i]) - ((int)b[i]); + } + + return (d); +} + +/* + * compare the bridge address component of the bridgeid + */ +static int +bstp_same_bridgeid(uint64_t id1, uint64_t id2) +{ + u_char addr1[ETHER_ADDR_LEN]; + u_char addr2[ETHER_ADDR_LEN]; + + PV2ADDR(id1, addr1); + PV2ADDR(id2, addr2); + + if (bstp_addr_cmp(addr1, addr2) == 0) + return (1); + + return (0); +} + +void +bstp_reinit(struct bstp_state *bs) +{ + struct bstp_port *bp; + struct ifnet *ifp, *mif; + u_char *e_addr; + static const u_char llzero[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ + + BSTP_LOCK_ASSERT(bs); + + mif = NULL; + /* + * Search through the Ethernet adapters and find the one with the + * lowest value. The adapter which we take the MAC address from does + * not need to be part of the bridge, it just needs to be a unique + * value. + */ + IFNET_RLOCK_NOSLEEP(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (ifp->if_type != IFT_ETHER) + continue; + + if (bstp_addr_cmp(IF_LLADDR(ifp), llzero) == 0) + continue; + + if (mif == NULL) { + mif = ifp; + continue; + } + if (bstp_addr_cmp(IF_LLADDR(ifp), IF_LLADDR(mif)) < 0) { + mif = ifp; + continue; + } + } + IFNET_RUNLOCK_NOSLEEP(); + + if (LIST_EMPTY(&bs->bs_bplist) || mif == NULL) { + /* Set the bridge and root id (lower bits) to zero */ + bs->bs_bridge_pv.pv_dbridge_id = + ((uint64_t)bs->bs_bridge_priority) << 48; + bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id; + bs->bs_root_pv = bs->bs_bridge_pv; + /* Disable any remaining ports, they will have no MAC address */ + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + bp->bp_infois = BSTP_INFO_DISABLED; + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + } + callout_stop(&bs->bs_bstpcallout); + return; + } + + e_addr = IF_LLADDR(mif); + bs->bs_bridge_pv.pv_dbridge_id = + (((uint64_t)bs->bs_bridge_priority) << 48) | + (((uint64_t)e_addr[0]) << 40) | + (((uint64_t)e_addr[1]) << 32) | + (((uint64_t)e_addr[2]) << 24) | + (((uint64_t)e_addr[3]) << 16) | + (((uint64_t)e_addr[4]) << 8) | + (((uint64_t)e_addr[5])); + + bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id; + bs->bs_bridge_pv.pv_cost = 0; + bs->bs_bridge_pv.pv_dport_id = 0; + bs->bs_bridge_pv.pv_port_id = 0; + + if (bs->bs_running && callout_pending(&bs->bs_bstpcallout) == 0) + callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs); + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + bp->bp_port_id = (bp->bp_priority << 8) | + (bp->bp_ifp->if_index & 0xfff); + bstp_ifupdstatus(bs, bp); + } + + bstp_assign_roles(bs); + bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER); +} + +static int +bstp_modevent(module_t mod, int type, void *data) +{ + switch (type) { + case MOD_LOAD: + mtx_init(&bstp_list_mtx, "bridgestp list", NULL, MTX_DEF); + LIST_INIT(&bstp_list); + bstp_linkstate_p = bstp_linkstate; + break; + case MOD_UNLOAD: + bstp_linkstate_p = NULL; + mtx_destroy(&bstp_list_mtx); + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t bstp_mod = { + "bridgestp", + bstp_modevent, + 0 +}; + +DECLARE_MODULE(bridgestp, bstp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(bridgestp, 1); + +void +bstp_attach(struct bstp_state *bs, struct bstp_cb_ops *cb) +{ + BSTP_LOCK_INIT(bs); + callout_init_mtx(&bs->bs_bstpcallout, &bs->bs_mtx, 0); + LIST_INIT(&bs->bs_bplist); + + bs->bs_bridge_max_age = BSTP_DEFAULT_MAX_AGE; + bs->bs_bridge_htime = BSTP_DEFAULT_HELLO_TIME; + bs->bs_bridge_fdelay = BSTP_DEFAULT_FORWARD_DELAY; + bs->bs_bridge_priority = BSTP_DEFAULT_BRIDGE_PRIORITY; + bs->bs_hold_time = BSTP_DEFAULT_HOLD_TIME; + bs->bs_migration_delay = BSTP_DEFAULT_MIGRATE_DELAY; + bs->bs_txholdcount = BSTP_DEFAULT_HOLD_COUNT; + bs->bs_protover = BSTP_PROTO_RSTP; + bs->bs_state_cb = cb->bcb_state; + bs->bs_rtage_cb = cb->bcb_rtage; + + getmicrotime(&bs->bs_last_tc_time); + + mtx_lock(&bstp_list_mtx); + LIST_INSERT_HEAD(&bstp_list, bs, bs_list); + mtx_unlock(&bstp_list_mtx); +} + +void +bstp_detach(struct bstp_state *bs) +{ + KASSERT(LIST_EMPTY(&bs->bs_bplist), ("bstp still active")); + + mtx_lock(&bstp_list_mtx); + LIST_REMOVE(bs, bs_list); + mtx_unlock(&bstp_list_mtx); + callout_drain(&bs->bs_bstpcallout); + BSTP_LOCK_DESTROY(bs); +} + +void +bstp_init(struct bstp_state *bs) +{ + BSTP_LOCK(bs); + callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs); + bs->bs_running = 1; + bstp_reinit(bs); + BSTP_UNLOCK(bs); +} + +void +bstp_stop(struct bstp_state *bs) +{ + struct bstp_port *bp; + + BSTP_LOCK(bs); + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + + bs->bs_running = 0; + callout_stop(&bs->bs_bstpcallout); + BSTP_UNLOCK(bs); +} + +int +bstp_create(struct bstp_state *bs, struct bstp_port *bp, struct ifnet *ifp) +{ + bzero(bp, sizeof(struct bstp_port)); + + BSTP_LOCK(bs); + bp->bp_ifp = ifp; + bp->bp_bs = bs; + bp->bp_priority = BSTP_DEFAULT_PORT_PRIORITY; + TASK_INIT(&bp->bp_statetask, 0, bstp_notify_state, bp); + TASK_INIT(&bp->bp_rtagetask, 0, bstp_notify_rtage, bp); + + /* Init state */ + bp->bp_infois = BSTP_INFO_DISABLED; + bp->bp_flags = BSTP_PORT_AUTOEDGE|BSTP_PORT_AUTOPTP; + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + bstp_set_port_proto(bp, bs->bs_protover); + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE); + bp->bp_path_cost = bstp_calc_path_cost(bp); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_enable(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + struct ifnet *ifp = bp->bp_ifp; + + KASSERT(bp->bp_active == 0, ("already a bstp member")); + + switch (ifp->if_type) { + case IFT_ETHER: /* These can do spanning tree. */ + break; + default: + /* Nothing else can. */ + return (EINVAL); + } + + BSTP_LOCK(bs); + LIST_INSERT_HEAD(&bs->bs_bplist, bp, bp_next); + bp->bp_active = 1; + bp->bp_flags |= BSTP_PORT_NEWINFO; + bstp_reinit(bs); + bstp_update_roles(bs, bp); + BSTP_UNLOCK(bs); + return (0); +} + +void +bstp_disable(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + + KASSERT(bp->bp_active == 1, ("not a bstp member")); + + BSTP_LOCK(bs); + bstp_disable_port(bs, bp); + LIST_REMOVE(bp, bp_next); + bp->bp_active = 0; + bstp_reinit(bs); + BSTP_UNLOCK(bs); +} + +/* + * The bstp_port structure is about to be freed by the parent bridge. + */ +void +bstp_destroy(struct bstp_port *bp) +{ + KASSERT(bp->bp_active == 0, ("port is still attached")); + taskqueue_drain(taskqueue_swi, &bp->bp_statetask); + taskqueue_drain(taskqueue_swi, &bp->bp_rtagetask); +} diff --git a/freebsd/sys/net/bridgestp.h b/freebsd/sys/net/bridgestp.h new file mode 100644 index 00000000..91328900 --- /dev/null +++ b/freebsd/sys/net/bridgestp.h @@ -0,0 +1,396 @@ +/* $NetBSD: if_bridgevar.h,v 1.4 2003/07/08 07:13:50 itojun Exp $ */ + +/* + * Copyright 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Jason R. Thorpe for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Jason L. Wright + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: if_bridge.h,v 1.14 2001/03/22 03:48:29 jason Exp + * + * $FreeBSD$ + */ + +/* + * Data structure and control definitions for STP interfaces. + */ + +#include +#include + +/* STP port states */ +#define BSTP_IFSTATE_DISABLED 0 +#define BSTP_IFSTATE_LISTENING 1 +#define BSTP_IFSTATE_LEARNING 2 +#define BSTP_IFSTATE_FORWARDING 3 +#define BSTP_IFSTATE_BLOCKING 4 +#define BSTP_IFSTATE_DISCARDING 5 + +#define BSTP_TCSTATE_ACTIVE 1 +#define BSTP_TCSTATE_DETECTED 2 +#define BSTP_TCSTATE_INACTIVE 3 +#define BSTP_TCSTATE_LEARNING 4 +#define BSTP_TCSTATE_PROPAG 5 +#define BSTP_TCSTATE_ACK 6 +#define BSTP_TCSTATE_TC 7 +#define BSTP_TCSTATE_TCN 8 + +#define BSTP_ROLE_DISABLED 0 +#define BSTP_ROLE_ROOT 1 +#define BSTP_ROLE_DESIGNATED 2 +#define BSTP_ROLE_ALTERNATE 3 +#define BSTP_ROLE_BACKUP 4 + +#ifdef _KERNEL + +/* STP port flags */ +#define BSTP_PORT_CANMIGRATE 0x0001 +#define BSTP_PORT_NEWINFO 0x0002 +#define BSTP_PORT_DISPUTED 0x0004 +#define BSTP_PORT_ADMCOST 0x0008 +#define BSTP_PORT_AUTOEDGE 0x0010 +#define BSTP_PORT_AUTOPTP 0x0020 +#define BSTP_PORT_ADMEDGE 0x0040 +#define BSTP_PORT_PNDCOST 0x0080 + +/* BPDU priority */ +#define BSTP_PDU_SUPERIOR 1 +#define BSTP_PDU_REPEATED 2 +#define BSTP_PDU_INFERIOR 3 +#define BSTP_PDU_INFERIORALT 4 +#define BSTP_PDU_OTHER 5 + +/* BPDU flags */ +#define BSTP_PDU_PRMASK 0x0c /* Port Role */ +#define BSTP_PDU_PRSHIFT 2 /* Port Role offset */ +#define BSTP_PDU_F_UNKN 0x00 /* Unknown port (00) */ +#define BSTP_PDU_F_ALT 0x01 /* Alt/Backup port (01) */ +#define BSTP_PDU_F_ROOT 0x02 /* Root port (10) */ +#define BSTP_PDU_F_DESG 0x03 /* Designated port (11) */ + +#define BSTP_PDU_STPMASK 0x81 /* strip unused STP flags */ +#define BSTP_PDU_RSTPMASK 0x7f /* strip unused RSTP flags */ +#define BSTP_PDU_F_TC 0x01 /* Topology change */ +#define BSTP_PDU_F_P 0x02 /* Proposal flag */ +#define BSTP_PDU_F_L 0x10 /* Learning flag */ +#define BSTP_PDU_F_F 0x20 /* Forwarding flag */ +#define BSTP_PDU_F_A 0x40 /* Agreement flag */ +#define BSTP_PDU_F_TCA 0x80 /* Topology change ack */ + +/* + * Spanning tree defaults. + */ +#define BSTP_DEFAULT_MAX_AGE (20 * 256) +#define BSTP_DEFAULT_HELLO_TIME (2 * 256) +#define BSTP_DEFAULT_FORWARD_DELAY (15 * 256) +#define BSTP_DEFAULT_HOLD_TIME (1 * 256) +#define BSTP_DEFAULT_MIGRATE_DELAY (3 * 256) +#define BSTP_DEFAULT_HOLD_COUNT 6 +#define BSTP_DEFAULT_BRIDGE_PRIORITY 0x8000 +#define BSTP_DEFAULT_PORT_PRIORITY 0x80 +#define BSTP_DEFAULT_PATH_COST 55 +#define BSTP_MIN_HELLO_TIME (1 * 256) +#define BSTP_MIN_MAX_AGE (6 * 256) +#define BSTP_MIN_FORWARD_DELAY (4 * 256) +#define BSTP_MIN_HOLD_COUNT 1 +#define BSTP_MAX_HELLO_TIME (2 * 256) +#define BSTP_MAX_MAX_AGE (40 * 256) +#define BSTP_MAX_FORWARD_DELAY (30 * 256) +#define BSTP_MAX_HOLD_COUNT 10 +#define BSTP_MAX_PRIORITY 61440 +#define BSTP_MAX_PORT_PRIORITY 240 +#define BSTP_MAX_PATH_COST 200000000 + +/* BPDU message types */ +#define BSTP_MSGTYPE_CFG 0x00 /* Configuration */ +#define BSTP_MSGTYPE_RSTP 0x02 /* Rapid STP */ +#define BSTP_MSGTYPE_TCN 0x80 /* Topology chg notification */ + +/* Protocol versions */ +#define BSTP_PROTO_ID 0x00 +#define BSTP_PROTO_STP 0x00 +#define BSTP_PROTO_RSTP 0x02 +#define BSTP_PROTO_MAX BSTP_PROTO_RSTP + +#define BSTP_INFO_RECEIVED 1 +#define BSTP_INFO_MINE 2 +#define BSTP_INFO_AGED 3 +#define BSTP_INFO_DISABLED 4 + + +#define BSTP_MESSAGE_AGE_INCR (1 * 256) /* in 256ths of a second */ +#define BSTP_TICK_VAL (1 * 256) /* in 256ths of a second */ +#define BSTP_LINK_TIMER (BSTP_TICK_VAL * 15) + +/* + * Driver callbacks for STP state changes + */ +typedef void (*bstp_state_cb_t)(struct ifnet *, int); +typedef void (*bstp_rtage_cb_t)(struct ifnet *, int); +struct bstp_cb_ops { + bstp_state_cb_t bcb_state; + bstp_rtage_cb_t bcb_rtage; +}; + +/* + * Because BPDU's do not make nicely aligned structures, two different + * declarations are used: bstp_?bpdu (wire representation, packed) and + * bstp_*_unit (internal, nicely aligned version). + */ + +/* configuration bridge protocol data unit */ +struct bstp_cbpdu { + uint8_t cbu_dsap; /* LLC: destination sap */ + uint8_t cbu_ssap; /* LLC: source sap */ + uint8_t cbu_ctl; /* LLC: control */ + uint16_t cbu_protoid; /* protocol id */ + uint8_t cbu_protover; /* protocol version */ + uint8_t cbu_bpdutype; /* message type */ + uint8_t cbu_flags; /* flags (below) */ + + /* root id */ + uint16_t cbu_rootpri; /* root priority */ + uint8_t cbu_rootaddr[6]; /* root address */ + + uint32_t cbu_rootpathcost; /* root path cost */ + + /* bridge id */ + uint16_t cbu_bridgepri; /* bridge priority */ + uint8_t cbu_bridgeaddr[6]; /* bridge address */ + + uint16_t cbu_portid; /* port id */ + uint16_t cbu_messageage; /* current message age */ + uint16_t cbu_maxage; /* maximum age */ + uint16_t cbu_hellotime; /* hello time */ + uint16_t cbu_forwarddelay; /* forwarding delay */ + uint8_t cbu_versionlen; /* version 1 length */ +} __packed; +#define BSTP_BPDU_STP_LEN (3 + 35) /* LLC + STP pdu */ +#define BSTP_BPDU_RSTP_LEN (3 + 36) /* LLC + RSTP pdu */ + +/* topology change notification bridge protocol data unit */ +struct bstp_tbpdu { + uint8_t tbu_dsap; /* LLC: destination sap */ + uint8_t tbu_ssap; /* LLC: source sap */ + uint8_t tbu_ctl; /* LLC: control */ + uint16_t tbu_protoid; /* protocol id */ + uint8_t tbu_protover; /* protocol version */ + uint8_t tbu_bpdutype; /* message type */ +} __packed; + +/* + * Timekeeping structure used in spanning tree code. + */ +struct bstp_timer { + int active; + int latched; + int value; +}; + +struct bstp_pri_vector { + uint64_t pv_root_id; + uint32_t pv_cost; + uint64_t pv_dbridge_id; + uint16_t pv_dport_id; + uint16_t pv_port_id; +}; + +struct bstp_config_unit { + struct bstp_pri_vector cu_pv; + uint16_t cu_message_age; + uint16_t cu_max_age; + uint16_t cu_forward_delay; + uint16_t cu_hello_time; + uint8_t cu_message_type; + uint8_t cu_topology_change_ack; + uint8_t cu_topology_change; + uint8_t cu_proposal; + uint8_t cu_agree; + uint8_t cu_learning; + uint8_t cu_forwarding; + uint8_t cu_role; +}; + +struct bstp_tcn_unit { + uint8_t tu_message_type; +}; + +struct bstp_port { + LIST_ENTRY(bstp_port) bp_next; + struct ifnet *bp_ifp; /* parent if */ + struct bstp_state *bp_bs; + uint8_t bp_active; + uint8_t bp_protover; + uint32_t bp_flags; + uint32_t bp_path_cost; + uint16_t bp_port_msg_age; + uint16_t bp_port_max_age; + uint16_t bp_port_fdelay; + uint16_t bp_port_htime; + uint16_t bp_desg_msg_age; + uint16_t bp_desg_max_age; + uint16_t bp_desg_fdelay; + uint16_t bp_desg_htime; + struct bstp_timer bp_edge_delay_timer; + struct bstp_timer bp_forward_delay_timer; + struct bstp_timer bp_hello_timer; + struct bstp_timer bp_message_age_timer; + struct bstp_timer bp_migrate_delay_timer; + struct bstp_timer bp_recent_backup_timer; + struct bstp_timer bp_recent_root_timer; + struct bstp_timer bp_tc_timer; + struct bstp_config_unit bp_msg_cu; + struct bstp_pri_vector bp_desg_pv; + struct bstp_pri_vector bp_port_pv; + uint16_t bp_port_id; + uint8_t bp_state; + uint8_t bp_tcstate; + uint8_t bp_role; + uint8_t bp_infois; + uint8_t bp_tc_ack; + uint8_t bp_tc_prop; + uint8_t bp_fdbflush; + uint8_t bp_priority; + uint8_t bp_ptp_link; + uint8_t bp_agree; + uint8_t bp_agreed; + uint8_t bp_sync; + uint8_t bp_synced; + uint8_t bp_proposing; + uint8_t bp_proposed; + uint8_t bp_operedge; + uint8_t bp_reroot; + uint8_t bp_rcvdtc; + uint8_t bp_rcvdtca; + uint8_t bp_rcvdtcn; + uint32_t bp_forward_transitions; + uint8_t bp_txcount; + struct task bp_statetask; + struct task bp_rtagetask; +}; + +/* + * Software state for each bridge STP. + */ +struct bstp_state { + LIST_ENTRY(bstp_state) bs_list; + uint8_t bs_running; + struct mtx bs_mtx; + struct bstp_pri_vector bs_bridge_pv; + struct bstp_pri_vector bs_root_pv; + struct bstp_port *bs_root_port; + uint8_t bs_protover; + uint16_t bs_migration_delay; + uint16_t bs_edge_delay; + uint16_t bs_bridge_max_age; + uint16_t bs_bridge_fdelay; + uint16_t bs_bridge_htime; + uint16_t bs_root_msg_age; + uint16_t bs_root_max_age; + uint16_t bs_root_fdelay; + uint16_t bs_root_htime; + uint16_t bs_hold_time; + uint16_t bs_bridge_priority; + uint8_t bs_txholdcount; + uint8_t bs_allsynced; + struct callout bs_bstpcallout; /* STP callout */ + struct bstp_timer bs_link_timer; + struct timeval bs_last_tc_time; + LIST_HEAD(, bstp_port) bs_bplist; + bstp_state_cb_t bs_state_cb; + bstp_rtage_cb_t bs_rtage_cb; +}; + +#define BSTP_LOCK_INIT(_bs) mtx_init(&(_bs)->bs_mtx, "bstp", NULL, MTX_DEF) +#define BSTP_LOCK_DESTROY(_bs) mtx_destroy(&(_bs)->bs_mtx) +#define BSTP_LOCK(_bs) mtx_lock(&(_bs)->bs_mtx) +#define BSTP_UNLOCK(_bs) mtx_unlock(&(_bs)->bs_mtx) +#define BSTP_LOCK_ASSERT(_bs) mtx_assert(&(_bs)->bs_mtx, MA_OWNED) + +extern const uint8_t bstp_etheraddr[]; + +extern void (*bstp_linkstate_p)(struct ifnet *ifp, int state); + +void bstp_attach(struct bstp_state *, struct bstp_cb_ops *); +void bstp_detach(struct bstp_state *); +void bstp_init(struct bstp_state *); +void bstp_stop(struct bstp_state *); +int bstp_create(struct bstp_state *, struct bstp_port *, struct ifnet *); +int bstp_enable(struct bstp_port *); +void bstp_disable(struct bstp_port *); +void bstp_destroy(struct bstp_port *); +void bstp_linkstate(struct ifnet *, int); +int bstp_set_htime(struct bstp_state *, int); +int bstp_set_fdelay(struct bstp_state *, int); +int bstp_set_maxage(struct bstp_state *, int); +int bstp_set_holdcount(struct bstp_state *, int); +int bstp_set_protocol(struct bstp_state *, int); +int bstp_set_priority(struct bstp_state *, int); +int bstp_set_port_priority(struct bstp_port *, int); +int bstp_set_path_cost(struct bstp_port *, uint32_t); +int bstp_set_edge(struct bstp_port *, int); +int bstp_set_autoedge(struct bstp_port *, int); +int bstp_set_ptp(struct bstp_port *, int); +int bstp_set_autoptp(struct bstp_port *, int); +struct mbuf *bstp_input(struct bstp_port *, struct ifnet *, struct mbuf *); + +#endif /* _KERNEL */ diff --git a/freebsd/sys/net/ethernet.h b/freebsd/sys/net/ethernet.h new file mode 100644 index 00000000..17d022b2 --- /dev/null +++ b/freebsd/sys/net/ethernet.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/net/fddi.h b/freebsd/sys/net/fddi.h new file mode 100644 index 00000000..03deabff --- /dev/null +++ b/freebsd/sys/net/fddi.h @@ -0,0 +1,105 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * Copyright (c) 1995 Matt Thomas (thomas@lkg.dec.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_fddi.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IF_FDDI_HH_ +#define _NETINET_IF_FDDI_HH_ + +#define FDDIIPMTU 4352 +#define FDDIMTU 4470 +#define FDDIMIN 3 + +#define FDDIFC_C 0x80 /* 0b10000000 */ +#define FDDIFC_L 0x40 /* 0b01000000 */ +#define FDDIFC_F 0x30 /* 0b00110000 */ +#define FDDIFC_Z 0x0F /* 0b00001111 */ +#define FDDIFC_CLFF 0xF0 /* Class/Length/Format bits */ +#define FDDIFC_ZZZZ 0x0F /* Control bits */ + +/* + * FDDI Frame Control values. (48-bit addressing only). + */ +#define FDDIFC_VOID 0x40 /* Void frame */ +#define FDDIFC_NRT 0x80 /* Nonrestricted token */ +#define FDDIFC_RT 0xc0 /* Restricted token */ +#define FDDIFC_MAC_BEACON 0xc2 /* MAC Beacon frame */ +#define FDDIFC_MAC_CLAIM 0xc3 /* MAC Claim frame */ +#define FDDIFC_LLC_ASYNC 0x50 +#define FDDIFC_LLC_PRIO0 0 +#define FDDIFC_LLC_PRIO1 1 +#define FDDIFC_LLC_PRIO2 2 +#define FDDIFC_LLC_PRIO3 3 +#define FDDIFC_LLC_PRIO4 4 +#define FDDIFC_LLC_PRIO5 5 +#define FDDIFC_LLC_PRIO6 6 +#define FDDIFC_LLC_PRIO7 7 +#define FDDIFC_LLC_SYNC 0xd0 +#define FDDIFC_IMP_ASYNC 0x60 /* Implementor Async. */ +#define FDDIFC_IMP_SYNC 0xe0 /* Implementor Synch. */ +#define FDDIFC_SMT 0x40 +#define FDDIFC_SMT_INFO 0x41 /* SMT Info */ +#define FDDIFC_SMT_NSA 0x4F /* SMT Next station adrs */ +#define FDDIFC_MAC 0xc0 /* MAC frame */ + +#define FDDI_ADDR_LEN 6 +#define FDDI_HDR_LEN (sizeof(struct fddi_header)) + +/* + * Structure of an 100Mb/s FDDI header. + */ +struct fddi_header { + u_char fddi_fc; + u_char fddi_dhost[FDDI_ADDR_LEN]; + u_char fddi_shost[FDDI_ADDR_LEN]; +}; + +#if defined(_KERNEL) +#define fddi_ipmulticast_min ether_ipmulticast_min +#define fddi_ipmulticast_max ether_ipmulticast_max +#define fddi_addmulti ether_addmulti +#define fddi_delmulti ether_delmulti +#define fddi_sprintf ether_sprintf + +#define FDDI_BPF_UNSUPPORTED 0 +#define FDDI_BPF_SUPPORTED 1 + +void fddi_ifattach(struct ifnet *, const u_int8_t *, int); +void fddi_ifdetach(struct ifnet *, int); +int fddi_ioctl(struct ifnet *, u_long, caddr_t); + +#endif /* _KERNEL */ +#endif /* _NET_FDDI_HH_ */ diff --git a/freebsd/sys/net/firewire.h b/freebsd/sys/net/firewire.h new file mode 100644 index 00000000..5411dbf8 --- /dev/null +++ b/freebsd/sys/net/firewire.h @@ -0,0 +1,142 @@ +/*- + * Copyright (c) 2004 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_FIREWIRE_HH_ +#define _NET_FIREWIRE_HH_ + +#define FW_ENCAP_UNFRAG 0 +#define FW_ENCAP_FIRST 1 +#define FW_ENCAP_LAST 2 +#define FW_ENCAP_NEXT 3 + +union fw_encap { + uint32_t ul[2]; + struct { +#if BYTE_ORDER == BIG_ENDIAN + uint32_t lf :2; + uint32_t reserved :14; + uint32_t ether_type :16; +#else + uint32_t ether_type :16; + uint32_t reserved :14; + uint32_t lf :2; +#endif + } unfrag; + struct { +#if BYTE_ORDER == BIG_ENDIAN + uint32_t lf :2; + uint32_t reserved1 :2; + uint32_t datagram_size :12; + uint32_t ether_type :16; + uint32_t dgl :16; + uint32_t reserved2 :16; +#else + uint32_t ether_type :16; + uint32_t datagram_size :12; + uint32_t reserved1 :2; + uint32_t lf :2; + uint32_t reserved2 :16; + uint32_t dgl :16; +#endif + } firstfrag; + struct { +#if BYTE_ORDER == BIG_ENDIAN + uint32_t lf :2; + uint32_t reserved1 :2; + uint32_t datagram_size :12; + uint32_t reserved2 :4; + uint32_t fragment_offset :12; + uint32_t dgl :16; + uint32_t reserved3 :16; +#else + uint32_t fragment_offset :12; + uint32_t reserved2 :4; + uint32_t datagram_size :12; + uint32_t reserved1 :2; + uint32_t lf :2; + uint32_t reserved3 :16; + uint32_t dgl :16; +#endif + } nextfrag; +}; + +#define MTAG_FIREWIRE 1394 +#define MTAG_FIREWIRE_HWADDR 0 +#define MTAG_FIREWIRE_SENDER_EUID 1 + +struct fw_hwaddr { + uint32_t sender_unique_ID_hi; + uint32_t sender_unique_ID_lo; + uint8_t sender_max_rec; + uint8_t sspd; + uint16_t sender_unicast_FIFO_hi; + uint32_t sender_unicast_FIFO_lo; +}; + +/* + * BPF wants to see one of these. + */ +struct fw_bpfhdr { + uint8_t firewire_dhost[8]; + uint8_t firewire_shost[8]; + uint16_t firewire_type; +}; + +#ifdef _KERNEL + +/* + * A structure to track the reassembly of a link-level fragmented + * datagram. + */ +struct fw_reass { + STAILQ_ENTRY(fw_reass) fr_link; + uint32_t fr_id; /* host+dgl */ + struct mbuf *fr_frags; /* chain of frags */ +}; +STAILQ_HEAD(fw_reass_list, fw_reass); + +struct fw_com { + struct ifnet *fc_ifp; + struct fw_hwaddr fc_hwaddr; + struct firewire_comm *fc_fc; + uint8_t fc_broadcast_channel; + uint8_t fc_speed; /* our speed */ + uint16_t fc_node; /* our nodeid */ + struct fw_reass_list fc_frags; /* partial datagrams */ +}; +#define IFP2FWC(ifp) ((struct fw_com *)(ifp)->if_l2com) + +extern void firewire_input(struct ifnet *ifp, struct mbuf *m, uint16_t src); +extern void firewire_ifattach(struct ifnet *, struct fw_hwaddr *); +extern void firewire_ifdetach(struct ifnet *); +extern void firewire_busreset(struct ifnet *); +extern int firewire_ioctl(struct ifnet *, u_long, caddr_t); + +#endif /* !_KERNEL */ + +#endif /* !_NET_FIREWIRE_HH_ */ diff --git a/freebsd/sys/net/flowtable.h b/freebsd/sys/net/flowtable.h new file mode 100644 index 00000000..c4a09659 --- /dev/null +++ b/freebsd/sys/net/flowtable.h @@ -0,0 +1,82 @@ +/************************************************************************** + +Copyright (c) 2008-2010, BitGravity Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the BitGravity Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +$FreeBSD$ + +***************************************************************************/ + +#ifndef _NET_FLOWTABLE_HH_ +#define _NET_FLOWTABLE_HH_ + +#ifdef _KERNEL + +#define FL_HASH_ALL (1<<0) /* hash 4-tuple + protocol */ +#define FL_PCPU (1<<1) /* pcpu cache */ +#define FL_NOAUTO (1<<2) /* don't automatically add flentry on miss */ + +#define FL_TCP (1<<11) +#define FL_SCTP (1<<12) +#define FL_UDP (1<<13) +#define FL_DEBUG (1<<14) +#define FL_DEBUG_ALL (1<<15) + +struct flowtable; +struct flentry; +struct route; +struct route_in6; + +VNET_DECLARE(struct flowtable *, ip_ft); +#define V_ip_ft VNET(ip_ft) + +VNET_DECLARE(struct flowtable *, ip6_ft); +#define V_ip6_ft VNET(ip6_ft) + +struct flowtable *flowtable_alloc(char *name, int nentry, int flags); + +/* + * Given a flow table, look up the L3 and L2 information and + * return it in the route. + * + */ +struct flentry *flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af); + +struct flentry *flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa, + struct sockaddr_storage *dsa, uint32_t fibnum, int flags); + +int kern_flowtable_insert(struct flowtable *ft, struct sockaddr_storage *ssa, + struct sockaddr_storage *dsa, struct route *ro, uint32_t fibnum, int flags); + +void flow_invalidate(struct flentry *fl); +void flowtable_route_flush(struct flowtable *ft, struct rtentry *rt); + +void flow_to_route(struct flentry *fl, struct route *ro); + +void flow_to_route_in6(struct flentry *fl, struct route_in6 *ro); + + +#endif /* _KERNEL */ +#endif diff --git a/freebsd/sys/net/ieee8023ad_lacp.c b/freebsd/sys/net/ieee8023ad_lacp.c new file mode 100644 index 00000000..75c0d9ea --- /dev/null +++ b/freebsd/sys/net/ieee8023ad_lacp.c @@ -0,0 +1,1947 @@ +#include + +/* $NetBSD: ieee8023ad_lacp.c,v 1.3 2005/12/11 12:24:54 christos Exp $ */ + +/*- + * Copyright (c)2005 YAMAMOTO Takashi, + * Copyright (c)2008 Andrew Thompson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include /* hz */ +#include /* for net/if.h */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +/* + * actor system priority and port priority. + * XXX should be configurable. + */ + +#define LACP_SYSTEM_PRIO 0x8000 +#define LACP_PORT_PRIO 0x8000 + +const uint8_t ethermulticastaddr_slowprotocols[ETHER_ADDR_LEN] = + { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x02 }; + +static const struct tlv_template lacp_info_tlv_template[] = { + { LACP_TYPE_ACTORINFO, + sizeof(struct tlvhdr) + sizeof(struct lacp_peerinfo) }, + { LACP_TYPE_PARTNERINFO, + sizeof(struct tlvhdr) + sizeof(struct lacp_peerinfo) }, + { LACP_TYPE_COLLECTORINFO, + sizeof(struct tlvhdr) + sizeof(struct lacp_collectorinfo) }, + { 0, 0 }, +}; + +static const struct tlv_template marker_info_tlv_template[] = { + { MARKER_TYPE_INFO, + sizeof(struct tlvhdr) + sizeof(struct lacp_markerinfo) }, + { 0, 0 }, +}; + +static const struct tlv_template marker_response_tlv_template[] = { + { MARKER_TYPE_RESPONSE, + sizeof(struct tlvhdr) + sizeof(struct lacp_markerinfo) }, + { 0, 0 }, +}; + +typedef void (*lacp_timer_func_t)(struct lacp_port *); + +static void lacp_fill_actorinfo(struct lacp_port *, struct lacp_peerinfo *); +static void lacp_fill_markerinfo(struct lacp_port *, + struct lacp_markerinfo *); + +static uint64_t lacp_aggregator_bandwidth(struct lacp_aggregator *); +static void lacp_suppress_distributing(struct lacp_softc *, + struct lacp_aggregator *); +static void lacp_transit_expire(void *); +static void lacp_update_portmap(struct lacp_softc *); +static void lacp_select_active_aggregator(struct lacp_softc *); +static uint16_t lacp_compose_key(struct lacp_port *); +static int tlv_check(const void *, size_t, const struct tlvhdr *, + const struct tlv_template *, boolean_t); +static void lacp_tick(void *); + +static void lacp_fill_aggregator_id(struct lacp_aggregator *, + const struct lacp_port *); +static void lacp_fill_aggregator_id_peer(struct lacp_peerinfo *, + const struct lacp_peerinfo *); +static int lacp_aggregator_is_compatible(const struct lacp_aggregator *, + const struct lacp_port *); +static int lacp_peerinfo_is_compatible(const struct lacp_peerinfo *, + const struct lacp_peerinfo *); + +static struct lacp_aggregator *lacp_aggregator_get(struct lacp_softc *, + struct lacp_port *); +static void lacp_aggregator_addref(struct lacp_softc *, + struct lacp_aggregator *); +static void lacp_aggregator_delref(struct lacp_softc *, + struct lacp_aggregator *); + +/* receive machine */ + +static int lacp_pdu_input(struct lacp_port *, struct mbuf *); +static int lacp_marker_input(struct lacp_port *, struct mbuf *); +static void lacp_sm_rx(struct lacp_port *, const struct lacpdu *); +static void lacp_sm_rx_timer(struct lacp_port *); +static void lacp_sm_rx_set_expired(struct lacp_port *); +static void lacp_sm_rx_update_ntt(struct lacp_port *, + const struct lacpdu *); +static void lacp_sm_rx_record_pdu(struct lacp_port *, + const struct lacpdu *); +static void lacp_sm_rx_update_selected(struct lacp_port *, + const struct lacpdu *); +static void lacp_sm_rx_record_default(struct lacp_port *); +static void lacp_sm_rx_update_default_selected(struct lacp_port *); +static void lacp_sm_rx_update_selected_from_peerinfo(struct lacp_port *, + const struct lacp_peerinfo *); + +/* mux machine */ + +static void lacp_sm_mux(struct lacp_port *); +static void lacp_set_mux(struct lacp_port *, enum lacp_mux_state); +static void lacp_sm_mux_timer(struct lacp_port *); + +/* periodic transmit machine */ + +static void lacp_sm_ptx_update_timeout(struct lacp_port *, uint8_t); +static void lacp_sm_ptx_tx_schedule(struct lacp_port *); +static void lacp_sm_ptx_timer(struct lacp_port *); + +/* transmit machine */ + +static void lacp_sm_tx(struct lacp_port *); +static void lacp_sm_assert_ntt(struct lacp_port *); + +static void lacp_run_timers(struct lacp_port *); +static int lacp_compare_peerinfo(const struct lacp_peerinfo *, + const struct lacp_peerinfo *); +static int lacp_compare_systemid(const struct lacp_systemid *, + const struct lacp_systemid *); +static void lacp_port_enable(struct lacp_port *); +static void lacp_port_disable(struct lacp_port *); +static void lacp_select(struct lacp_port *); +static void lacp_unselect(struct lacp_port *); +static void lacp_disable_collecting(struct lacp_port *); +static void lacp_enable_collecting(struct lacp_port *); +static void lacp_disable_distributing(struct lacp_port *); +static void lacp_enable_distributing(struct lacp_port *); +static int lacp_xmit_lacpdu(struct lacp_port *); +static int lacp_xmit_marker(struct lacp_port *); + +#if defined(LACP_DEBUG) +static void lacp_dump_lacpdu(const struct lacpdu *); +static const char *lacp_format_partner(const struct lacp_peerinfo *, char *, + size_t); +static const char *lacp_format_lagid(const struct lacp_peerinfo *, + const struct lacp_peerinfo *, char *, size_t); +static const char *lacp_format_lagid_aggregator(const struct lacp_aggregator *, + char *, size_t); +static const char *lacp_format_state(uint8_t, char *, size_t); +static const char *lacp_format_mac(const uint8_t *, char *, size_t); +static const char *lacp_format_systemid(const struct lacp_systemid *, char *, + size_t); +static const char *lacp_format_portid(const struct lacp_portid *, char *, + size_t); +static void lacp_dprintf(const struct lacp_port *, const char *, ...) + __attribute__((__format__(__printf__, 2, 3))); +#define LACP_DPRINTF(a) lacp_dprintf a +#else +#define LACP_DPRINTF(a) /* nothing */ +#endif + +/* + * partner administration variables. + * XXX should be configurable. + */ + +static const struct lacp_peerinfo lacp_partner_admin = { + .lip_systemid = { .lsi_prio = 0xffff }, + .lip_portid = { .lpi_prio = 0xffff }, +#if 1 + /* optimistic */ + .lip_state = LACP_STATE_SYNC | LACP_STATE_AGGREGATION | + LACP_STATE_COLLECTING | LACP_STATE_DISTRIBUTING, +#else + /* pessimistic */ + .lip_state = 0, +#endif +}; + +static const lacp_timer_func_t lacp_timer_funcs[LACP_NTIMER] = { + [LACP_TIMER_CURRENT_WHILE] = lacp_sm_rx_timer, + [LACP_TIMER_PERIODIC] = lacp_sm_ptx_timer, + [LACP_TIMER_WAIT_WHILE] = lacp_sm_mux_timer, +}; + +struct mbuf * +lacp_input(struct lagg_port *lgp, struct mbuf *m) +{ + struct lacp_port *lp = LACP_PORT(lgp); + uint8_t subtype; + + if (m->m_pkthdr.len < sizeof(struct ether_header) + sizeof(subtype)) { + m_freem(m); + return (NULL); + } + + m_copydata(m, sizeof(struct ether_header), sizeof(subtype), &subtype); + switch (subtype) { + case SLOWPROTOCOLS_SUBTYPE_LACP: + lacp_pdu_input(lp, m); + return (NULL); + + case SLOWPROTOCOLS_SUBTYPE_MARKER: + lacp_marker_input(lp, m); + return (NULL); + } + + /* Not a subtype we are interested in */ + return (m); +} + +/* + * lacp_pdu_input: process lacpdu + */ +static int +lacp_pdu_input(struct lacp_port *lp, struct mbuf *m) +{ + struct lacp_softc *lsc = lp->lp_lsc; + struct lacpdu *du; + int error = 0; + + if (m->m_pkthdr.len != sizeof(*du)) { + goto bad; + } + + if ((m->m_flags & M_MCAST) == 0) { + goto bad; + } + + if (m->m_len < sizeof(*du)) { + m = m_pullup(m, sizeof(*du)); + if (m == NULL) { + return (ENOMEM); + } + } + + du = mtod(m, struct lacpdu *); + + if (memcmp(&du->ldu_eh.ether_dhost, + ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN)) { + goto bad; + } + + /* + * ignore the version for compatibility with + * the future protocol revisions. + */ +#if 0 + if (du->ldu_sph.sph_version != 1) { + goto bad; + } +#endif + + /* + * ignore tlv types for compatibility with + * the future protocol revisions. + */ + if (tlv_check(du, sizeof(*du), &du->ldu_tlv_actor, + lacp_info_tlv_template, FALSE)) { + goto bad; + } + +#if defined(LACP_DEBUG) + LACP_DPRINTF((lp, "lacpdu receive\n")); + lacp_dump_lacpdu(du); +#endif /* defined(LACP_DEBUG) */ + + LACP_LOCK(lsc); + lacp_sm_rx(lp, du); + LACP_UNLOCK(lsc); + + m_freem(m); + return (error); + +bad: + m_freem(m); + return (EINVAL); +} + +static void +lacp_fill_actorinfo(struct lacp_port *lp, struct lacp_peerinfo *info) +{ + struct lagg_port *lgp = lp->lp_lagg; + struct lagg_softc *sc = lgp->lp_softc; + + info->lip_systemid.lsi_prio = htons(LACP_SYSTEM_PRIO); + memcpy(&info->lip_systemid.lsi_mac, + IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); + info->lip_portid.lpi_prio = htons(LACP_PORT_PRIO); + info->lip_portid.lpi_portno = htons(lp->lp_ifp->if_index); + info->lip_state = lp->lp_state; +} + +static void +lacp_fill_markerinfo(struct lacp_port *lp, struct lacp_markerinfo *info) +{ + struct ifnet *ifp = lp->lp_ifp; + + /* Fill in the port index and system id (encoded as the MAC) */ + info->mi_rq_port = htons(ifp->if_index); + memcpy(&info->mi_rq_system, lp->lp_systemid.lsi_mac, ETHER_ADDR_LEN); + info->mi_rq_xid = htonl(0); +} + +static int +lacp_xmit_lacpdu(struct lacp_port *lp) +{ + struct lagg_port *lgp = lp->lp_lagg; + struct mbuf *m; + struct lacpdu *du; + int error; + + LACP_LOCK_ASSERT(lp->lp_lsc); + + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) { + return (ENOMEM); + } + m->m_len = m->m_pkthdr.len = sizeof(*du); + + du = mtod(m, struct lacpdu *); + memset(du, 0, sizeof(*du)); + + memcpy(&du->ldu_eh.ether_dhost, ethermulticastaddr_slowprotocols, + ETHER_ADDR_LEN); + memcpy(&du->ldu_eh.ether_shost, lgp->lp_lladdr, ETHER_ADDR_LEN); + du->ldu_eh.ether_type = htons(ETHERTYPE_SLOW); + + du->ldu_sph.sph_subtype = SLOWPROTOCOLS_SUBTYPE_LACP; + du->ldu_sph.sph_version = 1; + + TLV_SET(&du->ldu_tlv_actor, LACP_TYPE_ACTORINFO, sizeof(du->ldu_actor)); + du->ldu_actor = lp->lp_actor; + + TLV_SET(&du->ldu_tlv_partner, LACP_TYPE_PARTNERINFO, + sizeof(du->ldu_partner)); + du->ldu_partner = lp->lp_partner; + + TLV_SET(&du->ldu_tlv_collector, LACP_TYPE_COLLECTORINFO, + sizeof(du->ldu_collector)); + du->ldu_collector.lci_maxdelay = 0; + +#if defined(LACP_DEBUG) + LACP_DPRINTF((lp, "lacpdu transmit\n")); + lacp_dump_lacpdu(du); +#endif /* defined(LACP_DEBUG) */ + + m->m_flags |= M_MCAST; + + /* + * XXX should use higher priority queue. + * otherwise network congestion can break aggregation. + */ + + error = lagg_enqueue(lp->lp_ifp, m); + return (error); +} + +static int +lacp_xmit_marker(struct lacp_port *lp) +{ + struct lagg_port *lgp = lp->lp_lagg; + struct mbuf *m; + struct markerdu *mdu; + int error; + + LACP_LOCK_ASSERT(lp->lp_lsc); + + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) { + return (ENOMEM); + } + m->m_len = m->m_pkthdr.len = sizeof(*mdu); + + mdu = mtod(m, struct markerdu *); + memset(mdu, 0, sizeof(*mdu)); + + memcpy(&mdu->mdu_eh.ether_dhost, ethermulticastaddr_slowprotocols, + ETHER_ADDR_LEN); + memcpy(&mdu->mdu_eh.ether_shost, lgp->lp_lladdr, ETHER_ADDR_LEN); + mdu->mdu_eh.ether_type = htons(ETHERTYPE_SLOW); + + mdu->mdu_sph.sph_subtype = SLOWPROTOCOLS_SUBTYPE_MARKER; + mdu->mdu_sph.sph_version = 1; + + /* Bump the transaction id and copy over the marker info */ + lp->lp_marker.mi_rq_xid = htonl(ntohl(lp->lp_marker.mi_rq_xid) + 1); + TLV_SET(&mdu->mdu_tlv, MARKER_TYPE_INFO, sizeof(mdu->mdu_info)); + mdu->mdu_info = lp->lp_marker; + + LACP_DPRINTF((lp, "marker transmit, port=%u, sys=%6D, id=%u\n", + ntohs(mdu->mdu_info.mi_rq_port), mdu->mdu_info.mi_rq_system, ":", + ntohl(mdu->mdu_info.mi_rq_xid))); + + m->m_flags |= M_MCAST; + error = lagg_enqueue(lp->lp_ifp, m); + return (error); +} + +void +lacp_linkstate(struct lagg_port *lgp) +{ + struct lacp_port *lp = LACP_PORT(lgp); + struct lacp_softc *lsc = lp->lp_lsc; + struct ifnet *ifp = lgp->lp_ifp; + struct ifmediareq ifmr; + int error = 0; + u_int media; + uint8_t old_state; + uint16_t old_key; + + bzero((char *)&ifmr, sizeof(ifmr)); + error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (caddr_t)&ifmr); + if (error != 0) + return; + + LACP_LOCK(lsc); + media = ifmr.ifm_active; + LACP_DPRINTF((lp, "media changed 0x%x -> 0x%x, ether = %d, fdx = %d, " + "link = %d\n", lp->lp_media, media, IFM_TYPE(media) == IFM_ETHER, + (media & IFM_FDX) != 0, ifp->if_link_state == LINK_STATE_UP)); + old_state = lp->lp_state; + old_key = lp->lp_key; + + lp->lp_media = media; + /* + * If the port is not an active full duplex Ethernet link then it can + * not be aggregated. + */ + if (IFM_TYPE(media) != IFM_ETHER || (media & IFM_FDX) == 0 || + ifp->if_link_state != LINK_STATE_UP) { + lacp_port_disable(lp); + } else { + lacp_port_enable(lp); + } + lp->lp_key = lacp_compose_key(lp); + + if (old_state != lp->lp_state || old_key != lp->lp_key) { + LACP_DPRINTF((lp, "-> UNSELECTED\n")); + lp->lp_selected = LACP_UNSELECTED; + } + LACP_UNLOCK(lsc); +} + +static void +lacp_tick(void *arg) +{ + struct lacp_softc *lsc = arg; + struct lacp_port *lp; + + LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) { + if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0) + continue; + + lacp_run_timers(lp); + + lacp_select(lp); + lacp_sm_mux(lp); + lacp_sm_tx(lp); + lacp_sm_ptx_tx_schedule(lp); + } + callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc); +} + +int +lacp_port_create(struct lagg_port *lgp) +{ + struct lagg_softc *sc = lgp->lp_softc; + struct lacp_softc *lsc = LACP_SOFTC(sc); + struct lacp_port *lp; + struct ifnet *ifp = lgp->lp_ifp; + struct sockaddr_dl sdl; + struct ifmultiaddr *rifma = NULL; + int error; + + boolean_t active = TRUE; /* XXX should be configurable */ + boolean_t fast = FALSE; /* XXX should be configurable */ + + bzero((char *)&sdl, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_index = ifp->if_index; + sdl.sdl_type = IFT_ETHER; + sdl.sdl_alen = ETHER_ADDR_LEN; + + bcopy(ðermulticastaddr_slowprotocols, + LLADDR(&sdl), ETHER_ADDR_LEN); + error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma); + if (error) { + printf("%s: ADDMULTI failed on %s\n", __func__, lgp->lp_ifname); + return (error); + } + + lp = malloc(sizeof(struct lacp_port), + M_DEVBUF, M_NOWAIT|M_ZERO); + if (lp == NULL) + return (ENOMEM); + + LACP_LOCK(lsc); + lgp->lp_psc = (caddr_t)lp; + lp->lp_ifp = ifp; + lp->lp_lagg = lgp; + lp->lp_lsc = lsc; + lp->lp_ifma = rifma; + + LIST_INSERT_HEAD(&lsc->lsc_ports, lp, lp_next); + + lacp_fill_actorinfo(lp, &lp->lp_actor); + lacp_fill_markerinfo(lp, &lp->lp_marker); + lp->lp_state = + (active ? LACP_STATE_ACTIVITY : 0) | + (fast ? LACP_STATE_TIMEOUT : 0); + lp->lp_aggregator = NULL; + lacp_sm_rx_set_expired(lp); + LACP_UNLOCK(lsc); + lacp_linkstate(lgp); + + return (0); +} + +void +lacp_port_destroy(struct lagg_port *lgp) +{ + struct lacp_port *lp = LACP_PORT(lgp); + struct lacp_softc *lsc = lp->lp_lsc; + int i; + + LACP_LOCK(lsc); + for (i = 0; i < LACP_NTIMER; i++) { + LACP_TIMER_DISARM(lp, i); + } + + lacp_disable_collecting(lp); + lacp_disable_distributing(lp); + lacp_unselect(lp); + + /* The address may have already been removed by if_purgemaddrs() */ + if (!lgp->lp_detaching) + if_delmulti_ifma(lp->lp_ifma); + + LIST_REMOVE(lp, lp_next); + LACP_UNLOCK(lsc); + free(lp, M_DEVBUF); +} + +void +lacp_req(struct lagg_softc *sc, caddr_t data) +{ + struct lacp_opreq *req = (struct lacp_opreq *)data; + struct lacp_softc *lsc = LACP_SOFTC(sc); + struct lacp_aggregator *la = lsc->lsc_active_aggregator; + + LACP_LOCK(lsc); + bzero(req, sizeof(struct lacp_opreq)); + if (la != NULL) { + req->actor_prio = ntohs(la->la_actor.lip_systemid.lsi_prio); + memcpy(&req->actor_mac, &la->la_actor.lip_systemid.lsi_mac, + ETHER_ADDR_LEN); + req->actor_key = ntohs(la->la_actor.lip_key); + req->actor_portprio = ntohs(la->la_actor.lip_portid.lpi_prio); + req->actor_portno = ntohs(la->la_actor.lip_portid.lpi_portno); + req->actor_state = la->la_actor.lip_state; + + req->partner_prio = ntohs(la->la_partner.lip_systemid.lsi_prio); + memcpy(&req->partner_mac, &la->la_partner.lip_systemid.lsi_mac, + ETHER_ADDR_LEN); + req->partner_key = ntohs(la->la_partner.lip_key); + req->partner_portprio = ntohs(la->la_partner.lip_portid.lpi_prio); + req->partner_portno = ntohs(la->la_partner.lip_portid.lpi_portno); + req->partner_state = la->la_partner.lip_state; + } + LACP_UNLOCK(lsc); +} + +void +lacp_portreq(struct lagg_port *lgp, caddr_t data) +{ + struct lacp_opreq *req = (struct lacp_opreq *)data; + struct lacp_port *lp = LACP_PORT(lgp); + struct lacp_softc *lsc = lp->lp_lsc; + + LACP_LOCK(lsc); + req->actor_prio = ntohs(lp->lp_actor.lip_systemid.lsi_prio); + memcpy(&req->actor_mac, &lp->lp_actor.lip_systemid.lsi_mac, + ETHER_ADDR_LEN); + req->actor_key = ntohs(lp->lp_actor.lip_key); + req->actor_portprio = ntohs(lp->lp_actor.lip_portid.lpi_prio); + req->actor_portno = ntohs(lp->lp_actor.lip_portid.lpi_portno); + req->actor_state = lp->lp_actor.lip_state; + + req->partner_prio = ntohs(lp->lp_partner.lip_systemid.lsi_prio); + memcpy(&req->partner_mac, &lp->lp_partner.lip_systemid.lsi_mac, + ETHER_ADDR_LEN); + req->partner_key = ntohs(lp->lp_partner.lip_key); + req->partner_portprio = ntohs(lp->lp_partner.lip_portid.lpi_prio); + req->partner_portno = ntohs(lp->lp_partner.lip_portid.lpi_portno); + req->partner_state = lp->lp_partner.lip_state; + LACP_UNLOCK(lsc); +} + +static void +lacp_disable_collecting(struct lacp_port *lp) +{ + LACP_DPRINTF((lp, "collecting disabled\n")); + lp->lp_state &= ~LACP_STATE_COLLECTING; +} + +static void +lacp_enable_collecting(struct lacp_port *lp) +{ + LACP_DPRINTF((lp, "collecting enabled\n")); + lp->lp_state |= LACP_STATE_COLLECTING; +} + +static void +lacp_disable_distributing(struct lacp_port *lp) +{ + struct lacp_aggregator *la = lp->lp_aggregator; + struct lacp_softc *lsc = lp->lp_lsc; +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif /* defined(LACP_DEBUG) */ + + LACP_LOCK_ASSERT(lsc); + + if (la == NULL || (lp->lp_state & LACP_STATE_DISTRIBUTING) == 0) { + return; + } + + KASSERT(!TAILQ_EMPTY(&la->la_ports), ("no aggregator ports")); + KASSERT(la->la_nports > 0, ("nports invalid (%d)", la->la_nports)); + KASSERT(la->la_refcnt >= la->la_nports, ("aggregator refcnt invalid")); + + LACP_DPRINTF((lp, "disable distributing on aggregator %s, " + "nports %d -> %d\n", + lacp_format_lagid_aggregator(la, buf, sizeof(buf)), + la->la_nports, la->la_nports - 1)); + + TAILQ_REMOVE(&la->la_ports, lp, lp_dist_q); + la->la_nports--; + + if (lsc->lsc_active_aggregator == la) { + lacp_suppress_distributing(lsc, la); + lacp_select_active_aggregator(lsc); + /* regenerate the port map, the active aggregator has changed */ + lacp_update_portmap(lsc); + } + + lp->lp_state &= ~LACP_STATE_DISTRIBUTING; +} + +static void +lacp_enable_distributing(struct lacp_port *lp) +{ + struct lacp_aggregator *la = lp->lp_aggregator; + struct lacp_softc *lsc = lp->lp_lsc; +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif /* defined(LACP_DEBUG) */ + + LACP_LOCK_ASSERT(lsc); + + if ((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0) { + return; + } + + LACP_DPRINTF((lp, "enable distributing on aggregator %s, " + "nports %d -> %d\n", + lacp_format_lagid_aggregator(la, buf, sizeof(buf)), + la->la_nports, la->la_nports + 1)); + + KASSERT(la->la_refcnt > la->la_nports, ("aggregator refcnt invalid")); + TAILQ_INSERT_HEAD(&la->la_ports, lp, lp_dist_q); + la->la_nports++; + + lp->lp_state |= LACP_STATE_DISTRIBUTING; + + if (lsc->lsc_active_aggregator == la) { + lacp_suppress_distributing(lsc, la); + lacp_update_portmap(lsc); + } else + /* try to become the active aggregator */ + lacp_select_active_aggregator(lsc); +} + +static void +lacp_transit_expire(void *vp) +{ + struct lacp_softc *lsc = vp; + + LACP_LOCK_ASSERT(lsc); + + LACP_DPRINTF((NULL, "%s\n", __func__)); + lsc->lsc_suppress_distributing = FALSE; +} + +int +lacp_attach(struct lagg_softc *sc) +{ + struct lacp_softc *lsc; + + lsc = malloc(sizeof(struct lacp_softc), + M_DEVBUF, M_NOWAIT|M_ZERO); + if (lsc == NULL) + return (ENOMEM); + + sc->sc_psc = (caddr_t)lsc; + lsc->lsc_softc = sc; + + lsc->lsc_hashkey = arc4random(); + lsc->lsc_active_aggregator = NULL; + LACP_LOCK_INIT(lsc); + TAILQ_INIT(&lsc->lsc_aggregators); + LIST_INIT(&lsc->lsc_ports); + + callout_init_mtx(&lsc->lsc_transit_callout, &lsc->lsc_mtx, 0); + callout_init_mtx(&lsc->lsc_callout, &lsc->lsc_mtx, 0); + + /* if the lagg is already up then do the same */ + if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) + lacp_init(sc); + + return (0); +} + +int +lacp_detach(struct lagg_softc *sc) +{ + struct lacp_softc *lsc = LACP_SOFTC(sc); + + KASSERT(TAILQ_EMPTY(&lsc->lsc_aggregators), + ("aggregators still active")); + KASSERT(lsc->lsc_active_aggregator == NULL, + ("aggregator still attached")); + + sc->sc_psc = NULL; + callout_drain(&lsc->lsc_transit_callout); + callout_drain(&lsc->lsc_callout); + + LACP_LOCK_DESTROY(lsc); + free(lsc, M_DEVBUF); + return (0); +} + +void +lacp_init(struct lagg_softc *sc) +{ + struct lacp_softc *lsc = LACP_SOFTC(sc); + + LACP_LOCK(lsc); + callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc); + LACP_UNLOCK(lsc); +} + +void +lacp_stop(struct lagg_softc *sc) +{ + struct lacp_softc *lsc = LACP_SOFTC(sc); + + LACP_LOCK(lsc); + callout_stop(&lsc->lsc_transit_callout); + callout_stop(&lsc->lsc_callout); + LACP_UNLOCK(lsc); +} + +struct lagg_port * +lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) +{ + struct lacp_softc *lsc = LACP_SOFTC(sc); + struct lacp_portmap *pm; + struct lacp_port *lp; + uint32_t hash; + + if (__predict_false(lsc->lsc_suppress_distributing)) { + LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__)); + return (NULL); + } + + pm = &lsc->lsc_pmap[lsc->lsc_activemap]; + if (pm->pm_count == 0) { + LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__)); + return (NULL); + } + + if (m->m_flags & M_FLOWID) + hash = m->m_pkthdr.flowid; + else + hash = lagg_hashmbuf(m, lsc->lsc_hashkey); + hash %= pm->pm_count; + lp = pm->pm_map[hash]; + + KASSERT((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0, + ("aggregated port is not distributing")); + + return (lp->lp_lagg); +} +/* + * lacp_suppress_distributing: drop transmit packets for a while + * to preserve packet ordering. + */ + +static void +lacp_suppress_distributing(struct lacp_softc *lsc, struct lacp_aggregator *la) +{ + struct lacp_port *lp; + + if (lsc->lsc_active_aggregator != la) { + return; + } + + LACP_DPRINTF((NULL, "%s\n", __func__)); + lsc->lsc_suppress_distributing = TRUE; + + /* send a marker frame down each port to verify the queues are empty */ + LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) { + lp->lp_flags |= LACP_PORT_MARK; + lacp_xmit_marker(lp); + } + + /* set a timeout for the marker frames */ + callout_reset(&lsc->lsc_transit_callout, + LACP_TRANSIT_DELAY * hz / 1000, lacp_transit_expire, lsc); +} + +static int +lacp_compare_peerinfo(const struct lacp_peerinfo *a, + const struct lacp_peerinfo *b) +{ + return (memcmp(a, b, offsetof(struct lacp_peerinfo, lip_state))); +} + +static int +lacp_compare_systemid(const struct lacp_systemid *a, + const struct lacp_systemid *b) +{ + return (memcmp(a, b, sizeof(*a))); +} + +#if 0 /* unused */ +static int +lacp_compare_portid(const struct lacp_portid *a, + const struct lacp_portid *b) +{ + return (memcmp(a, b, sizeof(*a))); +} +#endif + +static uint64_t +lacp_aggregator_bandwidth(struct lacp_aggregator *la) +{ + struct lacp_port *lp; + uint64_t speed; + + lp = TAILQ_FIRST(&la->la_ports); + if (lp == NULL) { + return (0); + } + + speed = ifmedia_baudrate(lp->lp_media); + speed *= la->la_nports; + if (speed == 0) { + LACP_DPRINTF((lp, "speed 0? media=0x%x nports=%d\n", + lp->lp_media, la->la_nports)); + } + + return (speed); +} + +/* + * lacp_select_active_aggregator: select an aggregator to be used to transmit + * packets from lagg(4) interface. + */ + +static void +lacp_select_active_aggregator(struct lacp_softc *lsc) +{ + struct lagg_softc *sc = lsc->lsc_softc; + struct lacp_aggregator *la; + struct lacp_aggregator *best_la = NULL; + uint64_t best_speed = 0; +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif /* defined(LACP_DEBUG) */ + + LACP_DPRINTF((NULL, "%s:\n", __func__)); + + TAILQ_FOREACH(la, &lsc->lsc_aggregators, la_q) { + uint64_t speed; + + if (la->la_nports == 0) { + continue; + } + + speed = lacp_aggregator_bandwidth(la); + LACP_DPRINTF((NULL, "%s, speed=%jd, nports=%d\n", + lacp_format_lagid_aggregator(la, buf, sizeof(buf)), + speed, la->la_nports)); + + /* This aggregator is chosen if + * the partner has a better system priority + * or, the total aggregated speed is higher + * or, it is already the chosen aggregator + */ + if ((best_la != NULL && LACP_SYS_PRI(la->la_partner) < + LACP_SYS_PRI(best_la->la_partner)) || + speed > best_speed || + (speed == best_speed && + la == lsc->lsc_active_aggregator)) { + best_la = la; + best_speed = speed; + } + } + + KASSERT(best_la == NULL || best_la->la_nports > 0, + ("invalid aggregator refcnt")); + KASSERT(best_la == NULL || !TAILQ_EMPTY(&best_la->la_ports), + ("invalid aggregator list")); + +#if defined(LACP_DEBUG) + if (lsc->lsc_active_aggregator != best_la) { + LACP_DPRINTF((NULL, "active aggregator changed\n")); + LACP_DPRINTF((NULL, "old %s\n", + lacp_format_lagid_aggregator(lsc->lsc_active_aggregator, + buf, sizeof(buf)))); + } else { + LACP_DPRINTF((NULL, "active aggregator not changed\n")); + } + LACP_DPRINTF((NULL, "new %s\n", + lacp_format_lagid_aggregator(best_la, buf, sizeof(buf)))); +#endif /* defined(LACP_DEBUG) */ + + if (lsc->lsc_active_aggregator != best_la) { + sc->sc_ifp->if_baudrate = best_speed; + lsc->lsc_active_aggregator = best_la; + lacp_update_portmap(lsc); + if (best_la) { + lacp_suppress_distributing(lsc, best_la); + } + } +} + +/* + * Updated the inactive portmap array with the new list of ports and + * make it live. + */ +static void +lacp_update_portmap(struct lacp_softc *lsc) +{ + struct lacp_aggregator *la; + struct lacp_portmap *p; + struct lacp_port *lp; + u_int newmap; + int i; + + newmap = lsc->lsc_activemap == 0 ? 1 : 0; + p = &lsc->lsc_pmap[newmap]; + la = lsc->lsc_active_aggregator; + bzero(p, sizeof(struct lacp_portmap)); + + if (la != NULL && la->la_nports > 0) { + p->pm_count = la->la_nports; + i = 0; + TAILQ_FOREACH(lp, &la->la_ports, lp_dist_q) + p->pm_map[i++] = lp; + KASSERT(i == p->pm_count, ("Invalid port count")); + } + + /* switch the active portmap over */ + atomic_store_rel_int(&lsc->lsc_activemap, newmap); + LACP_DPRINTF((NULL, "Set table %d with %d ports\n", + lsc->lsc_activemap, + lsc->lsc_pmap[lsc->lsc_activemap].pm_count)); +} + +static uint16_t +lacp_compose_key(struct lacp_port *lp) +{ + struct lagg_port *lgp = lp->lp_lagg; + struct lagg_softc *sc = lgp->lp_softc; + u_int media = lp->lp_media; + uint16_t key; + + if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0) { + + /* + * non-aggregatable links should have unique keys. + * + * XXX this isn't really unique as if_index is 16 bit. + */ + + /* bit 0..14: (some bits of) if_index of this port */ + key = lp->lp_ifp->if_index; + /* bit 15: 1 */ + key |= 0x8000; + } else { + u_int subtype = IFM_SUBTYPE(media); + + KASSERT(IFM_TYPE(media) == IFM_ETHER, ("invalid media type")); + KASSERT((media & IFM_FDX) != 0, ("aggregating HDX interface")); + + /* bit 0..4: IFM_SUBTYPE */ + key = subtype; + /* bit 5..14: (some bits of) if_index of lagg device */ + key |= 0x7fe0 & ((sc->sc_ifp->if_index) << 5); + /* bit 15: 0 */ + } + return (htons(key)); +} + +static void +lacp_aggregator_addref(struct lacp_softc *lsc, struct lacp_aggregator *la) +{ +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif + + LACP_DPRINTF((NULL, "%s: lagid=%s, refcnt %d -> %d\n", + __func__, + lacp_format_lagid(&la->la_actor, &la->la_partner, + buf, sizeof(buf)), + la->la_refcnt, la->la_refcnt + 1)); + + KASSERT(la->la_refcnt > 0, ("refcount <= 0")); + la->la_refcnt++; + KASSERT(la->la_refcnt > la->la_nports, ("invalid refcount")); +} + +static void +lacp_aggregator_delref(struct lacp_softc *lsc, struct lacp_aggregator *la) +{ +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif + + LACP_DPRINTF((NULL, "%s: lagid=%s, refcnt %d -> %d\n", + __func__, + lacp_format_lagid(&la->la_actor, &la->la_partner, + buf, sizeof(buf)), + la->la_refcnt, la->la_refcnt - 1)); + + KASSERT(la->la_refcnt > la->la_nports, ("invalid refcnt")); + la->la_refcnt--; + if (la->la_refcnt > 0) { + return; + } + + KASSERT(la->la_refcnt == 0, ("refcount not zero")); + KASSERT(lsc->lsc_active_aggregator != la, ("aggregator active")); + + TAILQ_REMOVE(&lsc->lsc_aggregators, la, la_q); + + free(la, M_DEVBUF); +} + +/* + * lacp_aggregator_get: allocate an aggregator. + */ + +static struct lacp_aggregator * +lacp_aggregator_get(struct lacp_softc *lsc, struct lacp_port *lp) +{ + struct lacp_aggregator *la; + + la = malloc(sizeof(*la), M_DEVBUF, M_NOWAIT); + if (la) { + la->la_refcnt = 1; + la->la_nports = 0; + TAILQ_INIT(&la->la_ports); + la->la_pending = 0; + TAILQ_INSERT_TAIL(&lsc->lsc_aggregators, la, la_q); + } + + return (la); +} + +/* + * lacp_fill_aggregator_id: setup a newly allocated aggregator from a port. + */ + +static void +lacp_fill_aggregator_id(struct lacp_aggregator *la, const struct lacp_port *lp) +{ + lacp_fill_aggregator_id_peer(&la->la_partner, &lp->lp_partner); + lacp_fill_aggregator_id_peer(&la->la_actor, &lp->lp_actor); + + la->la_actor.lip_state = lp->lp_state & LACP_STATE_AGGREGATION; +} + +static void +lacp_fill_aggregator_id_peer(struct lacp_peerinfo *lpi_aggr, + const struct lacp_peerinfo *lpi_port) +{ + memset(lpi_aggr, 0, sizeof(*lpi_aggr)); + lpi_aggr->lip_systemid = lpi_port->lip_systemid; + lpi_aggr->lip_key = lpi_port->lip_key; +} + +/* + * lacp_aggregator_is_compatible: check if a port can join to an aggregator. + */ + +static int +lacp_aggregator_is_compatible(const struct lacp_aggregator *la, + const struct lacp_port *lp) +{ + if (!(lp->lp_state & LACP_STATE_AGGREGATION) || + !(lp->lp_partner.lip_state & LACP_STATE_AGGREGATION)) { + return (0); + } + + if (!(la->la_actor.lip_state & LACP_STATE_AGGREGATION)) { + return (0); + } + + if (!lacp_peerinfo_is_compatible(&la->la_partner, &lp->lp_partner)) { + return (0); + } + + if (!lacp_peerinfo_is_compatible(&la->la_actor, &lp->lp_actor)) { + return (0); + } + + return (1); +} + +static int +lacp_peerinfo_is_compatible(const struct lacp_peerinfo *a, + const struct lacp_peerinfo *b) +{ + if (memcmp(&a->lip_systemid, &b->lip_systemid, + sizeof(a->lip_systemid))) { + return (0); + } + + if (memcmp(&a->lip_key, &b->lip_key, sizeof(a->lip_key))) { + return (0); + } + + return (1); +} + +static void +lacp_port_enable(struct lacp_port *lp) +{ + lp->lp_state |= LACP_STATE_AGGREGATION; +} + +static void +lacp_port_disable(struct lacp_port *lp) +{ + lacp_set_mux(lp, LACP_MUX_DETACHED); + + lp->lp_state &= ~LACP_STATE_AGGREGATION; + lp->lp_selected = LACP_UNSELECTED; + lacp_sm_rx_record_default(lp); + lp->lp_partner.lip_state &= ~LACP_STATE_AGGREGATION; + lp->lp_state &= ~LACP_STATE_EXPIRED; +} + +/* + * lacp_select: select an aggregator. create one if necessary. + */ +static void +lacp_select(struct lacp_port *lp) +{ + struct lacp_softc *lsc = lp->lp_lsc; + struct lacp_aggregator *la; +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif + + if (lp->lp_aggregator) { + return; + } + + KASSERT(!LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), + ("timer_wait_while still active")); + + LACP_DPRINTF((lp, "port lagid=%s\n", + lacp_format_lagid(&lp->lp_actor, &lp->lp_partner, + buf, sizeof(buf)))); + + TAILQ_FOREACH(la, &lsc->lsc_aggregators, la_q) { + if (lacp_aggregator_is_compatible(la, lp)) { + break; + } + } + + if (la == NULL) { + la = lacp_aggregator_get(lsc, lp); + if (la == NULL) { + LACP_DPRINTF((lp, "aggregator creation failed\n")); + + /* + * will retry on the next tick. + */ + + return; + } + lacp_fill_aggregator_id(la, lp); + LACP_DPRINTF((lp, "aggregator created\n")); + } else { + LACP_DPRINTF((lp, "compatible aggregator found\n")); + if (la->la_refcnt == LACP_MAX_PORTS) + return; + lacp_aggregator_addref(lsc, la); + } + + LACP_DPRINTF((lp, "aggregator lagid=%s\n", + lacp_format_lagid(&la->la_actor, &la->la_partner, + buf, sizeof(buf)))); + + lp->lp_aggregator = la; + lp->lp_selected = LACP_SELECTED; +} + +/* + * lacp_unselect: finish unselect/detach process. + */ + +static void +lacp_unselect(struct lacp_port *lp) +{ + struct lacp_softc *lsc = lp->lp_lsc; + struct lacp_aggregator *la = lp->lp_aggregator; + + KASSERT(!LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), + ("timer_wait_while still active")); + + if (la == NULL) { + return; + } + + lp->lp_aggregator = NULL; + lacp_aggregator_delref(lsc, la); +} + +/* mux machine */ + +static void +lacp_sm_mux(struct lacp_port *lp) +{ + enum lacp_mux_state new_state; + boolean_t p_sync = + (lp->lp_partner.lip_state & LACP_STATE_SYNC) != 0; + boolean_t p_collecting = + (lp->lp_partner.lip_state & LACP_STATE_COLLECTING) != 0; + enum lacp_selected selected = lp->lp_selected; + struct lacp_aggregator *la; + + /* LACP_DPRINTF((lp, "%s: state %d\n", __func__, lp->lp_mux_state)); */ + +re_eval: + la = lp->lp_aggregator; + KASSERT(lp->lp_mux_state == LACP_MUX_DETACHED || la != NULL, + ("MUX not detached")); + new_state = lp->lp_mux_state; + switch (lp->lp_mux_state) { + case LACP_MUX_DETACHED: + if (selected != LACP_UNSELECTED) { + new_state = LACP_MUX_WAITING; + } + break; + case LACP_MUX_WAITING: + KASSERT(la->la_pending > 0 || + !LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE), + ("timer_wait_while still active")); + if (selected == LACP_SELECTED && la->la_pending == 0) { + new_state = LACP_MUX_ATTACHED; + } else if (selected == LACP_UNSELECTED) { + new_state = LACP_MUX_DETACHED; + } + break; + case LACP_MUX_ATTACHED: + if (selected == LACP_SELECTED && p_sync) { + new_state = LACP_MUX_COLLECTING; + } else if (selected != LACP_SELECTED) { + new_state = LACP_MUX_DETACHED; + } + break; + case LACP_MUX_COLLECTING: + if (selected == LACP_SELECTED && p_sync && p_collecting) { + new_state = LACP_MUX_DISTRIBUTING; + } else if (selected != LACP_SELECTED || !p_sync) { + new_state = LACP_MUX_ATTACHED; + } + break; + case LACP_MUX_DISTRIBUTING: + if (selected != LACP_SELECTED || !p_sync || !p_collecting) { + new_state = LACP_MUX_COLLECTING; + } + break; + default: + panic("%s: unknown state", __func__); + } + + if (lp->lp_mux_state == new_state) { + return; + } + + lacp_set_mux(lp, new_state); + goto re_eval; +} + +static void +lacp_set_mux(struct lacp_port *lp, enum lacp_mux_state new_state) +{ + struct lacp_aggregator *la = lp->lp_aggregator; + + if (lp->lp_mux_state == new_state) { + return; + } + + switch (new_state) { + case LACP_MUX_DETACHED: + lp->lp_state &= ~LACP_STATE_SYNC; + lacp_disable_distributing(lp); + lacp_disable_collecting(lp); + lacp_sm_assert_ntt(lp); + /* cancel timer */ + if (LACP_TIMER_ISARMED(lp, LACP_TIMER_WAIT_WHILE)) { + KASSERT(la->la_pending > 0, + ("timer_wait_while not active")); + la->la_pending--; + } + LACP_TIMER_DISARM(lp, LACP_TIMER_WAIT_WHILE); + lacp_unselect(lp); + break; + case LACP_MUX_WAITING: + LACP_TIMER_ARM(lp, LACP_TIMER_WAIT_WHILE, + LACP_AGGREGATE_WAIT_TIME); + la->la_pending++; + break; + case LACP_MUX_ATTACHED: + lp->lp_state |= LACP_STATE_SYNC; + lacp_disable_collecting(lp); + lacp_sm_assert_ntt(lp); + break; + case LACP_MUX_COLLECTING: + lacp_enable_collecting(lp); + lacp_disable_distributing(lp); + lacp_sm_assert_ntt(lp); + break; + case LACP_MUX_DISTRIBUTING: + lacp_enable_distributing(lp); + break; + default: + panic("%s: unknown state", __func__); + } + + LACP_DPRINTF((lp, "mux_state %d -> %d\n", lp->lp_mux_state, new_state)); + + lp->lp_mux_state = new_state; +} + +static void +lacp_sm_mux_timer(struct lacp_port *lp) +{ + struct lacp_aggregator *la = lp->lp_aggregator; +#if defined(LACP_DEBUG) + char buf[LACP_LAGIDSTR_MAX+1]; +#endif + + KASSERT(la->la_pending > 0, ("no pending event")); + + LACP_DPRINTF((lp, "%s: aggregator %s, pending %d -> %d\n", __func__, + lacp_format_lagid(&la->la_actor, &la->la_partner, + buf, sizeof(buf)), + la->la_pending, la->la_pending - 1)); + + la->la_pending--; +} + +/* periodic transmit machine */ + +static void +lacp_sm_ptx_update_timeout(struct lacp_port *lp, uint8_t oldpstate) +{ + if (LACP_STATE_EQ(oldpstate, lp->lp_partner.lip_state, + LACP_STATE_TIMEOUT)) { + return; + } + + LACP_DPRINTF((lp, "partner timeout changed\n")); + + /* + * FAST_PERIODIC -> SLOW_PERIODIC + * or + * SLOW_PERIODIC (-> PERIODIC_TX) -> FAST_PERIODIC + * + * let lacp_sm_ptx_tx_schedule to update timeout. + */ + + LACP_TIMER_DISARM(lp, LACP_TIMER_PERIODIC); + + /* + * if timeout has been shortened, assert NTT. + */ + + if ((lp->lp_partner.lip_state & LACP_STATE_TIMEOUT)) { + lacp_sm_assert_ntt(lp); + } +} + +static void +lacp_sm_ptx_tx_schedule(struct lacp_port *lp) +{ + int timeout; + + if (!(lp->lp_state & LACP_STATE_ACTIVITY) && + !(lp->lp_partner.lip_state & LACP_STATE_ACTIVITY)) { + + /* + * NO_PERIODIC + */ + + LACP_TIMER_DISARM(lp, LACP_TIMER_PERIODIC); + return; + } + + if (LACP_TIMER_ISARMED(lp, LACP_TIMER_PERIODIC)) { + return; + } + + timeout = (lp->lp_partner.lip_state & LACP_STATE_TIMEOUT) ? + LACP_FAST_PERIODIC_TIME : LACP_SLOW_PERIODIC_TIME; + + LACP_TIMER_ARM(lp, LACP_TIMER_PERIODIC, timeout); +} + +static void +lacp_sm_ptx_timer(struct lacp_port *lp) +{ + lacp_sm_assert_ntt(lp); +} + +static void +lacp_sm_rx(struct lacp_port *lp, const struct lacpdu *du) +{ + int timeout; + + /* + * check LACP_DISABLED first + */ + + if (!(lp->lp_state & LACP_STATE_AGGREGATION)) { + return; + } + + /* + * check loopback condition. + */ + + if (!lacp_compare_systemid(&du->ldu_actor.lip_systemid, + &lp->lp_actor.lip_systemid)) { + return; + } + + /* + * EXPIRED, DEFAULTED, CURRENT -> CURRENT + */ + + lacp_sm_rx_update_selected(lp, du); + lacp_sm_rx_update_ntt(lp, du); + lacp_sm_rx_record_pdu(lp, du); + + timeout = (lp->lp_state & LACP_STATE_TIMEOUT) ? + LACP_SHORT_TIMEOUT_TIME : LACP_LONG_TIMEOUT_TIME; + LACP_TIMER_ARM(lp, LACP_TIMER_CURRENT_WHILE, timeout); + + lp->lp_state &= ~LACP_STATE_EXPIRED; + + /* + * kick transmit machine without waiting the next tick. + */ + + lacp_sm_tx(lp); +} + +static void +lacp_sm_rx_set_expired(struct lacp_port *lp) +{ + lp->lp_partner.lip_state &= ~LACP_STATE_SYNC; + lp->lp_partner.lip_state |= LACP_STATE_TIMEOUT; + LACP_TIMER_ARM(lp, LACP_TIMER_CURRENT_WHILE, LACP_SHORT_TIMEOUT_TIME); + lp->lp_state |= LACP_STATE_EXPIRED; +} + +static void +lacp_sm_rx_timer(struct lacp_port *lp) +{ + if ((lp->lp_state & LACP_STATE_EXPIRED) == 0) { + /* CURRENT -> EXPIRED */ + LACP_DPRINTF((lp, "%s: CURRENT -> EXPIRED\n", __func__)); + lacp_sm_rx_set_expired(lp); + } else { + /* EXPIRED -> DEFAULTED */ + LACP_DPRINTF((lp, "%s: EXPIRED -> DEFAULTED\n", __func__)); + lacp_sm_rx_update_default_selected(lp); + lacp_sm_rx_record_default(lp); + lp->lp_state &= ~LACP_STATE_EXPIRED; + } +} + +static void +lacp_sm_rx_record_pdu(struct lacp_port *lp, const struct lacpdu *du) +{ + boolean_t active; + uint8_t oldpstate; +#if defined(LACP_DEBUG) + char buf[LACP_STATESTR_MAX+1]; +#endif + + /* LACP_DPRINTF((lp, "%s\n", __func__)); */ + + oldpstate = lp->lp_partner.lip_state; + + active = (du->ldu_actor.lip_state & LACP_STATE_ACTIVITY) + || ((lp->lp_state & LACP_STATE_ACTIVITY) && + (du->ldu_partner.lip_state & LACP_STATE_ACTIVITY)); + + lp->lp_partner = du->ldu_actor; + if (active && + ((LACP_STATE_EQ(lp->lp_state, du->ldu_partner.lip_state, + LACP_STATE_AGGREGATION) && + !lacp_compare_peerinfo(&lp->lp_actor, &du->ldu_partner)) + || (du->ldu_partner.lip_state & LACP_STATE_AGGREGATION) == 0)) { + /* XXX nothing? */ + } else { + lp->lp_partner.lip_state &= ~LACP_STATE_SYNC; + } + + lp->lp_state &= ~LACP_STATE_DEFAULTED; + + if (oldpstate != lp->lp_partner.lip_state) { + LACP_DPRINTF((lp, "old pstate %s\n", + lacp_format_state(oldpstate, buf, sizeof(buf)))); + LACP_DPRINTF((lp, "new pstate %s\n", + lacp_format_state(lp->lp_partner.lip_state, buf, + sizeof(buf)))); + } + + lacp_sm_ptx_update_timeout(lp, oldpstate); +} + +static void +lacp_sm_rx_update_ntt(struct lacp_port *lp, const struct lacpdu *du) +{ + /* LACP_DPRINTF((lp, "%s\n", __func__)); */ + + if (lacp_compare_peerinfo(&lp->lp_actor, &du->ldu_partner) || + !LACP_STATE_EQ(lp->lp_state, du->ldu_partner.lip_state, + LACP_STATE_ACTIVITY | LACP_STATE_SYNC | LACP_STATE_AGGREGATION)) { + LACP_DPRINTF((lp, "%s: assert ntt\n", __func__)); + lacp_sm_assert_ntt(lp); + } +} + +static void +lacp_sm_rx_record_default(struct lacp_port *lp) +{ + uint8_t oldpstate; + + /* LACP_DPRINTF((lp, "%s\n", __func__)); */ + + oldpstate = lp->lp_partner.lip_state; + lp->lp_partner = lacp_partner_admin; + lp->lp_state |= LACP_STATE_DEFAULTED; + lacp_sm_ptx_update_timeout(lp, oldpstate); +} + +static void +lacp_sm_rx_update_selected_from_peerinfo(struct lacp_port *lp, + const struct lacp_peerinfo *info) +{ + /* LACP_DPRINTF((lp, "%s\n", __func__)); */ + + if (lacp_compare_peerinfo(&lp->lp_partner, info) || + !LACP_STATE_EQ(lp->lp_partner.lip_state, info->lip_state, + LACP_STATE_AGGREGATION)) { + lp->lp_selected = LACP_UNSELECTED; + /* mux machine will clean up lp->lp_aggregator */ + } +} + +static void +lacp_sm_rx_update_selected(struct lacp_port *lp, const struct lacpdu *du) +{ + /* LACP_DPRINTF((lp, "%s\n", __func__)); */ + + lacp_sm_rx_update_selected_from_peerinfo(lp, &du->ldu_actor); +} + +static void +lacp_sm_rx_update_default_selected(struct lacp_port *lp) +{ + /* LACP_DPRINTF((lp, "%s\n", __func__)); */ + + lacp_sm_rx_update_selected_from_peerinfo(lp, &lacp_partner_admin); +} + +/* transmit machine */ + +static void +lacp_sm_tx(struct lacp_port *lp) +{ + int error; + + if (!(lp->lp_state & LACP_STATE_AGGREGATION) +#if 1 + || (!(lp->lp_state & LACP_STATE_ACTIVITY) + && !(lp->lp_partner.lip_state & LACP_STATE_ACTIVITY)) +#endif + ) { + lp->lp_flags &= ~LACP_PORT_NTT; + } + + if (!(lp->lp_flags & LACP_PORT_NTT)) { + return; + } + + /* Rate limit to 3 PDUs per LACP_FAST_PERIODIC_TIME */ + if (ppsratecheck(&lp->lp_last_lacpdu, &lp->lp_lacpdu_sent, + (3 / LACP_FAST_PERIODIC_TIME)) == 0) { + LACP_DPRINTF((lp, "rate limited pdu\n")); + return; + } + + error = lacp_xmit_lacpdu(lp); + + if (error == 0) { + lp->lp_flags &= ~LACP_PORT_NTT; + } else { + LACP_DPRINTF((lp, "lacpdu transmit failure, error %d\n", + error)); + } +} + +static void +lacp_sm_assert_ntt(struct lacp_port *lp) +{ + + lp->lp_flags |= LACP_PORT_NTT; +} + +static void +lacp_run_timers(struct lacp_port *lp) +{ + int i; + + for (i = 0; i < LACP_NTIMER; i++) { + KASSERT(lp->lp_timer[i] >= 0, + ("invalid timer value %d", lp->lp_timer[i])); + if (lp->lp_timer[i] == 0) { + continue; + } else if (--lp->lp_timer[i] <= 0) { + if (lacp_timer_funcs[i]) { + (*lacp_timer_funcs[i])(lp); + } + } + } +} + +int +lacp_marker_input(struct lacp_port *lp, struct mbuf *m) +{ + struct lacp_softc *lsc = lp->lp_lsc; + struct lagg_port *lgp = lp->lp_lagg; + struct lacp_port *lp2; + struct markerdu *mdu; + int error = 0; + int pending = 0; + + if (m->m_pkthdr.len != sizeof(*mdu)) { + goto bad; + } + + if ((m->m_flags & M_MCAST) == 0) { + goto bad; + } + + if (m->m_len < sizeof(*mdu)) { + m = m_pullup(m, sizeof(*mdu)); + if (m == NULL) { + return (ENOMEM); + } + } + + mdu = mtod(m, struct markerdu *); + + if (memcmp(&mdu->mdu_eh.ether_dhost, + ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN)) { + goto bad; + } + + if (mdu->mdu_sph.sph_version != 1) { + goto bad; + } + + switch (mdu->mdu_tlv.tlv_type) { + case MARKER_TYPE_INFO: + if (tlv_check(mdu, sizeof(*mdu), &mdu->mdu_tlv, + marker_info_tlv_template, TRUE)) { + goto bad; + } + mdu->mdu_tlv.tlv_type = MARKER_TYPE_RESPONSE; + memcpy(&mdu->mdu_eh.ether_dhost, + ðermulticastaddr_slowprotocols, ETHER_ADDR_LEN); + memcpy(&mdu->mdu_eh.ether_shost, + lgp->lp_lladdr, ETHER_ADDR_LEN); + error = lagg_enqueue(lp->lp_ifp, m); + break; + + case MARKER_TYPE_RESPONSE: + if (tlv_check(mdu, sizeof(*mdu), &mdu->mdu_tlv, + marker_response_tlv_template, TRUE)) { + goto bad; + } + LACP_DPRINTF((lp, "marker response, port=%u, sys=%6D, id=%u\n", + ntohs(mdu->mdu_info.mi_rq_port), mdu->mdu_info.mi_rq_system, + ":", ntohl(mdu->mdu_info.mi_rq_xid))); + + /* Verify that it is the last marker we sent out */ + if (memcmp(&mdu->mdu_info, &lp->lp_marker, + sizeof(struct lacp_markerinfo))) + goto bad; + + LACP_LOCK(lsc); + lp->lp_flags &= ~LACP_PORT_MARK; + + if (lsc->lsc_suppress_distributing) { + /* Check if any ports are waiting for a response */ + LIST_FOREACH(lp2, &lsc->lsc_ports, lp_next) { + if (lp2->lp_flags & LACP_PORT_MARK) { + pending = 1; + break; + } + } + + if (pending == 0) { + /* All interface queues are clear */ + LACP_DPRINTF((NULL, "queue flush complete\n")); + lsc->lsc_suppress_distributing = FALSE; + } + } + LACP_UNLOCK(lsc); + m_freem(m); + break; + + default: + goto bad; + } + + return (error); + +bad: + LACP_DPRINTF((lp, "bad marker frame\n")); + m_freem(m); + return (EINVAL); +} + +static int +tlv_check(const void *p, size_t size, const struct tlvhdr *tlv, + const struct tlv_template *tmpl, boolean_t check_type) +{ + while (/* CONSTCOND */ 1) { + if ((const char *)tlv - (const char *)p + sizeof(*tlv) > size) { + return (EINVAL); + } + if ((check_type && tlv->tlv_type != tmpl->tmpl_type) || + tlv->tlv_length != tmpl->tmpl_length) { + return (EINVAL); + } + if (tmpl->tmpl_type == 0) { + break; + } + tlv = (const struct tlvhdr *) + ((const char *)tlv + tlv->tlv_length); + tmpl++; + } + + return (0); +} + +#if defined(LACP_DEBUG) +const char * +lacp_format_mac(const uint8_t *mac, char *buf, size_t buflen) +{ + snprintf(buf, buflen, "%02X-%02X-%02X-%02X-%02X-%02X", + (int)mac[0], + (int)mac[1], + (int)mac[2], + (int)mac[3], + (int)mac[4], + (int)mac[5]); + + return (buf); +} + +const char * +lacp_format_systemid(const struct lacp_systemid *sysid, + char *buf, size_t buflen) +{ + char macbuf[LACP_MACSTR_MAX+1]; + + snprintf(buf, buflen, "%04X,%s", + ntohs(sysid->lsi_prio), + lacp_format_mac(sysid->lsi_mac, macbuf, sizeof(macbuf))); + + return (buf); +} + +const char * +lacp_format_portid(const struct lacp_portid *portid, char *buf, size_t buflen) +{ + snprintf(buf, buflen, "%04X,%04X", + ntohs(portid->lpi_prio), + ntohs(portid->lpi_portno)); + + return (buf); +} + +const char * +lacp_format_partner(const struct lacp_peerinfo *peer, char *buf, size_t buflen) +{ + char sysid[LACP_SYSTEMIDSTR_MAX+1]; + char portid[LACP_PORTIDSTR_MAX+1]; + + snprintf(buf, buflen, "(%s,%04X,%s)", + lacp_format_systemid(&peer->lip_systemid, sysid, sizeof(sysid)), + ntohs(peer->lip_key), + lacp_format_portid(&peer->lip_portid, portid, sizeof(portid))); + + return (buf); +} + +const char * +lacp_format_lagid(const struct lacp_peerinfo *a, + const struct lacp_peerinfo *b, char *buf, size_t buflen) +{ + char astr[LACP_PARTNERSTR_MAX+1]; + char bstr[LACP_PARTNERSTR_MAX+1]; + +#if 0 + /* + * there's a convention to display small numbered peer + * in the left. + */ + + if (lacp_compare_peerinfo(a, b) > 0) { + const struct lacp_peerinfo *t; + + t = a; + a = b; + b = t; + } +#endif + + snprintf(buf, buflen, "[%s,%s]", + lacp_format_partner(a, astr, sizeof(astr)), + lacp_format_partner(b, bstr, sizeof(bstr))); + + return (buf); +} + +const char * +lacp_format_lagid_aggregator(const struct lacp_aggregator *la, + char *buf, size_t buflen) +{ + if (la == NULL) { + return ("(none)"); + } + + return (lacp_format_lagid(&la->la_actor, &la->la_partner, buf, buflen)); +} + +const char * +lacp_format_state(uint8_t state, char *buf, size_t buflen) +{ + snprintf(buf, buflen, "%b", state, LACP_STATE_BITS); + return (buf); +} + +static void +lacp_dump_lacpdu(const struct lacpdu *du) +{ + char buf[LACP_PARTNERSTR_MAX+1]; + char buf2[LACP_STATESTR_MAX+1]; + + printf("actor=%s\n", + lacp_format_partner(&du->ldu_actor, buf, sizeof(buf))); + printf("actor.state=%s\n", + lacp_format_state(du->ldu_actor.lip_state, buf2, sizeof(buf2))); + printf("partner=%s\n", + lacp_format_partner(&du->ldu_partner, buf, sizeof(buf))); + printf("partner.state=%s\n", + lacp_format_state(du->ldu_partner.lip_state, buf2, sizeof(buf2))); + + printf("maxdelay=%d\n", ntohs(du->ldu_collector.lci_maxdelay)); +} + +static void +lacp_dprintf(const struct lacp_port *lp, const char *fmt, ...) +{ + va_list va; + + if (lp) { + printf("%s: ", lp->lp_ifp->if_xname); + } + + va_start(va, fmt); + vprintf(fmt, va); + va_end(va); +} +#endif diff --git a/freebsd/sys/net/ieee8023ad_lacp.h b/freebsd/sys/net/ieee8023ad_lacp.h new file mode 100644 index 00000000..9cebc591 --- /dev/null +++ b/freebsd/sys/net/ieee8023ad_lacp.h @@ -0,0 +1,333 @@ +/* $NetBSD: ieee8023ad_impl.h,v 1.2 2005/12/10 23:21:39 elad Exp $ */ + +/*- + * Copyright (c)2005 YAMAMOTO Takashi, + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * IEEE802.3ad LACP + * + * implementation details. + */ + +#define LACP_TIMER_CURRENT_WHILE 0 +#define LACP_TIMER_PERIODIC 1 +#define LACP_TIMER_WAIT_WHILE 2 +#define LACP_NTIMER 3 + +#define LACP_TIMER_ARM(port, timer, val) \ + (port)->lp_timer[(timer)] = (val) +#define LACP_TIMER_DISARM(port, timer) \ + (port)->lp_timer[(timer)] = 0 +#define LACP_TIMER_ISARMED(port, timer) \ + ((port)->lp_timer[(timer)] > 0) + +/* + * IEEE802.3ad LACP + * + * protocol definitions. + */ + +#define LACP_STATE_ACTIVITY (1<<0) +#define LACP_STATE_TIMEOUT (1<<1) +#define LACP_STATE_AGGREGATION (1<<2) +#define LACP_STATE_SYNC (1<<3) +#define LACP_STATE_COLLECTING (1<<4) +#define LACP_STATE_DISTRIBUTING (1<<5) +#define LACP_STATE_DEFAULTED (1<<6) +#define LACP_STATE_EXPIRED (1<<7) + +#define LACP_PORT_NTT 0x00000001 +#define LACP_PORT_MARK 0x00000002 + +#define LACP_STATE_BITS \ + "\020" \ + "\001ACTIVITY" \ + "\002TIMEOUT" \ + "\003AGGREGATION" \ + "\004SYNC" \ + "\005COLLECTING" \ + "\006DISTRIBUTING" \ + "\007DEFAULTED" \ + "\010EXPIRED" + +/* + * IEEE802.3 slow protocols + * + * protocol (on-wire) definitions. + * + * XXX should be elsewhere. + */ + +#define SLOWPROTOCOLS_SUBTYPE_LACP 1 +#define SLOWPROTOCOLS_SUBTYPE_MARKER 2 + +struct slowprothdr { + uint8_t sph_subtype; + uint8_t sph_version; +} __packed; + +/* + * TLV on-wire structure. + */ + +struct tlvhdr { + uint8_t tlv_type; + uint8_t tlv_length; + /* uint8_t tlv_value[]; */ +} __packed; + +/* + * ... and our implementation. + */ + +#define TLV_SET(tlv, type, length) \ + do { \ + (tlv)->tlv_type = (type); \ + (tlv)->tlv_length = sizeof(*tlv) + (length); \ + } while (/*CONSTCOND*/0) + +struct tlv_template { + uint8_t tmpl_type; + uint8_t tmpl_length; +}; + +struct lacp_systemid { + uint16_t lsi_prio; + uint8_t lsi_mac[6]; +} __packed; + +struct lacp_portid { + uint16_t lpi_prio; + uint16_t lpi_portno; +} __packed; + +struct lacp_peerinfo { + struct lacp_systemid lip_systemid; + uint16_t lip_key; + struct lacp_portid lip_portid; + uint8_t lip_state; + uint8_t lip_resv[3]; +} __packed; + +struct lacp_collectorinfo { + uint16_t lci_maxdelay; + uint8_t lci_resv[12]; +} __packed; + +struct lacpdu { + struct ether_header ldu_eh; + struct slowprothdr ldu_sph; + + struct tlvhdr ldu_tlv_actor; + struct lacp_peerinfo ldu_actor; + struct tlvhdr ldu_tlv_partner; + struct lacp_peerinfo ldu_partner; + struct tlvhdr ldu_tlv_collector; + struct lacp_collectorinfo ldu_collector; + struct tlvhdr ldu_tlv_term; + uint8_t ldu_resv[50]; +} __packed; + +/* + * IEEE802.3ad marker protocol + * + * protocol (on-wire) definitions. + */ +struct lacp_markerinfo { + uint16_t mi_rq_port; + uint8_t mi_rq_system[ETHER_ADDR_LEN]; + uint32_t mi_rq_xid; + uint8_t mi_pad[2]; +} __packed; + +struct markerdu { + struct ether_header mdu_eh; + struct slowprothdr mdu_sph; + + struct tlvhdr mdu_tlv; + struct lacp_markerinfo mdu_info; + struct tlvhdr mdu_tlv_term; + uint8_t mdu_resv[90]; +} __packed; + +#define MARKER_TYPE_INFO 0x01 +#define MARKER_TYPE_RESPONSE 0x02 + +enum lacp_selected { + LACP_UNSELECTED, + LACP_STANDBY, /* not used in this implementation */ + LACP_SELECTED, +}; + +enum lacp_mux_state { + LACP_MUX_DETACHED, + LACP_MUX_WAITING, + LACP_MUX_ATTACHED, + LACP_MUX_COLLECTING, + LACP_MUX_DISTRIBUTING, +}; + +#define LACP_MAX_PORTS 32 + +struct lacp_portmap { + int pm_count; + struct lacp_port *pm_map[LACP_MAX_PORTS]; +}; + +struct lacp_port { + TAILQ_ENTRY(lacp_port) lp_dist_q; + LIST_ENTRY(lacp_port) lp_next; + struct lacp_softc *lp_lsc; + struct lagg_port *lp_lagg; + struct ifnet *lp_ifp; + struct lacp_peerinfo lp_partner; + struct lacp_peerinfo lp_actor; + struct lacp_markerinfo lp_marker; +#define lp_state lp_actor.lip_state +#define lp_key lp_actor.lip_key +#define lp_systemid lp_actor.lip_systemid + struct timeval lp_last_lacpdu; + int lp_lacpdu_sent; + enum lacp_mux_state lp_mux_state; + enum lacp_selected lp_selected; + int lp_flags; + u_int lp_media; /* XXX redundant */ + int lp_timer[LACP_NTIMER]; + struct ifmultiaddr *lp_ifma; + + struct lacp_aggregator *lp_aggregator; +}; + +struct lacp_aggregator { + TAILQ_ENTRY(lacp_aggregator) la_q; + int la_refcnt; /* num of ports which selected us */ + int la_nports; /* num of distributing ports */ + TAILQ_HEAD(, lacp_port) la_ports; /* distributing ports */ + struct lacp_peerinfo la_partner; + struct lacp_peerinfo la_actor; + int la_pending; /* number of ports in wait_while */ +}; + +struct lacp_softc { + struct lagg_softc *lsc_softc; + struct mtx lsc_mtx; + struct lacp_aggregator *lsc_active_aggregator; + TAILQ_HEAD(, lacp_aggregator) lsc_aggregators; + boolean_t lsc_suppress_distributing; + struct callout lsc_transit_callout; + struct callout lsc_callout; + LIST_HEAD(, lacp_port) lsc_ports; + struct lacp_portmap lsc_pmap[2]; + volatile u_int lsc_activemap; + u_int32_t lsc_hashkey; +}; + +#define LACP_TYPE_ACTORINFO 1 +#define LACP_TYPE_PARTNERINFO 2 +#define LACP_TYPE_COLLECTORINFO 3 + +/* timeout values (in sec) */ +#define LACP_FAST_PERIODIC_TIME (1) +#define LACP_SLOW_PERIODIC_TIME (30) +#define LACP_SHORT_TIMEOUT_TIME (3 * LACP_FAST_PERIODIC_TIME) +#define LACP_LONG_TIMEOUT_TIME (3 * LACP_SLOW_PERIODIC_TIME) +#define LACP_CHURN_DETECTION_TIME (60) +#define LACP_AGGREGATE_WAIT_TIME (2) +#define LACP_TRANSIT_DELAY 3000 /* in msec */ + +#define LACP_STATE_EQ(s1, s2, mask) \ + ((((s1) ^ (s2)) & (mask)) == 0) + +#define LACP_SYS_PRI(peer) (peer).lip_systemid.lsi_prio + +#define LACP_PORT(_lp) ((struct lacp_port *)(_lp)->lp_psc) +#define LACP_SOFTC(_sc) ((struct lacp_softc *)(_sc)->sc_psc) + +#define LACP_LOCK_INIT(_lsc) mtx_init(&(_lsc)->lsc_mtx, \ + "lacp mtx", NULL, MTX_DEF) +#define LACP_LOCK_DESTROY(_lsc) mtx_destroy(&(_lsc)->lsc_mtx) +#define LACP_LOCK(_lsc) mtx_lock(&(_lsc)->lsc_mtx) +#define LACP_UNLOCK(_lsc) mtx_unlock(&(_lsc)->lsc_mtx) +#define LACP_LOCK_ASSERT(_lsc) mtx_assert(&(_lsc)->lsc_mtx, MA_OWNED) + +struct mbuf *lacp_input(struct lagg_port *, struct mbuf *); +struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *); +int lacp_attach(struct lagg_softc *); +int lacp_detach(struct lagg_softc *); +void lacp_init(struct lagg_softc *); +void lacp_stop(struct lagg_softc *); +int lacp_port_create(struct lagg_port *); +void lacp_port_destroy(struct lagg_port *); +void lacp_linkstate(struct lagg_port *); +void lacp_req(struct lagg_softc *, caddr_t); +void lacp_portreq(struct lagg_port *, caddr_t); + +static __inline int +lacp_isactive(struct lagg_port *lgp) +{ + struct lacp_port *lp = LACP_PORT(lgp); + struct lacp_softc *lsc = lp->lp_lsc; + struct lacp_aggregator *la = lp->lp_aggregator; + + /* This port is joined to the active aggregator */ + if (la != NULL && la == lsc->lsc_active_aggregator) + return (1); + + return (0); +} + +static __inline int +lacp_iscollecting(struct lagg_port *lgp) +{ + struct lacp_port *lp = LACP_PORT(lgp); + + return ((lp->lp_state & LACP_STATE_COLLECTING) != 0); +} + +static __inline int +lacp_isdistributing(struct lagg_port *lgp) +{ + struct lacp_port *lp = LACP_PORT(lgp); + + return ((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0); +} + +/* following constants don't include terminating NUL */ +#define LACP_MACSTR_MAX (2*6 + 5) +#define LACP_SYSTEMPRIOSTR_MAX (4) +#define LACP_SYSTEMIDSTR_MAX (LACP_SYSTEMPRIOSTR_MAX + 1 + LACP_MACSTR_MAX) +#define LACP_PORTPRIOSTR_MAX (4) +#define LACP_PORTNOSTR_MAX (4) +#define LACP_PORTIDSTR_MAX (LACP_PORTPRIOSTR_MAX + 1 + LACP_PORTNOSTR_MAX) +#define LACP_KEYSTR_MAX (4) +#define LACP_PARTNERSTR_MAX \ + (1 + LACP_SYSTEMIDSTR_MAX + 1 + LACP_KEYSTR_MAX + 1 \ + + LACP_PORTIDSTR_MAX + 1) +#define LACP_LAGIDSTR_MAX \ + (1 + LACP_PARTNERSTR_MAX + 1 + LACP_PARTNERSTR_MAX + 1) +#define LACP_STATESTR_MAX (255) /* XXX */ diff --git a/freebsd/sys/net/if.c b/freebsd/sys/net/if.c new file mode 100644 index 00000000..33d9ed9d --- /dev/null +++ b/freebsd/sys/net/if.c @@ -0,0 +1,3431 @@ +#include + +/*- + * Copyright (c) 1980, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if.c 8.5 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(INET) || defined(INET6) +/*XXX*/ +#include +#include +#include +#ifdef INET6 +#include +#include +#endif +#endif +#ifdef INET +#include +#endif + +#include + +#ifdef COMPAT_FREEBSD32 +#include +#include +#endif + +struct ifindex_entry { + struct ifnet *ife_ifnet; +}; + +static int slowtimo_started; + +SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); +SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); + +TUNABLE_INT("net.link.ifqmaxlen", &ifqmaxlen); +SYSCTL_UINT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, + &ifqmaxlen, 0, "max send queue size"); + +/* Log link state change events */ +static int log_link_state_change = 1; + +SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, + &log_link_state_change, 0, + "log interface link state change events"); + +/* Interface description */ +static unsigned int ifdescr_maxlen = 1024; +SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, + &ifdescr_maxlen, 0, + "administrative maximum length for interface description"); + +MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); + +/* global sx for non-critical path ifdescr */ +static struct sx ifdescr_sx; +SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); + +void (*bstp_linkstate_p)(struct ifnet *ifp, int state); +void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); +void (*lagg_linkstate_p)(struct ifnet *ifp, int state); +/* These are external hooks for CARP. */ +void (*carp_linkstate_p)(struct ifnet *ifp); +#if defined(INET) || defined(INET6) +struct ifnet *(*carp_forus_p)(struct ifnet *ifp, u_char *dhost); +int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *sa, struct rtentry *rt); +#endif +#ifdef INET +int (*carp_iamatch_p)(struct ifnet *, struct in_ifaddr *, struct in_addr *, + u_int8_t **); +#endif +#ifdef INET6 +struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); +caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, + const struct in6_addr *taddr); +#endif + +struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; + +/* + * XXX: Style; these should be sorted alphabetically, and unprototyped + * static functions should be prototyped. Currently they are sorted by + * declaration order. + */ +static void if_attachdomain(void *); +static void if_attachdomain1(struct ifnet *); +static int ifconf(u_long, caddr_t); +static void if_freemulti(struct ifmultiaddr *); +static void if_init(void *); +static void if_grow(void); +static void if_check(void *); +static void if_route(struct ifnet *, int flag, int fam); +static int if_setflag(struct ifnet *, int, int, int *, int); +static void if_slowtimo(void *); +static int if_transmit(struct ifnet *ifp, struct mbuf *m); +static void if_unroute(struct ifnet *, int flag, int fam); +static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); +static int if_rtdel(struct radix_node *, void *); +static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); +static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); +static void do_link_state_change(void *, int); +static int if_getgroup(struct ifgroupreq *, struct ifnet *); +static int if_getgroupmembers(struct ifgroupreq *); +static void if_delgroups(struct ifnet *); +static void if_attach_internal(struct ifnet *, int); +static void if_detach_internal(struct ifnet *, int); + +#ifdef INET6 +/* + * XXX: declare here to avoid to include many inet6 related files.. + * should be more generalized? + */ +extern void nd6_setmtu(struct ifnet *); +#endif + +VNET_DEFINE(int, if_index); +int ifqmaxlen = IFQ_MAXLEN; +VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ +VNET_DEFINE(struct ifgrouphead, ifg_head); + +static VNET_DEFINE(int, if_indexlim) = 8; + +/* Table of ifnet by index. */ +VNET_DEFINE(struct ifindex_entry *, ifindex_table); + +#define V_if_indexlim VNET(if_indexlim) +#define V_ifindex_table VNET(ifindex_table) + +/* + * The global network interface list (V_ifnet) and related state (such as + * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and + * an rwlock. Either may be acquired shared to stablize the list, but both + * must be acquired writable to modify the list. This model allows us to + * both stablize the interface list during interrupt thread processing, but + * also to stablize it over long-running ioctls, without introducing priority + * inversions and deadlocks. + */ +struct rwlock ifnet_rwlock; +struct sx ifnet_sxlock; + +/* + * The allocation of network interfaces is a rather non-atomic affair; we + * need to select an index before we are ready to expose the interface for + * use, so will use this pointer value to indicate reservation. + */ +#define IFNET_HOLD (void *)(uintptr_t)(-1) + +static if_com_alloc_t *if_com_alloc[256]; +static if_com_free_t *if_com_free[256]; + +/* + * System initialization + */ +SYSINIT(interface_check, SI_SUB_PROTO_IF, SI_ORDER_FIRST, if_check, NULL); + +MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); +MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); +MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); + +struct ifnet * +ifnet_byindex_locked(u_short idx) +{ + + if (idx > V_if_index) + return (NULL); + if (V_ifindex_table[idx].ife_ifnet == IFNET_HOLD) + return (NULL); + return (V_ifindex_table[idx].ife_ifnet); +} + +struct ifnet * +ifnet_byindex(u_short idx) +{ + struct ifnet *ifp; + + IFNET_RLOCK_NOSLEEP(); + ifp = ifnet_byindex_locked(idx); + IFNET_RUNLOCK_NOSLEEP(); + return (ifp); +} + +struct ifnet * +ifnet_byindex_ref(u_short idx) +{ + struct ifnet *ifp; + + IFNET_RLOCK_NOSLEEP(); + ifp = ifnet_byindex_locked(idx); + if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { + IFNET_RUNLOCK_NOSLEEP(); + return (NULL); + } + if_ref(ifp); + IFNET_RUNLOCK_NOSLEEP(); + return (ifp); +} + +/* + * Allocate an ifindex array entry; return 0 on success or an error on + * failure. + */ +static int +ifindex_alloc_locked(u_short *idxp) +{ + u_short idx; + + IFNET_WLOCK_ASSERT(); + + /* + * Try to find an empty slot below V_if_index. If we fail, take the + * next slot. + */ + for (idx = 1; idx <= V_if_index; idx++) { + if (V_ifindex_table[idx].ife_ifnet == NULL) + break; + } + + /* Catch if_index overflow. */ + if (idx < 1) + return (ENOSPC); + if (idx > V_if_index) + V_if_index = idx; + if (V_if_index >= V_if_indexlim) + if_grow(); + *idxp = idx; + return (0); +} + +static void +ifindex_free_locked(u_short idx) +{ + + IFNET_WLOCK_ASSERT(); + + V_ifindex_table[idx].ife_ifnet = NULL; + while (V_if_index > 0 && + V_ifindex_table[V_if_index].ife_ifnet == NULL) + V_if_index--; +} + +static void +ifindex_free(u_short idx) +{ + + IFNET_WLOCK(); + ifindex_free_locked(idx); + IFNET_WUNLOCK(); +} + +static void +ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp) +{ + + IFNET_WLOCK_ASSERT(); + + V_ifindex_table[idx].ife_ifnet = ifp; +} + +static void +ifnet_setbyindex(u_short idx, struct ifnet *ifp) +{ + + IFNET_WLOCK(); + ifnet_setbyindex_locked(idx, ifp); + IFNET_WUNLOCK(); +} + +struct ifaddr * +ifaddr_byindex(u_short idx) +{ + struct ifaddr *ifa; + + IFNET_RLOCK_NOSLEEP(); + ifa = ifnet_byindex_locked(idx)->if_addr; + if (ifa != NULL) + ifa_ref(ifa); + IFNET_RUNLOCK_NOSLEEP(); + return (ifa); +} + +/* + * Network interface utility routines. + * + * Routines with ifa_ifwith* names take sockaddr *'s as + * parameters. + */ + +static void +vnet_if_init(const void *unused __unused) +{ + + TAILQ_INIT(&V_ifnet); + TAILQ_INIT(&V_ifg_head); + if_grow(); /* create initial table */ + vnet_if_clone_init(); +} +VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_init, + NULL); + +/* ARGSUSED*/ +static void +if_init(void *dummy __unused) +{ + + IFNET_LOCK_INIT(); + if_clone_init(); +} +SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_SECOND, if_init, NULL); + + +#ifdef VIMAGE +static void +vnet_if_uninit(const void *unused __unused) +{ + + VNET_ASSERT(TAILQ_EMPTY(&V_ifnet)); + VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head)); + + free((caddr_t)V_ifindex_table, M_IFNET); +} +VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, + vnet_if_uninit, NULL); +#endif + +static void +if_grow(void) +{ + u_int n; + struct ifindex_entry *e; + + V_if_indexlim <<= 1; + n = V_if_indexlim * sizeof(*e); + e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); + if (V_ifindex_table != NULL) { + memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); + free((caddr_t)V_ifindex_table, M_IFNET); + } + V_ifindex_table = e; +} + +static void +if_check(void *dummy __unused) +{ + + /* + * If at least one interface added during boot uses + * if_watchdog then start the timer. + */ + if (slowtimo_started) + if_slowtimo(0); +} + +/* + * Allocate a struct ifnet and an index for an interface. A layer 2 + * common structure will also be allocated if an allocation routine is + * registered for the passed type. + */ +struct ifnet * +if_alloc(u_char type) +{ + struct ifnet *ifp; + u_short idx; + + ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); + IFNET_WLOCK(); + if (ifindex_alloc_locked(&idx) != 0) { + IFNET_WUNLOCK(); + free(ifp, M_IFNET); + return (NULL); + } + ifnet_setbyindex_locked(idx, IFNET_HOLD); + IFNET_WUNLOCK(); + ifp->if_index = idx; + ifp->if_type = type; + ifp->if_alloctype = type; + if (if_com_alloc[type] != NULL) { + ifp->if_l2com = if_com_alloc[type](type, ifp); + if (ifp->if_l2com == NULL) { + free(ifp, M_IFNET); + ifindex_free(idx); + return (NULL); + } + } + + IF_ADDR_LOCK_INIT(ifp); + TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); + ifp->if_afdata_initialized = 0; + IF_AFDATA_LOCK_INIT(ifp); + TAILQ_INIT(&ifp->if_addrhead); + TAILQ_INIT(&ifp->if_prefixhead); + TAILQ_INIT(&ifp->if_multiaddrs); + TAILQ_INIT(&ifp->if_groups); +#ifdef MAC + mac_ifnet_init(ifp); +#endif + ifq_init(&ifp->if_snd, ifp); + + refcount_init(&ifp->if_refcount, 1); /* Index reference. */ + ifnet_setbyindex(ifp->if_index, ifp); + return (ifp); +} + +/* + * Do the actual work of freeing a struct ifnet, associated index, and layer + * 2 common structure. This call is made when the last reference to an + * interface is released. + */ +static void +if_free_internal(struct ifnet *ifp) +{ + + KASSERT((ifp->if_flags & IFF_DYING), + ("if_free_internal: interface not dying")); + + IFNET_WLOCK(); + KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), + ("%s: freeing unallocated ifnet", ifp->if_xname)); + + ifindex_free_locked(ifp->if_index); + IFNET_WUNLOCK(); + + if (if_com_free[ifp->if_alloctype] != NULL) + if_com_free[ifp->if_alloctype](ifp->if_l2com, + ifp->if_alloctype); + +#ifdef MAC + mac_ifnet_destroy(ifp); +#endif /* MAC */ + if (ifp->if_description != NULL) + free(ifp->if_description, M_IFDESCR); + IF_AFDATA_DESTROY(ifp); + IF_ADDR_LOCK_DESTROY(ifp); + ifq_delete(&ifp->if_snd); + free(ifp, M_IFNET); +} + +/* + * This version should only be called by intefaces that switch their type + * after calling if_alloc(). if_free_type() will go away again now that we + * have if_alloctype to cache the original allocation type. For now, assert + * that they match, since we require that in practice. + */ +void +if_free_type(struct ifnet *ifp, u_char type) +{ + + KASSERT(ifp->if_alloctype == type, + ("if_free_type: type (%d) != alloctype (%d)", type, + ifp->if_alloctype)); + + ifp->if_flags |= IFF_DYING; /* XXX: Locking */ + if (!refcount_release(&ifp->if_refcount)) + return; + if_free_internal(ifp); +} + +/* + * This is the normal version of if_free(), used by device drivers to free a + * detached network interface. The contents of if_free_type() will move into + * here when if_free_type() goes away. + */ +void +if_free(struct ifnet *ifp) +{ + + if_free_type(ifp, ifp->if_alloctype); +} + +/* + * Interfaces to keep an ifnet type-stable despite the possibility of the + * driver calling if_free(). If there are additional references, we defer + * freeing the underlying data structure. + */ +void +if_ref(struct ifnet *ifp) +{ + + /* We don't assert the ifnet list lock here, but arguably should. */ + refcount_acquire(&ifp->if_refcount); +} + +void +if_rele(struct ifnet *ifp) +{ + + if (!refcount_release(&ifp->if_refcount)) + return; + if_free_internal(ifp); +} + +void +ifq_init(struct ifaltq *ifq, struct ifnet *ifp) +{ + + mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); + + if (ifq->ifq_maxlen == 0) + ifq->ifq_maxlen = ifqmaxlen; + + ifq->altq_type = 0; + ifq->altq_disc = NULL; + ifq->altq_flags &= ALTQF_CANTCHANGE; + ifq->altq_tbr = NULL; + ifq->altq_ifp = ifp; +} + +void +ifq_delete(struct ifaltq *ifq) +{ + mtx_destroy(&ifq->ifq_mtx); +} + +/* + * Perform generic interface initalization tasks and attach the interface + * to the list of "active" interfaces. If vmove flag is set on entry + * to if_attach_internal(), perform only a limited subset of initialization + * tasks, given that we are moving from one vnet to another an ifnet which + * has already been fully initialized. + * + * XXX: + * - The decision to return void and thus require this function to + * succeed is questionable. + * - We should probably do more sanity checking. For instance we don't + * do anything to insure if_xname is unique or non-empty. + */ +void +if_attach(struct ifnet *ifp) +{ + + if_attach_internal(ifp, 0); +} + +static void +if_attach_internal(struct ifnet *ifp, int vmove) +{ + unsigned socksize, ifasize; + int namelen, masklen; + struct sockaddr_dl *sdl; + struct ifaddr *ifa; + + if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) + panic ("%s: BUG: if_attach called without if_alloc'd input()\n", + ifp->if_xname); + +#ifdef VIMAGE + ifp->if_vnet = curvnet; + if (ifp->if_home_vnet == NULL) + ifp->if_home_vnet = curvnet; +#endif + + if_addgroup(ifp, IFG_ALL); + + getmicrotime(&ifp->if_lastchange); + ifp->if_data.ifi_epoch = time_uptime; + ifp->if_data.ifi_datalen = sizeof(struct if_data); + + KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || + (ifp->if_transmit != NULL && ifp->if_qflush != NULL), + ("transmit and qflush must both either be set or both be NULL")); + if (ifp->if_transmit == NULL) { + ifp->if_transmit = if_transmit; + ifp->if_qflush = if_qflush; + } + + if (!vmove) { +#ifdef MAC + mac_ifnet_create(ifp); +#endif + + /* + * Create a Link Level name for this device. + */ + namelen = strlen(ifp->if_xname); + /* + * Always save enough space for any possiable name so we + * can do a rename in place later. + */ + masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; + socksize = masklen + ifp->if_addrlen; + if (socksize < sizeof(*sdl)) + socksize = sizeof(*sdl); + socksize = roundup2(socksize, sizeof(long)); + ifasize = sizeof(*ifa) + 2 * socksize; + ifa = malloc(ifasize, M_IFADDR, M_WAITOK | M_ZERO); + ifa_init(ifa); + sdl = (struct sockaddr_dl *)(ifa + 1); + sdl->sdl_len = socksize; + sdl->sdl_family = AF_LINK; + bcopy(ifp->if_xname, sdl->sdl_data, namelen); + sdl->sdl_nlen = namelen; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = ifp->if_type; + ifp->if_addr = ifa; + ifa->ifa_ifp = ifp; + ifa->ifa_rtrequest = link_rtrequest; + ifa->ifa_addr = (struct sockaddr *)sdl; + sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); + ifa->ifa_netmask = (struct sockaddr *)sdl; + sdl->sdl_len = masklen; + while (namelen != 0) + sdl->sdl_data[--namelen] = 0xff; + TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); + /* Reliably crash if used uninitialized. */ + ifp->if_broadcastaddr = NULL; + } +#ifdef VIMAGE + else { + /* + * Update the interface index in the link layer address + * of the interface. + */ + for (ifa = ifp->if_addr; ifa != NULL; + ifa = TAILQ_NEXT(ifa, ifa_link)) { + if (ifa->ifa_addr->sa_family == AF_LINK) { + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + sdl->sdl_index = ifp->if_index; + } + } + } +#endif + + IFNET_WLOCK(); + TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); +#ifdef VIMAGE + curvnet->vnet_ifcnt++; +#endif + IFNET_WUNLOCK(); + + if (domain_init_status >= 2) + if_attachdomain1(ifp); + + EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); + if (IS_DEFAULT_VNET(curvnet)) + devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); + + /* Announce the interface. */ + rt_ifannouncemsg(ifp, IFAN_ARRIVAL); + + if (!vmove && ifp->if_watchdog != NULL) { + if_printf(ifp, + "WARNING: using obsoleted if_watchdog interface\n"); + + /* + * Note that we need if_slowtimo(). If this happens after + * boot, then call if_slowtimo() directly. + */ + if (atomic_cmpset_int(&slowtimo_started, 0, 1) && !cold) + if_slowtimo(0); + } +} + +static void +if_attachdomain(void *dummy) +{ + struct ifnet *ifp; + int s; + + s = splnet(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) + if_attachdomain1(ifp); + splx(s); +} +SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, + if_attachdomain, NULL); + +static void +if_attachdomain1(struct ifnet *ifp) +{ + struct domain *dp; + int s; + + s = splnet(); + + /* + * Since dp->dom_ifattach calls malloc() with M_WAITOK, we + * cannot lock ifp->if_afdata initialization, entirely. + */ + if (IF_AFDATA_TRYLOCK(ifp) == 0) { + splx(s); + return; + } + if (ifp->if_afdata_initialized >= domain_init_status) { + IF_AFDATA_UNLOCK(ifp); + splx(s); + printf("if_attachdomain called more than once on %s\n", + ifp->if_xname); + return; + } + ifp->if_afdata_initialized = domain_init_status; + IF_AFDATA_UNLOCK(ifp); + + /* address family dependent data region */ + bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); + for (dp = domains; dp; dp = dp->dom_next) { + if (dp->dom_ifattach) + ifp->if_afdata[dp->dom_family] = + (*dp->dom_ifattach)(ifp); + } + + splx(s); +} + +/* + * Remove any unicast or broadcast network addresses from an interface. + */ +void +if_purgeaddrs(struct ifnet *ifp) +{ + struct ifaddr *ifa, *next; + + TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { + if (ifa->ifa_addr->sa_family == AF_LINK) + continue; +#ifdef INET + /* XXX: Ugly!! ad hoc just for INET */ + if (ifa->ifa_addr->sa_family == AF_INET) { + struct ifaliasreq ifr; + + bzero(&ifr, sizeof(ifr)); + ifr.ifra_addr = *ifa->ifa_addr; + if (ifa->ifa_dstaddr) + ifr.ifra_broadaddr = *ifa->ifa_dstaddr; + if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, + NULL) == 0) + continue; + } +#endif /* INET */ +#ifdef INET6 + if (ifa->ifa_addr->sa_family == AF_INET6) { + in6_purgeaddr(ifa); + /* ifp_addrhead is already updated */ + continue; + } +#endif /* INET6 */ + TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); + ifa_free(ifa); + } +} + +/* + * Remove any multicast network addresses from an interface when an ifnet + * is going away. + */ +static void +if_purgemaddrs(struct ifnet *ifp) +{ + struct ifmultiaddr *ifma; + struct ifmultiaddr *next; + + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) + if_delmulti_locked(ifp, ifma, 1); + IF_ADDR_UNLOCK(ifp); +} + +/* + * Detach an interface, removing it from the list of "active" interfaces. + * If vmove flag is set on entry to if_detach_internal(), perform only a + * limited subset of cleanup tasks, given that we are moving an ifnet from + * one vnet to another, where it must be fully operational. + * + * XXXRW: There are some significant questions about event ordering, and + * how to prevent things from starting to use the interface during detach. + */ +void +if_detach(struct ifnet *ifp) +{ + + if_detach_internal(ifp, 0); +} + +static void +if_detach_internal(struct ifnet *ifp, int vmove) +{ + struct ifaddr *ifa; + struct radix_node_head *rnh; + int i, j; + struct domain *dp; + struct ifnet *iter; + int found = 0; + + IFNET_WLOCK(); + TAILQ_FOREACH(iter, &V_ifnet, if_link) + if (iter == ifp) { + TAILQ_REMOVE(&V_ifnet, ifp, if_link); + found = 1; + break; + } +#ifdef VIMAGE + if (found) + curvnet->vnet_ifcnt--; +#endif + IFNET_WUNLOCK(); + if (!found) { + if (vmove) + panic("%s: ifp=%p not on the ifnet tailq %p", + __func__, ifp, &V_ifnet); + else + return; /* XXX this should panic as well? */ + } + + /* + * Remove/wait for pending events. + */ + taskqueue_drain(taskqueue_swi, &ifp->if_linktask); + + /* + * Remove routes and flush queues. + */ + if_down(ifp); +#ifdef ALTQ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) + altq_disable(&ifp->if_snd); + if (ALTQ_IS_ATTACHED(&ifp->if_snd)) + altq_detach(&ifp->if_snd); +#endif + + if_purgeaddrs(ifp); + +#ifdef INET + in_ifdetach(ifp); +#endif + +#ifdef INET6 + /* + * Remove all IPv6 kernel structs related to ifp. This should be done + * before removing routing entries below, since IPv6 interface direct + * routes are expected to be removed by the IPv6-specific kernel API. + * Otherwise, the kernel will detect some inconsistency and bark it. + */ + in6_ifdetach(ifp); +#endif + if_purgemaddrs(ifp); + + if (!vmove) { + /* + * Prevent further calls into the device driver via ifnet. + */ + if_dead(ifp); + + /* + * Remove link ifaddr pointer and maybe decrement if_index. + * Clean up all addresses. + */ + ifp->if_addr = NULL; + + /* We can now free link ifaddr. */ + if (!TAILQ_EMPTY(&ifp->if_addrhead)) { + ifa = TAILQ_FIRST(&ifp->if_addrhead); + TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); + ifa_free(ifa); + } + } + + /* + * Delete all remaining routes using this interface + * Unfortuneatly the only way to do this is to slog through + * the entire routing table looking for routes which point + * to this interface...oh well... + */ + for (i = 1; i <= AF_MAX; i++) { + for (j = 0; j < rt_numfibs; j++) { + rnh = rt_tables_get_rnh(j, i); + if (rnh == NULL) + continue; + RADIX_NODE_HEAD_LOCK(rnh); + (void) rnh->rnh_walktree(rnh, if_rtdel, ifp); + RADIX_NODE_HEAD_UNLOCK(rnh); + } + } + + /* Announce that the interface is gone. */ + rt_ifannouncemsg(ifp, IFAN_DEPARTURE); + EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); + if (IS_DEFAULT_VNET(curvnet)) + devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); + if_delgroups(ifp); + + /* + * We cannot hold the lock over dom_ifdetach calls as they might + * sleep, for example trying to drain a callout, thus open up the + * theoretical race with re-attaching. + */ + IF_AFDATA_LOCK(ifp); + i = ifp->if_afdata_initialized; + ifp->if_afdata_initialized = 0; + IF_AFDATA_UNLOCK(ifp); + for (dp = domains; i > 0 && dp; dp = dp->dom_next) { + if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) + (*dp->dom_ifdetach)(ifp, + ifp->if_afdata[dp->dom_family]); + } +} + +#ifdef VIMAGE +/* + * if_vmove() performs a limited version of if_detach() in current + * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. + * An attempt is made to shrink if_index in current vnet, find an + * unused if_index in target vnet and calls if_grow() if necessary, + * and finally find an unused if_xname for the target vnet. + */ +void +if_vmove(struct ifnet *ifp, struct vnet *new_vnet) +{ + u_short idx; + + /* + * Detach from current vnet, but preserve LLADDR info, do not + * mark as dead etc. so that the ifnet can be reattached later. + */ + if_detach_internal(ifp, 1); + + /* + * Unlink the ifnet from ifindex_table[] in current vnet, and shrink + * the if_index for that vnet if possible. + * + * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, + * or we'd lock on one vnet and unlock on another. + */ + IFNET_WLOCK(); + ifindex_free_locked(ifp->if_index); + IFNET_WUNLOCK(); + + /* + * Perform interface-specific reassignment tasks, if provided by + * the driver. + */ + if (ifp->if_reassign != NULL) + ifp->if_reassign(ifp, new_vnet, NULL); + + /* + * Switch to the context of the target vnet. + */ + CURVNET_SET_QUIET(new_vnet); + + IFNET_WLOCK(); + if (ifindex_alloc_locked(&idx) != 0) { + IFNET_WUNLOCK(); + panic("if_index overflow"); + } + ifp->if_index = idx; + ifnet_setbyindex_locked(ifp->if_index, ifp); + IFNET_WUNLOCK(); + + if_attach_internal(ifp, 1); + + CURVNET_RESTORE(); +} + +/* + * Move an ifnet to or from another child prison/vnet, specified by the jail id. + */ +static int +if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) +{ + struct prison *pr; + struct ifnet *difp; + + /* Try to find the prison within our visibility. */ + sx_slock(&allprison_lock); + pr = prison_find_child(td->td_ucred->cr_prison, jid); + sx_sunlock(&allprison_lock); + if (pr == NULL) + return (ENXIO); + prison_hold_locked(pr); + mtx_unlock(&pr->pr_mtx); + + /* Do not try to move the iface from and to the same prison. */ + if (pr->pr_vnet == ifp->if_vnet) { + prison_free(pr); + return (EEXIST); + } + + /* Make sure the named iface does not exists in the dst. prison/vnet. */ + /* XXX Lock interfaces to avoid races. */ + CURVNET_SET_QUIET(pr->pr_vnet); + difp = ifunit(ifname); + CURVNET_RESTORE(); + if (difp != NULL) { + prison_free(pr); + return (EEXIST); + } + + /* Move the interface into the child jail/vnet. */ + if_vmove(ifp, pr->pr_vnet); + + /* Report the new if_xname back to the userland. */ + sprintf(ifname, "%s", ifp->if_xname); + + prison_free(pr); + return (0); +} + +static int +if_vmove_reclaim(struct thread *td, char *ifname, int jid) +{ + struct prison *pr; + struct vnet *vnet_dst; + struct ifnet *ifp; + + /* Try to find the prison within our visibility. */ + sx_slock(&allprison_lock); + pr = prison_find_child(td->td_ucred->cr_prison, jid); + sx_sunlock(&allprison_lock); + if (pr == NULL) + return (ENXIO); + prison_hold_locked(pr); + mtx_unlock(&pr->pr_mtx); + + /* Make sure the named iface exists in the source prison/vnet. */ + CURVNET_SET(pr->pr_vnet); + ifp = ifunit(ifname); /* XXX Lock to avoid races. */ + if (ifp == NULL) { + CURVNET_RESTORE(); + prison_free(pr); + return (ENXIO); + } + + /* Do not try to move the iface from and to the same prison. */ + vnet_dst = TD_TO_VNET(td); + if (vnet_dst == ifp->if_vnet) { + CURVNET_RESTORE(); + prison_free(pr); + return (EEXIST); + } + + /* Get interface back from child jail/vnet. */ + if_vmove(ifp, vnet_dst); + CURVNET_RESTORE(); + + /* Report the new if_xname back to the userland. */ + sprintf(ifname, "%s", ifp->if_xname); + + prison_free(pr); + return (0); +} +#endif /* VIMAGE */ + +/* + * Add a group to an interface + */ +int +if_addgroup(struct ifnet *ifp, const char *groupname) +{ + struct ifg_list *ifgl; + struct ifg_group *ifg = NULL; + struct ifg_member *ifgm; + + if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && + groupname[strlen(groupname) - 1] <= '9') + return (EINVAL); + + IFNET_WLOCK(); + TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) + if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { + IFNET_WUNLOCK(); + return (EEXIST); + } + + if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, + M_NOWAIT)) == NULL) { + IFNET_WUNLOCK(); + return (ENOMEM); + } + + if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), + M_TEMP, M_NOWAIT)) == NULL) { + free(ifgl, M_TEMP); + IFNET_WUNLOCK(); + return (ENOMEM); + } + + TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) + if (!strcmp(ifg->ifg_group, groupname)) + break; + + if (ifg == NULL) { + if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), + M_TEMP, M_NOWAIT)) == NULL) { + free(ifgl, M_TEMP); + free(ifgm, M_TEMP); + IFNET_WUNLOCK(); + return (ENOMEM); + } + strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); + ifg->ifg_refcnt = 0; + TAILQ_INIT(&ifg->ifg_members); + EVENTHANDLER_INVOKE(group_attach_event, ifg); + TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); + } + + ifg->ifg_refcnt++; + ifgl->ifgl_group = ifg; + ifgm->ifgm_ifp = ifp; + + IF_ADDR_LOCK(ifp); + TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); + TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); + IF_ADDR_UNLOCK(ifp); + + IFNET_WUNLOCK(); + + EVENTHANDLER_INVOKE(group_change_event, groupname); + + return (0); +} + +/* + * Remove a group from an interface + */ +int +if_delgroup(struct ifnet *ifp, const char *groupname) +{ + struct ifg_list *ifgl; + struct ifg_member *ifgm; + + IFNET_WLOCK(); + TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) + if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) + break; + if (ifgl == NULL) { + IFNET_WUNLOCK(); + return (ENOENT); + } + + IF_ADDR_LOCK(ifp); + TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); + IF_ADDR_UNLOCK(ifp); + + TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) + if (ifgm->ifgm_ifp == ifp) + break; + + if (ifgm != NULL) { + TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); + free(ifgm, M_TEMP); + } + + if (--ifgl->ifgl_group->ifg_refcnt == 0) { + TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); + EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); + free(ifgl->ifgl_group, M_TEMP); + } + IFNET_WUNLOCK(); + + free(ifgl, M_TEMP); + + EVENTHANDLER_INVOKE(group_change_event, groupname); + + return (0); +} + +/* + * Remove an interface from all groups + */ +static void +if_delgroups(struct ifnet *ifp) +{ + struct ifg_list *ifgl; + struct ifg_member *ifgm; + char groupname[IFNAMSIZ]; + + IFNET_WLOCK(); + while (!TAILQ_EMPTY(&ifp->if_groups)) { + ifgl = TAILQ_FIRST(&ifp->if_groups); + + strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); + + IF_ADDR_LOCK(ifp); + TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); + IF_ADDR_UNLOCK(ifp); + + TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) + if (ifgm->ifgm_ifp == ifp) + break; + + if (ifgm != NULL) { + TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, + ifgm_next); + free(ifgm, M_TEMP); + } + + if (--ifgl->ifgl_group->ifg_refcnt == 0) { + TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); + EVENTHANDLER_INVOKE(group_detach_event, + ifgl->ifgl_group); + free(ifgl->ifgl_group, M_TEMP); + } + IFNET_WUNLOCK(); + + free(ifgl, M_TEMP); + + EVENTHANDLER_INVOKE(group_change_event, groupname); + + IFNET_WLOCK(); + } + IFNET_WUNLOCK(); +} + +/* + * Stores all groups from an interface in memory pointed + * to by data + */ +static int +if_getgroup(struct ifgroupreq *data, struct ifnet *ifp) +{ + int len, error; + struct ifg_list *ifgl; + struct ifg_req ifgrq, *ifgp; + struct ifgroupreq *ifgr = data; + + if (ifgr->ifgr_len == 0) { + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) + ifgr->ifgr_len += sizeof(struct ifg_req); + IF_ADDR_UNLOCK(ifp); + return (0); + } + + len = ifgr->ifgr_len; + ifgp = ifgr->ifgr_groups; + /* XXX: wire */ + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { + if (len < sizeof(ifgrq)) { + IF_ADDR_UNLOCK(ifp); + return (EINVAL); + } + bzero(&ifgrq, sizeof ifgrq); + strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, + sizeof(ifgrq.ifgrq_group)); + if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { + IF_ADDR_UNLOCK(ifp); + return (error); + } + len -= sizeof(ifgrq); + ifgp++; + } + IF_ADDR_UNLOCK(ifp); + + return (0); +} + +/* + * Stores all members of a group in memory pointed to by data + */ +static int +if_getgroupmembers(struct ifgroupreq *data) +{ + struct ifgroupreq *ifgr = data; + struct ifg_group *ifg; + struct ifg_member *ifgm; + struct ifg_req ifgrq, *ifgp; + int len, error; + + IFNET_RLOCK(); + TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) + if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) + break; + if (ifg == NULL) { + IFNET_RUNLOCK(); + return (ENOENT); + } + + if (ifgr->ifgr_len == 0) { + TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) + ifgr->ifgr_len += sizeof(ifgrq); + IFNET_RUNLOCK(); + return (0); + } + + len = ifgr->ifgr_len; + ifgp = ifgr->ifgr_groups; + TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { + if (len < sizeof(ifgrq)) { + IFNET_RUNLOCK(); + return (EINVAL); + } + bzero(&ifgrq, sizeof ifgrq); + strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, + sizeof(ifgrq.ifgrq_member)); + if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { + IFNET_RUNLOCK(); + return (error); + } + len -= sizeof(ifgrq); + ifgp++; + } + IFNET_RUNLOCK(); + + return (0); +} + +/* + * Delete Routes for a Network Interface + * + * Called for each routing entry via the rnh->rnh_walktree() call above + * to delete all route entries referencing a detaching network interface. + * + * Arguments: + * rn pointer to node in the routing table + * arg argument passed to rnh->rnh_walktree() - detaching interface + * + * Returns: + * 0 successful + * errno failed - reason indicated + * + */ +static int +if_rtdel(struct radix_node *rn, void *arg) +{ + struct rtentry *rt = (struct rtentry *)rn; + struct ifnet *ifp = arg; + int err; + + if (rt->rt_ifp == ifp) { + + /* + * Protect (sorta) against walktree recursion problems + * with cloned routes + */ + if ((rt->rt_flags & RTF_UP) == 0) + return (0); + + err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway, + rt_mask(rt), rt->rt_flags|RTF_RNH_LOCKED, + (struct rtentry **) NULL, rt->rt_fibnum); + if (err) { + log(LOG_WARNING, "if_rtdel: error %d\n", err); + } + } + + return (0); +} + +/* + * Wrapper functions for struct ifnet address list locking macros. These are + * used by kernel modules to avoid encoding programming interface or binary + * interface assumptions that may be violated when kernel-internal locking + * approaches change. + */ +void +if_addr_rlock(struct ifnet *ifp) +{ + + IF_ADDR_LOCK(ifp); +} + +void +if_addr_runlock(struct ifnet *ifp) +{ + + IF_ADDR_UNLOCK(ifp); +} + +void +if_maddr_rlock(struct ifnet *ifp) +{ + + IF_ADDR_LOCK(ifp); +} + +void +if_maddr_runlock(struct ifnet *ifp) +{ + + IF_ADDR_UNLOCK(ifp); +} + +/* + * Reference count functions for ifaddrs. + */ +void +ifa_init(struct ifaddr *ifa) +{ + + mtx_init(&ifa->ifa_mtx, "ifaddr", NULL, MTX_DEF); + refcount_init(&ifa->ifa_refcnt, 1); +} + +void +ifa_ref(struct ifaddr *ifa) +{ + + refcount_acquire(&ifa->ifa_refcnt); +} + +void +ifa_free(struct ifaddr *ifa) +{ + + if (refcount_release(&ifa->ifa_refcnt)) { + mtx_destroy(&ifa->ifa_mtx); + free(ifa, M_IFADDR); + } +} + +int +ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) +{ + int error = 0; + struct rtentry *rt = NULL; + struct rt_addrinfo info; + static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + + bzero(&info, sizeof(info)); + info.rti_ifp = V_loif; + info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC; + info.rti_info[RTAX_DST] = ia; + info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; + error = rtrequest1_fib(RTM_ADD, &info, &rt, 0); + + if (error == 0 && rt != NULL) { + RT_LOCK(rt); + ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = + ifa->ifa_ifp->if_type; + ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = + ifa->ifa_ifp->if_index; + RT_REMREF(rt); + RT_UNLOCK(rt); + } else if (error != 0) + log(LOG_INFO, "ifa_add_loopback_route: insertion failed\n"); + + return (error); +} + +int +ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) +{ + int error = 0; + struct rt_addrinfo info; + struct sockaddr_dl null_sdl; + + bzero(&null_sdl, sizeof(null_sdl)); + null_sdl.sdl_len = sizeof(null_sdl); + null_sdl.sdl_family = AF_LINK; + null_sdl.sdl_type = ifa->ifa_ifp->if_type; + null_sdl.sdl_index = ifa->ifa_ifp->if_index; + bzero(&info, sizeof(info)); + info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC; + info.rti_info[RTAX_DST] = ia; + info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; + error = rtrequest1_fib(RTM_DELETE, &info, NULL, 0); + + if (error != 0) + log(LOG_INFO, "ifa_del_loopback_route: deletion failed\n"); + + return (error); +} + +/* + * XXX: Because sockaddr_dl has deeper structure than the sockaddr + * structs used to represent other address families, it is necessary + * to perform a different comparison. + */ + +#define sa_equal(a1, a2) \ + (bcmp((a1), (a2), ((a1))->sa_len) == 0) + +#define sa_dl_equal(a1, a2) \ + ((((struct sockaddr_dl *)(a1))->sdl_len == \ + ((struct sockaddr_dl *)(a2))->sdl_len) && \ + (bcmp(LLADDR((struct sockaddr_dl *)(a1)), \ + LLADDR((struct sockaddr_dl *)(a2)), \ + ((struct sockaddr_dl *)(a1))->sdl_alen) == 0)) + +/* + * Locate an interface based on a complete address. + */ +/*ARGSUSED*/ +static struct ifaddr * +ifa_ifwithaddr_internal(struct sockaddr *addr, int getref) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + + IFNET_RLOCK_NOSLEEP(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != addr->sa_family) + continue; + if (sa_equal(addr, ifa->ifa_addr)) { + if (getref) + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + goto done; + } + /* IP6 doesn't have broadcast */ + if ((ifp->if_flags & IFF_BROADCAST) && + ifa->ifa_broadaddr && + ifa->ifa_broadaddr->sa_len != 0 && + sa_equal(ifa->ifa_broadaddr, addr)) { + if (getref) + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + goto done; + } + } + IF_ADDR_UNLOCK(ifp); + } + ifa = NULL; +done: + IFNET_RUNLOCK_NOSLEEP(); + return (ifa); +} + +struct ifaddr * +ifa_ifwithaddr(struct sockaddr *addr) +{ + + return (ifa_ifwithaddr_internal(addr, 1)); +} + +int +ifa_ifwithaddr_check(struct sockaddr *addr) +{ + + return (ifa_ifwithaddr_internal(addr, 0) != NULL); +} + +/* + * Locate an interface based on the broadcast address. + */ +/* ARGSUSED */ +struct ifaddr * +ifa_ifwithbroadaddr(struct sockaddr *addr) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + + IFNET_RLOCK_NOSLEEP(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != addr->sa_family) + continue; + if ((ifp->if_flags & IFF_BROADCAST) && + ifa->ifa_broadaddr && + ifa->ifa_broadaddr->sa_len != 0 && + sa_equal(ifa->ifa_broadaddr, addr)) { + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + goto done; + } + } + IF_ADDR_UNLOCK(ifp); + } + ifa = NULL; +done: + IFNET_RUNLOCK_NOSLEEP(); + return (ifa); +} + +/* + * Locate the point to point interface with a given destination address. + */ +/*ARGSUSED*/ +struct ifaddr * +ifa_ifwithdstaddr(struct sockaddr *addr) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + + IFNET_RLOCK_NOSLEEP(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + continue; + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != addr->sa_family) + continue; + if (ifa->ifa_dstaddr != NULL && + sa_equal(addr, ifa->ifa_dstaddr)) { + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + goto done; + } + } + IF_ADDR_UNLOCK(ifp); + } + ifa = NULL; +done: + IFNET_RUNLOCK_NOSLEEP(); + return (ifa); +} + +/* + * Find an interface on a specific network. If many, choice + * is most specific found. + */ +struct ifaddr * +ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + struct ifaddr *ifa_maybe = NULL; + u_int af = addr->sa_family; + char *addr_data = addr->sa_data, *cplim; + + /* + * AF_LINK addresses can be looked up directly by their index number, + * so do that if we can. + */ + if (af == AF_LINK) { + struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr; + if (sdl->sdl_index && sdl->sdl_index <= V_if_index) + return (ifaddr_byindex(sdl->sdl_index)); + } + + /* + * Scan though each interface, looking for ones that have addresses + * in this address family. Maintain a reference on ifa_maybe once + * we find one, as we release the IF_ADDR_LOCK() that kept it stable + * when we move onto the next interface. + */ + IFNET_RLOCK_NOSLEEP(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + char *cp, *cp2, *cp3; + + if (ifa->ifa_addr->sa_family != af) +next: continue; + if (af == AF_INET && + ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { + /* + * This is a bit broken as it doesn't + * take into account that the remote end may + * be a single node in the network we are + * looking for. + * The trouble is that we don't know the + * netmask for the remote end. + */ + if (ifa->ifa_dstaddr != NULL && + sa_equal(addr, ifa->ifa_dstaddr)) { + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + goto done; + } + } else { + /* + * if we have a special address handler, + * then use it instead of the generic one. + */ + if (ifa->ifa_claim_addr) { + if ((*ifa->ifa_claim_addr)(ifa, addr)) { + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + goto done; + } + continue; + } + + /* + * Scan all the bits in the ifa's address. + * If a bit dissagrees with what we are + * looking for, mask it with the netmask + * to see if it really matters. + * (A byte at a time) + */ + if (ifa->ifa_netmask == 0) + continue; + cp = addr_data; + cp2 = ifa->ifa_addr->sa_data; + cp3 = ifa->ifa_netmask->sa_data; + cplim = ifa->ifa_netmask->sa_len + + (char *)ifa->ifa_netmask; + while (cp3 < cplim) + if ((*cp++ ^ *cp2++) & *cp3++) + goto next; /* next address! */ + /* + * If the netmask of what we just found + * is more specific than what we had before + * (if we had one) then remember the new one + * before continuing to search + * for an even better one. + */ + if (ifa_maybe == NULL || + rn_refines((caddr_t)ifa->ifa_netmask, + (caddr_t)ifa_maybe->ifa_netmask)) { + if (ifa_maybe != NULL) + ifa_free(ifa_maybe); + ifa_maybe = ifa; + ifa_ref(ifa_maybe); + } + } + } + IF_ADDR_UNLOCK(ifp); + } + ifa = ifa_maybe; + ifa_maybe = NULL; +done: + IFNET_RUNLOCK_NOSLEEP(); + if (ifa_maybe != NULL) + ifa_free(ifa_maybe); + return (ifa); +} + +/* + * Find an interface address specific to an interface best matching + * a given address. + */ +struct ifaddr * +ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp) +{ + struct ifaddr *ifa; + char *cp, *cp2, *cp3; + char *cplim; + struct ifaddr *ifa_maybe = NULL; + u_int af = addr->sa_family; + + if (af >= AF_MAX) + return (NULL); + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != af) + continue; + if (ifa_maybe == NULL) + ifa_maybe = ifa; + if (ifa->ifa_netmask == 0) { + if (sa_equal(addr, ifa->ifa_addr) || + (ifa->ifa_dstaddr && + sa_equal(addr, ifa->ifa_dstaddr))) + goto done; + continue; + } + if (ifp->if_flags & IFF_POINTOPOINT) { + if (sa_equal(addr, ifa->ifa_dstaddr)) + goto done; + } else { + cp = addr->sa_data; + cp2 = ifa->ifa_addr->sa_data; + cp3 = ifa->ifa_netmask->sa_data; + cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; + for (; cp3 < cplim; cp3++) + if ((*cp++ ^ *cp2++) & *cp3) + break; + if (cp3 == cplim) + goto done; + } + } + ifa = ifa_maybe; +done: + if (ifa != NULL) + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + return (ifa); +} + +#include + +/* + * Default action when installing a route with a Link Level gateway. + * Lookup an appropriate real ifa to point to. + * This should be moved to /sys/net/link.c eventually. + */ +static void +link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) +{ + struct ifaddr *ifa, *oifa; + struct sockaddr *dst; + struct ifnet *ifp; + + RT_LOCK_ASSERT(rt); + + if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) || + ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0)) + return; + ifa = ifaof_ifpforaddr(dst, ifp); + if (ifa) { + oifa = rt->rt_ifa; + rt->rt_ifa = ifa; + ifa_free(oifa); + if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) + ifa->ifa_rtrequest(cmd, rt, info); + } +} + +/* + * Mark an interface down and notify protocols of + * the transition. + * NOTE: must be called at splnet or eqivalent. + */ +static void +if_unroute(struct ifnet *ifp, int flag, int fam) +{ + struct ifaddr *ifa; + + KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); + + ifp->if_flags &= ~flag; + getmicrotime(&ifp->if_lastchange); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) + pfctlinput(PRC_IFDOWN, ifa->ifa_addr); + ifp->if_qflush(ifp); + + if (ifp->if_carp) + (*carp_linkstate_p)(ifp); + rt_ifmsg(ifp); +} + +/* + * Mark an interface up and notify protocols of + * the transition. + * NOTE: must be called at splnet or eqivalent. + */ +static void +if_route(struct ifnet *ifp, int flag, int fam) +{ + struct ifaddr *ifa; + + KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); + + ifp->if_flags |= flag; + getmicrotime(&ifp->if_lastchange); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) + pfctlinput(PRC_IFUP, ifa->ifa_addr); + if (ifp->if_carp) + (*carp_linkstate_p)(ifp); + rt_ifmsg(ifp); +#ifdef INET6 + in6_if_up(ifp); +#endif +} + +void (*vlan_link_state_p)(struct ifnet *, int); /* XXX: private from if_vlan */ +void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ + +/* + * Handle a change in the interface link state. To avoid LORs + * between driver lock and upper layer locks, as well as possible + * recursions, we post event to taskqueue, and all job + * is done in static do_link_state_change(). + */ +void +if_link_state_change(struct ifnet *ifp, int link_state) +{ + /* Return if state hasn't changed. */ + if (ifp->if_link_state == link_state) + return; + + ifp->if_link_state = link_state; + + taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); +} + +static void +do_link_state_change(void *arg, int pending) +{ + struct ifnet *ifp = (struct ifnet *)arg; + int link_state = ifp->if_link_state; + CURVNET_SET(ifp->if_vnet); + + /* Notify that the link state has changed. */ + rt_ifmsg(ifp); + if (ifp->if_vlantrunk != NULL) + (*vlan_link_state_p)(ifp, 0); + + if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && + IFP2AC(ifp)->ac_netgraph != NULL) + (*ng_ether_link_state_p)(ifp, link_state); + if (ifp->if_carp) + (*carp_linkstate_p)(ifp); + if (ifp->if_bridge) { + KASSERT(bstp_linkstate_p != NULL,("if_bridge bstp not loaded!")); + (*bstp_linkstate_p)(ifp, link_state); + } + if (ifp->if_lagg) { + KASSERT(lagg_linkstate_p != NULL,("if_lagg not loaded!")); + (*lagg_linkstate_p)(ifp, link_state); + } + + if (IS_DEFAULT_VNET(curvnet)) + devctl_notify("IFNET", ifp->if_xname, + (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", + NULL); + if (pending > 1) + if_printf(ifp, "%d link states coalesced\n", pending); + if (log_link_state_change) + log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname, + (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); + CURVNET_RESTORE(); +} + +/* + * Mark an interface down and notify protocols of + * the transition. + * NOTE: must be called at splnet or eqivalent. + */ +void +if_down(struct ifnet *ifp) +{ + + if_unroute(ifp, IFF_UP, AF_UNSPEC); +} + +/* + * Mark an interface up and notify protocols of + * the transition. + * NOTE: must be called at splnet or eqivalent. + */ +void +if_up(struct ifnet *ifp) +{ + + if_route(ifp, IFF_UP, AF_UNSPEC); +} + +/* + * Flush an interface queue. + */ +void +if_qflush(struct ifnet *ifp) +{ + struct mbuf *m, *n; + struct ifaltq *ifq; + + ifq = &ifp->if_snd; + IFQ_LOCK(ifq); +#ifdef ALTQ + if (ALTQ_IS_ENABLED(ifq)) + ALTQ_PURGE(ifq); +#endif + n = ifq->ifq_head; + while ((m = n) != 0) { + n = m->m_act; + m_freem(m); + } + ifq->ifq_head = 0; + ifq->ifq_tail = 0; + ifq->ifq_len = 0; + IFQ_UNLOCK(ifq); +} + +/* + * Handle interface watchdog timer routines. Called + * from softclock, we decrement timers (if set) and + * call the appropriate interface routine on expiration. + * + * XXXRW: Note that because timeouts run with Giant, if_watchdog() is called + * holding Giant. + */ +static void +if_slowtimo(void *arg) +{ + VNET_ITERATOR_DECL(vnet_iter); + struct ifnet *ifp; + int s = splimp(); + + VNET_LIST_RLOCK_NOSLEEP(); + IFNET_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (ifp->if_timer == 0 || --ifp->if_timer) + continue; + if (ifp->if_watchdog) + (*ifp->if_watchdog)(ifp); + } + CURVNET_RESTORE(); + } + IFNET_RUNLOCK_NOSLEEP(); + VNET_LIST_RUNLOCK_NOSLEEP(); + splx(s); + timeout(if_slowtimo, (void *)0, hz / IFNET_SLOWHZ); +} + +/* + * Map interface name to interface structure pointer, with or without + * returning a reference. + */ +struct ifnet * +ifunit_ref(const char *name) +{ + struct ifnet *ifp; + + IFNET_RLOCK_NOSLEEP(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && + !(ifp->if_flags & IFF_DYING)) + break; + } + if (ifp != NULL) + if_ref(ifp); + IFNET_RUNLOCK_NOSLEEP(); + return (ifp); +} + +struct ifnet * +ifunit(const char *name) +{ + struct ifnet *ifp; + + IFNET_RLOCK_NOSLEEP(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) + break; + } + IFNET_RUNLOCK_NOSLEEP(); + return (ifp); +} + +/* + * Hardware specific interface ioctls. + */ +static int +ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) +{ + struct ifreq *ifr; + struct ifstat *ifs; + int error = 0; + int new_flags, temp_flags; + size_t namelen, onamelen; + size_t descrlen; + char *descrbuf, *odescrbuf; + char new_name[IFNAMSIZ]; + struct ifaddr *ifa; + struct sockaddr_dl *sdl; + + ifr = (struct ifreq *)data; + switch (cmd) { + case SIOCGIFINDEX: + ifr->ifr_index = ifp->if_index; + break; + + case SIOCGIFFLAGS: + temp_flags = ifp->if_flags | ifp->if_drv_flags; + ifr->ifr_flags = temp_flags & 0xffff; + ifr->ifr_flagshigh = temp_flags >> 16; + break; + + case SIOCGIFCAP: + ifr->ifr_reqcap = ifp->if_capabilities; + ifr->ifr_curcap = ifp->if_capenable; + break; + +#ifdef MAC + case SIOCGIFMAC: + error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); + break; +#endif + + case SIOCGIFMETRIC: + ifr->ifr_metric = ifp->if_metric; + break; + + case SIOCGIFMTU: + ifr->ifr_mtu = ifp->if_mtu; + break; + + case SIOCGIFPHYS: + ifr->ifr_phys = ifp->if_physical; + break; + + case SIOCGIFDESCR: + error = 0; + sx_slock(&ifdescr_sx); + if (ifp->if_description == NULL) + error = ENOMSG; + else { + /* space for terminating nul */ + descrlen = strlen(ifp->if_description) + 1; + if (ifr->ifr_buffer.length < descrlen) + ifr->ifr_buffer.buffer = NULL; + else + error = copyout(ifp->if_description, + ifr->ifr_buffer.buffer, descrlen); + ifr->ifr_buffer.length = descrlen; + } + sx_sunlock(&ifdescr_sx); + break; + + case SIOCSIFDESCR: + error = priv_check(td, PRIV_NET_SETIFDESCR); + if (error) + return (error); + + /* + * Copy only (length-1) bytes to make sure that + * if_description is always nul terminated. The + * length parameter is supposed to count the + * terminating nul in. + */ + if (ifr->ifr_buffer.length > ifdescr_maxlen) + return (ENAMETOOLONG); + else if (ifr->ifr_buffer.length == 0) + descrbuf = NULL; + else { + descrbuf = malloc(ifr->ifr_buffer.length, M_IFDESCR, + M_WAITOK | M_ZERO); + error = copyin(ifr->ifr_buffer.buffer, descrbuf, + ifr->ifr_buffer.length - 1); + if (error) { + free(descrbuf, M_IFDESCR); + break; + } + } + + sx_xlock(&ifdescr_sx); + odescrbuf = ifp->if_description; + ifp->if_description = descrbuf; + sx_xunlock(&ifdescr_sx); + + getmicrotime(&ifp->if_lastchange); + free(odescrbuf, M_IFDESCR); + break; + + case SIOCSIFFLAGS: + error = priv_check(td, PRIV_NET_SETIFFLAGS); + if (error) + return (error); + /* + * Currently, no driver owned flags pass the IFF_CANTCHANGE + * check, so we don't need special handling here yet. + */ + new_flags = (ifr->ifr_flags & 0xffff) | + (ifr->ifr_flagshigh << 16); + if (ifp->if_flags & IFF_SMART) { + /* Smart drivers twiddle their own routes */ + } else if (ifp->if_flags & IFF_UP && + (new_flags & IFF_UP) == 0) { + int s = splimp(); + if_down(ifp); + splx(s); + } else if (new_flags & IFF_UP && + (ifp->if_flags & IFF_UP) == 0) { + int s = splimp(); + if_up(ifp); + splx(s); + } + /* See if permanently promiscuous mode bit is about to flip */ + if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { + if (new_flags & IFF_PPROMISC) + ifp->if_flags |= IFF_PROMISC; + else if (ifp->if_pcount == 0) + ifp->if_flags &= ~IFF_PROMISC; + log(LOG_INFO, "%s: permanently promiscuous mode %s\n", + ifp->if_xname, + (new_flags & IFF_PPROMISC) ? "enabled" : "disabled"); + } + ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | + (new_flags &~ IFF_CANTCHANGE); + if (ifp->if_ioctl) { + (void) (*ifp->if_ioctl)(ifp, cmd, data); + } + getmicrotime(&ifp->if_lastchange); + break; + + case SIOCSIFCAP: + error = priv_check(td, PRIV_NET_SETIFCAP); + if (error) + return (error); + if (ifp->if_ioctl == NULL) + return (EOPNOTSUPP); + if (ifr->ifr_reqcap & ~ifp->if_capabilities) + return (EINVAL); + error = (*ifp->if_ioctl)(ifp, cmd, data); + if (error == 0) + getmicrotime(&ifp->if_lastchange); + break; + +#ifdef MAC + case SIOCSIFMAC: + error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); + break; +#endif + + case SIOCSIFNAME: + error = priv_check(td, PRIV_NET_SETIFNAME); + if (error) + return (error); + error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL); + if (error != 0) + return (error); + if (new_name[0] == '\0') + return (EINVAL); + if (ifunit(new_name) != NULL) + return (EEXIST); + + /* + * XXX: Locking. Nothing else seems to lock if_flags, + * and there are numerous other races with the + * ifunit() checks not being atomic with namespace + * changes (renames, vmoves, if_attach, etc). + */ + ifp->if_flags |= IFF_RENAMING; + + /* Announce the departure of the interface. */ + rt_ifannouncemsg(ifp, IFAN_DEPARTURE); + EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); + + log(LOG_INFO, "%s: changing name to '%s'\n", + ifp->if_xname, new_name); + + strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); + ifa = ifp->if_addr; + IFA_LOCK(ifa); + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + namelen = strlen(new_name); + onamelen = sdl->sdl_nlen; + /* + * Move the address if needed. This is safe because we + * allocate space for a name of length IFNAMSIZ when we + * create this in if_attach(). + */ + if (namelen != onamelen) { + bcopy(sdl->sdl_data + onamelen, + sdl->sdl_data + namelen, sdl->sdl_alen); + } + bcopy(new_name, sdl->sdl_data, namelen); + sdl->sdl_nlen = namelen; + sdl = (struct sockaddr_dl *)ifa->ifa_netmask; + bzero(sdl->sdl_data, onamelen); + while (namelen != 0) + sdl->sdl_data[--namelen] = 0xff; + IFA_UNLOCK(ifa); + + EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); + /* Announce the return of the interface. */ + rt_ifannouncemsg(ifp, IFAN_ARRIVAL); + + ifp->if_flags &= ~IFF_RENAMING; + break; + +#ifdef VIMAGE + case SIOCSIFVNET: + error = priv_check(td, PRIV_NET_SETIFVNET); + if (error) + return (error); + error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); + break; +#endif + + case SIOCSIFMETRIC: + error = priv_check(td, PRIV_NET_SETIFMETRIC); + if (error) + return (error); + ifp->if_metric = ifr->ifr_metric; + getmicrotime(&ifp->if_lastchange); + break; + + case SIOCSIFPHYS: + error = priv_check(td, PRIV_NET_SETIFPHYS); + if (error) + return (error); + if (ifp->if_ioctl == NULL) + return (EOPNOTSUPP); + error = (*ifp->if_ioctl)(ifp, cmd, data); + if (error == 0) + getmicrotime(&ifp->if_lastchange); + break; + + case SIOCSIFMTU: + { + u_long oldmtu = ifp->if_mtu; + + error = priv_check(td, PRIV_NET_SETIFMTU); + if (error) + return (error); + if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) + return (EINVAL); + if (ifp->if_ioctl == NULL) + return (EOPNOTSUPP); + error = (*ifp->if_ioctl)(ifp, cmd, data); + if (error == 0) { + getmicrotime(&ifp->if_lastchange); + rt_ifmsg(ifp); + } + /* + * If the link MTU changed, do network layer specific procedure. + */ + if (ifp->if_mtu != oldmtu) { +#ifdef INET6 + nd6_setmtu(ifp); +#endif + } + break; + } + + case SIOCADDMULTI: + case SIOCDELMULTI: + if (cmd == SIOCADDMULTI) + error = priv_check(td, PRIV_NET_ADDMULTI); + else + error = priv_check(td, PRIV_NET_DELMULTI); + if (error) + return (error); + + /* Don't allow group membership on non-multicast interfaces. */ + if ((ifp->if_flags & IFF_MULTICAST) == 0) + return (EOPNOTSUPP); + + /* Don't let users screw up protocols' entries. */ + if (ifr->ifr_addr.sa_family != AF_LINK) + return (EINVAL); + + if (cmd == SIOCADDMULTI) { + struct ifmultiaddr *ifma; + + /* + * Userland is only permitted to join groups once + * via the if_addmulti() KPI, because it cannot hold + * struct ifmultiaddr * between calls. It may also + * lose a race while we check if the membership + * already exists. + */ + IF_ADDR_LOCK(ifp); + ifma = if_findmulti(ifp, &ifr->ifr_addr); + IF_ADDR_UNLOCK(ifp); + if (ifma != NULL) + error = EADDRINUSE; + else + error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); + } else { + error = if_delmulti(ifp, &ifr->ifr_addr); + } + if (error == 0) + getmicrotime(&ifp->if_lastchange); + break; + + case SIOCSIFPHYADDR: + case SIOCDIFPHYADDR: +#ifdef INET6 + case SIOCSIFPHYADDR_IN6: +#endif + case SIOCSLIFPHYADDR: + case SIOCSIFMEDIA: + case SIOCSIFGENERIC: + error = priv_check(td, PRIV_NET_HWIOCTL); + if (error) + return (error); + if (ifp->if_ioctl == NULL) + return (EOPNOTSUPP); + error = (*ifp->if_ioctl)(ifp, cmd, data); + if (error == 0) + getmicrotime(&ifp->if_lastchange); + break; + + case SIOCGIFSTATUS: + ifs = (struct ifstat *)data; + ifs->ascii[0] = '\0'; + + case SIOCGIFPSRCADDR: + case SIOCGIFPDSTADDR: + case SIOCGLIFPHYADDR: + case SIOCGIFMEDIA: + case SIOCGIFGENERIC: + if (ifp->if_ioctl == NULL) + return (EOPNOTSUPP); + error = (*ifp->if_ioctl)(ifp, cmd, data); + break; + + case SIOCSIFLLADDR: + error = priv_check(td, PRIV_NET_SETLLADDR); + if (error) + return (error); + error = if_setlladdr(ifp, + ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); + EVENTHANDLER_INVOKE(iflladdr_event, ifp); + break; + + case SIOCAIFGROUP: + { + struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr; + + error = priv_check(td, PRIV_NET_ADDIFGROUP); + if (error) + return (error); + if ((error = if_addgroup(ifp, ifgr->ifgr_group))) + return (error); + break; + } + + case SIOCGIFGROUP: + if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp))) + return (error); + break; + + case SIOCDIFGROUP: + { + struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr; + + error = priv_check(td, PRIV_NET_DELIFGROUP); + if (error) + return (error); + if ((error = if_delgroup(ifp, ifgr->ifgr_group))) + return (error); + break; + } + + default: + error = ENOIOCTL; + break; + } + return (error); +} + +#ifdef COMPAT_FREEBSD32 +struct ifconf32 { + int32_t ifc_len; + union { + uint32_t ifcu_buf; + uint32_t ifcu_req; + } ifc_ifcu; +}; +#define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) +#endif + +/* + * Interface ioctls. + */ +int +ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) +{ + struct ifnet *ifp; + struct ifreq *ifr; + int error; + int oif_flags; + + switch (cmd) { + case SIOCGIFCONF: + case OSIOCGIFCONF: + return (ifconf(cmd, data)); + +#ifdef COMPAT_FREEBSD32 + case SIOCGIFCONF32: + { + struct ifconf32 *ifc32; + struct ifconf ifc; + + ifc32 = (struct ifconf32 *)data; + ifc.ifc_len = ifc32->ifc_len; + ifc.ifc_buf = PTRIN(ifc32->ifc_buf); + + return (ifconf(SIOCGIFCONF, (void *)&ifc)); + } +#endif + } + ifr = (struct ifreq *)data; + + switch (cmd) { +#ifdef VIMAGE + case SIOCSIFRVNET: + error = priv_check(td, PRIV_NET_SETIFVNET); + if (error) + return (error); + return (if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid)); +#endif + case SIOCIFCREATE: + case SIOCIFCREATE2: + error = priv_check(td, PRIV_NET_IFCREATE); + if (error) + return (error); + return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), + cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL)); + case SIOCIFDESTROY: + error = priv_check(td, PRIV_NET_IFDESTROY); + if (error) + return (error); + return if_clone_destroy(ifr->ifr_name); + + case SIOCIFGCLONERS: + return (if_clone_list((struct if_clonereq *)data)); + case SIOCGIFGMEMB: + return (if_getgroupmembers((struct ifgroupreq *)data)); + } + + ifp = ifunit_ref(ifr->ifr_name); + if (ifp == NULL) + return (ENXIO); + + error = ifhwioctl(cmd, ifp, data, td); + if (error != ENOIOCTL) { + if_rele(ifp); + return (error); + } + + oif_flags = ifp->if_flags; + if (so->so_proto == NULL) { + if_rele(ifp); + return (EOPNOTSUPP); + } +#ifndef COMPAT_43 + error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, + data, + ifp, td)); + if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL) + error = (*ifp->if_ioctl)(ifp, cmd, data); +#else + { + u_long ocmd = cmd; + + switch (cmd) { + + case SIOCSIFDSTADDR: + case SIOCSIFADDR: + case SIOCSIFBRDADDR: + case SIOCSIFNETMASK: +#if BYTE_ORDER != BIG_ENDIAN + if (ifr->ifr_addr.sa_family == 0 && + ifr->ifr_addr.sa_len < 16) { + ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len; + ifr->ifr_addr.sa_len = 16; + } +#else + if (ifr->ifr_addr.sa_len == 0) + ifr->ifr_addr.sa_len = 16; +#endif + break; + + case OSIOCGIFADDR: + cmd = SIOCGIFADDR; + break; + + case OSIOCGIFDSTADDR: + cmd = SIOCGIFDSTADDR; + break; + + case OSIOCGIFBRDADDR: + cmd = SIOCGIFBRDADDR; + break; + + case OSIOCGIFNETMASK: + cmd = SIOCGIFNETMASK; + } + error = ((*so->so_proto->pr_usrreqs->pru_control)(so, + cmd, + data, + ifp, td)); + if (error == EOPNOTSUPP && ifp != NULL && + ifp->if_ioctl != NULL) + error = (*ifp->if_ioctl)(ifp, cmd, data); + switch (ocmd) { + + case OSIOCGIFADDR: + case OSIOCGIFDSTADDR: + case OSIOCGIFBRDADDR: + case OSIOCGIFNETMASK: + *(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family; + + } + } +#endif /* COMPAT_43 */ + + if ((oif_flags ^ ifp->if_flags) & IFF_UP) { +#ifdef INET6 + if (ifp->if_flags & IFF_UP) { + int s = splimp(); + in6_if_up(ifp); + splx(s); + } +#endif + } + if_rele(ifp); + return (error); +} + +/* + * The code common to handling reference counted flags, + * e.g., in ifpromisc() and if_allmulti(). + * The "pflag" argument can specify a permanent mode flag to check, + * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. + * + * Only to be used on stack-owned flags, not driver-owned flags. + */ +static int +if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) +{ + struct ifreq ifr; + int error; + int oldflags, oldcount; + + /* Sanity checks to catch programming errors */ + KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, + ("%s: setting driver-owned flag %d", __func__, flag)); + + if (onswitch) + KASSERT(*refcount >= 0, + ("%s: increment negative refcount %d for flag %d", + __func__, *refcount, flag)); + else + KASSERT(*refcount > 0, + ("%s: decrement non-positive refcount %d for flag %d", + __func__, *refcount, flag)); + + /* In case this mode is permanent, just touch refcount */ + if (ifp->if_flags & pflag) { + *refcount += onswitch ? 1 : -1; + return (0); + } + + /* Save ifnet parameters for if_ioctl() may fail */ + oldcount = *refcount; + oldflags = ifp->if_flags; + + /* + * See if we aren't the only and touching refcount is enough. + * Actually toggle interface flag if we are the first or last. + */ + if (onswitch) { + if ((*refcount)++) + return (0); + ifp->if_flags |= flag; + } else { + if (--(*refcount)) + return (0); + ifp->if_flags &= ~flag; + } + + /* Call down the driver since we've changed interface flags */ + if (ifp->if_ioctl == NULL) { + error = EOPNOTSUPP; + goto recover; + } + ifr.ifr_flags = ifp->if_flags & 0xffff; + ifr.ifr_flagshigh = ifp->if_flags >> 16; + error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); + if (error) + goto recover; + /* Notify userland that interface flags have changed */ + rt_ifmsg(ifp); + return (0); + +recover: + /* Recover after driver error */ + *refcount = oldcount; + ifp->if_flags = oldflags; + return (error); +} + +/* + * Set/clear promiscuous mode on interface ifp based on the truth value + * of pswitch. The calls are reference counted so that only the first + * "on" request actually has an effect, as does the final "off" request. + * Results are undefined if the "off" and "on" requests are not matched. + */ +int +ifpromisc(struct ifnet *ifp, int pswitch) +{ + int error; + int oldflags = ifp->if_flags; + + error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, + &ifp->if_pcount, pswitch); + /* If promiscuous mode status has changed, log a message */ + if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC)) + log(LOG_INFO, "%s: promiscuous mode %s\n", + ifp->if_xname, + (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); + return (error); +} + +/* + * Return interface configuration + * of system. List may be used + * in later ioctl's (above) to get + * other information. + */ +/*ARGSUSED*/ +static int +ifconf(u_long cmd, caddr_t data) +{ + struct ifconf *ifc = (struct ifconf *)data; + struct ifnet *ifp; + struct ifaddr *ifa; + struct ifreq ifr; + struct sbuf *sb; + int error, full = 0, valid_len, max_len; + + /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ + max_len = MAXPHYS - 1; + + /* Prevent hostile input from being able to crash the system */ + if (ifc->ifc_len <= 0) + return (EINVAL); + +again: + if (ifc->ifc_len <= max_len) { + max_len = ifc->ifc_len; + full = 1; + } + sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); + max_len = 0; + valid_len = 0; + + IFNET_RLOCK(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + int addrs; + + /* + * Zero the ifr_name buffer to make sure we don't + * disclose the contents of the stack. + */ + memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name)); + + if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) + >= sizeof(ifr.ifr_name)) { + sbuf_delete(sb); + IFNET_RUNLOCK(); + return (ENAMETOOLONG); + } + + addrs = 0; + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + struct sockaddr *sa = ifa->ifa_addr; + + if (prison_if(curthread->td_ucred, sa) != 0) + continue; + addrs++; +#ifdef COMPAT_43 + if (cmd == OSIOCGIFCONF) { + struct osockaddr *osa = + (struct osockaddr *)&ifr.ifr_addr; + ifr.ifr_addr = *sa; + osa->sa_family = sa->sa_family; + sbuf_bcat(sb, &ifr, sizeof(ifr)); + max_len += sizeof(ifr); + } else +#endif + if (sa->sa_len <= sizeof(*sa)) { + ifr.ifr_addr = *sa; + sbuf_bcat(sb, &ifr, sizeof(ifr)); + max_len += sizeof(ifr); + } else { + sbuf_bcat(sb, &ifr, + offsetof(struct ifreq, ifr_addr)); + max_len += offsetof(struct ifreq, ifr_addr); + sbuf_bcat(sb, sa, sa->sa_len); + max_len += sa->sa_len; + } + + if (!sbuf_overflowed(sb)) + valid_len = sbuf_len(sb); + } + IF_ADDR_UNLOCK(ifp); + if (addrs == 0) { + bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr)); + sbuf_bcat(sb, &ifr, sizeof(ifr)); + max_len += sizeof(ifr); + + if (!sbuf_overflowed(sb)) + valid_len = sbuf_len(sb); + } + } + IFNET_RUNLOCK(); + + /* + * If we didn't allocate enough space (uncommon), try again. If + * we have already allocated as much space as we are allowed, + * return what we've got. + */ + if (valid_len != max_len && !full) { + sbuf_delete(sb); + goto again; + } + + ifc->ifc_len = valid_len; + sbuf_finish(sb); + error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); + sbuf_delete(sb); + return (error); +} + +/* + * Just like ifpromisc(), but for all-multicast-reception mode. + */ +int +if_allmulti(struct ifnet *ifp, int onswitch) +{ + + return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); +} + +struct ifmultiaddr * +if_findmulti(struct ifnet *ifp, struct sockaddr *sa) +{ + struct ifmultiaddr *ifma; + + IF_ADDR_LOCK_ASSERT(ifp); + + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (sa->sa_family == AF_LINK) { + if (sa_dl_equal(ifma->ifma_addr, sa)) + break; + } else { + if (sa_equal(ifma->ifma_addr, sa)) + break; + } + } + + return ifma; +} + +/* + * Allocate a new ifmultiaddr and initialize based on passed arguments. We + * make copies of passed sockaddrs. The ifmultiaddr will not be added to + * the ifnet multicast address list here, so the caller must do that and + * other setup work (such as notifying the device driver). The reference + * count is initialized to 1. + */ +static struct ifmultiaddr * +if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, + int mflags) +{ + struct ifmultiaddr *ifma; + struct sockaddr *dupsa; + + ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | + M_ZERO); + if (ifma == NULL) + return (NULL); + + dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); + if (dupsa == NULL) { + free(ifma, M_IFMADDR); + return (NULL); + } + bcopy(sa, dupsa, sa->sa_len); + ifma->ifma_addr = dupsa; + + ifma->ifma_ifp = ifp; + ifma->ifma_refcount = 1; + ifma->ifma_protospec = NULL; + + if (llsa == NULL) { + ifma->ifma_lladdr = NULL; + return (ifma); + } + + dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); + if (dupsa == NULL) { + free(ifma->ifma_addr, M_IFMADDR); + free(ifma, M_IFMADDR); + return (NULL); + } + bcopy(llsa, dupsa, llsa->sa_len); + ifma->ifma_lladdr = dupsa; + + return (ifma); +} + +/* + * if_freemulti: free ifmultiaddr structure and possibly attached related + * addresses. The caller is responsible for implementing reference + * counting, notifying the driver, handling routing messages, and releasing + * any dependent link layer state. + */ +static void +if_freemulti(struct ifmultiaddr *ifma) +{ + + KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", + ifma->ifma_refcount)); + KASSERT(ifma->ifma_protospec == NULL, + ("if_freemulti: protospec not NULL")); + + if (ifma->ifma_lladdr != NULL) + free(ifma->ifma_lladdr, M_IFMADDR); + free(ifma->ifma_addr, M_IFMADDR); + free(ifma, M_IFMADDR); +} + +/* + * Register an additional multicast address with a network interface. + * + * - If the address is already present, bump the reference count on the + * address and return. + * - If the address is not link-layer, look up a link layer address. + * - Allocate address structures for one or both addresses, and attach to the + * multicast address list on the interface. If automatically adding a link + * layer address, the protocol address will own a reference to the link + * layer address, to be freed when it is freed. + * - Notify the network device driver of an addition to the multicast address + * list. + * + * 'sa' points to caller-owned memory with the desired multicast address. + * + * 'retifma' will be used to return a pointer to the resulting multicast + * address reference, if desired. + */ +int +if_addmulti(struct ifnet *ifp, struct sockaddr *sa, + struct ifmultiaddr **retifma) +{ + struct ifmultiaddr *ifma, *ll_ifma; + struct sockaddr *llsa; + int error; + + /* + * If the address is already present, return a new reference to it; + * otherwise, allocate storage and set up a new address. + */ + IF_ADDR_LOCK(ifp); + ifma = if_findmulti(ifp, sa); + if (ifma != NULL) { + ifma->ifma_refcount++; + if (retifma != NULL) + *retifma = ifma; + IF_ADDR_UNLOCK(ifp); + return (0); + } + + /* + * The address isn't already present; resolve the protocol address + * into a link layer address, and then look that up, bump its + * refcount or allocate an ifma for that also. If 'llsa' was + * returned, we will need to free it later. + */ + llsa = NULL; + ll_ifma = NULL; + if (ifp->if_resolvemulti != NULL) { + error = ifp->if_resolvemulti(ifp, &llsa, sa); + if (error) + goto unlock_out; + } + + /* + * Allocate the new address. Don't hook it up yet, as we may also + * need to allocate a link layer multicast address. + */ + ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); + if (ifma == NULL) { + error = ENOMEM; + goto free_llsa_out; + } + + /* + * If a link layer address is found, we'll need to see if it's + * already present in the address list, or allocate is as well. + * When this block finishes, the link layer address will be on the + * list. + */ + if (llsa != NULL) { + ll_ifma = if_findmulti(ifp, llsa); + if (ll_ifma == NULL) { + ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); + if (ll_ifma == NULL) { + --ifma->ifma_refcount; + if_freemulti(ifma); + error = ENOMEM; + goto free_llsa_out; + } + TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, + ifma_link); + } else + ll_ifma->ifma_refcount++; + ifma->ifma_llifma = ll_ifma; + } + + /* + * We now have a new multicast address, ifma, and possibly a new or + * referenced link layer address. Add the primary address to the + * ifnet address list. + */ + TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); + + if (retifma != NULL) + *retifma = ifma; + + /* + * Must generate the message while holding the lock so that 'ifma' + * pointer is still valid. + */ + rt_newmaddrmsg(RTM_NEWMADDR, ifma); + IF_ADDR_UNLOCK(ifp); + + /* + * We are certain we have added something, so call down to the + * interface to let them know about it. + */ + if (ifp->if_ioctl != NULL) { + (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); + } + + if (llsa != NULL) + free(llsa, M_IFMADDR); + + return (0); + +free_llsa_out: + if (llsa != NULL) + free(llsa, M_IFMADDR); + +unlock_out: + IF_ADDR_UNLOCK(ifp); + return (error); +} + +/* + * Delete a multicast group membership by network-layer group address. + * + * Returns ENOENT if the entry could not be found. If ifp no longer + * exists, results are undefined. This entry point should only be used + * from subsystems which do appropriate locking to hold ifp for the + * duration of the call. + * Network-layer protocol domains must use if_delmulti_ifma(). + */ +int +if_delmulti(struct ifnet *ifp, struct sockaddr *sa) +{ + struct ifmultiaddr *ifma; + int lastref; +#ifdef INVARIANTS + struct ifnet *oifp; + + IFNET_RLOCK_NOSLEEP(); + TAILQ_FOREACH(oifp, &V_ifnet, if_link) + if (ifp == oifp) + break; + if (ifp != oifp) + ifp = NULL; + IFNET_RUNLOCK_NOSLEEP(); + + KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); +#endif + if (ifp == NULL) + return (ENOENT); + + IF_ADDR_LOCK(ifp); + lastref = 0; + ifma = if_findmulti(ifp, sa); + if (ifma != NULL) + lastref = if_delmulti_locked(ifp, ifma, 0); + IF_ADDR_UNLOCK(ifp); + + if (ifma == NULL) + return (ENOENT); + + if (lastref && ifp->if_ioctl != NULL) { + (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); + } + + return (0); +} + +/* + * Delete all multicast group membership for an interface. + * Should be used to quickly flush all multicast filters. + */ +void +if_delallmulti(struct ifnet *ifp) +{ + struct ifmultiaddr *ifma; + struct ifmultiaddr *next; + + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) + if_delmulti_locked(ifp, ifma, 0); + IF_ADDR_UNLOCK(ifp); +} + +/* + * Delete a multicast group membership by group membership pointer. + * Network-layer protocol domains must use this routine. + * + * It is safe to call this routine if the ifp disappeared. + */ +void +if_delmulti_ifma(struct ifmultiaddr *ifma) +{ + struct ifnet *ifp; + int lastref; + + ifp = ifma->ifma_ifp; +#ifdef DIAGNOSTIC + if (ifp == NULL) { + printf("%s: ifma_ifp seems to be detached\n", __func__); + } else { + struct ifnet *oifp; + + IFNET_RLOCK_NOSLEEP(); + TAILQ_FOREACH(oifp, &V_ifnet, if_link) + if (ifp == oifp) + break; + if (ifp != oifp) { + printf("%s: ifnet %p disappeared\n", __func__, ifp); + ifp = NULL; + } + IFNET_RUNLOCK_NOSLEEP(); + } +#endif + /* + * If and only if the ifnet instance exists: Acquire the address lock. + */ + if (ifp != NULL) + IF_ADDR_LOCK(ifp); + + lastref = if_delmulti_locked(ifp, ifma, 0); + + if (ifp != NULL) { + /* + * If and only if the ifnet instance exists: + * Release the address lock. + * If the group was left: update the hardware hash filter. + */ + IF_ADDR_UNLOCK(ifp); + if (lastref && ifp->if_ioctl != NULL) { + (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); + } + } +} + +/* + * Perform deletion of network-layer and/or link-layer multicast address. + * + * Return 0 if the reference count was decremented. + * Return 1 if the final reference was released, indicating that the + * hardware hash filter should be reprogrammed. + */ +static int +if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) +{ + struct ifmultiaddr *ll_ifma; + + if (ifp != NULL && ifma->ifma_ifp != NULL) { + KASSERT(ifma->ifma_ifp == ifp, + ("%s: inconsistent ifp %p", __func__, ifp)); + IF_ADDR_LOCK_ASSERT(ifp); + } + + ifp = ifma->ifma_ifp; + + /* + * If the ifnet is detaching, null out references to ifnet, + * so that upper protocol layers will notice, and not attempt + * to obtain locks for an ifnet which no longer exists. The + * routing socket announcement must happen before the ifnet + * instance is detached from the system. + */ + if (detaching) { +#ifdef DIAGNOSTIC + printf("%s: detaching ifnet instance %p\n", __func__, ifp); +#endif + /* + * ifp may already be nulled out if we are being reentered + * to delete the ll_ifma. + */ + if (ifp != NULL) { + rt_newmaddrmsg(RTM_DELMADDR, ifma); + ifma->ifma_ifp = NULL; + } + } + + if (--ifma->ifma_refcount > 0) + return 0; + + /* + * If this ifma is a network-layer ifma, a link-layer ifma may + * have been associated with it. Release it first if so. + */ + ll_ifma = ifma->ifma_llifma; + if (ll_ifma != NULL) { + KASSERT(ifma->ifma_lladdr != NULL, + ("%s: llifma w/o lladdr", __func__)); + if (detaching) + ll_ifma->ifma_ifp = NULL; /* XXX */ + if (--ll_ifma->ifma_refcount == 0) { + if (ifp != NULL) { + TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, + ifma_link); + } + if_freemulti(ll_ifma); + } + } + + if (ifp != NULL) + TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link); + + if_freemulti(ifma); + + /* + * The last reference to this instance of struct ifmultiaddr + * was released; the hardware should be notified of this change. + */ + return 1; +} + +/* + * Set the link layer address on an interface. + * + * At this time we only support certain types of interfaces, + * and we don't allow the length of the address to change. + */ +int +if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) +{ + struct sockaddr_dl *sdl; + struct ifaddr *ifa; + struct ifreq ifr; + + IF_ADDR_LOCK(ifp); + ifa = ifp->if_addr; + if (ifa == NULL) { + IF_ADDR_UNLOCK(ifp); + return (EINVAL); + } + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + if (sdl == NULL) { + ifa_free(ifa); + return (EINVAL); + } + if (len != sdl->sdl_alen) { /* don't allow length to change */ + ifa_free(ifa); + return (EINVAL); + } + switch (ifp->if_type) { + case IFT_ETHER: + case IFT_FDDI: + case IFT_XETHER: + case IFT_ISO88025: + case IFT_L2VLAN: + case IFT_BRIDGE: + case IFT_ARCNET: + case IFT_IEEE8023ADLAG: + case IFT_IEEE80211: + bcopy(lladdr, LLADDR(sdl), len); + ifa_free(ifa); + break; + default: + ifa_free(ifa); + return (ENODEV); + } + + /* + * If the interface is already up, we need + * to re-init it in order to reprogram its + * address filter. + */ + if ((ifp->if_flags & IFF_UP) != 0) { + if (ifp->if_ioctl) { + ifp->if_flags &= ~IFF_UP; + ifr.ifr_flags = ifp->if_flags & 0xffff; + ifr.ifr_flagshigh = ifp->if_flags >> 16; + (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); + ifp->if_flags |= IFF_UP; + ifr.ifr_flags = ifp->if_flags & 0xffff; + ifr.ifr_flagshigh = ifp->if_flags >> 16; + (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); + } +#ifdef INET + /* + * Also send gratuitous ARPs to notify other nodes about + * the address change. + */ + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family == AF_INET) + arp_ifinit(ifp, ifa); + } +#endif + } + return (0); +} + +/* + * The name argument must be a pointer to storage which will last as + * long as the interface does. For physical devices, the result of + * device_get_name(dev) is a good choice and for pseudo-devices a + * static string works well. + */ +void +if_initname(struct ifnet *ifp, const char *name, int unit) +{ + ifp->if_dname = name; + ifp->if_dunit = unit; + if (unit != IF_DUNIT_NONE) + snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); + else + strlcpy(ifp->if_xname, name, IFNAMSIZ); +} + +int +if_printf(struct ifnet *ifp, const char * fmt, ...) +{ + va_list ap; + int retval; + + retval = printf("%s: ", ifp->if_xname); + va_start(ap, fmt); + retval += vprintf(fmt, ap); + va_end(ap); + return (retval); +} + +void +if_start(struct ifnet *ifp) +{ + + (*(ifp)->if_start)(ifp); +} + +/* + * Backwards compatibility interface for drivers + * that have not implemented it + */ +static int +if_transmit(struct ifnet *ifp, struct mbuf *m) +{ + int error; + + IFQ_HANDOFF(ifp, m, error); + return (error); +} + +int +if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) +{ + int active = 0; + + IF_LOCK(ifq); + if (_IF_QFULL(ifq)) { + _IF_DROP(ifq); + IF_UNLOCK(ifq); + m_freem(m); + return (0); + } + if (ifp != NULL) { + ifp->if_obytes += m->m_pkthdr.len + adjust; + if (m->m_flags & (M_BCAST|M_MCAST)) + ifp->if_omcasts++; + active = ifp->if_drv_flags & IFF_DRV_OACTIVE; + } + _IF_ENQUEUE(ifq, m); + IF_UNLOCK(ifq); + if (ifp != NULL && !active) + (*(ifp)->if_start)(ifp); + return (1); +} + +void +if_register_com_alloc(u_char type, + if_com_alloc_t *a, if_com_free_t *f) +{ + + KASSERT(if_com_alloc[type] == NULL, + ("if_register_com_alloc: %d already registered", type)); + KASSERT(if_com_free[type] == NULL, + ("if_register_com_alloc: %d free already registered", type)); + + if_com_alloc[type] = a; + if_com_free[type] = f; +} + +void +if_deregister_com_alloc(u_char type) +{ + + KASSERT(if_com_alloc[type] != NULL, + ("if_deregister_com_alloc: %d not registered", type)); + KASSERT(if_com_free[type] != NULL, + ("if_deregister_com_alloc: %d free not registered", type)); + if_com_alloc[type] = NULL; + if_com_free[type] = NULL; +} diff --git a/freebsd/sys/net/if.h b/freebsd/sys/net/if.h new file mode 100644 index 00000000..6fbbb34a --- /dev/null +++ b/freebsd/sys/net/if.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/net/if_arc.h b/freebsd/sys/net/if_arc.h new file mode 100644 index 00000000..6be5d4e1 --- /dev/null +++ b/freebsd/sys/net/if_arc.h @@ -0,0 +1,143 @@ +/* $NetBSD: if_arc.h,v 1.13 1999/11/19 20:41:19 thorpej Exp $ */ +/* $FreeBSD$ */ + +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: NetBSD: if_ether.h,v 1.10 1994/06/29 06:37:55 cgd Exp + * @(#)if_ether.h 8.1 (Berkeley) 6/10/93 + */ + +#ifndef _NET_IF_ARC_HH_ +#define _NET_IF_ARC_HH_ + +/* + * Arcnet address - 1 octets + * don't know who uses this. + */ +struct arc_addr { + u_int8_t arc_addr_octet[1]; +} __packed; + +/* + * Structure of a 2.5MB/s Arcnet header. + * as given to interface code. + */ +struct arc_header { + u_int8_t arc_shost; + u_int8_t arc_dhost; + u_int8_t arc_type; + /* + * only present for newstyle encoding with LL fragmentation. + * Don't use sizeof(anything), use ARC_HDR{,NEW}LEN instead. + */ + u_int8_t arc_flag; + u_int16_t arc_seqid; + + /* + * only present in exception packets (arc_flag == 0xff) + */ + u_int8_t arc_type2; /* same as arc_type */ + u_int8_t arc_flag2; /* real flag value */ + u_int16_t arc_seqid2; /* real seqid value */ +} __packed; + +#define ARC_ADDR_LEN 1 + +#define ARC_HDRLEN 3 +#define ARC_HDRNEWLEN 6 +#define ARC_HDRNEWLEN_EXC 10 + +/* these lengths are data link layer length - 2 * ARC_ADDR_LEN */ +#define ARC_MIN_LEN 1 +#define ARC_MIN_FORBID_LEN 254 +#define ARC_MAX_FORBID_LEN 256 +#define ARC_MAX_LEN 508 +#define ARC_MAX_DATA 504 + +/* RFC 1051 */ +#define ARCTYPE_IP_OLD 240 /* IP protocol */ +#define ARCTYPE_ARP_OLD 241 /* address resolution protocol */ + +/* RFC 1201 */ +#define ARCTYPE_IP 212 /* IP protocol */ +#define ARCTYPE_ARP 213 /* address resolution protocol */ +#define ARCTYPE_REVARP 214 /* reverse addr resolution protocol */ + +#define ARCTYPE_ATALK 221 /* Appletalk */ +#define ARCTYPE_BANIAN 247 /* Banyan Vines */ +#define ARCTYPE_IPX 250 /* Novell IPX */ + +#define ARCTYPE_INET6 0xc4 /* IPng */ +#define ARCTYPE_DIAGNOSE 0x80 /* as per ANSI/ATA 878.1 */ + +#define ARCMTU 507 +#define ARCMIN 0 + +#define ARC_PHDS_MAXMTU 60480 + +struct arccom { + struct ifnet *ac_ifp; /* network-visible interface */ + + u_int16_t ac_seqid; /* seq. id used by PHDS encap. */ + + u_int8_t arc_shost; + u_int8_t arc_dhost; + u_int8_t arc_type; + + u_int8_t dummy0; + u_int16_t dummy1; + int sflag, fsflag, rsflag; + struct mbuf *curr_frag; + + struct ac_frag { + u_int8_t af_maxflag; /* from first packet */ + u_int8_t af_lastseen; /* last split flag seen */ + u_int16_t af_seqid; + struct mbuf *af_packet; + } ac_fragtab[256]; /* indexed by sender ll address */ +}; + +#ifdef _KERNEL +extern u_int8_t arcbroadcastaddr; +extern int arc_ipmtu; /* XXX new ip only, no RFC 1051! */ + +void arc_ifattach(struct ifnet *, u_int8_t); +void arc_ifdetach(struct ifnet *); +void arc_storelladdr(struct ifnet *, u_int8_t); +int arc_isphds(u_int8_t); +void arc_input(struct ifnet *, struct mbuf *); +int arc_output(struct ifnet *, struct mbuf *, + struct sockaddr *, struct route *); +int arc_ioctl(struct ifnet *, u_long, caddr_t); + +void arc_frag_init(struct ifnet *); +struct mbuf * arc_frag_next(struct ifnet *); +#endif + +#endif /* _NET_IF_ARC_HH_ */ diff --git a/freebsd/sys/net/if_arcsubr.c b/freebsd/sys/net/if_arcsubr.c new file mode 100644 index 00000000..8cd53a6d --- /dev/null +++ b/freebsd/sys/net/if_arcsubr.c @@ -0,0 +1,886 @@ +#include + +/* $NetBSD: if_arcsubr.c,v 1.36 2001/06/14 05:44:23 itojun Exp $ */ +/* $FreeBSD$ */ + +/*- + * Copyright (c) 1994, 1995 Ignatios Souvatzis + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: NetBSD: if_ethersubr.c,v 1.9 1994/06/29 06:36:11 cgd Exp + * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 + * + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(INET) || defined(INET6) +#include +#include +#include +#endif + +#ifdef INET6 +#include +#endif + +#ifdef IPX +#include +#include +#endif + +#define ARCNET_ALLOW_BROKEN_ARP + +static struct mbuf *arc_defrag(struct ifnet *, struct mbuf *); +static int arc_resolvemulti(struct ifnet *, struct sockaddr **, + struct sockaddr *); + +u_int8_t arcbroadcastaddr = 0; + +#define ARC_LLADDR(ifp) (*(u_int8_t *)IF_LLADDR(ifp)) + +#define senderr(e) { error = (e); goto bad;} +#define SIN(s) ((struct sockaddr_in *)s) +#define SIPX(s) ((struct sockaddr_ipx *)s) + +/* + * ARCnet output routine. + * Encapsulate a packet of type family for the local net. + * Assumes that ifp is actually pointer to arccom structure. + */ +int +arc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct route *ro) +{ + struct arc_header *ah; + int error; + u_int8_t atype, adst; + int loop_copy = 0; + int isphds; +#if defined(INET) || defined(INET6) + struct llentry *lle; +#endif + + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) + return(ENETDOWN); /* m, m1 aren't initialized yet */ + + error = 0; + + switch (dst->sa_family) { +#ifdef INET + case AF_INET: + + /* + * For now, use the simple IP addr -> ARCnet addr mapping + */ + if (m->m_flags & (M_BCAST|M_MCAST)) + adst = arcbroadcastaddr; /* ARCnet broadcast address */ + else if (ifp->if_flags & IFF_NOARP) + adst = ntohl(SIN(dst)->sin_addr.s_addr) & 0xFF; + else { + error = arpresolve(ifp, ro ? ro->ro_rt : NULL, + m, dst, &adst, &lle); + if (error) + return (error == EWOULDBLOCK ? 0 : error); + } + + atype = (ifp->if_flags & IFF_LINK0) ? + ARCTYPE_IP_OLD : ARCTYPE_IP; + break; + case AF_ARP: + { + struct arphdr *ah; + ah = mtod(m, struct arphdr *); + ah->ar_hrd = htons(ARPHRD_ARCNET); + + loop_copy = -1; /* if this is for us, don't do it */ + + switch(ntohs(ah->ar_op)) { + case ARPOP_REVREQUEST: + case ARPOP_REVREPLY: + atype = ARCTYPE_REVARP; + break; + case ARPOP_REQUEST: + case ARPOP_REPLY: + default: + atype = ARCTYPE_ARP; + break; + } + + if (m->m_flags & M_BCAST) + bcopy(ifp->if_broadcastaddr, &adst, ARC_ADDR_LEN); + else + bcopy(ar_tha(ah), &adst, ARC_ADDR_LEN); + + } + break; +#endif +#ifdef INET6 + case AF_INET6: + error = nd6_storelladdr(ifp, m, dst, (u_char *)&adst, &lle); + if (error) + return (error); + atype = ARCTYPE_INET6; + break; +#endif +#ifdef IPX + case AF_IPX: + adst = SIPX(dst)->sipx_addr.x_host.c_host[5]; + atype = ARCTYPE_IPX; + if (adst == 0xff) + adst = arcbroadcastaddr; + break; +#endif + + case AF_UNSPEC: + loop_copy = -1; + ah = (struct arc_header *)dst->sa_data; + adst = ah->arc_dhost; + atype = ah->arc_type; + + if (atype == ARCTYPE_ARP) { + atype = (ifp->if_flags & IFF_LINK0) ? + ARCTYPE_ARP_OLD: ARCTYPE_ARP; + +#ifdef ARCNET_ALLOW_BROKEN_ARP + /* + * XXX It's not clear per RFC826 if this is needed, but + * "assigned numbers" say this is wrong. + * However, e.g., AmiTCP 3.0Beta used it... we make this + * switchable for emergency cases. Not perfect, but... + */ + if (ifp->if_flags & IFF_LINK2) + mtod(m, struct arphdr *)->ar_pro = atype - 1; +#endif + } + break; + + default: + if_printf(ifp, "can't handle af%d\n", dst->sa_family); + senderr(EAFNOSUPPORT); + } + + isphds = arc_isphds(atype); + M_PREPEND(m, isphds ? ARC_HDRNEWLEN : ARC_HDRLEN, M_DONTWAIT); + if (m == 0) + senderr(ENOBUFS); + ah = mtod(m, struct arc_header *); + ah->arc_type = atype; + ah->arc_dhost = adst; + ah->arc_shost = ARC_LLADDR(ifp); + if (isphds) { + ah->arc_flag = 0; + ah->arc_seqid = 0; + } + + if ((ifp->if_flags & IFF_SIMPLEX) && (loop_copy != -1)) { + if ((m->m_flags & M_BCAST) || (loop_copy > 0)) { + struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); + + (void) if_simloop(ifp, n, dst->sa_family, ARC_HDRLEN); + } else if (ah->arc_dhost == ah->arc_shost) { + (void) if_simloop(ifp, m, dst->sa_family, ARC_HDRLEN); + return (0); /* XXX */ + } + } + + BPF_MTAP(ifp, m); + + error = ifp->if_transmit(ifp, m); + + return (error); + +bad: + if (m) + m_freem(m); + return (error); +} + +void +arc_frag_init(struct ifnet *ifp) +{ + struct arccom *ac; + + ac = (struct arccom *)ifp->if_l2com; + ac->curr_frag = 0; +} + +struct mbuf * +arc_frag_next(struct ifnet *ifp) +{ + struct arccom *ac; + struct mbuf *m; + struct arc_header *ah; + + ac = (struct arccom *)ifp->if_l2com; + if ((m = ac->curr_frag) == 0) { + int tfrags; + + /* dequeue new packet */ + IF_DEQUEUE(&ifp->if_snd, m); + if (m == 0) + return 0; + + ah = mtod(m, struct arc_header *); + if (!arc_isphds(ah->arc_type)) + return m; + + ++ac->ac_seqid; /* make the seqid unique */ + tfrags = (m->m_pkthdr.len + ARC_MAX_DATA - 1) / ARC_MAX_DATA; + ac->fsflag = 2 * tfrags - 3; + ac->sflag = 0; + ac->rsflag = ac->fsflag; + ac->arc_dhost = ah->arc_dhost; + ac->arc_shost = ah->arc_shost; + ac->arc_type = ah->arc_type; + + m_adj(m, ARC_HDRNEWLEN); + ac->curr_frag = m; + } + + /* split out next fragment and return it */ + if (ac->sflag < ac->fsflag) { + /* we CAN'T have short packets here */ + ac->curr_frag = m_split(m, ARC_MAX_DATA, M_DONTWAIT); + if (ac->curr_frag == 0) { + m_freem(m); + return 0; + } + + M_PREPEND(m, ARC_HDRNEWLEN, M_DONTWAIT); + if (m == 0) { + m_freem(ac->curr_frag); + ac->curr_frag = 0; + return 0; + } + + ah = mtod(m, struct arc_header *); + ah->arc_flag = ac->rsflag; + ah->arc_seqid = ac->ac_seqid; + + ac->sflag += 2; + ac->rsflag = ac->sflag; + } else if ((m->m_pkthdr.len >= + ARC_MIN_FORBID_LEN - ARC_HDRNEWLEN + 2) && + (m->m_pkthdr.len <= + ARC_MAX_FORBID_LEN - ARC_HDRNEWLEN + 2)) { + ac->curr_frag = 0; + + M_PREPEND(m, ARC_HDRNEWLEN_EXC, M_DONTWAIT); + if (m == 0) + return 0; + + ah = mtod(m, struct arc_header *); + ah->arc_flag = 0xFF; + ah->arc_seqid = 0xFFFF; + ah->arc_type2 = ac->arc_type; + ah->arc_flag2 = ac->sflag; + ah->arc_seqid2 = ac->ac_seqid; + } else { + ac->curr_frag = 0; + + M_PREPEND(m, ARC_HDRNEWLEN, M_DONTWAIT); + if (m == 0) + return 0; + + ah = mtod(m, struct arc_header *); + ah->arc_flag = ac->sflag; + ah->arc_seqid = ac->ac_seqid; + } + + ah->arc_dhost = ac->arc_dhost; + ah->arc_shost = ac->arc_shost; + ah->arc_type = ac->arc_type; + + return m; +} + +/* + * Defragmenter. Returns mbuf if last packet found, else + * NULL. frees imcoming mbuf as necessary. + */ + +static __inline struct mbuf * +arc_defrag(struct ifnet *ifp, struct mbuf *m) +{ + struct arc_header *ah, *ah1; + struct arccom *ac; + struct ac_frag *af; + struct mbuf *m1; + char *s; + int newflen; + u_char src,dst,typ; + + ac = (struct arccom *)ifp->if_l2com; + + if (m->m_len < ARC_HDRNEWLEN) { + m = m_pullup(m, ARC_HDRNEWLEN); + if (m == NULL) { + ++ifp->if_ierrors; + return NULL; + } + } + + ah = mtod(m, struct arc_header *); + typ = ah->arc_type; + + if (!arc_isphds(typ)) + return m; + + src = ah->arc_shost; + dst = ah->arc_dhost; + + if (ah->arc_flag == 0xff) { + m_adj(m, 4); + + if (m->m_len < ARC_HDRNEWLEN) { + m = m_pullup(m, ARC_HDRNEWLEN); + if (m == NULL) { + ++ifp->if_ierrors; + return NULL; + } + } + + ah = mtod(m, struct arc_header *); + } + + af = &ac->ac_fragtab[src]; + m1 = af->af_packet; + s = "debug code error"; + + if (ah->arc_flag & 1) { + /* + * first fragment. We always initialize, which is + * about the right thing to do, as we only want to + * accept one fragmented packet per src at a time. + */ + if (m1 != NULL) + m_freem(m1); + + af->af_packet = m; + m1 = m; + af->af_maxflag = ah->arc_flag; + af->af_lastseen = 0; + af->af_seqid = ah->arc_seqid; + + return NULL; + /* notreached */ + } else { + /* check for unfragmented packet */ + if (ah->arc_flag == 0) + return m; + + /* do we have a first packet from that src? */ + if (m1 == NULL) { + s = "no first frag"; + goto outofseq; + } + + ah1 = mtod(m1, struct arc_header *); + + if (ah->arc_seqid != ah1->arc_seqid) { + s = "seqid differs"; + goto outofseq; + } + + if (typ != ah1->arc_type) { + s = "type differs"; + goto outofseq; + } + + if (dst != ah1->arc_dhost) { + s = "dest host differs"; + goto outofseq; + } + + /* typ, seqid and dst are ok here. */ + + if (ah->arc_flag == af->af_lastseen) { + m_freem(m); + return NULL; + } + + if (ah->arc_flag == af->af_lastseen + 2) { + /* ok, this is next fragment */ + af->af_lastseen = ah->arc_flag; + m_adj(m,ARC_HDRNEWLEN); + + /* + * m_cat might free the first mbuf (with pkthdr) + * in 2nd chain; therefore: + */ + + newflen = m->m_pkthdr.len; + + m_cat(m1,m); + + m1->m_pkthdr.len += newflen; + + /* is it the last one? */ + if (af->af_lastseen > af->af_maxflag) { + af->af_packet = NULL; + return(m1); + } else + return NULL; + } + s = "other reason"; + /* if all else fails, it is out of sequence, too */ + } +outofseq: + if (m1) { + m_freem(m1); + af->af_packet = NULL; + } + + if (m) + m_freem(m); + + log(LOG_INFO,"%s: got out of seq. packet: %s\n", + ifp->if_xname, s); + + return NULL; +} + +/* + * return 1 if Packet Header Definition Standard, else 0. + * For now: old IP, old ARP aren't obviously. Lacking correct information, + * we guess that besides new IP and new ARP also IPX and APPLETALK are PHDS. + * (Apple and Novell corporations were involved, among others, in PHDS work). + * Easiest is to assume that everybody else uses that, too. + */ +int +arc_isphds(u_int8_t type) +{ + return (type != ARCTYPE_IP_OLD && + type != ARCTYPE_ARP_OLD && + type != ARCTYPE_DIAGNOSE); +} + +/* + * Process a received Arcnet packet; + * the packet is in the mbuf chain m with + * the ARCnet header. + */ +void +arc_input(struct ifnet *ifp, struct mbuf *m) +{ + struct arc_header *ah; + int isr; + u_int8_t atype; + + if ((ifp->if_flags & IFF_UP) == 0) { + m_freem(m); + return; + } + + /* possibly defragment: */ + m = arc_defrag(ifp, m); + if (m == NULL) + return; + + BPF_MTAP(ifp, m); + + ah = mtod(m, struct arc_header *); + /* does this belong to us? */ + if ((ifp->if_flags & IFF_PROMISC) == 0 + && ah->arc_dhost != arcbroadcastaddr + && ah->arc_dhost != ARC_LLADDR(ifp)) { + m_freem(m); + return; + } + + ifp->if_ibytes += m->m_pkthdr.len; + + if (ah->arc_dhost == arcbroadcastaddr) { + m->m_flags |= M_BCAST|M_MCAST; + ifp->if_imcasts++; + } + + atype = ah->arc_type; + switch (atype) { +#ifdef INET + case ARCTYPE_IP: + m_adj(m, ARC_HDRNEWLEN); + if ((m = ip_fastforward(m)) == NULL) + return; + isr = NETISR_IP; + break; + + case ARCTYPE_IP_OLD: + m_adj(m, ARC_HDRLEN); + if ((m = ip_fastforward(m)) == NULL) + return; + isr = NETISR_IP; + break; + + case ARCTYPE_ARP: + if (ifp->if_flags & IFF_NOARP) { + /* Discard packet if ARP is disabled on interface */ + m_freem(m); + return; + } + m_adj(m, ARC_HDRNEWLEN); + isr = NETISR_ARP; +#ifdef ARCNET_ALLOW_BROKEN_ARP + mtod(m, struct arphdr *)->ar_pro = htons(ETHERTYPE_IP); +#endif + break; + + case ARCTYPE_ARP_OLD: + if (ifp->if_flags & IFF_NOARP) { + /* Discard packet if ARP is disabled on interface */ + m_freem(m); + return; + } + m_adj(m, ARC_HDRLEN); + isr = NETISR_ARP; +#ifdef ARCNET_ALLOW_BROKEN_ARP + mtod(m, struct arphdr *)->ar_pro = htons(ETHERTYPE_IP); +#endif + break; +#endif +#ifdef INET6 + case ARCTYPE_INET6: + m_adj(m, ARC_HDRNEWLEN); + isr = NETISR_IPV6; + break; +#endif +#ifdef IPX + case ARCTYPE_IPX: + m_adj(m, ARC_HDRNEWLEN); + isr = NETISR_IPX; + break; +#endif + default: + m_freem(m); + return; + } + netisr_dispatch(isr, m); +} + +/* + * Register (new) link level address. + */ +void +arc_storelladdr(struct ifnet *ifp, u_int8_t lla) +{ + ARC_LLADDR(ifp) = lla; +} + +/* + * Perform common duties while attaching to interface list + */ +void +arc_ifattach(struct ifnet *ifp, u_int8_t lla) +{ + struct ifaddr *ifa; + struct sockaddr_dl *sdl; + struct arccom *ac; + + if_attach(ifp); + ifp->if_addrlen = 1; + ifp->if_hdrlen = ARC_HDRLEN; + ifp->if_mtu = 1500; + ifp->if_resolvemulti = arc_resolvemulti; + if (ifp->if_baudrate == 0) + ifp->if_baudrate = 2500000; +#if __FreeBSD_version < 500000 + ifa = ifnet_addrs[ifp->if_index - 1]; +#else + ifa = ifp->if_addr; +#endif + KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + sdl->sdl_type = IFT_ARCNET; + sdl->sdl_alen = ifp->if_addrlen; + + if (ifp->if_flags & IFF_BROADCAST) + ifp->if_flags |= IFF_MULTICAST|IFF_ALLMULTI; + + ac = (struct arccom *)ifp->if_l2com; + ac->ac_seqid = (time_second) & 0xFFFF; /* try to make seqid unique */ + if (lla == 0) { + /* XXX this message isn't entirely clear, to me -- cgd */ + log(LOG_ERR,"%s: link address 0 reserved for broadcasts. Please change it and ifconfig %s down up\n", + ifp->if_xname, ifp->if_xname); + } + arc_storelladdr(ifp, lla); + + ifp->if_broadcastaddr = &arcbroadcastaddr; + + bpfattach(ifp, DLT_ARCNET, ARC_HDRLEN); +} + +void +arc_ifdetach(struct ifnet *ifp) +{ + bpfdetach(ifp); + if_detach(ifp); +} + +int +arc_ioctl(struct ifnet *ifp, u_long command, caddr_t data) +{ + struct ifaddr *ifa = (struct ifaddr *) data; + struct ifreq *ifr = (struct ifreq *) data; + int error = 0; + + switch (command) { + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + ifp->if_init(ifp->if_softc); /* before arpwhohas */ + arp_ifinit(ifp, ifa); + break; +#endif +#ifdef IPX + /* + * XXX This code is probably wrong + */ + case AF_IPX: + { + struct ipx_addr *ina = &(IA_SIPX(ifa)->sipx_addr); + + if (ipx_nullhost(*ina)) + ina->x_host.c_host[5] = ARC_LLADDR(ifp); + else + arc_storelladdr(ifp, ina->x_host.c_host[5]); + + /* + * Set new address + */ + ifp->if_init(ifp->if_softc); + break; + } +#endif + default: + ifp->if_init(ifp->if_softc); + break; + } + break; + + case SIOCGIFADDR: + { + struct sockaddr *sa; + + sa = (struct sockaddr *) &ifr->ifr_data; + *(u_int8_t *)sa->sa_data = ARC_LLADDR(ifp); + } + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + if (ifr == NULL) + error = EAFNOSUPPORT; + else { + switch (ifr->ifr_addr.sa_family) { + case AF_INET: + case AF_INET6: + error = 0; + break; + default: + error = EAFNOSUPPORT; + break; + } + } + break; + + case SIOCSIFMTU: + /* + * Set the interface MTU. + * mtu can't be larger than ARCMTU for RFC1051 + * and can't be larger than ARC_PHDS_MTU + */ + if (((ifp->if_flags & IFF_LINK0) && ifr->ifr_mtu > ARCMTU) || + ifr->ifr_mtu > ARC_PHDS_MAXMTU) + error = EINVAL; + else + ifp->if_mtu = ifr->ifr_mtu; + break; + } + + return (error); +} + +/* based on ether_resolvemulti() */ +int +arc_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, + struct sockaddr *sa) +{ + struct sockaddr_dl *sdl; +#ifdef INET + struct sockaddr_in *sin; +#endif +#ifdef INET6 + struct sockaddr_in6 *sin6; +#endif + + switch(sa->sa_family) { + case AF_LINK: + /* + * No mapping needed. Just check that it's a valid MC address. + */ + sdl = (struct sockaddr_dl *)sa; + if (*LLADDR(sdl) != arcbroadcastaddr) + return EADDRNOTAVAIL; + *llsa = 0; + return 0; +#ifdef INET + case AF_INET: + sin = (struct sockaddr_in *)sa; + if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + return EADDRNOTAVAIL; + sdl = malloc(sizeof *sdl, M_IFMADDR, + M_NOWAIT | M_ZERO); + if (sdl == NULL) + return ENOMEM; + sdl->sdl_len = sizeof *sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = IFT_ARCNET; + sdl->sdl_alen = ARC_ADDR_LEN; + *LLADDR(sdl) = 0; + *llsa = (struct sockaddr *)sdl; + return 0; +#endif +#ifdef INET6 + case AF_INET6: + sin6 = (struct sockaddr_in6 *)sa; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + /* + * An IP6 address of 0 means listen to all + * of the Ethernet multicast address used for IP6. + * (This is used for multicast routers.) + */ + ifp->if_flags |= IFF_ALLMULTI; + *llsa = 0; + return 0; + } + if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) + return EADDRNOTAVAIL; + sdl = malloc(sizeof *sdl, M_IFMADDR, + M_NOWAIT | M_ZERO); + if (sdl == NULL) + return ENOMEM; + sdl->sdl_len = sizeof *sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = IFT_ARCNET; + sdl->sdl_alen = ARC_ADDR_LEN; + *LLADDR(sdl) = 0; + *llsa = (struct sockaddr *)sdl; + return 0; +#endif + + default: + /* + * Well, the text isn't quite right, but it's the name + * that counts... + */ + return EAFNOSUPPORT; + } +} + +MALLOC_DEFINE(M_ARCCOM, "arccom", "ARCNET interface internals"); + +static void* +arc_alloc(u_char type, struct ifnet *ifp) +{ + struct arccom *ac; + + ac = malloc(sizeof(struct arccom), M_ARCCOM, M_WAITOK | M_ZERO); + ac->ac_ifp = ifp; + + return (ac); +} + +static void +arc_free(void *com, u_char type) +{ + + free(com, M_ARCCOM); +} + +static int +arc_modevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + if_register_com_alloc(IFT_ARCNET, arc_alloc, arc_free); + break; + case MOD_UNLOAD: + if_deregister_com_alloc(IFT_ARCNET); + break; + default: + return EOPNOTSUPP; + } + + return (0); +} + +static moduledata_t arc_mod = { + "arcnet", + arc_modevent, + 0 +}; + +DECLARE_MODULE(arcnet, arc_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); +MODULE_VERSION(arcnet, 1); diff --git a/freebsd/sys/net/if_arp.h b/freebsd/sys/net/if_arp.h new file mode 100644 index 00000000..2ad9fffb --- /dev/null +++ b/freebsd/sys/net/if_arp.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/net/if_atm.h b/freebsd/sys/net/if_atm.h new file mode 100644 index 00000000..e8f69da0 --- /dev/null +++ b/freebsd/sys/net/if_atm.h @@ -0,0 +1,337 @@ +/* $NetBSD: if_atm.h,v 1.7 1996/11/09 23:02:27 chuck Exp $ */ +/* $FreeBSD$ */ + +/*- + * + * Copyright (c) 1996 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * net/if_atm.h + */ + +/* + * Classification of ATM cards. + */ +#define ATM_DEVICE_UNKNOWN 0 +#define ATM_DEVICE_PCA200E 1 /* Fore/Marconi PCA200-E */ +#define ATM_DEVICE_HE155 2 /* Fore/Marconi HE155 */ +#define ATM_DEVICE_HE622 3 /* Fore/Marconi HE622 */ +#define ATM_DEVICE_ENI155P 4 /* Efficient networks 155p */ +#define ATM_DEVICE_ADP155P 5 /* Adaptec 155p */ +#define ATM_DEVICE_FORELE25 6 /* ForeRunnerLE 25 */ +#define ATM_DEVICE_FORELE155 7 /* ForeRunnerLE 155 */ +#define ATM_DEVICE_NICSTAR25 8 /* other 77211 25.6MBit */ +#define ATM_DEVICE_NICSTAR155 9 /* other 77211 155MBit */ +#define ATM_DEVICE_IDTABR25 10 /* 77252 based card 25MBit */ +#define ATM_DEVICE_IDTABR155 11 /* 77252 based card 155MBit */ +#define ATM_DEVICE_PROATM25 12 /* 77252 based ProSum card 25MBit */ +#define ATM_DEVICE_PROATM155 13 /* 77252 based ProSum card 155MBit */ +#define ATM_DEVICE_VIRTUAL 14 /* virtual ATM device (netgraph) */ + +/* map to strings and vendors */ +#define ATM_DEVICE_NAMES \ + { "Unknown", "Unknown" }, \ + { "PCA200-E", "Fore/Marconi" }, \ + { "HE155", "Fore/Marconi" }, \ + { "HE622", "Fore/Marconi" }, \ + { "ENI155p", "Efficient Networks" }, \ + { "ADP155p", "Adaptec" }, \ + { "ForeRunnerLE25", "Fore/Marconi" }, \ + { "ForeRunnerLE155", "Fore/Marconi" }, \ + { "IDT77211/25", "IDT" }, \ + { "IDT77211/155", "IDT" }, \ + { "IDT77252/25", "IDT" }, \ + { "IDT77252/155", "IDT" }, \ + { "ProATM/25", "ProSum" }, \ + { "ProATM/155", "ProSum" }, \ + { "Virtual", "NetGraph" }, + +/* + * This is the common link layer MIB for all ATM interfaces. Much of the + * information here is needed for ILMI. This will be augmented by statistics + * at some point. + */ +struct ifatm_mib { + /* configuration data */ + uint8_t device; /* type of card */ + u_char esi[6]; /* end system identifier (MAC) */ + uint32_t serial; /* card serial number */ + uint32_t hw_version; /* card version */ + uint32_t sw_version; /* firmware version (if any) */ + uint32_t pcr; /* supported peak cell rate */ + uint32_t media; /* physical media */ + uint8_t vpi_bits; /* number of used bits in VPI field */ + uint8_t vci_bits; /* number of used bits in VCI field */ + uint16_t max_vpcs; /* maximum number of VPCs */ + uint32_t max_vccs; /* maximum number of VCCs */ +}; + +/* + * Traffic parameters for ATM connections. This contains all parameters + * to accomodate UBR, UBR+MCR, CBR, VBR and ABR connections. + * + * Keep in sync with ng_atm.h + */ +struct atmio_tparam { + uint32_t pcr; /* 24bit: Peak Cell Rate */ + uint32_t scr; /* 24bit: VBR Sustainable Cell Rate */ + uint32_t mbs; /* 24bit: VBR Maximum burst size */ + uint32_t mcr; /* 24bit: ABR/VBR/UBR+MCR MCR */ + uint32_t icr; /* 24bit: ABR ICR */ + uint32_t tbe; /* 24bit: ABR TBE (1...2^24-1) */ + uint8_t nrm; /* 3bit: ABR Nrm */ + uint8_t trm; /* 3bit: ABR Trm */ + uint16_t adtf; /* 10bit: ABR ADTF */ + uint8_t rif; /* 4bit: ABR RIF */ + uint8_t rdf; /* 4bit: ABR RDF */ + uint8_t cdf; /* 3bit: ABR CDF */ +}; + +/* + * VCC parameters + * + * Keep in sync with ng_atm.h + */ +struct atmio_vcc { + uint16_t flags; /* VCC flags */ + uint16_t vpi; + uint16_t vci; + uint16_t rmtu; /* maximum receive PDU */ + uint16_t tmtu; /* maximum transmit PDU */ + uint8_t aal; /* aal type */ + uint8_t traffic; /* traffic type */ + struct atmio_tparam tparam; /* traffic parameters */ +}; + +/* VCC flags */ +#define ATMIO_FLAG_LLCSNAP 0x0002 /* same as ATM_PH_LLCSNAP */ +#define ATMIO_FLAG_NG 0x0010 /* owned by netgraph */ +#define ATMIO_FLAG_HARP 0x0020 /* owned by HARP */ +#define ATMIO_FLAG_NORX 0x0100 /* not receiving on this VCC */ +#define ATMIO_FLAG_NOTX 0x0200 /* not transmitting on this VCC */ +#define ATMIO_FLAG_PVC 0x0400 /* this is a PVC */ +#define ATMIO_FLAG_ASYNC 0x0800 /* async open/close */ +#define ATMIO_FLAGS "\020\2LLCSNAP\5NG\6HARP\11NORX\12NOTX\13PVC\14ASYNC" + +#define ATMIO_AAL_0 0 /* pure cells */ +#define ATMIO_AAL_34 4 /* AAL3 and 4 */ +#define ATMIO_AAL_5 5 /* AAL5 */ +#define ATMIO_AAL_RAW 10 /* whatever the card does */ + +#define ATMIO_TRAFFIC_UBR 0 +#define ATMIO_TRAFFIC_CBR 1 +#define ATMIO_TRAFFIC_ABR 2 +#define ATMIO_TRAFFIC_VBR 3 + +/* + * VCC table + * + * Keep in sync with ng_atm.h + */ +struct atmio_vcctable { + uint32_t count; /* number of vccs */ + struct atmio_vcc vccs[0]; /* array of VCCs */ +}; + +/* + * Peak cell rates for various physical media. Note, that there are + * different opinions on what the correct values are. + */ +#define ATM_RATE_25_6M 59259 +#define ATM_RATE_155M 353208 +#define ATM_RATE_622M 1412830 +#define ATM_RATE_2_4G 5651320 + +#ifdef _KERNEL +/* + * Common fields for all ATM interfaces. Each driver's softc must start with + * this structure. + */ +struct ifatm { + struct ifnet *ifp; + struct ifatm_mib mib; /* exported data */ + void *phy; /* usually SUNI */ + void *ngpriv; /* netgraph link */ +}; +#define IFP2IFATM(ifp) ((struct ifatm *)(ifp)->if_l2com) +#endif + +/* + * Keep structures in sync with ng_atm.h + * + * These are used by netgraph/harp to call the driver + * NATM uses the atm_pseudoioctl instead. + */ +struct atmio_openvcc { + void *rxhand; /* handle argument */ + struct atmio_vcc param; /* parameters */ +}; + +struct atmio_closevcc { + uint16_t vpi; + uint16_t vci; +}; + +#if defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) +#define RTALLOC1(A,B) rtalloc1((A),(B)) +#elif defined(__FreeBSD__) +#define RTALLOC1(A,B) rtalloc1((A),(B),0UL) +#endif + +/* + * pseudo header for packet transmission + */ +struct atm_pseudohdr { + uint8_t atm_ph[4]; /* flags+VPI+VCI1(msb)+VCI2(lsb) */ +}; + +#define ATM_PH_FLAGS(X) ((X)->atm_ph[0]) +#define ATM_PH_VPI(X) ((X)->atm_ph[1]) +#define ATM_PH_VCI(X) ((((X)->atm_ph[2]) << 8) | ((X)->atm_ph[3])) +#define ATM_PH_SETVCI(X,V) { \ + (X)->atm_ph[2] = ((V) >> 8) & 0xff; \ + (X)->atm_ph[3] = ((V) & 0xff); \ +} + +/* use AAL5? (0 == aal0) */ +#define ATM_PH_AAL5 0x01 +/* use the LLC SNAP encoding (iff aal5) */ +#define ATM_PH_LLCSNAP ATMIO_FLAG_LLCSNAP + +#define ATM_PH_DRIVER7 0x40 /* reserve for driver's use */ +#define ATM_PH_DRIVER8 0x80 /* reserve for driver's use */ + +#define ATMMTU 9180 /* ATM MTU size for IP */ + /* XXX: could be 9188 with LLC/SNAP according + to comer */ + +#define SIOCATMGETVCCS _IOW('a', 125, struct atmio_vcctable) +#define SIOCATMOPENVCC _IOR('a', 126, struct atmio_openvcc) +#define SIOCATMCLOSEVCC _IOR('a', 127, struct atmio_closevcc) + +#define SIOCATMGVCCS _IOWR('i', 230, struct ifreq) + +/* + * XXX forget all the garbage in if_llc.h and do it the easy way + */ +#define ATMLLC_HDR "\252\252\3\0\0\0" +struct atmllc { + uint8_t llchdr[6]; /* aa.aa.03.00.00.00 */ + uint8_t type[2]; /* "ethernet" type */ +}; + +/* ATM_LLC macros: note type code in host byte order */ +#define ATM_LLC_TYPE(X) (((X)->type[0] << 8) | ((X)->type[1])) +#define ATM_LLC_SETTYPE(X, V) do { \ + (X)->type[0] = ((V) >> 8) & 0xff; \ + (X)->type[1] = ((V) & 0xff); \ + } while (0) + +/* + * Events that are emitted by the driver. Currently the only consumer + * of this is the netgraph node. + */ +#define ATMEV_FLOW_CONTROL 0x0001 /* channel busy state changed */ +#define ATMEV_IFSTATE_CHANGED 0x0002 /* up/down or carrier */ +#define ATMEV_VCC_CHANGED 0x0003 /* PVC deleted/create */ +#define ATMEV_ACR_CHANGED 0x0004 /* ABR ACR has changed */ + +struct atmev_flow_control { + uint16_t vpi; /* channel that is changed */ + uint16_t vci; + u_int busy : 1; /* != 0 -> ATM layer busy */ +}; + +struct atmev_ifstate_changed { + u_int running : 1; /* interface is running now */ + u_int carrier : 1; /* carrier detected (or not) */ +}; + +struct atmev_vcc_changed { + uint16_t vpi; /* channel that is changed */ + uint16_t vci; + u_int up : 1; /* 1 - created, 0 - deleted */ +}; + +struct atmev_acr_changed { + uint16_t vpi; /* channel that is changed */ + uint16_t vci; + uint32_t acr; /* new ACR */ +}; + +#ifdef _KERNEL +void atm_ifattach(struct ifnet *); +void atm_ifdetach(struct ifnet *); +void atm_input(struct ifnet *, struct atm_pseudohdr *, + struct mbuf *, void *); +int atm_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); +struct atmio_vcctable *atm_getvccs(struct atmio_vcc **, u_int, u_int, + struct mtx *, int); + +void atm_event(struct ifnet *, u_int, void *); + +#define ATMEV_SEND_FLOW_CONTROL(ATMIF, VPI, VCI, BUSY) \ + do { \ + struct atmev_flow_control _arg; \ + _arg.vpi = (VPI); \ + _arg.vci = (VCI); \ + _arg.busy = (BUSY); \ + atm_event((ATMIF)->ifp, ATMEV_FLOW_CONTROL, &_arg); \ + } while (0) + +#define ATMEV_SEND_VCC_CHANGED(ATMIF, VPI, VCI, UP) \ + do { \ + struct atmev_vcc_changed _arg; \ + _arg.vpi = (VPI); \ + _arg.vci = (VCI); \ + _arg.up = (UP); \ + atm_event((ATMIF)->ifp, ATMEV_VCC_CHANGED, &_arg); \ + } while (0) + +#define ATMEV_SEND_IFSTATE_CHANGED(ATMIF, CARRIER) \ + do { \ + struct atmev_ifstate_changed _arg; \ + _arg.running = (((ATMIF)->ifp->if_drv_flags & \ + IFF_DRV_RUNNING) != 0); \ + _arg.carrier = ((CARRIER) != 0); \ + atm_event((ATMIF)->ifp, ATMEV_IFSTATE_CHANGED, &_arg); \ + } while (0) + +#define ATMEV_SEND_ACR_CHANGED(ATMIF, VPI, VCI, ACR) \ + do { \ + struct atmev_acr_changed _arg; \ + _arg.vpi = (VPI); \ + _arg.vci = (VCI); \ + _arg.acr= (ACR); \ + atm_event((ATMIF)->ifp, ATMEV_ACR_CHANGED, &_arg); \ + } while (0) +#endif diff --git a/freebsd/sys/net/if_atmsubr.c b/freebsd/sys/net/if_atmsubr.c new file mode 100644 index 00000000..7daa347f --- /dev/null +++ b/freebsd/sys/net/if_atmsubr.c @@ -0,0 +1,504 @@ +#include + +/* $NetBSD: if_atmsubr.c,v 1.10 1997/03/11 23:19:51 chuck Exp $ */ + +/*- + * + * Copyright (c) 1996 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * if_atmsubr.c + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include /* XXX: for ETHERTYPE_* */ +#if defined(INET) || defined(INET6) +#include +#endif +#ifdef NATM +#include +#endif + +#include + +/* + * Netgraph interface functions. + * These need not be protected by a lock, because ng_atm nodes are persitent. + * The ng_atm module can be unloaded only if all ATM interfaces have been + * unloaded, so nobody should be in the code paths accessing these function + * pointers. + */ +void (*ng_atm_attach_p)(struct ifnet *); +void (*ng_atm_detach_p)(struct ifnet *); +int (*ng_atm_output_p)(struct ifnet *, struct mbuf **); +void (*ng_atm_input_p)(struct ifnet *, struct mbuf **, + struct atm_pseudohdr *, void *); +void (*ng_atm_input_orphan_p)(struct ifnet *, struct mbuf *, + struct atm_pseudohdr *, void *); +void (*ng_atm_event_p)(struct ifnet *, uint32_t, void *); + +/* + * Harp pseudo interface hooks + */ +void (*atm_harp_input_p)(struct ifnet *ifp, struct mbuf **m, + struct atm_pseudohdr *ah, void *rxhand); +void (*atm_harp_attach_p)(struct ifnet *); +void (*atm_harp_detach_p)(struct ifnet *); +void (*atm_harp_event_p)(struct ifnet *, uint32_t, void *); + +SYSCTL_NODE(_hw, OID_AUTO, atm, CTLFLAG_RW, 0, "ATM hardware"); + +MALLOC_DEFINE(M_IFATM, "ifatm", "atm interface internals"); + +#ifndef ETHERTYPE_IPV6 +#define ETHERTYPE_IPV6 0x86dd +#endif + +#define senderr(e) do { error = (e); goto bad; } while (0) + +/* + * atm_output: ATM output routine + * inputs: + * "ifp" = ATM interface to output to + * "m0" = the packet to output + * "dst" = the sockaddr to send to (either IP addr, or raw VPI/VCI) + * "ro" = the route to use + * returns: error code [0 == ok] + * + * note: special semantic: if (dst == NULL) then we assume "m" already + * has an atm_pseudohdr on it and just send it directly. + * [for native mode ATM output] if dst is null, then + * ro->ro_rt must also be NULL. + */ +int +atm_output(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, + struct route *ro) +{ + u_int16_t etype = 0; /* if using LLC/SNAP */ + int error = 0, sz; + struct atm_pseudohdr atmdst, *ad; + struct mbuf *m = m0; + struct atmllc *atmllc; + struct atmllc *llc_hdr = NULL; + u_int32_t atm_flags; + +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) + senderr(error); +#endif + + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) + senderr(ENETDOWN); + + /* + * check for non-native ATM traffic (dst != NULL) + */ + if (dst) { + switch (dst->sa_family) { + +#if defined(INET) || defined(INET6) + case AF_INET: + case AF_INET6: + { + if (dst->sa_family == AF_INET6) + etype = ETHERTYPE_IPV6; + else + etype = ETHERTYPE_IP; + if (!atmresolve(ro->ro_rt, m, dst, &atmdst)) { + m = NULL; + /* XXX: atmresolve already free'd it */ + senderr(EHOSTUNREACH); + /* XXX: put ATMARP stuff here */ + /* XXX: watch who frees m on failure */ + } + } + break; +#endif /* INET || INET6 */ + + case AF_UNSPEC: + /* + * XXX: bpfwrite. assuming dst contains 12 bytes + * (atm pseudo header (4) + LLC/SNAP (8)) + */ + bcopy(dst->sa_data, &atmdst, sizeof(atmdst)); + llc_hdr = (struct atmllc *)(dst->sa_data + + sizeof(atmdst)); + break; + + default: + printf("%s: can't handle af%d\n", ifp->if_xname, + dst->sa_family); + senderr(EAFNOSUPPORT); + } + + /* + * must add atm_pseudohdr to data + */ + sz = sizeof(atmdst); + atm_flags = ATM_PH_FLAGS(&atmdst); + if (atm_flags & ATM_PH_LLCSNAP) + sz += 8; /* sizeof snap == 8 */ + M_PREPEND(m, sz, M_DONTWAIT); + if (m == 0) + senderr(ENOBUFS); + ad = mtod(m, struct atm_pseudohdr *); + *ad = atmdst; + if (atm_flags & ATM_PH_LLCSNAP) { + atmllc = (struct atmllc *)(ad + 1); + if (llc_hdr == NULL) { + bcopy(ATMLLC_HDR, atmllc->llchdr, + sizeof(atmllc->llchdr)); + /* note: in host order */ + ATM_LLC_SETTYPE(atmllc, etype); + } + else + bcopy(llc_hdr, atmllc, sizeof(struct atmllc)); + } + } + + if (ng_atm_output_p != NULL) { + if ((error = (*ng_atm_output_p)(ifp, &m)) != 0) { + if (m != NULL) + m_freem(m); + return (error); + } + if (m == NULL) + return (0); + } + + /* + * Queue message on interface, and start output if interface + * not yet active. + */ + if (!IF_HANDOFF_ADJ(&ifp->if_snd, m, ifp, + -(int)sizeof(struct atm_pseudohdr))) + return (ENOBUFS); + return (error); + +bad: + if (m) + m_freem(m); + return (error); +} + +/* + * Process a received ATM packet; + * the packet is in the mbuf chain m. + */ +void +atm_input(struct ifnet *ifp, struct atm_pseudohdr *ah, struct mbuf *m, + void *rxhand) +{ + int isr; + u_int16_t etype = ETHERTYPE_IP; /* default */ + + if ((ifp->if_flags & IFF_UP) == 0) { + m_freem(m); + return; + } +#ifdef MAC + mac_ifnet_create_mbuf(ifp, m); +#endif + ifp->if_ibytes += m->m_pkthdr.len; + + if (ng_atm_input_p != NULL) { + (*ng_atm_input_p)(ifp, &m, ah, rxhand); + if (m == NULL) + return; + } + + /* not eaten by ng_atm. Maybe it's a pseudo-harp PDU? */ + if (atm_harp_input_p != NULL) { + (*atm_harp_input_p)(ifp, &m, ah, rxhand); + if (m == NULL) + return; + } + + if (rxhand) { +#ifdef NATM + struct natmpcb *npcb; + + /* + * XXXRW: this use of 'rxhand' is not a very good idea, and + * was subject to races even before SMPng due to the release + * of spl here. + */ + NATM_LOCK(); + npcb = rxhand; + npcb->npcb_inq++; /* count # in queue */ + isr = NETISR_NATM; + m->m_pkthdr.rcvif = rxhand; /* XXX: overload */ + NATM_UNLOCK(); +#else + printf("atm_input: NATM detected but not " + "configured in kernel\n"); + goto dropit; +#endif + } else { + /* + * handle LLC/SNAP header, if present + */ + if (ATM_PH_FLAGS(ah) & ATM_PH_LLCSNAP) { + struct atmllc *alc; + + if (m->m_len < sizeof(*alc) && + (m = m_pullup(m, sizeof(*alc))) == 0) + return; /* failed */ + alc = mtod(m, struct atmllc *); + if (bcmp(alc, ATMLLC_HDR, 6)) { + printf("%s: recv'd invalid LLC/SNAP frame " + "[vp=%d,vc=%d]\n", ifp->if_xname, + ATM_PH_VPI(ah), ATM_PH_VCI(ah)); + m_freem(m); + return; + } + etype = ATM_LLC_TYPE(alc); + m_adj(m, sizeof(*alc)); + } + + switch (etype) { + +#ifdef INET + case ETHERTYPE_IP: + isr = NETISR_IP; + break; +#endif + +#ifdef INET6 + case ETHERTYPE_IPV6: + isr = NETISR_IPV6; + break; +#endif + default: +#ifndef NATM + dropit: +#endif + if (ng_atm_input_orphan_p != NULL) + (*ng_atm_input_orphan_p)(ifp, m, ah, rxhand); + else + m_freem(m); + return; + } + } + netisr_dispatch(isr, m); +} + +/* + * Perform common duties while attaching to interface list. + */ +void +atm_ifattach(struct ifnet *ifp) +{ + struct ifaddr *ifa; + struct sockaddr_dl *sdl; + struct ifatm *ifatm = ifp->if_l2com; + + ifp->if_addrlen = 0; + ifp->if_hdrlen = 0; + if_attach(ifp); + ifp->if_mtu = ATMMTU; + ifp->if_output = atm_output; +#if 0 + ifp->if_input = atm_input; +#endif + ifp->if_snd.ifq_maxlen = 50; /* dummy */ + + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + if (ifa->ifa_addr->sa_family == AF_LINK) { + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + sdl->sdl_type = IFT_ATM; + sdl->sdl_alen = ifp->if_addrlen; +#ifdef notyet /* if using ATMARP, store hardware address using the next line */ + bcopy(ifp->hw_addr, LLADDR(sdl), ifp->if_addrlen); +#endif + break; + } + + ifp->if_linkmib = &ifatm->mib; + ifp->if_linkmiblen = sizeof(ifatm->mib); + + if(ng_atm_attach_p) + (*ng_atm_attach_p)(ifp); + if (atm_harp_attach_p) + (*atm_harp_attach_p)(ifp); +} + +/* + * Common stuff for detaching an ATM interface + */ +void +atm_ifdetach(struct ifnet *ifp) +{ + if (atm_harp_detach_p) + (*atm_harp_detach_p)(ifp); + if(ng_atm_detach_p) + (*ng_atm_detach_p)(ifp); + if_detach(ifp); +} + +/* + * Support routine for the SIOCATMGVCCS ioctl(). + * + * This routine assumes, that the private VCC structures used by the driver + * begin with a struct atmio_vcc. + * + * Return a table of VCCs in a freshly allocated memory area. + * Here we have a problem: we first count, how many vccs we need + * to return. The we allocate the memory and finally fill it in. + * Because we cannot lock while calling malloc, the number of active + * vccs may change while we're in malloc. So we allocate a couple of + * vccs more and if space anyway is not enough re-iterate. + * + * We could use an sx lock for the vcc tables. + */ +struct atmio_vcctable * +atm_getvccs(struct atmio_vcc **table, u_int size, u_int start, + struct mtx *lock, int waitok) +{ + u_int cid, alloc; + size_t len; + struct atmio_vcctable *vccs; + struct atmio_vcc *v; + + alloc = start + 10; + vccs = NULL; + + for (;;) { + len = sizeof(*vccs) + alloc * sizeof(vccs->vccs[0]); + vccs = reallocf(vccs, len, M_TEMP, + waitok ? M_WAITOK : M_NOWAIT); + if (vccs == NULL) + return (NULL); + bzero(vccs, len); + + vccs->count = 0; + v = vccs->vccs; + + mtx_lock(lock); + for (cid = 0; cid < size; cid++) + if (table[cid] != NULL) { + if (++vccs->count == alloc) + /* too many - try again */ + break; + *v++ = *table[cid]; + } + mtx_unlock(lock); + + if (cid == size) + break; + + alloc *= 2; + } + return (vccs); +} + +/* + * Driver or channel state has changed. Inform whoever is interested + * in these events. + */ +void +atm_event(struct ifnet *ifp, u_int event, void *arg) +{ + if (ng_atm_event_p != NULL) + (*ng_atm_event_p)(ifp, event, arg); + if (atm_harp_event_p != NULL) + (*atm_harp_event_p)(ifp, event, arg); +} + +static void * +atm_alloc(u_char type, struct ifnet *ifp) +{ + struct ifatm *ifatm; + + ifatm = malloc(sizeof(struct ifatm), M_IFATM, M_WAITOK | M_ZERO); + ifatm->ifp = ifp; + + return (ifatm); +} + +static void +atm_free(void *com, u_char type) +{ + + free(com, M_IFATM); +} + +static int +atm_modevent(module_t mod, int type, void *data) +{ + switch (type) { + case MOD_LOAD: + if_register_com_alloc(IFT_ATM, atm_alloc, atm_free); + break; + case MOD_UNLOAD: + if_deregister_com_alloc(IFT_ATM); + break; + default: + return (EOPNOTSUPP); + } + + return (0); +} + +static moduledata_t atm_mod = { + "atm", + atm_modevent, + 0 +}; + +DECLARE_MODULE(atm, atm_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); +MODULE_VERSION(atm, 1); diff --git a/freebsd/sys/net/if_bridge.c b/freebsd/sys/net/if_bridge.c new file mode 100644 index 00000000..de7aea04 --- /dev/null +++ b/freebsd/sys/net/if_bridge.c @@ -0,0 +1,3458 @@ +#include + +/* $NetBSD: if_bridge.c,v 1.31 2005/06/01 19:45:34 jdc Exp $ */ + +/* + * Copyright 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Jason R. Thorpe for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: if_bridge.c,v 1.60 2001/06/15 03:38:33 itojun Exp + */ + +/* + * Network interface bridge support. + * + * TODO: + * + * - Currently only supports Ethernet-like interfaces (Ethernet, + * 802.11, VLANs on Ethernet, etc.) Figure out a nice way + * to bridge other types of interfaces (FDDI-FDDI, and maybe + * consider heterogenous bridges). + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* for net/if.h */ +#include +#include /* string functions */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* for struct arpcom */ +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#endif +#if defined(INET) || defined(INET6) +#include +#endif +#include +#include /* for struct arpcom */ +#include +#include +#include +#include + +#include +#include +#include + +/* + * Size of the route hash table. Must be a power of two. + */ +#ifndef BRIDGE_RTHASH_SIZE +#define BRIDGE_RTHASH_SIZE 1024 +#endif + +#define BRIDGE_RTHASH_MASK (BRIDGE_RTHASH_SIZE - 1) + +/* + * Maximum number of addresses to cache. + */ +#ifndef BRIDGE_RTABLE_MAX +#define BRIDGE_RTABLE_MAX 100 +#endif + +/* + * Timeout (in seconds) for entries learned dynamically. + */ +#ifndef BRIDGE_RTABLE_TIMEOUT +#define BRIDGE_RTABLE_TIMEOUT (20 * 60) /* same as ARP */ +#endif + +/* + * Number of seconds between walks of the route list. + */ +#ifndef BRIDGE_RTABLE_PRUNE_PERIOD +#define BRIDGE_RTABLE_PRUNE_PERIOD (5 * 60) +#endif + +/* + * List of capabilities to possibly mask on the member interface. + */ +#define BRIDGE_IFCAPS_MASK (IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM) + +/* + * List of capabilities to strip + */ +#define BRIDGE_IFCAPS_STRIP IFCAP_LRO + +/* + * Bridge interface list entry. + */ +struct bridge_iflist { + LIST_ENTRY(bridge_iflist) bif_next; + struct ifnet *bif_ifp; /* member if */ + struct bstp_port bif_stp; /* STP state */ + uint32_t bif_flags; /* member if flags */ + int bif_savedcaps; /* saved capabilities */ + uint32_t bif_addrmax; /* max # of addresses */ + uint32_t bif_addrcnt; /* cur. # of addresses */ + uint32_t bif_addrexceeded;/* # of address violations */ +}; + +/* + * Bridge route node. + */ +struct bridge_rtnode { + LIST_ENTRY(bridge_rtnode) brt_hash; /* hash table linkage */ + LIST_ENTRY(bridge_rtnode) brt_list; /* list linkage */ + struct bridge_iflist *brt_dst; /* destination if */ + unsigned long brt_expire; /* expiration time */ + uint8_t brt_flags; /* address flags */ + uint8_t brt_addr[ETHER_ADDR_LEN]; + uint16_t brt_vlan; /* vlan id */ +}; +#define brt_ifp brt_dst->bif_ifp + +/* + * Software state for each bridge. + */ +struct bridge_softc { + struct ifnet *sc_ifp; /* make this an interface */ + LIST_ENTRY(bridge_softc) sc_list; + struct mtx sc_mtx; + struct cv sc_cv; + uint32_t sc_brtmax; /* max # of addresses */ + uint32_t sc_brtcnt; /* cur. # of addresses */ + uint32_t sc_brttimeout; /* rt timeout in seconds */ + struct callout sc_brcallout; /* bridge callout */ + uint32_t sc_iflist_ref; /* refcount for sc_iflist */ + uint32_t sc_iflist_xcnt; /* refcount for sc_iflist */ + LIST_HEAD(, bridge_iflist) sc_iflist; /* member interface list */ + LIST_HEAD(, bridge_rtnode) *sc_rthash; /* our forwarding table */ + LIST_HEAD(, bridge_rtnode) sc_rtlist; /* list version of above */ + uint32_t sc_rthash_key; /* key for hash */ + LIST_HEAD(, bridge_iflist) sc_spanlist; /* span ports list */ + struct bstp_state sc_stp; /* STP state */ + uint32_t sc_brtexceeded; /* # of cache drops */ + struct ifnet *sc_ifaddr; /* member mac copied from */ + u_char sc_defaddr[6]; /* Default MAC address */ +}; + +static struct mtx bridge_list_mtx; +eventhandler_tag bridge_detach_cookie = NULL; + +int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD; + +uma_zone_t bridge_rtnode_zone; + +static int bridge_clone_create(struct if_clone *, int, caddr_t); +static void bridge_clone_destroy(struct ifnet *); + +static int bridge_ioctl(struct ifnet *, u_long, caddr_t); +static void bridge_mutecaps(struct bridge_softc *); +static void bridge_set_ifcap(struct bridge_softc *, struct bridge_iflist *, + int); +static void bridge_ifdetach(void *arg __unused, struct ifnet *); +static void bridge_init(void *); +static void bridge_dummynet(struct mbuf *, struct ifnet *); +static void bridge_stop(struct ifnet *, int); +static void bridge_start(struct ifnet *); +static struct mbuf *bridge_input(struct ifnet *, struct mbuf *); +static int bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); +static void bridge_enqueue(struct bridge_softc *, struct ifnet *, + struct mbuf *); +static void bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int); + +static void bridge_forward(struct bridge_softc *, struct bridge_iflist *, + struct mbuf *m); + +static void bridge_timer(void *); + +static void bridge_broadcast(struct bridge_softc *, struct ifnet *, + struct mbuf *, int); +static void bridge_span(struct bridge_softc *, struct mbuf *); + +static int bridge_rtupdate(struct bridge_softc *, const uint8_t *, + uint16_t, struct bridge_iflist *, int, uint8_t); +static struct ifnet *bridge_rtlookup(struct bridge_softc *, const uint8_t *, + uint16_t); +static void bridge_rttrim(struct bridge_softc *); +static void bridge_rtage(struct bridge_softc *); +static void bridge_rtflush(struct bridge_softc *, int); +static int bridge_rtdaddr(struct bridge_softc *, const uint8_t *, + uint16_t); + +static int bridge_rtable_init(struct bridge_softc *); +static void bridge_rtable_fini(struct bridge_softc *); + +static int bridge_rtnode_addr_cmp(const uint8_t *, const uint8_t *); +static struct bridge_rtnode *bridge_rtnode_lookup(struct bridge_softc *, + const uint8_t *, uint16_t); +static int bridge_rtnode_insert(struct bridge_softc *, + struct bridge_rtnode *); +static void bridge_rtnode_destroy(struct bridge_softc *, + struct bridge_rtnode *); +static void bridge_rtable_expire(struct ifnet *, int); +static void bridge_state_change(struct ifnet *, int); + +static struct bridge_iflist *bridge_lookup_member(struct bridge_softc *, + const char *name); +static struct bridge_iflist *bridge_lookup_member_if(struct bridge_softc *, + struct ifnet *ifp); +static void bridge_delete_member(struct bridge_softc *, + struct bridge_iflist *, int); +static void bridge_delete_span(struct bridge_softc *, + struct bridge_iflist *); + +static int bridge_ioctl_add(struct bridge_softc *, void *); +static int bridge_ioctl_del(struct bridge_softc *, void *); +static int bridge_ioctl_gifflags(struct bridge_softc *, void *); +static int bridge_ioctl_sifflags(struct bridge_softc *, void *); +static int bridge_ioctl_scache(struct bridge_softc *, void *); +static int bridge_ioctl_gcache(struct bridge_softc *, void *); +static int bridge_ioctl_gifs(struct bridge_softc *, void *); +static int bridge_ioctl_rts(struct bridge_softc *, void *); +static int bridge_ioctl_saddr(struct bridge_softc *, void *); +static int bridge_ioctl_sto(struct bridge_softc *, void *); +static int bridge_ioctl_gto(struct bridge_softc *, void *); +static int bridge_ioctl_daddr(struct bridge_softc *, void *); +static int bridge_ioctl_flush(struct bridge_softc *, void *); +static int bridge_ioctl_gpri(struct bridge_softc *, void *); +static int bridge_ioctl_spri(struct bridge_softc *, void *); +static int bridge_ioctl_ght(struct bridge_softc *, void *); +static int bridge_ioctl_sht(struct bridge_softc *, void *); +static int bridge_ioctl_gfd(struct bridge_softc *, void *); +static int bridge_ioctl_sfd(struct bridge_softc *, void *); +static int bridge_ioctl_gma(struct bridge_softc *, void *); +static int bridge_ioctl_sma(struct bridge_softc *, void *); +static int bridge_ioctl_sifprio(struct bridge_softc *, void *); +static int bridge_ioctl_sifcost(struct bridge_softc *, void *); +static int bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *); +static int bridge_ioctl_addspan(struct bridge_softc *, void *); +static int bridge_ioctl_delspan(struct bridge_softc *, void *); +static int bridge_ioctl_gbparam(struct bridge_softc *, void *); +static int bridge_ioctl_grte(struct bridge_softc *, void *); +static int bridge_ioctl_gifsstp(struct bridge_softc *, void *); +static int bridge_ioctl_sproto(struct bridge_softc *, void *); +static int bridge_ioctl_stxhc(struct bridge_softc *, void *); +static int bridge_pfil(struct mbuf **, struct ifnet *, struct ifnet *, + int); +static int bridge_ip_checkbasic(struct mbuf **mp); +#ifdef INET6 +static int bridge_ip6_checkbasic(struct mbuf **mp); +#endif /* INET6 */ +static int bridge_fragment(struct ifnet *, struct mbuf *, + struct ether_header *, int, struct llc *); + +/* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */ +#define VLANTAGOF(_m) \ + (_m->m_flags & M_VLANTAG) ? EVL_VLANOFTAG(_m->m_pkthdr.ether_vtag) : 1 + +static struct bstp_cb_ops bridge_ops = { + .bcb_state = bridge_state_change, + .bcb_rtage = bridge_rtable_expire +}; + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW, 0, "Bridge"); + +static int pfil_onlyip = 1; /* only pass IP[46] packets when pfil is enabled */ +static int pfil_bridge = 1; /* run pfil hooks on the bridge interface */ +static int pfil_member = 1; /* run pfil hooks on the member interface */ +static int pfil_ipfw = 0; /* layer2 filter with ipfw */ +static int pfil_ipfw_arp = 0; /* layer2 filter with ipfw */ +static int pfil_local_phys = 0; /* run pfil hooks on the physical interface for + locally destined packets */ +static int log_stp = 0; /* log STP state changes */ +static int bridge_inherit_mac = 0; /* share MAC with first bridge member */ +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip, CTLFLAG_RW, + &pfil_onlyip, 0, "Only pass IP packets when pfil is enabled"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp, CTLFLAG_RW, + &pfil_ipfw_arp, 0, "Filter ARP packets through IPFW layer2"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge, CTLFLAG_RW, + &pfil_bridge, 0, "Packet filter on the bridge interface"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member, CTLFLAG_RW, + &pfil_member, 0, "Packet filter on the member interface"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys, CTLFLAG_RW, + &pfil_local_phys, 0, + "Packet filter on the physical interface for locally destined packets"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp, CTLFLAG_RW, + &log_stp, 0, "Log STP state changes"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, inherit_mac, CTLFLAG_RW, + &bridge_inherit_mac, 0, + "Inherit MAC address from the first bridge member"); + +struct bridge_control { + int (*bc_func)(struct bridge_softc *, void *); + int bc_argsize; + int bc_flags; +}; + +#define BC_F_COPYIN 0x01 /* copy arguments in */ +#define BC_F_COPYOUT 0x02 /* copy arguments out */ +#define BC_F_SUSER 0x04 /* do super-user check */ + +const struct bridge_control bridge_control_table[] = { + { bridge_ioctl_add, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_del, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_sifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_scache, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gcache, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifs, sizeof(struct ifbifconf), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_rts, sizeof(struct ifbaconf), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_saddr, sizeof(struct ifbareq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sto, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gto, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_daddr, sizeof(struct ifbareq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_flush, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gpri, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_spri, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_ght, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sht, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfd, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfd, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gma, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sma, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifprio, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifcost, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_addspan, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_delspan, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gbparam, sizeof(struct ifbropreq), + BC_F_COPYOUT }, + + { bridge_ioctl_grte, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifsstp, sizeof(struct ifbpstpconf), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_sproto, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_stxhc, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + +}; +const int bridge_control_table_size = + sizeof(bridge_control_table) / sizeof(bridge_control_table[0]); + +LIST_HEAD(, bridge_softc) bridge_list; + +IFC_SIMPLE_DECLARE(bridge, 0); + +static int +bridge_modevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + mtx_init(&bridge_list_mtx, "if_bridge list", NULL, MTX_DEF); + if_clone_attach(&bridge_cloner); + bridge_rtnode_zone = uma_zcreate("bridge_rtnode", + sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + LIST_INIT(&bridge_list); + bridge_input_p = bridge_input; + bridge_output_p = bridge_output; + bridge_dn_p = bridge_dummynet; + bridge_detach_cookie = EVENTHANDLER_REGISTER( + ifnet_departure_event, bridge_ifdetach, NULL, + EVENTHANDLER_PRI_ANY); + break; + case MOD_UNLOAD: + EVENTHANDLER_DEREGISTER(ifnet_departure_event, + bridge_detach_cookie); + if_clone_detach(&bridge_cloner); + uma_zdestroy(bridge_rtnode_zone); + bridge_input_p = NULL; + bridge_output_p = NULL; + bridge_dn_p = NULL; + mtx_destroy(&bridge_list_mtx); + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t bridge_mod = { + "if_bridge", + bridge_modevent, + 0 +}; + +DECLARE_MODULE(if_bridge, bridge_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_DEPEND(if_bridge, bridgestp, 1, 1, 1); + +/* + * handler for net.link.bridge.pfil_ipfw + */ +static int +sysctl_pfil_ipfw(SYSCTL_HANDLER_ARGS) +{ + int enable = pfil_ipfw; + int error; + + error = sysctl_handle_int(oidp, &enable, 0, req); + enable = (enable) ? 1 : 0; + + if (enable != pfil_ipfw) { + pfil_ipfw = enable; + + /* + * Disable pfil so that ipfw doesnt run twice, if the user + * really wants both then they can re-enable pfil_bridge and/or + * pfil_member. Also allow non-ip packets as ipfw can filter by + * layer2 type. + */ + if (pfil_ipfw) { + pfil_onlyip = 0; + pfil_bridge = 0; + pfil_member = 0; + } + } + + return (error); +} +SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw, CTLTYPE_INT|CTLFLAG_RW, + &pfil_ipfw, 0, &sysctl_pfil_ipfw, "I", "Layer2 filter with IPFW"); + +/* + * bridge_clone_create: + * + * Create a new bridge instance. + */ +static int +bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + struct bridge_softc *sc, *sc2; + struct ifnet *bifp, *ifp; + int retry; + + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); + ifp = sc->sc_ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) { + free(sc, M_DEVBUF); + return (ENOSPC); + } + + BRIDGE_LOCK_INIT(sc); + sc->sc_brtmax = BRIDGE_RTABLE_MAX; + sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT; + + /* Initialize our routing table. */ + bridge_rtable_init(sc); + + callout_init_mtx(&sc->sc_brcallout, &sc->sc_mtx, 0); + + LIST_INIT(&sc->sc_iflist); + LIST_INIT(&sc->sc_spanlist); + + ifp->if_softc = sc; + if_initname(ifp, ifc->ifc_name, unit); + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_ioctl = bridge_ioctl; + ifp->if_start = bridge_start; + ifp->if_init = bridge_init; + ifp->if_type = IFT_BRIDGE; + IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); + ifp->if_snd.ifq_drv_maxlen = ifqmaxlen; + IFQ_SET_READY(&ifp->if_snd); + + /* + * Generate a random ethernet address with a locally administered + * address. + * + * Since we are using random ethernet addresses for the bridge, it is + * possible that we might have address collisions, so make sure that + * this hardware address isn't already in use on another bridge. + */ + for (retry = 1; retry != 0;) { + arc4rand(sc->sc_defaddr, ETHER_ADDR_LEN, 1); + sc->sc_defaddr[0] &= ~1; /* clear multicast bit */ + sc->sc_defaddr[0] |= 2; /* set the LAA bit */ + retry = 0; + mtx_lock(&bridge_list_mtx); + LIST_FOREACH(sc2, &bridge_list, sc_list) { + bifp = sc2->sc_ifp; + if (memcmp(sc->sc_defaddr, + IF_LLADDR(bifp), ETHER_ADDR_LEN) == 0) + retry = 1; + } + mtx_unlock(&bridge_list_mtx); + } + + bstp_attach(&sc->sc_stp, &bridge_ops); + ether_ifattach(ifp, sc->sc_defaddr); + /* Now undo some of the damage... */ + ifp->if_baudrate = 0; + ifp->if_type = IFT_BRIDGE; + + mtx_lock(&bridge_list_mtx); + LIST_INSERT_HEAD(&bridge_list, sc, sc_list); + mtx_unlock(&bridge_list_mtx); + + return (0); +} + +/* + * bridge_clone_destroy: + * + * Destroy a bridge instance. + */ +static void +bridge_clone_destroy(struct ifnet *ifp) +{ + struct bridge_softc *sc = ifp->if_softc; + struct bridge_iflist *bif; + + BRIDGE_LOCK(sc); + + bridge_stop(ifp, 1); + ifp->if_flags &= ~IFF_UP; + + while ((bif = LIST_FIRST(&sc->sc_iflist)) != NULL) + bridge_delete_member(sc, bif, 0); + + while ((bif = LIST_FIRST(&sc->sc_spanlist)) != NULL) { + bridge_delete_span(sc, bif); + } + + BRIDGE_UNLOCK(sc); + + callout_drain(&sc->sc_brcallout); + + mtx_lock(&bridge_list_mtx); + LIST_REMOVE(sc, sc_list); + mtx_unlock(&bridge_list_mtx); + + bstp_detach(&sc->sc_stp); + ether_ifdetach(ifp); + if_free_type(ifp, IFT_ETHER); + + /* Tear down the routing table. */ + bridge_rtable_fini(sc); + + BRIDGE_LOCK_DESTROY(sc); + free(sc, M_DEVBUF); +} + +/* + * bridge_ioctl: + * + * Handle a control request from the operator. + */ +static int +bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct bridge_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data; + struct bridge_iflist *bif; + struct thread *td = curthread; + union { + struct ifbreq ifbreq; + struct ifbifconf ifbifconf; + struct ifbareq ifbareq; + struct ifbaconf ifbaconf; + struct ifbrparam ifbrparam; + struct ifbropreq ifbropreq; + } args; + struct ifdrv *ifd = (struct ifdrv *) data; + const struct bridge_control *bc; + int error = 0; + + switch (cmd) { + + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + + case SIOCGDRVSPEC: + case SIOCSDRVSPEC: + if (ifd->ifd_cmd >= bridge_control_table_size) { + error = EINVAL; + break; + } + bc = &bridge_control_table[ifd->ifd_cmd]; + + if (cmd == SIOCGDRVSPEC && + (bc->bc_flags & BC_F_COPYOUT) == 0) { + error = EINVAL; + break; + } + else if (cmd == SIOCSDRVSPEC && + (bc->bc_flags & BC_F_COPYOUT) != 0) { + error = EINVAL; + break; + } + + if (bc->bc_flags & BC_F_SUSER) { + error = priv_check(td, PRIV_NET_BRIDGE); + if (error) + break; + } + + if (ifd->ifd_len != bc->bc_argsize || + ifd->ifd_len > sizeof(args)) { + error = EINVAL; + break; + } + + bzero(&args, sizeof(args)); + if (bc->bc_flags & BC_F_COPYIN) { + error = copyin(ifd->ifd_data, &args, ifd->ifd_len); + if (error) + break; + } + + BRIDGE_LOCK(sc); + error = (*bc->bc_func)(sc, &args); + BRIDGE_UNLOCK(sc); + if (error) + break; + + if (bc->bc_flags & BC_F_COPYOUT) + error = copyout(&args, ifd->ifd_data, ifd->ifd_len); + + break; + + case SIOCSIFFLAGS: + if (!(ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING)) { + /* + * If interface is marked down and it is running, + * then stop and disable it. + */ + BRIDGE_LOCK(sc); + bridge_stop(ifp, 1); + BRIDGE_UNLOCK(sc); + } else if ((ifp->if_flags & IFF_UP) && + !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { + /* + * If interface is marked up and it is stopped, then + * start it. + */ + (*ifp->if_init)(sc); + } + break; + + case SIOCSIFMTU: + if (ifr->ifr_mtu < 576) { + error = EINVAL; + break; + } + if (LIST_EMPTY(&sc->sc_iflist)) { + sc->sc_ifp->if_mtu = ifr->ifr_mtu; + break; + } + BRIDGE_LOCK(sc); + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if (bif->bif_ifp->if_mtu != ifr->ifr_mtu) { + log(LOG_NOTICE, "%s: invalid MTU: %lu(%s)" + " != %d\n", sc->sc_ifp->if_xname, + bif->bif_ifp->if_mtu, + bif->bif_ifp->if_xname, ifr->ifr_mtu); + error = EINVAL; + break; + } + } + if (!error) + sc->sc_ifp->if_mtu = ifr->ifr_mtu; + BRIDGE_UNLOCK(sc); + break; + default: + /* + * drop the lock as ether_ioctl() will call bridge_start() and + * cause the lock to be recursed. + */ + error = ether_ioctl(ifp, cmd, data); + break; + } + + return (error); +} + +/* + * bridge_mutecaps: + * + * Clear or restore unwanted capabilities on the member interface + */ +static void +bridge_mutecaps(struct bridge_softc *sc) +{ + struct bridge_iflist *bif; + int enabled, mask; + + /* Initial bitmask of capabilities to test */ + mask = BRIDGE_IFCAPS_MASK; + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + /* Every member must support it or its disabled */ + mask &= bif->bif_savedcaps; + } + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + enabled = bif->bif_ifp->if_capenable; + enabled &= ~BRIDGE_IFCAPS_STRIP; + /* strip off mask bits and enable them again if allowed */ + enabled &= ~BRIDGE_IFCAPS_MASK; + enabled |= mask; + bridge_set_ifcap(sc, bif, enabled); + } + +} + +static void +bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set) +{ + struct ifnet *ifp = bif->bif_ifp; + struct ifreq ifr; + int error; + + bzero(&ifr, sizeof(ifr)); + ifr.ifr_reqcap = set; + + if (ifp->if_capenable != set) { + error = (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifr); + if (error) + if_printf(sc->sc_ifp, + "error setting interface capabilities on %s\n", + ifp->if_xname); + } +} + +/* + * bridge_lookup_member: + * + * Lookup a bridge member interface. + */ +static struct bridge_iflist * +bridge_lookup_member(struct bridge_softc *sc, const char *name) +{ + struct bridge_iflist *bif; + struct ifnet *ifp; + + BRIDGE_LOCK_ASSERT(sc); + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + ifp = bif->bif_ifp; + if (strcmp(ifp->if_xname, name) == 0) + return (bif); + } + + return (NULL); +} + +/* + * bridge_lookup_member_if: + * + * Lookup a bridge member interface by ifnet*. + */ +static struct bridge_iflist * +bridge_lookup_member_if(struct bridge_softc *sc, struct ifnet *member_ifp) +{ + struct bridge_iflist *bif; + + BRIDGE_LOCK_ASSERT(sc); + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if (bif->bif_ifp == member_ifp) + return (bif); + } + + return (NULL); +} + +/* + * bridge_delete_member: + * + * Delete the specified member interface. + */ +static void +bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, + int gone) +{ + struct ifnet *ifs = bif->bif_ifp; + struct ifnet *fif = NULL; + + BRIDGE_LOCK_ASSERT(sc); + + if (bif->bif_flags & IFBIF_STP) + bstp_disable(&bif->bif_stp); + + ifs->if_bridge = NULL; + BRIDGE_XLOCK(sc); + LIST_REMOVE(bif, bif_next); + BRIDGE_XDROP(sc); + + /* + * If removing the interface that gave the bridge its mac address, set + * the mac address of the bridge to the address of the next member, or + * to its default address if no members are left. + */ + if (bridge_inherit_mac && sc->sc_ifaddr == ifs) { + if (LIST_EMPTY(&sc->sc_iflist)) { + bcopy(sc->sc_defaddr, + IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); + sc->sc_ifaddr = NULL; + } else { + fif = LIST_FIRST(&sc->sc_iflist)->bif_ifp; + bcopy(IF_LLADDR(fif), + IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); + sc->sc_ifaddr = fif; + } + EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); + } + + bridge_mutecaps(sc); /* recalcuate now this interface is removed */ + bridge_rtdelete(sc, ifs, IFBF_FLUSHALL); + KASSERT(bif->bif_addrcnt == 0, + ("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt)); + + BRIDGE_UNLOCK(sc); + if (!gone) { + switch (ifs->if_type) { + case IFT_ETHER: + case IFT_L2VLAN: + /* + * Take the interface out of promiscuous mode. + */ + (void) ifpromisc(ifs, 0); + break; + + case IFT_GIF: + break; + + default: +#ifdef DIAGNOSTIC + panic("bridge_delete_member: impossible"); +#endif + break; + } + /* reneable any interface capabilities */ + bridge_set_ifcap(sc, bif, bif->bif_savedcaps); + } + bstp_destroy(&bif->bif_stp); /* prepare to free */ + BRIDGE_LOCK(sc); + free(bif, M_DEVBUF); +} + +/* + * bridge_delete_span: + * + * Delete the specified span interface. + */ +static void +bridge_delete_span(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + BRIDGE_LOCK_ASSERT(sc); + + KASSERT(bif->bif_ifp->if_bridge == NULL, + ("%s: not a span interface", __func__)); + + LIST_REMOVE(bif, bif_next); + free(bif, M_DEVBUF); +} + +static int +bridge_ioctl_add(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif = NULL; + struct ifnet *ifs; + int error = 0; + + ifs = ifunit(req->ifbr_ifsname); + if (ifs == NULL) + return (ENOENT); + if (ifs->if_ioctl == NULL) /* must be supported */ + return (EINVAL); + + /* If it's in the span list, it can't be a member. */ + LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifs == bif->bif_ifp) + return (EBUSY); + + if (ifs->if_bridge == sc) + return (EEXIST); + + if (ifs->if_bridge != NULL) + return (EBUSY); + + bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); + if (bif == NULL) + return (ENOMEM); + + bif->bif_ifp = ifs; + bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER; + bif->bif_savedcaps = ifs->if_capenable; + + switch (ifs->if_type) { + case IFT_ETHER: + case IFT_L2VLAN: + case IFT_GIF: + /* permitted interface types */ + break; + default: + error = EINVAL; + goto out; + } + + /* Allow the first Ethernet member to define the MTU */ + if (LIST_EMPTY(&sc->sc_iflist)) + sc->sc_ifp->if_mtu = ifs->if_mtu; + else if (sc->sc_ifp->if_mtu != ifs->if_mtu) { + if_printf(sc->sc_ifp, "invalid MTU: %lu(%s) != %lu\n", + ifs->if_mtu, ifs->if_xname, sc->sc_ifp->if_mtu); + error = EINVAL; + goto out; + } + + /* + * Assign the interface's MAC address to the bridge if it's the first + * member and the MAC address of the bridge has not been changed from + * the default randomly generated one. + */ + if (bridge_inherit_mac && LIST_EMPTY(&sc->sc_iflist) && + !memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr, ETHER_ADDR_LEN)) { + bcopy(IF_LLADDR(ifs), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); + sc->sc_ifaddr = ifs; + EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); + } + + ifs->if_bridge = sc; + bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp); + /* + * XXX: XLOCK HERE!?! + * + * NOTE: insert_***HEAD*** should be safe for the traversals. + */ + LIST_INSERT_HEAD(&sc->sc_iflist, bif, bif_next); + + /* Set interface capabilities to the intersection set of all members */ + bridge_mutecaps(sc); + + switch (ifs->if_type) { + case IFT_ETHER: + case IFT_L2VLAN: + /* + * Place the interface into promiscuous mode. + */ + BRIDGE_UNLOCK(sc); + error = ifpromisc(ifs, 1); + BRIDGE_LOCK(sc); + break; + } + if (error) + bridge_delete_member(sc, bif, 0); +out: + if (error) { + if (bif != NULL) + free(bif, M_DEVBUF); + } + return (error); +} + +static int +bridge_ioctl_del(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + bridge_delete_member(sc, bif, 0); + + return (0); +} + +static int +bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + struct bstp_port *bp; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + bp = &bif->bif_stp; + req->ifbr_ifsflags = bif->bif_flags; + req->ifbr_state = bp->bp_state; + req->ifbr_priority = bp->bp_priority; + req->ifbr_path_cost = bp->bp_path_cost; + req->ifbr_portno = bif->bif_ifp->if_index & 0xfff; + req->ifbr_proto = bp->bp_protover; + req->ifbr_role = bp->bp_role; + req->ifbr_stpflags = bp->bp_flags; + req->ifbr_addrcnt = bif->bif_addrcnt; + req->ifbr_addrmax = bif->bif_addrmax; + req->ifbr_addrexceeded = bif->bif_addrexceeded; + + /* Copy STP state options as flags */ + if (bp->bp_operedge) + req->ifbr_ifsflags |= IFBIF_BSTP_EDGE; + if (bp->bp_flags & BSTP_PORT_AUTOEDGE) + req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE; + if (bp->bp_ptp_link) + req->ifbr_ifsflags |= IFBIF_BSTP_PTP; + if (bp->bp_flags & BSTP_PORT_AUTOPTP) + req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP; + if (bp->bp_flags & BSTP_PORT_ADMEDGE) + req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE; + if (bp->bp_flags & BSTP_PORT_ADMCOST) + req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST; + return (0); +} + +static int +bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + struct bstp_port *bp; + int error; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + bp = &bif->bif_stp; + + if (req->ifbr_ifsflags & IFBIF_SPAN) + /* SPAN is readonly */ + return (EINVAL); + + if (req->ifbr_ifsflags & IFBIF_STP) { + if ((bif->bif_flags & IFBIF_STP) == 0) { + error = bstp_enable(&bif->bif_stp); + if (error) + return (error); + } + } else { + if ((bif->bif_flags & IFBIF_STP) != 0) + bstp_disable(&bif->bif_stp); + } + + /* Pass on STP flags */ + bstp_set_edge(bp, req->ifbr_ifsflags & IFBIF_BSTP_EDGE ? 1 : 0); + bstp_set_autoedge(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOEDGE ? 1 : 0); + bstp_set_ptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_PTP ? 1 : 0); + bstp_set_autoptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOPTP ? 1 : 0); + + /* Save the bits relating to the bridge */ + bif->bif_flags = req->ifbr_ifsflags & IFBIFMASK; + + return (0); +} + +static int +bridge_ioctl_scache(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + sc->sc_brtmax = param->ifbrp_csize; + bridge_rttrim(sc); + + return (0); +} + +static int +bridge_ioctl_gcache(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_csize = sc->sc_brtmax; + + return (0); +} + +static int +bridge_ioctl_gifs(struct bridge_softc *sc, void *arg) +{ + struct ifbifconf *bifc = arg; + struct bridge_iflist *bif; + struct ifbreq breq; + char *buf, *outbuf; + int count, buflen, len, error = 0; + + count = 0; + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) + count++; + LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) + count++; + + buflen = sizeof(breq) * count; + if (bifc->ifbic_len == 0) { + bifc->ifbic_len = buflen; + return (0); + } + BRIDGE_UNLOCK(sc); + outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); + BRIDGE_LOCK(sc); + + count = 0; + buf = outbuf; + len = min(bifc->ifbic_len, buflen); + bzero(&breq, sizeof(breq)); + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if (len < sizeof(breq)) + break; + + strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname, + sizeof(breq.ifbr_ifsname)); + /* Fill in the ifbreq structure */ + error = bridge_ioctl_gifflags(sc, &breq); + if (error) + break; + memcpy(buf, &breq, sizeof(breq)); + count++; + buf += sizeof(breq); + len -= sizeof(breq); + } + LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) { + if (len < sizeof(breq)) + break; + + strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname, + sizeof(breq.ifbr_ifsname)); + breq.ifbr_ifsflags = bif->bif_flags; + breq.ifbr_portno = bif->bif_ifp->if_index & 0xfff; + memcpy(buf, &breq, sizeof(breq)); + count++; + buf += sizeof(breq); + len -= sizeof(breq); + } + + BRIDGE_UNLOCK(sc); + bifc->ifbic_len = sizeof(breq) * count; + error = copyout(outbuf, bifc->ifbic_req, bifc->ifbic_len); + BRIDGE_LOCK(sc); + free(outbuf, M_TEMP); + return (error); +} + +static int +bridge_ioctl_rts(struct bridge_softc *sc, void *arg) +{ + struct ifbaconf *bac = arg; + struct bridge_rtnode *brt; + struct ifbareq bareq; + char *buf, *outbuf; + int count, buflen, len, error = 0; + + if (bac->ifbac_len == 0) + return (0); + + count = 0; + LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) + count++; + buflen = sizeof(bareq) * count; + + BRIDGE_UNLOCK(sc); + outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); + BRIDGE_LOCK(sc); + + count = 0; + buf = outbuf; + len = min(bac->ifbac_len, buflen); + bzero(&bareq, sizeof(bareq)); + LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { + if (len < sizeof(bareq)) + goto out; + strlcpy(bareq.ifba_ifsname, brt->brt_ifp->if_xname, + sizeof(bareq.ifba_ifsname)); + memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr)); + bareq.ifba_vlan = brt->brt_vlan; + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC && + time_uptime < brt->brt_expire) + bareq.ifba_expire = brt->brt_expire - time_uptime; + else + bareq.ifba_expire = 0; + bareq.ifba_flags = brt->brt_flags; + + memcpy(buf, &bareq, sizeof(bareq)); + count++; + buf += sizeof(bareq); + len -= sizeof(bareq); + } +out: + BRIDGE_UNLOCK(sc); + bac->ifbac_len = sizeof(bareq) * count; + error = copyout(outbuf, bac->ifbac_req, bac->ifbac_len); + BRIDGE_LOCK(sc); + free(outbuf, M_TEMP); + return (error); +} + +static int +bridge_ioctl_saddr(struct bridge_softc *sc, void *arg) +{ + struct ifbareq *req = arg; + struct bridge_iflist *bif; + int error; + + bif = bridge_lookup_member(sc, req->ifba_ifsname); + if (bif == NULL) + return (ENOENT); + + error = bridge_rtupdate(sc, req->ifba_dst, req->ifba_vlan, bif, 1, + req->ifba_flags); + + return (error); +} + +static int +bridge_ioctl_sto(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + sc->sc_brttimeout = param->ifbrp_ctime; + return (0); +} + +static int +bridge_ioctl_gto(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_ctime = sc->sc_brttimeout; + return (0); +} + +static int +bridge_ioctl_daddr(struct bridge_softc *sc, void *arg) +{ + struct ifbareq *req = arg; + + return (bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan)); +} + +static int +bridge_ioctl_flush(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + + bridge_rtflush(sc, req->ifbr_ifsflags); + return (0); +} + +static int +bridge_ioctl_gpri(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + struct bstp_state *bs = &sc->sc_stp; + + param->ifbrp_prio = bs->bs_bridge_priority; + return (0); +} + +static int +bridge_ioctl_spri(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_priority(&sc->sc_stp, param->ifbrp_prio)); +} + +static int +bridge_ioctl_ght(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + struct bstp_state *bs = &sc->sc_stp; + + param->ifbrp_hellotime = bs->bs_bridge_htime >> 8; + return (0); +} + +static int +bridge_ioctl_sht(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime)); +} + +static int +bridge_ioctl_gfd(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + struct bstp_state *bs = &sc->sc_stp; + + param->ifbrp_fwddelay = bs->bs_bridge_fdelay >> 8; + return (0); +} + +static int +bridge_ioctl_sfd(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay)); +} + +static int +bridge_ioctl_gma(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + struct bstp_state *bs = &sc->sc_stp; + + param->ifbrp_maxage = bs->bs_bridge_max_age >> 8; + return (0); +} + +static int +bridge_ioctl_sma(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage)); +} + +static int +bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority)); +} + +static int +bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost)); +} + +static int +bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + bif->bif_addrmax = req->ifbr_addrmax; + return (0); +} + +static int +bridge_ioctl_addspan(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif = NULL; + struct ifnet *ifs; + + ifs = ifunit(req->ifbr_ifsname); + if (ifs == NULL) + return (ENOENT); + + LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifs == bif->bif_ifp) + return (EBUSY); + + if (ifs->if_bridge != NULL) + return (EBUSY); + + switch (ifs->if_type) { + case IFT_ETHER: + case IFT_GIF: + case IFT_L2VLAN: + break; + default: + return (EINVAL); + } + + bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); + if (bif == NULL) + return (ENOMEM); + + bif->bif_ifp = ifs; + bif->bif_flags = IFBIF_SPAN; + + LIST_INSERT_HEAD(&sc->sc_spanlist, bif, bif_next); + + return (0); +} + +static int +bridge_ioctl_delspan(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + struct ifnet *ifs; + + ifs = ifunit(req->ifbr_ifsname); + if (ifs == NULL) + return (ENOENT); + + LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifs == bif->bif_ifp) + break; + + if (bif == NULL) + return (ENOENT); + + bridge_delete_span(sc, bif); + + return (0); +} + +static int +bridge_ioctl_gbparam(struct bridge_softc *sc, void *arg) +{ + struct ifbropreq *req = arg; + struct bstp_state *bs = &sc->sc_stp; + struct bstp_port *root_port; + + req->ifbop_maxage = bs->bs_bridge_max_age >> 8; + req->ifbop_hellotime = bs->bs_bridge_htime >> 8; + req->ifbop_fwddelay = bs->bs_bridge_fdelay >> 8; + + root_port = bs->bs_root_port; + if (root_port == NULL) + req->ifbop_root_port = 0; + else + req->ifbop_root_port = root_port->bp_ifp->if_index; + + req->ifbop_holdcount = bs->bs_txholdcount; + req->ifbop_priority = bs->bs_bridge_priority; + req->ifbop_protocol = bs->bs_protover; + req->ifbop_root_path_cost = bs->bs_root_pv.pv_cost; + req->ifbop_bridgeid = bs->bs_bridge_pv.pv_dbridge_id; + req->ifbop_designated_root = bs->bs_root_pv.pv_root_id; + req->ifbop_designated_bridge = bs->bs_root_pv.pv_dbridge_id; + req->ifbop_last_tc_time.tv_sec = bs->bs_last_tc_time.tv_sec; + req->ifbop_last_tc_time.tv_usec = bs->bs_last_tc_time.tv_usec; + + return (0); +} + +static int +bridge_ioctl_grte(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_cexceeded = sc->sc_brtexceeded; + return (0); +} + +static int +bridge_ioctl_gifsstp(struct bridge_softc *sc, void *arg) +{ + struct ifbpstpconf *bifstp = arg; + struct bridge_iflist *bif; + struct bstp_port *bp; + struct ifbpstpreq bpreq; + char *buf, *outbuf; + int count, buflen, len, error = 0; + + count = 0; + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if ((bif->bif_flags & IFBIF_STP) != 0) + count++; + } + + buflen = sizeof(bpreq) * count; + if (bifstp->ifbpstp_len == 0) { + bifstp->ifbpstp_len = buflen; + return (0); + } + + BRIDGE_UNLOCK(sc); + outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); + BRIDGE_LOCK(sc); + + count = 0; + buf = outbuf; + len = min(bifstp->ifbpstp_len, buflen); + bzero(&bpreq, sizeof(bpreq)); + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if (len < sizeof(bpreq)) + break; + + if ((bif->bif_flags & IFBIF_STP) == 0) + continue; + + bp = &bif->bif_stp; + bpreq.ifbp_portno = bif->bif_ifp->if_index & 0xfff; + bpreq.ifbp_fwd_trans = bp->bp_forward_transitions; + bpreq.ifbp_design_cost = bp->bp_desg_pv.pv_cost; + bpreq.ifbp_design_port = bp->bp_desg_pv.pv_port_id; + bpreq.ifbp_design_bridge = bp->bp_desg_pv.pv_dbridge_id; + bpreq.ifbp_design_root = bp->bp_desg_pv.pv_root_id; + + memcpy(buf, &bpreq, sizeof(bpreq)); + count++; + buf += sizeof(bpreq); + len -= sizeof(bpreq); + } + + BRIDGE_UNLOCK(sc); + bifstp->ifbpstp_len = sizeof(bpreq) * count; + error = copyout(outbuf, bifstp->ifbpstp_req, bifstp->ifbpstp_len); + BRIDGE_LOCK(sc); + free(outbuf, M_TEMP); + return (error); +} + +static int +bridge_ioctl_sproto(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto)); +} + +static int +bridge_ioctl_stxhc(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc)); +} + +/* + * bridge_ifdetach: + * + * Detach an interface from a bridge. Called when a member + * interface is detaching. + */ +static void +bridge_ifdetach(void *arg __unused, struct ifnet *ifp) +{ + struct bridge_softc *sc = ifp->if_bridge; + struct bridge_iflist *bif; + + /* Check if the interface is a bridge member */ + if (sc != NULL) { + BRIDGE_LOCK(sc); + + bif = bridge_lookup_member_if(sc, ifp); + if (bif != NULL) + bridge_delete_member(sc, bif, 1); + + BRIDGE_UNLOCK(sc); + return; + } + + /* Check if the interface is a span port */ + mtx_lock(&bridge_list_mtx); + LIST_FOREACH(sc, &bridge_list, sc_list) { + BRIDGE_LOCK(sc); + LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifp == bif->bif_ifp) { + bridge_delete_span(sc, bif); + break; + } + + BRIDGE_UNLOCK(sc); + } + mtx_unlock(&bridge_list_mtx); +} + +/* + * bridge_init: + * + * Initialize a bridge interface. + */ +static void +bridge_init(void *xsc) +{ + struct bridge_softc *sc = (struct bridge_softc *)xsc; + struct ifnet *ifp = sc->sc_ifp; + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + return; + + BRIDGE_LOCK(sc); + callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz, + bridge_timer, sc); + + ifp->if_drv_flags |= IFF_DRV_RUNNING; + bstp_init(&sc->sc_stp); /* Initialize Spanning Tree */ + + BRIDGE_UNLOCK(sc); +} + +/* + * bridge_stop: + * + * Stop the bridge interface. + */ +static void +bridge_stop(struct ifnet *ifp, int disable) +{ + struct bridge_softc *sc = ifp->if_softc; + + BRIDGE_LOCK_ASSERT(sc); + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + return; + + callout_stop(&sc->sc_brcallout); + bstp_stop(&sc->sc_stp); + + bridge_rtflush(sc, IFBF_FLUSHDYN); + + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; +} + +/* + * bridge_enqueue: + * + * Enqueue a packet on a bridge member interface. + * + */ +static void +bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) +{ + int len, err = 0; + short mflags; + struct mbuf *m0; + + len = m->m_pkthdr.len; + mflags = m->m_flags; + + /* We may be sending a fragment so traverse the mbuf */ + for (; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = NULL; + + /* + * If underlying interface can not do VLAN tag insertion itself + * then attach a packet tag that holds it. + */ + if ((m->m_flags & M_VLANTAG) && + (dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) { + m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); + if (m == NULL) { + if_printf(dst_ifp, + "unable to prepend VLAN header\n"); + dst_ifp->if_oerrors++; + continue; + } + m->m_flags &= ~M_VLANTAG; + } + + if (err == 0) + dst_ifp->if_transmit(dst_ifp, m); + } + + if (err == 0) { + sc->sc_ifp->if_opackets++; + sc->sc_ifp->if_obytes += len; + if (mflags & M_MCAST) + sc->sc_ifp->if_omcasts++; + } +} + +/* + * bridge_dummynet: + * + * Receive a queued packet from dummynet and pass it on to the output + * interface. + * + * The mbuf has the Ethernet header already attached. + */ +static void +bridge_dummynet(struct mbuf *m, struct ifnet *ifp) +{ + struct bridge_softc *sc; + + sc = ifp->if_bridge; + + /* + * The packet didnt originate from a member interface. This should only + * ever happen if a member interface is removed while packets are + * queued for it. + */ + if (sc == NULL) { + m_freem(m); + return; + } + + if (PFIL_HOOKED(&V_inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&V_inet6_pfil_hook) +#endif + ) { + if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0) + return; + if (m == NULL) + return; + } + + bridge_enqueue(sc, ifp, m); +} + +/* + * bridge_output: + * + * Send output from a bridge member interface. This + * performs the bridging function for locally originated + * packets. + * + * The mbuf has the Ethernet header already attached. We must + * enqueue or free the mbuf before returning. + */ +static int +bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, + struct rtentry *rt) +{ + struct ether_header *eh; + struct ifnet *dst_if; + struct bridge_softc *sc; + uint16_t vlan; + + if (m->m_len < ETHER_HDR_LEN) { + m = m_pullup(m, ETHER_HDR_LEN); + if (m == NULL) + return (0); + } + + eh = mtod(m, struct ether_header *); + sc = ifp->if_bridge; + vlan = VLANTAGOF(m); + + BRIDGE_LOCK(sc); + + /* + * If bridge is down, but the original output interface is up, + * go ahead and send out that interface. Otherwise, the packet + * is dropped below. + */ + if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + dst_if = ifp; + goto sendunicast; + } + + /* + * If the packet is a multicast, or we don't know a better way to + * get there, send to all interfaces. + */ + if (ETHER_IS_MULTICAST(eh->ether_dhost)) + dst_if = NULL; + else + dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan); + if (dst_if == NULL) { + struct bridge_iflist *bif; + struct mbuf *mc; + int error = 0, used = 0; + + bridge_span(sc, m); + + BRIDGE_LOCK2REF(sc, error); + if (error) { + m_freem(m); + return (0); + } + + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + dst_if = bif->bif_ifp; + + if (dst_if->if_type == IFT_GIF) + continue; + if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) + continue; + + /* + * If this is not the original output interface, + * and the interface is participating in spanning + * tree, make sure the port is in a state that + * allows forwarding. + */ + if (dst_if != ifp && (bif->bif_flags & IFBIF_STP) && + bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) + continue; + + if (LIST_NEXT(bif, bif_next) == NULL) { + used = 1; + mc = m; + } else { + mc = m_copypacket(m, M_DONTWAIT); + if (mc == NULL) { + sc->sc_ifp->if_oerrors++; + continue; + } + } + + bridge_enqueue(sc, dst_if, mc); + } + if (used == 0) + m_freem(m); + BRIDGE_UNREF(sc); + return (0); + } + +sendunicast: + /* + * XXX Spanning tree consideration here? + */ + + bridge_span(sc, m); + if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) { + m_freem(m); + BRIDGE_UNLOCK(sc); + return (0); + } + + BRIDGE_UNLOCK(sc); + bridge_enqueue(sc, dst_if, m); + return (0); +} + +/* + * bridge_start: + * + * Start output on a bridge. + * + */ +static void +bridge_start(struct ifnet *ifp) +{ + struct bridge_softc *sc; + struct mbuf *m; + struct ether_header *eh; + struct ifnet *dst_if; + + sc = ifp->if_softc; + + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + for (;;) { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m == 0) + break; + ETHER_BPF_MTAP(ifp, m); + + eh = mtod(m, struct ether_header *); + dst_if = NULL; + + BRIDGE_LOCK(sc); + if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) { + dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1); + } + + if (dst_if == NULL) + bridge_broadcast(sc, ifp, m, 0); + else { + BRIDGE_UNLOCK(sc); + bridge_enqueue(sc, dst_if, m); + } + } + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; +} + +/* + * bridge_forward: + * + * The forwarding function of the bridge. + * + * NOTE: Releases the lock on return. + */ +static void +bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, + struct mbuf *m) +{ + struct bridge_iflist *dbif; + struct ifnet *src_if, *dst_if, *ifp; + struct ether_header *eh; + uint16_t vlan; + uint8_t *dst; + int error; + + src_if = m->m_pkthdr.rcvif; + ifp = sc->sc_ifp; + + ifp->if_ipackets++; + ifp->if_ibytes += m->m_pkthdr.len; + vlan = VLANTAGOF(m); + + if ((sbif->bif_flags & IFBIF_STP) && + sbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) + goto drop; + + eh = mtod(m, struct ether_header *); + dst = eh->ether_dhost; + + /* If the interface is learning, record the address. */ + if (sbif->bif_flags & IFBIF_LEARNING) { + error = bridge_rtupdate(sc, eh->ether_shost, vlan, + sbif, 0, IFBAF_DYNAMIC); + /* + * If the interface has addresses limits then deny any source + * that is not in the cache. + */ + if (error && sbif->bif_addrmax) + goto drop; + } + + if ((sbif->bif_flags & IFBIF_STP) != 0 && + sbif->bif_stp.bp_state == BSTP_IFSTATE_LEARNING) + goto drop; + + /* + * At this point, the port either doesn't participate + * in spanning tree or it is in the forwarding state. + */ + + /* + * If the packet is unicast, destined for someone on + * "this" side of the bridge, drop it. + */ + if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) { + dst_if = bridge_rtlookup(sc, dst, vlan); + if (src_if == dst_if) + goto drop; + } else { + /* + * Check if its a reserved multicast address, any address + * listed in 802.1D section 7.12.6 may not be forwarded by the + * bridge. + * This is currently 01-80-C2-00-00-00 to 01-80-C2-00-00-0F + */ + if (dst[0] == 0x01 && dst[1] == 0x80 && + dst[2] == 0xc2 && dst[3] == 0x00 && + dst[4] == 0x00 && dst[5] <= 0x0f) + goto drop; + + /* ...forward it to all interfaces. */ + ifp->if_imcasts++; + dst_if = NULL; + } + + /* + * If we have a destination interface which is a member of our bridge, + * OR this is a unicast packet, push it through the bpf(4) machinery. + * For broadcast or multicast packets, don't bother because it will + * be reinjected into ether_input. We do this before we pass the packets + * through the pfil(9) framework, as it is possible that pfil(9) will + * drop the packet, or possibly modify it, making it difficult to debug + * firewall issues on the bridge. + */ + if (dst_if != NULL || (m->m_flags & (M_BCAST | M_MCAST)) == 0) + ETHER_BPF_MTAP(ifp, m); + + /* run the packet filter */ + if (PFIL_HOOKED(&V_inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&V_inet6_pfil_hook) +#endif + ) { + BRIDGE_UNLOCK(sc); + if (bridge_pfil(&m, ifp, src_if, PFIL_IN) != 0) + return; + if (m == NULL) + return; + BRIDGE_LOCK(sc); + } + + if (dst_if == NULL) { + bridge_broadcast(sc, src_if, m, 1); + return; + } + + /* + * At this point, we're dealing with a unicast frame + * going to a different interface. + */ + if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) + goto drop; + + dbif = bridge_lookup_member_if(sc, dst_if); + if (dbif == NULL) + /* Not a member of the bridge (anymore?) */ + goto drop; + + /* Private segments can not talk to each other */ + if (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE) + goto drop; + + if ((dbif->bif_flags & IFBIF_STP) && + dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) + goto drop; + + BRIDGE_UNLOCK(sc); + + if (PFIL_HOOKED(&V_inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&V_inet6_pfil_hook) +#endif + ) { + if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0) + return; + if (m == NULL) + return; + } + + bridge_enqueue(sc, dst_if, m); + return; + +drop: + BRIDGE_UNLOCK(sc); + m_freem(m); +} + +/* + * bridge_input: + * + * Receive input from a member interface. Queue the packet for + * bridging if it is not for us. + */ +static struct mbuf * +bridge_input(struct ifnet *ifp, struct mbuf *m) +{ + struct bridge_softc *sc = ifp->if_bridge; + struct bridge_iflist *bif, *bif2; + struct ifnet *bifp; + struct ether_header *eh; + struct mbuf *mc, *mc2; + uint16_t vlan; + int error; + + if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + return (m); + + bifp = sc->sc_ifp; + vlan = VLANTAGOF(m); + + /* + * Implement support for bridge monitoring. If this flag has been + * set on this interface, discard the packet once we push it through + * the bpf(4) machinery, but before we do, increment the byte and + * packet counters associated with this interface. + */ + if ((bifp->if_flags & IFF_MONITOR) != 0) { + m->m_pkthdr.rcvif = bifp; + ETHER_BPF_MTAP(bifp, m); + bifp->if_ipackets++; + bifp->if_ibytes += m->m_pkthdr.len; + m_freem(m); + return (NULL); + } + BRIDGE_LOCK(sc); + bif = bridge_lookup_member_if(sc, ifp); + if (bif == NULL) { + BRIDGE_UNLOCK(sc); + return (m); + } + + eh = mtod(m, struct ether_header *); + + bridge_span(sc, m); + + if (m->m_flags & (M_BCAST|M_MCAST)) { + /* Tap off 802.1D packets; they do not get forwarded. */ + if (memcmp(eh->ether_dhost, bstp_etheraddr, + ETHER_ADDR_LEN) == 0) { + m = bstp_input(&bif->bif_stp, ifp, m); + if (m == NULL) { + BRIDGE_UNLOCK(sc); + return (NULL); + } + } + + if ((bif->bif_flags & IFBIF_STP) && + bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { + BRIDGE_UNLOCK(sc); + return (m); + } + + /* + * Make a deep copy of the packet and enqueue the copy + * for bridge processing; return the original packet for + * local processing. + */ + mc = m_dup(m, M_DONTWAIT); + if (mc == NULL) { + BRIDGE_UNLOCK(sc); + return (m); + } + + /* Perform the bridge forwarding function with the copy. */ + bridge_forward(sc, bif, mc); + + /* + * Reinject the mbuf as arriving on the bridge so we have a + * chance at claiming multicast packets. We can not loop back + * here from ether_input as a bridge is never a member of a + * bridge. + */ + KASSERT(bifp->if_bridge == NULL, + ("loop created in bridge_input")); + mc2 = m_dup(m, M_DONTWAIT); + if (mc2 != NULL) { + /* Keep the layer3 header aligned */ + int i = min(mc2->m_pkthdr.len, max_protohdr); + mc2 = m_copyup(mc2, i, ETHER_ALIGN); + } + if (mc2 != NULL) { + mc2->m_pkthdr.rcvif = bifp; + (*bifp->if_input)(bifp, mc2); + } + + /* Return the original packet for local processing. */ + return (m); + } + + if ((bif->bif_flags & IFBIF_STP) && + bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { + BRIDGE_UNLOCK(sc); + return (m); + } + +#if (defined(INET) || defined(INET6)) +# define OR_CARP_CHECK_WE_ARE_DST(iface) \ + || ((iface)->if_carp \ + && (*carp_forus_p)((iface), eh->ether_dhost)) +# define OR_CARP_CHECK_WE_ARE_SRC(iface) \ + || ((iface)->if_carp \ + && (*carp_forus_p)((iface), eh->ether_shost)) +#else +# define OR_CARP_CHECK_WE_ARE_DST(iface) +# define OR_CARP_CHECK_WE_ARE_SRC(iface) +#endif + +#ifdef INET6 +# define OR_PFIL_HOOKED_INET6 \ + || PFIL_HOOKED(&V_inet6_pfil_hook) +#else +# define OR_PFIL_HOOKED_INET6 +#endif + +#define GRAB_OUR_PACKETS(iface) \ + if ((iface)->if_type == IFT_GIF) \ + continue; \ + /* It is destined for us. */ \ + if (memcmp(IF_LLADDR((iface)), eh->ether_dhost, ETHER_ADDR_LEN) == 0 \ + OR_CARP_CHECK_WE_ARE_DST((iface)) \ + ) { \ + if ((iface)->if_type == IFT_BRIDGE) { \ + ETHER_BPF_MTAP(iface, m); \ + iface->if_ipackets++; \ + /* Filter on the physical interface. */ \ + if (pfil_local_phys && \ + (PFIL_HOOKED(&V_inet_pfil_hook) \ + OR_PFIL_HOOKED_INET6)) { \ + if (bridge_pfil(&m, NULL, ifp, \ + PFIL_IN) != 0 || m == NULL) { \ + BRIDGE_UNLOCK(sc); \ + return (NULL); \ + } \ + } \ + } \ + if (bif->bif_flags & IFBIF_LEARNING) { \ + error = bridge_rtupdate(sc, eh->ether_shost, \ + vlan, bif, 0, IFBAF_DYNAMIC); \ + if (error && bif->bif_addrmax) { \ + BRIDGE_UNLOCK(sc); \ + m_freem(m); \ + return (NULL); \ + } \ + } \ + m->m_pkthdr.rcvif = iface; \ + BRIDGE_UNLOCK(sc); \ + return (m); \ + } \ + \ + /* We just received a packet that we sent out. */ \ + if (memcmp(IF_LLADDR((iface)), eh->ether_shost, ETHER_ADDR_LEN) == 0 \ + OR_CARP_CHECK_WE_ARE_SRC((iface)) \ + ) { \ + BRIDGE_UNLOCK(sc); \ + m_freem(m); \ + return (NULL); \ + } + + /* + * Unicast. Make sure it's not for the bridge. + */ + do { GRAB_OUR_PACKETS(bifp) } while (0); + + /* + * Give a chance for ifp at first priority. This will help when the + * packet comes through the interface like VLAN's with the same MACs + * on several interfaces from the same bridge. This also will save + * some CPU cycles in case the destination interface and the input + * interface (eq ifp) are the same. + */ + do { GRAB_OUR_PACKETS(ifp) } while (0); + + /* Now check the all bridge members. */ + LIST_FOREACH(bif2, &sc->sc_iflist, bif_next) { + GRAB_OUR_PACKETS(bif2->bif_ifp) + } + +#undef OR_CARP_CHECK_WE_ARE_DST +#undef OR_CARP_CHECK_WE_ARE_SRC +#undef OR_PFIL_HOOKED_INET6 +#undef GRAB_OUR_PACKETS + + /* Perform the bridge forwarding function. */ + bridge_forward(sc, bif, m); + + return (NULL); +} + +/* + * bridge_broadcast: + * + * Send a frame to all interfaces that are members of + * the bridge, except for the one on which the packet + * arrived. + * + * NOTE: Releases the lock on return. + */ +static void +bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, + struct mbuf *m, int runfilt) +{ + struct bridge_iflist *dbif, *sbif; + struct mbuf *mc; + struct ifnet *dst_if; + int error = 0, used = 0, i; + + sbif = bridge_lookup_member_if(sc, src_if); + + BRIDGE_LOCK2REF(sc, error); + if (error) { + m_freem(m); + return; + } + + /* Filter on the bridge interface before broadcasting */ + if (runfilt && (PFIL_HOOKED(&V_inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&V_inet6_pfil_hook) +#endif + )) { + if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0) + goto out; + if (m == NULL) + goto out; + } + + LIST_FOREACH(dbif, &sc->sc_iflist, bif_next) { + dst_if = dbif->bif_ifp; + if (dst_if == src_if) + continue; + + /* Private segments can not talk to each other */ + if (sbif && (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE)) + continue; + + if ((dbif->bif_flags & IFBIF_STP) && + dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) + continue; + + if ((dbif->bif_flags & IFBIF_DISCOVER) == 0 && + (m->m_flags & (M_BCAST|M_MCAST)) == 0) + continue; + + if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) + continue; + + if (LIST_NEXT(dbif, bif_next) == NULL) { + mc = m; + used = 1; + } else { + mc = m_dup(m, M_DONTWAIT); + if (mc == NULL) { + sc->sc_ifp->if_oerrors++; + continue; + } + } + + /* + * Filter on the output interface. Pass a NULL bridge interface + * pointer so we do not redundantly filter on the bridge for + * each interface we broadcast on. + */ + if (runfilt && (PFIL_HOOKED(&V_inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&V_inet6_pfil_hook) +#endif + )) { + if (used == 0) { + /* Keep the layer3 header aligned */ + i = min(mc->m_pkthdr.len, max_protohdr); + mc = m_copyup(mc, i, ETHER_ALIGN); + if (mc == NULL) { + sc->sc_ifp->if_oerrors++; + continue; + } + } + if (bridge_pfil(&mc, NULL, dst_if, PFIL_OUT) != 0) + continue; + if (mc == NULL) + continue; + } + + bridge_enqueue(sc, dst_if, mc); + } + if (used == 0) + m_freem(m); + +out: + BRIDGE_UNREF(sc); +} + +/* + * bridge_span: + * + * Duplicate a packet out one or more interfaces that are in span mode, + * the original mbuf is unmodified. + */ +static void +bridge_span(struct bridge_softc *sc, struct mbuf *m) +{ + struct bridge_iflist *bif; + struct ifnet *dst_if; + struct mbuf *mc; + + if (LIST_EMPTY(&sc->sc_spanlist)) + return; + + LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) { + dst_if = bif->bif_ifp; + + if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) + continue; + + mc = m_copypacket(m, M_DONTWAIT); + if (mc == NULL) { + sc->sc_ifp->if_oerrors++; + continue; + } + + bridge_enqueue(sc, dst_if, mc); + } +} + +/* + * bridge_rtupdate: + * + * Add a bridge routing entry. + */ +static int +bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, + struct bridge_iflist *bif, int setflags, uint8_t flags) +{ + struct bridge_rtnode *brt; + int error; + + BRIDGE_LOCK_ASSERT(sc); + + /* Check the source address is valid and not multicast. */ + if (ETHER_IS_MULTICAST(dst) || + (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 && + dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0) + return (EINVAL); + + /* 802.1p frames map to vlan 1 */ + if (vlan == 0) + vlan = 1; + + /* + * A route for this destination might already exist. If so, + * update it, otherwise create a new one. + */ + if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) == NULL) { + if (sc->sc_brtcnt >= sc->sc_brtmax) { + sc->sc_brtexceeded++; + return (ENOSPC); + } + /* Check per interface address limits (if enabled) */ + if (bif->bif_addrmax && bif->bif_addrcnt >= bif->bif_addrmax) { + bif->bif_addrexceeded++; + return (ENOSPC); + } + + /* + * Allocate a new bridge forwarding node, and + * initialize the expiration time and Ethernet + * address. + */ + brt = uma_zalloc(bridge_rtnode_zone, M_NOWAIT | M_ZERO); + if (brt == NULL) + return (ENOMEM); + + if (bif->bif_flags & IFBIF_STICKY) + brt->brt_flags = IFBAF_STICKY; + else + brt->brt_flags = IFBAF_DYNAMIC; + + memcpy(brt->brt_addr, dst, ETHER_ADDR_LEN); + brt->brt_vlan = vlan; + + if ((error = bridge_rtnode_insert(sc, brt)) != 0) { + uma_zfree(bridge_rtnode_zone, brt); + return (error); + } + brt->brt_dst = bif; + bif->bif_addrcnt++; + } + + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC && + brt->brt_dst != bif) { + brt->brt_dst->bif_addrcnt--; + brt->brt_dst = bif; + brt->brt_dst->bif_addrcnt++; + } + + if ((flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) + brt->brt_expire = time_uptime + sc->sc_brttimeout; + if (setflags) + brt->brt_flags = flags; + + return (0); +} + +/* + * bridge_rtlookup: + * + * Lookup the destination interface for an address. + */ +static struct ifnet * +bridge_rtlookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) +{ + struct bridge_rtnode *brt; + + BRIDGE_LOCK_ASSERT(sc); + + if ((brt = bridge_rtnode_lookup(sc, addr, vlan)) == NULL) + return (NULL); + + return (brt->brt_ifp); +} + +/* + * bridge_rttrim: + * + * Trim the routine table so that we have a number + * of routing entries less than or equal to the + * maximum number. + */ +static void +bridge_rttrim(struct bridge_softc *sc) +{ + struct bridge_rtnode *brt, *nbrt; + + BRIDGE_LOCK_ASSERT(sc); + + /* Make sure we actually need to do this. */ + if (sc->sc_brtcnt <= sc->sc_brtmax) + return; + + /* Force an aging cycle; this might trim enough addresses. */ + bridge_rtage(sc); + if (sc->sc_brtcnt <= sc->sc_brtmax) + return; + + LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { + bridge_rtnode_destroy(sc, brt); + if (sc->sc_brtcnt <= sc->sc_brtmax) + return; + } + } +} + +/* + * bridge_timer: + * + * Aging timer for the bridge. + */ +static void +bridge_timer(void *arg) +{ + struct bridge_softc *sc = arg; + + BRIDGE_LOCK_ASSERT(sc); + + bridge_rtage(sc); + + if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) + callout_reset(&sc->sc_brcallout, + bridge_rtable_prune_period * hz, bridge_timer, sc); +} + +/* + * bridge_rtage: + * + * Perform an aging cycle. + */ +static void +bridge_rtage(struct bridge_softc *sc) +{ + struct bridge_rtnode *brt, *nbrt; + + BRIDGE_LOCK_ASSERT(sc); + + LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { + if (time_uptime >= brt->brt_expire) + bridge_rtnode_destroy(sc, brt); + } + } +} + +/* + * bridge_rtflush: + * + * Remove all dynamic addresses from the bridge. + */ +static void +bridge_rtflush(struct bridge_softc *sc, int full) +{ + struct bridge_rtnode *brt, *nbrt; + + BRIDGE_LOCK_ASSERT(sc); + + LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { + if (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) + bridge_rtnode_destroy(sc, brt); + } +} + +/* + * bridge_rtdaddr: + * + * Remove an address from the table. + */ +static int +bridge_rtdaddr(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) +{ + struct bridge_rtnode *brt; + int found = 0; + + BRIDGE_LOCK_ASSERT(sc); + + /* + * If vlan is zero then we want to delete for all vlans so the lookup + * may return more than one. + */ + while ((brt = bridge_rtnode_lookup(sc, addr, vlan)) != NULL) { + bridge_rtnode_destroy(sc, brt); + found = 1; + } + + return (found ? 0 : ENOENT); +} + +/* + * bridge_rtdelete: + * + * Delete routes to a speicifc member interface. + */ +static void +bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full) +{ + struct bridge_rtnode *brt, *nbrt; + + BRIDGE_LOCK_ASSERT(sc); + + LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { + if (brt->brt_ifp == ifp && (full || + (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)) + bridge_rtnode_destroy(sc, brt); + } +} + +/* + * bridge_rtable_init: + * + * Initialize the route table for this bridge. + */ +static int +bridge_rtable_init(struct bridge_softc *sc) +{ + int i; + + sc->sc_rthash = malloc(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE, + M_DEVBUF, M_NOWAIT); + if (sc->sc_rthash == NULL) + return (ENOMEM); + + for (i = 0; i < BRIDGE_RTHASH_SIZE; i++) + LIST_INIT(&sc->sc_rthash[i]); + + sc->sc_rthash_key = arc4random(); + + LIST_INIT(&sc->sc_rtlist); + + return (0); +} + +/* + * bridge_rtable_fini: + * + * Deconstruct the route table for this bridge. + */ +static void +bridge_rtable_fini(struct bridge_softc *sc) +{ + + KASSERT(sc->sc_brtcnt == 0, + ("%s: %d bridge routes referenced", __func__, sc->sc_brtcnt)); + free(sc->sc_rthash, M_DEVBUF); +} + +/* + * The following hash function is adapted from "Hash Functions" by Bob Jenkins + * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + */ +#define mix(a, b, c) \ +do { \ + a -= b; a -= c; a ^= (c >> 13); \ + b -= c; b -= a; b ^= (a << 8); \ + c -= a; c -= b; c ^= (b >> 13); \ + a -= b; a -= c; a ^= (c >> 12); \ + b -= c; b -= a; b ^= (a << 16); \ + c -= a; c -= b; c ^= (b >> 5); \ + a -= b; a -= c; a ^= (c >> 3); \ + b -= c; b -= a; b ^= (a << 10); \ + c -= a; c -= b; c ^= (b >> 15); \ +} while (/*CONSTCOND*/0) + +static __inline uint32_t +bridge_rthash(struct bridge_softc *sc, const uint8_t *addr) +{ + uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->sc_rthash_key; + + b += addr[5] << 8; + b += addr[4]; + a += addr[3] << 24; + a += addr[2] << 16; + a += addr[1] << 8; + a += addr[0]; + + mix(a, b, c); + + return (c & BRIDGE_RTHASH_MASK); +} + +#undef mix + +static int +bridge_rtnode_addr_cmp(const uint8_t *a, const uint8_t *b) +{ + int i, d; + + for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) { + d = ((int)a[i]) - ((int)b[i]); + } + + return (d); +} + +/* + * bridge_rtnode_lookup: + * + * Look up a bridge route node for the specified destination. Compare the + * vlan id or if zero then just return the first match. + */ +static struct bridge_rtnode * +bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) +{ + struct bridge_rtnode *brt; + uint32_t hash; + int dir; + + BRIDGE_LOCK_ASSERT(sc); + + hash = bridge_rthash(sc, addr); + LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) { + dir = bridge_rtnode_addr_cmp(addr, brt->brt_addr); + if (dir == 0 && (brt->brt_vlan == vlan || vlan == 0)) + return (brt); + if (dir > 0) + return (NULL); + } + + return (NULL); +} + +/* + * bridge_rtnode_insert: + * + * Insert the specified bridge node into the route table. We + * assume the entry is not already in the table. + */ +static int +bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt) +{ + struct bridge_rtnode *lbrt; + uint32_t hash; + int dir; + + BRIDGE_LOCK_ASSERT(sc); + + hash = bridge_rthash(sc, brt->brt_addr); + + lbrt = LIST_FIRST(&sc->sc_rthash[hash]); + if (lbrt == NULL) { + LIST_INSERT_HEAD(&sc->sc_rthash[hash], brt, brt_hash); + goto out; + } + + do { + dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr); + if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan) + return (EEXIST); + if (dir > 0) { + LIST_INSERT_BEFORE(lbrt, brt, brt_hash); + goto out; + } + if (LIST_NEXT(lbrt, brt_hash) == NULL) { + LIST_INSERT_AFTER(lbrt, brt, brt_hash); + goto out; + } + lbrt = LIST_NEXT(lbrt, brt_hash); + } while (lbrt != NULL); + +#ifdef DIAGNOSTIC + panic("bridge_rtnode_insert: impossible"); +#endif + +out: + LIST_INSERT_HEAD(&sc->sc_rtlist, brt, brt_list); + sc->sc_brtcnt++; + + return (0); +} + +/* + * bridge_rtnode_destroy: + * + * Destroy a bridge rtnode. + */ +static void +bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt) +{ + BRIDGE_LOCK_ASSERT(sc); + + LIST_REMOVE(brt, brt_hash); + + LIST_REMOVE(brt, brt_list); + sc->sc_brtcnt--; + brt->brt_dst->bif_addrcnt--; + uma_zfree(bridge_rtnode_zone, brt); +} + +/* + * bridge_rtable_expire: + * + * Set the expiry time for all routes on an interface. + */ +static void +bridge_rtable_expire(struct ifnet *ifp, int age) +{ + struct bridge_softc *sc = ifp->if_bridge; + struct bridge_rtnode *brt; + + BRIDGE_LOCK(sc); + + /* + * If the age is zero then flush, otherwise set all the expiry times to + * age for the interface + */ + if (age == 0) + bridge_rtdelete(sc, ifp, IFBF_FLUSHDYN); + else { + LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { + /* Cap the expiry time to 'age' */ + if (brt->brt_ifp == ifp && + brt->brt_expire > time_uptime + age && + (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) + brt->brt_expire = time_uptime + age; + } + } + BRIDGE_UNLOCK(sc); +} + +/* + * bridge_state_change: + * + * Callback from the bridgestp code when a port changes states. + */ +static void +bridge_state_change(struct ifnet *ifp, int state) +{ + struct bridge_softc *sc = ifp->if_bridge; + static const char *stpstates[] = { + "disabled", + "listening", + "learning", + "forwarding", + "blocking", + "discarding" + }; + + if (log_stp) + log(LOG_NOTICE, "%s: state changed to %s on %s\n", + sc->sc_ifp->if_xname, stpstates[state], ifp->if_xname); +} + +/* + * Send bridge packets through pfil if they are one of the types pfil can deal + * with, or if they are ARP or REVARP. (pfil will pass ARP and REVARP without + * question.) If *bifp or *ifp are NULL then packet filtering is skipped for + * that interface. + */ +static int +bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) +{ + int snap, error, i, hlen; + struct ether_header *eh1, eh2; + struct ip_fw_args args; + struct ip *ip; + struct llc llc1; + u_int16_t ether_type; + + snap = 0; + error = -1; /* Default error if not error == 0 */ + +#if 0 + /* we may return with the IP fields swapped, ensure its not shared */ + KASSERT(M_WRITABLE(*mp), ("%s: modifying a shared mbuf", __func__)); +#endif + + if (pfil_bridge == 0 && pfil_member == 0 && pfil_ipfw == 0) + return (0); /* filtering is disabled */ + + i = min((*mp)->m_pkthdr.len, max_protohdr); + if ((*mp)->m_len < i) { + *mp = m_pullup(*mp, i); + if (*mp == NULL) { + printf("%s: m_pullup failed\n", __func__); + return (-1); + } + } + + eh1 = mtod(*mp, struct ether_header *); + ether_type = ntohs(eh1->ether_type); + + /* + * Check for SNAP/LLC. + */ + if (ether_type < ETHERMTU) { + struct llc *llc2 = (struct llc *)(eh1 + 1); + + if ((*mp)->m_len >= ETHER_HDR_LEN + 8 && + llc2->llc_dsap == LLC_SNAP_LSAP && + llc2->llc_ssap == LLC_SNAP_LSAP && + llc2->llc_control == LLC_UI) { + ether_type = htons(llc2->llc_un.type_snap.ether_type); + snap = 1; + } + } + + /* + * If we're trying to filter bridge traffic, don't look at anything + * other than IP and ARP traffic. If the filter doesn't understand + * IPv6, don't allow IPv6 through the bridge either. This is lame + * since if we really wanted, say, an AppleTalk filter, we are hosed, + * but of course we don't have an AppleTalk filter to begin with. + * (Note that since pfil doesn't understand ARP it will pass *ALL* + * ARP traffic.) + */ + switch (ether_type) { + case ETHERTYPE_ARP: + case ETHERTYPE_REVARP: + if (pfil_ipfw_arp == 0) + return (0); /* Automatically pass */ + break; + + case ETHERTYPE_IP: +#ifdef INET6 + case ETHERTYPE_IPV6: +#endif /* INET6 */ + break; + default: + /* + * Check to see if the user wants to pass non-ip + * packets, these will not be checked by pfil(9) and + * passed unconditionally so the default is to drop. + */ + if (pfil_onlyip) + goto bad; + } + + /* Strip off the Ethernet header and keep a copy. */ + m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t) &eh2); + m_adj(*mp, ETHER_HDR_LEN); + + /* Strip off snap header, if present */ + if (snap) { + m_copydata(*mp, 0, sizeof(struct llc), (caddr_t) &llc1); + m_adj(*mp, sizeof(struct llc)); + } + + /* + * Check the IP header for alignment and errors + */ + if (dir == PFIL_IN) { + switch (ether_type) { + case ETHERTYPE_IP: + error = bridge_ip_checkbasic(mp); + break; +#ifdef INET6 + case ETHERTYPE_IPV6: + error = bridge_ip6_checkbasic(mp); + break; +#endif /* INET6 */ + default: + error = 0; + } + if (error) + goto bad; + } + + /* XXX this section is also in if_ethersubr.c */ + // XXX PFIL_OUT or DIR_OUT ? + if (V_ip_fw_chk_ptr && pfil_ipfw != 0 && + dir == PFIL_OUT && ifp != NULL) { + struct m_tag *mtag; + + error = -1; + /* fetch the start point from existing tags, if any */ + mtag = m_tag_locate(*mp, MTAG_IPFW_RULE, 0, NULL); + if (mtag == NULL) { + args.rule.slot = 0; + } else { + struct ipfw_rule_ref *r; + + /* XXX can we free the tag after use ? */ + mtag->m_tag_id = PACKET_TAG_NONE; + r = (struct ipfw_rule_ref *)(mtag + 1); + /* packet already partially processed ? */ + if (r->info & IPFW_ONEPASS) + goto ipfwpass; + args.rule = *r; + } + + args.m = *mp; + args.oif = ifp; + args.next_hop = NULL; + args.eh = &eh2; + args.inp = NULL; /* used by ipfw uid/gid/jail rules */ + i = V_ip_fw_chk_ptr(&args); + *mp = args.m; + + if (*mp == NULL) + return (error); + + if (ip_dn_io_ptr && (i == IP_FW_DUMMYNET)) { + + /* put the Ethernet header back on */ + M_PREPEND(*mp, ETHER_HDR_LEN, M_DONTWAIT); + if (*mp == NULL) + return (error); + bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN); + + /* + * Pass the pkt to dummynet, which consumes it. The + * packet will return to us via bridge_dummynet(). + */ + args.oif = ifp; + ip_dn_io_ptr(mp, DIR_FWD | PROTO_IFB, &args); + return (error); + } + + if (i != IP_FW_PASS) /* drop */ + goto bad; + } + +ipfwpass: + error = 0; + + /* + * Run the packet through pfil + */ + switch (ether_type) { + case ETHERTYPE_IP: + /* + * before calling the firewall, swap fields the same as + * IP does. here we assume the header is contiguous + */ + ip = mtod(*mp, struct ip *); + + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + + /* + * Run pfil on the member interface and the bridge, both can + * be skipped by clearing pfil_member or pfil_bridge. + * + * Keep the order: + * in_if -> bridge_if -> out_if + */ + if (pfil_bridge && dir == PFIL_OUT && bifp != NULL) + error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + if (pfil_member && ifp != NULL) + error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + if (pfil_bridge && dir == PFIL_IN && bifp != NULL) + error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + /* check if we need to fragment the packet */ + if (pfil_member && ifp != NULL && dir == PFIL_OUT) { + i = (*mp)->m_pkthdr.len; + if (i > ifp->if_mtu) { + error = bridge_fragment(ifp, *mp, &eh2, snap, + &llc1); + return (error); + } + } + + /* Recalculate the ip checksum and restore byte ordering */ + ip = mtod(*mp, struct ip *); + hlen = ip->ip_hl << 2; + if (hlen < sizeof(struct ip)) + goto bad; + if (hlen > (*mp)->m_len) { + if ((*mp = m_pullup(*mp, hlen)) == 0) + goto bad; + ip = mtod(*mp, struct ip *); + if (ip == NULL) + goto bad; + } + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(*mp, hlen); + + break; +#ifdef INET6 + case ETHERTYPE_IPV6: + if (pfil_bridge && dir == PFIL_OUT && bifp != NULL) + error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + if (pfil_member && ifp != NULL) + error = pfil_run_hooks(&V_inet6_pfil_hook, mp, ifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + if (pfil_bridge && dir == PFIL_IN && bifp != NULL) + error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp, + dir, NULL); + break; +#endif + default: + error = 0; + break; + } + + if (*mp == NULL) + return (error); + if (error != 0) + goto bad; + + error = -1; + + /* + * Finally, put everything back the way it was and return + */ + if (snap) { + M_PREPEND(*mp, sizeof(struct llc), M_DONTWAIT); + if (*mp == NULL) + return (error); + bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc)); + } + + M_PREPEND(*mp, ETHER_HDR_LEN, M_DONTWAIT); + if (*mp == NULL) + return (error); + bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN); + + return (0); + +bad: + m_freem(*mp); + *mp = NULL; + return (error); +} + +/* + * Perform basic checks on header size since + * pfil assumes ip_input has already processed + * it for it. Cut-and-pasted from ip_input.c. + * Given how simple the IPv6 version is, + * does the IPv4 version really need to be + * this complicated? + * + * XXX Should we update ipstat here, or not? + * XXX Right now we update ipstat but not + * XXX csum_counter. + */ +static int +bridge_ip_checkbasic(struct mbuf **mp) +{ + struct mbuf *m = *mp; + struct ip *ip; + int len, hlen; + u_short sum; + + if (*mp == NULL) + return (-1); + + if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { + if ((m = m_copyup(m, sizeof(struct ip), + (max_linkhdr + 3) & ~3)) == NULL) { + /* XXXJRT new stat, please */ + KMOD_IPSTAT_INC(ips_toosmall); + goto bad; + } + } else if (__predict_false(m->m_len < sizeof (struct ip))) { + if ((m = m_pullup(m, sizeof (struct ip))) == NULL) { + KMOD_IPSTAT_INC(ips_toosmall); + goto bad; + } + } + ip = mtod(m, struct ip *); + if (ip == NULL) goto bad; + + if (ip->ip_v != IPVERSION) { + KMOD_IPSTAT_INC(ips_badvers); + goto bad; + } + hlen = ip->ip_hl << 2; + if (hlen < sizeof(struct ip)) { /* minimum header length */ + KMOD_IPSTAT_INC(ips_badhlen); + goto bad; + } + if (hlen > m->m_len) { + if ((m = m_pullup(m, hlen)) == 0) { + KMOD_IPSTAT_INC(ips_badhlen); + goto bad; + } + ip = mtod(m, struct ip *); + if (ip == NULL) goto bad; + } + + if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { + sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); + } else { + if (hlen == sizeof(struct ip)) { + sum = in_cksum_hdr(ip); + } else { + sum = in_cksum(m, hlen); + } + } + if (sum) { + KMOD_IPSTAT_INC(ips_badsum); + goto bad; + } + + /* Retrieve the packet length. */ + len = ntohs(ip->ip_len); + + /* + * Check for additional length bogosity + */ + if (len < hlen) { + KMOD_IPSTAT_INC(ips_badlen); + goto bad; + } + + /* + * Check that the amount of data in the buffers + * is as at least much as the IP header would have us expect. + * Drop packet if shorter than we expect. + */ + if (m->m_pkthdr.len < len) { + KMOD_IPSTAT_INC(ips_tooshort); + goto bad; + } + + /* Checks out, proceed */ + *mp = m; + return (0); + +bad: + *mp = m; + return (-1); +} + +#ifdef INET6 +/* + * Same as above, but for IPv6. + * Cut-and-pasted from ip6_input.c. + * XXX Should we update ip6stat, or not? + */ +static int +bridge_ip6_checkbasic(struct mbuf **mp) +{ + struct mbuf *m = *mp; + struct ip6_hdr *ip6; + + /* + * If the IPv6 header is not aligned, slurp it up into a new + * mbuf with space for link headers, in the event we forward + * it. Otherwise, if it is aligned, make sure the entire base + * IPv6 header is in the first mbuf of the chain. + */ + if (IP6_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { + struct ifnet *inifp = m->m_pkthdr.rcvif; + if ((m = m_copyup(m, sizeof(struct ip6_hdr), + (max_linkhdr + 3) & ~3)) == NULL) { + /* XXXJRT new stat, please */ + V_ip6stat.ip6s_toosmall++; + in6_ifstat_inc(inifp, ifs6_in_hdrerr); + goto bad; + } + } else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) { + struct ifnet *inifp = m->m_pkthdr.rcvif; + if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { + V_ip6stat.ip6s_toosmall++; + in6_ifstat_inc(inifp, ifs6_in_hdrerr); + goto bad; + } + } + + ip6 = mtod(m, struct ip6_hdr *); + + if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { + V_ip6stat.ip6s_badvers++; + in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); + goto bad; + } + + /* Checks out, proceed */ + *mp = m; + return (0); + +bad: + *mp = m; + return (-1); +} +#endif /* INET6 */ + +/* + * bridge_fragment: + * + * Return a fragmented mbuf chain. + */ +static int +bridge_fragment(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh, + int snap, struct llc *llc) +{ + struct mbuf *m0; + struct ip *ip; + int error = -1; + + if (m->m_len < sizeof(struct ip) && + (m = m_pullup(m, sizeof(struct ip))) == NULL) + goto out; + ip = mtod(m, struct ip *); + + error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, + CSUM_DELAY_IP); + if (error) + goto out; + + /* walk the chain and re-add the Ethernet header */ + for (m0 = m; m0; m0 = m0->m_nextpkt) { + if (error == 0) { + if (snap) { + M_PREPEND(m0, sizeof(struct llc), M_DONTWAIT); + if (m0 == NULL) { + error = ENOBUFS; + continue; + } + bcopy(llc, mtod(m0, caddr_t), + sizeof(struct llc)); + } + M_PREPEND(m0, ETHER_HDR_LEN, M_DONTWAIT); + if (m0 == NULL) { + error = ENOBUFS; + continue; + } + bcopy(eh, mtod(m0, caddr_t), ETHER_HDR_LEN); + } else + m_freem(m); + } + + if (error == 0) + KMOD_IPSTAT_INC(ips_fragmented); + + return (error); + +out: + if (m != NULL) + m_freem(m); + return (error); +} diff --git a/freebsd/sys/net/if_bridgevar.h b/freebsd/sys/net/if_bridgevar.h new file mode 100644 index 00000000..642cc98d --- /dev/null +++ b/freebsd/sys/net/if_bridgevar.h @@ -0,0 +1,328 @@ +/* $NetBSD: if_bridgevar.h,v 1.4 2003/07/08 07:13:50 itojun Exp $ */ + +/* + * Copyright 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Jason R. Thorpe for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Jason L. Wright + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: if_bridge.h,v 1.14 2001/03/22 03:48:29 jason Exp + * + * $FreeBSD$ + */ + +/* + * Data structure and control definitions for bridge interfaces. + */ + +#include +#include +#include + +/* + * Commands used in the SIOCSDRVSPEC ioctl. Note the lookup of the + * bridge interface itself is keyed off the ifdrv structure. + */ +#define BRDGADD 0 /* add bridge member (ifbreq) */ +#define BRDGDEL 1 /* delete bridge member (ifbreq) */ +#define BRDGGIFFLGS 2 /* get member if flags (ifbreq) */ +#define BRDGSIFFLGS 3 /* set member if flags (ifbreq) */ +#define BRDGSCACHE 4 /* set cache size (ifbrparam) */ +#define BRDGGCACHE 5 /* get cache size (ifbrparam) */ +#define BRDGGIFS 6 /* get member list (ifbifconf) */ +#define BRDGRTS 7 /* get address list (ifbaconf) */ +#define BRDGSADDR 8 /* set static address (ifbareq) */ +#define BRDGSTO 9 /* set cache timeout (ifbrparam) */ +#define BRDGGTO 10 /* get cache timeout (ifbrparam) */ +#define BRDGDADDR 11 /* delete address (ifbareq) */ +#define BRDGFLUSH 12 /* flush address cache (ifbreq) */ + +#define BRDGGPRI 13 /* get priority (ifbrparam) */ +#define BRDGSPRI 14 /* set priority (ifbrparam) */ +#define BRDGGHT 15 /* get hello time (ifbrparam) */ +#define BRDGSHT 16 /* set hello time (ifbrparam) */ +#define BRDGGFD 17 /* get forward delay (ifbrparam) */ +#define BRDGSFD 18 /* set forward delay (ifbrparam) */ +#define BRDGGMA 19 /* get max age (ifbrparam) */ +#define BRDGSMA 20 /* set max age (ifbrparam) */ +#define BRDGSIFPRIO 21 /* set if priority (ifbreq) */ +#define BRDGSIFCOST 22 /* set if path cost (ifbreq) */ +#define BRDGADDS 23 /* add bridge span member (ifbreq) */ +#define BRDGDELS 24 /* delete bridge span member (ifbreq) */ +#define BRDGPARAM 25 /* get bridge STP params (ifbropreq) */ +#define BRDGGRTE 26 /* get cache drops (ifbrparam) */ +#define BRDGGIFSSTP 27 /* get member STP params list + * (ifbpstpconf) */ +#define BRDGSPROTO 28 /* set protocol (ifbrparam) */ +#define BRDGSTXHC 29 /* set tx hold count (ifbrparam) */ +#define BRDGSIFAMAX 30 /* set max interface addrs (ifbreq) */ + +/* + * Generic bridge control request. + */ +struct ifbreq { + char ifbr_ifsname[IFNAMSIZ]; /* member if name */ + uint32_t ifbr_ifsflags; /* member if flags */ + uint32_t ifbr_stpflags; /* member if STP flags */ + uint32_t ifbr_path_cost; /* member if STP cost */ + uint8_t ifbr_portno; /* member if port number */ + uint8_t ifbr_priority; /* member if STP priority */ + uint8_t ifbr_proto; /* member if STP protocol */ + uint8_t ifbr_role; /* member if STP role */ + uint8_t ifbr_state; /* member if STP state */ + uint32_t ifbr_addrcnt; /* member if addr number */ + uint32_t ifbr_addrmax; /* member if addr max */ + uint32_t ifbr_addrexceeded; /* member if addr violations */ + uint8_t pad[32]; +}; + +/* BRDGGIFFLAGS, BRDGSIFFLAGS */ +#define IFBIF_LEARNING 0x0001 /* if can learn */ +#define IFBIF_DISCOVER 0x0002 /* if sends packets w/ unknown dest. */ +#define IFBIF_STP 0x0004 /* if participates in spanning tree */ +#define IFBIF_SPAN 0x0008 /* if is a span port */ +#define IFBIF_STICKY 0x0010 /* if learned addresses stick */ +#define IFBIF_BSTP_EDGE 0x0020 /* member stp edge port */ +#define IFBIF_BSTP_AUTOEDGE 0x0040 /* member stp autoedge enabled */ +#define IFBIF_BSTP_PTP 0x0080 /* member stp point to point */ +#define IFBIF_BSTP_AUTOPTP 0x0100 /* member stp autoptp enabled */ +#define IFBIF_BSTP_ADMEDGE 0x0200 /* member stp admin edge enabled */ +#define IFBIF_BSTP_ADMCOST 0x0400 /* member stp admin path cost */ +#define IFBIF_PRIVATE 0x0800 /* if is a private segment */ + +#define IFBIFBITS "\020\001LEARNING\002DISCOVER\003STP\004SPAN" \ + "\005STICKY\014PRIVATE\006EDGE\007AUTOEDGE\010PTP" \ + "\011AUTOPTP" +#define IFBIFMASK ~(IFBIF_BSTP_EDGE|IFBIF_BSTP_AUTOEDGE|IFBIF_BSTP_PTP| \ + IFBIF_BSTP_AUTOPTP|IFBIF_BSTP_ADMEDGE| \ + IFBIF_BSTP_ADMCOST) /* not saved */ + +/* BRDGFLUSH */ +#define IFBF_FLUSHDYN 0x00 /* flush learned addresses only */ +#define IFBF_FLUSHALL 0x01 /* flush all addresses */ + +/* + * Interface list structure. + */ +struct ifbifconf { + uint32_t ifbic_len; /* buffer size */ + union { + caddr_t ifbicu_buf; + struct ifbreq *ifbicu_req; + } ifbic_ifbicu; +#define ifbic_buf ifbic_ifbicu.ifbicu_buf +#define ifbic_req ifbic_ifbicu.ifbicu_req +}; + +/* + * Bridge address request. + */ +struct ifbareq { + char ifba_ifsname[IFNAMSIZ]; /* member if name */ + unsigned long ifba_expire; /* address expire time */ + uint8_t ifba_flags; /* address flags */ + uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */ + uint16_t ifba_vlan; /* vlan id */ +}; + +#define IFBAF_TYPEMASK 0x03 /* address type mask */ +#define IFBAF_DYNAMIC 0x00 /* dynamically learned address */ +#define IFBAF_STATIC 0x01 /* static address */ +#define IFBAF_STICKY 0x02 /* sticky address */ + +#define IFBAFBITS "\020\1STATIC\2STICKY" + +/* + * Address list structure. + */ +struct ifbaconf { + uint32_t ifbac_len; /* buffer size */ + union { + caddr_t ifbacu_buf; + struct ifbareq *ifbacu_req; + } ifbac_ifbacu; +#define ifbac_buf ifbac_ifbacu.ifbacu_buf +#define ifbac_req ifbac_ifbacu.ifbacu_req +}; + +/* + * Bridge parameter structure. + */ +struct ifbrparam { + union { + uint32_t ifbrpu_int32; + uint16_t ifbrpu_int16; + uint8_t ifbrpu_int8; + } ifbrp_ifbrpu; +}; +#define ifbrp_csize ifbrp_ifbrpu.ifbrpu_int32 /* cache size */ +#define ifbrp_ctime ifbrp_ifbrpu.ifbrpu_int32 /* cache time (sec) */ +#define ifbrp_prio ifbrp_ifbrpu.ifbrpu_int16 /* bridge priority */ +#define ifbrp_proto ifbrp_ifbrpu.ifbrpu_int8 /* bridge protocol */ +#define ifbrp_txhc ifbrp_ifbrpu.ifbrpu_int8 /* bpdu tx holdcount */ +#define ifbrp_hellotime ifbrp_ifbrpu.ifbrpu_int8 /* hello time (sec) */ +#define ifbrp_fwddelay ifbrp_ifbrpu.ifbrpu_int8 /* fwd time (sec) */ +#define ifbrp_maxage ifbrp_ifbrpu.ifbrpu_int8 /* max age (sec) */ +#define ifbrp_cexceeded ifbrp_ifbrpu.ifbrpu_int32 /* # of cache dropped + * adresses */ +/* + * Bridge current operational parameters structure. + */ +struct ifbropreq { + uint8_t ifbop_holdcount; + uint8_t ifbop_maxage; + uint8_t ifbop_hellotime; + uint8_t ifbop_fwddelay; + uint8_t ifbop_protocol; + uint16_t ifbop_priority; + uint16_t ifbop_root_port; + uint32_t ifbop_root_path_cost; + uint64_t ifbop_bridgeid; + uint64_t ifbop_designated_root; + uint64_t ifbop_designated_bridge; + struct timeval ifbop_last_tc_time; +}; + +/* + * Bridge member operational STP params structure. + */ +struct ifbpstpreq { + uint8_t ifbp_portno; /* bp STP port number */ + uint32_t ifbp_fwd_trans; /* bp STP fwd transitions */ + uint32_t ifbp_design_cost; /* bp STP designated cost */ + uint32_t ifbp_design_port; /* bp STP designated port */ + uint64_t ifbp_design_bridge; /* bp STP designated bridge */ + uint64_t ifbp_design_root; /* bp STP designated root */ +}; + +/* + * Bridge STP ports list structure. + */ +struct ifbpstpconf { + uint32_t ifbpstp_len; /* buffer size */ + union { + caddr_t ifbpstpu_buf; + struct ifbpstpreq *ifbpstpu_req; + } ifbpstp_ifbpstpu; +#define ifbpstp_buf ifbpstp_ifbpstpu.ifbpstpu_buf +#define ifbpstp_req ifbpstp_ifbpstpu.ifbpstpu_req +}; + +#ifdef _KERNEL + +#define BRIDGE_LOCK_INIT(_sc) do { \ + mtx_init(&(_sc)->sc_mtx, "if_bridge", NULL, MTX_DEF); \ + cv_init(&(_sc)->sc_cv, "if_bridge_cv"); \ +} while (0) +#define BRIDGE_LOCK_DESTROY(_sc) do { \ + mtx_destroy(&(_sc)->sc_mtx); \ + cv_destroy(&(_sc)->sc_cv); \ +} while (0) +#define BRIDGE_LOCK(_sc) mtx_lock(&(_sc)->sc_mtx) +#define BRIDGE_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_mtx) +#define BRIDGE_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->sc_mtx, MA_OWNED) +#define BRIDGE_LOCK2REF(_sc, _err) do { \ + mtx_assert(&(_sc)->sc_mtx, MA_OWNED); \ + if ((_sc)->sc_iflist_xcnt > 0) \ + (_err) = EBUSY; \ + else \ + (_sc)->sc_iflist_ref++; \ + mtx_unlock(&(_sc)->sc_mtx); \ +} while (0) +#define BRIDGE_UNREF(_sc) do { \ + mtx_lock(&(_sc)->sc_mtx); \ + (_sc)->sc_iflist_ref--; \ + if (((_sc)->sc_iflist_xcnt > 0) && ((_sc)->sc_iflist_ref == 0)) \ + cv_broadcast(&(_sc)->sc_cv); \ + mtx_unlock(&(_sc)->sc_mtx); \ +} while (0) +#define BRIDGE_XLOCK(_sc) do { \ + mtx_assert(&(_sc)->sc_mtx, MA_OWNED); \ + (_sc)->sc_iflist_xcnt++; \ + while ((_sc)->sc_iflist_ref > 0) \ + cv_wait(&(_sc)->sc_cv, &(_sc)->sc_mtx); \ +} while (0) +#define BRIDGE_XDROP(_sc) do { \ + mtx_assert(&(_sc)->sc_mtx, MA_OWNED); \ + (_sc)->sc_iflist_xcnt--; \ +} while (0) + +#define BRIDGE_INPUT(_ifp, _m) do { \ + KASSERT(bridge_input_p != NULL, \ + ("%s: if_bridge not loaded!", __func__)); \ + _m = (*bridge_input_p)(_ifp, _m); \ + if (_m != NULL) \ + _ifp = _m->m_pkthdr.rcvif; \ +} while (0) + +#define BRIDGE_OUTPUT(_ifp, _m, _err) do { \ + KASSERT(bridge_output_p != NULL, \ + ("%s: if_bridge not loaded!", __func__)); \ + _err = (*bridge_output_p)(_ifp, _m, NULL, NULL); \ +} while (0) + +extern struct mbuf *(*bridge_input_p)(struct ifnet *, struct mbuf *); +extern int (*bridge_output_p)(struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *); +extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); + +#endif /* _KERNEL */ diff --git a/freebsd/sys/net/if_clone.c b/freebsd/sys/net/if_clone.c new file mode 100644 index 00000000..aca1276f --- /dev/null +++ b/freebsd/sys/net/if_clone.c @@ -0,0 +1,617 @@ +#include + +/*- + * Copyright (c) 1980, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if.c 8.5 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#if 0 +#include +#endif +#include +#include +#include +#include +#include + +static void if_clone_free(struct if_clone *ifc); +static int if_clone_createif(struct if_clone *ifc, char *name, size_t len, + caddr_t params); + +static struct mtx if_cloners_mtx; +static VNET_DEFINE(int, if_cloners_count); +VNET_DEFINE(LIST_HEAD(, if_clone), if_cloners); + +#define V_if_cloners_count VNET(if_cloners_count) +#define V_if_cloners VNET(if_cloners) + +#define IF_CLONERS_LOCK_INIT() \ + mtx_init(&if_cloners_mtx, "if_cloners lock", NULL, MTX_DEF) +#define IF_CLONERS_LOCK_ASSERT() mtx_assert(&if_cloners_mtx, MA_OWNED) +#define IF_CLONERS_LOCK() mtx_lock(&if_cloners_mtx) +#define IF_CLONERS_UNLOCK() mtx_unlock(&if_cloners_mtx) + +#define IF_CLONE_LOCK_INIT(ifc) \ + mtx_init(&(ifc)->ifc_mtx, "if_clone lock", NULL, MTX_DEF) +#define IF_CLONE_LOCK_DESTROY(ifc) mtx_destroy(&(ifc)->ifc_mtx) +#define IF_CLONE_LOCK_ASSERT(ifc) mtx_assert(&(ifc)->ifc_mtx, MA_OWNED) +#define IF_CLONE_LOCK(ifc) mtx_lock(&(ifc)->ifc_mtx) +#define IF_CLONE_UNLOCK(ifc) mtx_unlock(&(ifc)->ifc_mtx) + +#define IF_CLONE_ADDREF(ifc) \ + do { \ + IF_CLONE_LOCK(ifc); \ + IF_CLONE_ADDREF_LOCKED(ifc); \ + IF_CLONE_UNLOCK(ifc); \ + } while (0) +#define IF_CLONE_ADDREF_LOCKED(ifc) \ + do { \ + IF_CLONE_LOCK_ASSERT(ifc); \ + KASSERT((ifc)->ifc_refcnt >= 0, \ + ("negative refcnt %ld", (ifc)->ifc_refcnt)); \ + (ifc)->ifc_refcnt++; \ + } while (0) +#define IF_CLONE_REMREF(ifc) \ + do { \ + IF_CLONE_LOCK(ifc); \ + IF_CLONE_REMREF_LOCKED(ifc); \ + } while (0) +#define IF_CLONE_REMREF_LOCKED(ifc) \ + do { \ + IF_CLONE_LOCK_ASSERT(ifc); \ + KASSERT((ifc)->ifc_refcnt > 0, \ + ("bogus refcnt %ld", (ifc)->ifc_refcnt)); \ + if (--(ifc)->ifc_refcnt == 0) { \ + IF_CLONE_UNLOCK(ifc); \ + if_clone_free(ifc); \ + } else { \ + /* silently free the lock */ \ + IF_CLONE_UNLOCK(ifc); \ + } \ + } while (0) + +#define IFC_IFLIST_INSERT(_ifc, _ifp) \ + LIST_INSERT_HEAD(&_ifc->ifc_iflist, _ifp, if_clones) +#define IFC_IFLIST_REMOVE(_ifc, _ifp) \ + LIST_REMOVE(_ifp, if_clones) + +static MALLOC_DEFINE(M_CLONE, "clone", "interface cloning framework"); + +void +vnet_if_clone_init(void) +{ + + LIST_INIT(&V_if_cloners); +} + +void +if_clone_init(void) +{ + + IF_CLONERS_LOCK_INIT(); +} + +/* + * Lookup and create a clone network interface. + */ +int +if_clone_create(char *name, size_t len, caddr_t params) +{ + struct if_clone *ifc; + + /* Try to find an applicable cloner for this request */ + IF_CLONERS_LOCK(); + LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { + if (ifc->ifc_match(ifc, name)) { + break; + } + } +#ifdef VIMAGE + if (ifc == NULL && !IS_DEFAULT_VNET(curvnet)) { + CURVNET_SET_QUIET(vnet0); + LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { + if (ifc->ifc_match(ifc, name)) + break; + } + CURVNET_RESTORE(); + } +#endif + IF_CLONERS_UNLOCK(); + + if (ifc == NULL) + return (EINVAL); + + return (if_clone_createif(ifc, name, len, params)); +} + +/* + * Create a clone network interface. + */ +static int +if_clone_createif(struct if_clone *ifc, char *name, size_t len, caddr_t params) +{ + int err; + struct ifnet *ifp; + + if (ifunit(name) != NULL) + return (EEXIST); + + err = (*ifc->ifc_create)(ifc, name, len, params); + + if (!err) { + ifp = ifunit(name); + if (ifp == NULL) + panic("%s: lookup failed for %s", __func__, name); + + if_addgroup(ifp, ifc->ifc_name); + + IF_CLONE_LOCK(ifc); + IFC_IFLIST_INSERT(ifc, ifp); + IF_CLONE_UNLOCK(ifc); + } + + return (err); +} + +/* + * Lookup and destroy a clone network interface. + */ +int +if_clone_destroy(const char *name) +{ + int err; + struct if_clone *ifc; + struct ifnet *ifp; + + ifp = ifunit_ref(name); + if (ifp == NULL) + return (ENXIO); + + /* Find the cloner for this interface */ + IF_CLONERS_LOCK(); + LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { + if (strcmp(ifc->ifc_name, ifp->if_dname) == 0) { + break; + } + } +#ifdef VIMAGE + if (ifc == NULL && !IS_DEFAULT_VNET(curvnet)) { + CURVNET_SET_QUIET(vnet0); + LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { + if (ifc->ifc_match(ifc, name)) + break; + } + CURVNET_RESTORE(); + } +#endif + IF_CLONERS_UNLOCK(); + if (ifc == NULL) { + if_rele(ifp); + return (EINVAL); + } + + err = if_clone_destroyif(ifc, ifp); + if_rele(ifp); + return err; +} + +/* + * Destroy a clone network interface. + */ +int +if_clone_destroyif(struct if_clone *ifc, struct ifnet *ifp) +{ + int err; + struct ifnet *ifcifp; + + if (ifc->ifc_destroy == NULL) + return(EOPNOTSUPP); + + /* + * Given that the cloned ifnet might be attached to a different + * vnet from where its cloner was registered, we have to + * switch to the vnet context of the target vnet. + */ + CURVNET_SET_QUIET(ifp->if_vnet); + + IF_CLONE_LOCK(ifc); + LIST_FOREACH(ifcifp, &ifc->ifc_iflist, if_clones) { + if (ifcifp == ifp) { + IFC_IFLIST_REMOVE(ifc, ifp); + break; + } + } + IF_CLONE_UNLOCK(ifc); + if (ifcifp == NULL) { + CURVNET_RESTORE(); + return (ENXIO); /* ifp is not on the list. */ + } + + if_delgroup(ifp, ifc->ifc_name); + + err = (*ifc->ifc_destroy)(ifc, ifp); + + if (err != 0) { + if_addgroup(ifp, ifc->ifc_name); + + IF_CLONE_LOCK(ifc); + IFC_IFLIST_INSERT(ifc, ifp); + IF_CLONE_UNLOCK(ifc); + } + CURVNET_RESTORE(); + return (err); +} + +/* + * Register a network interface cloner. + */ +void +if_clone_attach(struct if_clone *ifc) +{ + int len, maxclone; + + /* + * Compute bitmap size and allocate it. + */ + maxclone = ifc->ifc_maxunit + 1; + len = maxclone >> 3; + if ((len << 3) < maxclone) + len++; + ifc->ifc_units = malloc(len, M_CLONE, M_WAITOK | M_ZERO); + ifc->ifc_bmlen = len; + IF_CLONE_LOCK_INIT(ifc); + IF_CLONE_ADDREF(ifc); + + IF_CLONERS_LOCK(); + LIST_INSERT_HEAD(&V_if_cloners, ifc, ifc_list); + V_if_cloners_count++; + IF_CLONERS_UNLOCK(); + + LIST_INIT(&ifc->ifc_iflist); + + if (ifc->ifc_attach != NULL) + (*ifc->ifc_attach)(ifc); + EVENTHANDLER_INVOKE(if_clone_event, ifc); +} + +/* + * Unregister a network interface cloner. + */ +void +if_clone_detach(struct if_clone *ifc) +{ + struct ifc_simple_data *ifcs = ifc->ifc_data; + + IF_CLONERS_LOCK(); + LIST_REMOVE(ifc, ifc_list); + V_if_cloners_count--; + IF_CLONERS_UNLOCK(); + + /* Allow all simples to be destroyed */ + if (ifc->ifc_attach == ifc_simple_attach) + ifcs->ifcs_minifs = 0; + + /* destroy all interfaces for this cloner */ + while (!LIST_EMPTY(&ifc->ifc_iflist)) + if_clone_destroyif(ifc, LIST_FIRST(&ifc->ifc_iflist)); + + IF_CLONE_REMREF(ifc); +} + +static void +if_clone_free(struct if_clone *ifc) +{ + for (int bytoff = 0; bytoff < ifc->ifc_bmlen; bytoff++) { + KASSERT(ifc->ifc_units[bytoff] == 0x00, + ("ifc_units[%d] is not empty", bytoff)); + } + + KASSERT(LIST_EMPTY(&ifc->ifc_iflist), + ("%s: ifc_iflist not empty", __func__)); + + IF_CLONE_LOCK_DESTROY(ifc); + free(ifc->ifc_units, M_CLONE); +} + +/* + * Provide list of interface cloners to userspace. + */ +int +if_clone_list(struct if_clonereq *ifcr) +{ + char *buf, *dst, *outbuf = NULL; + struct if_clone *ifc; + int buf_count, count, err = 0; + + if (ifcr->ifcr_count < 0) + return (EINVAL); + + IF_CLONERS_LOCK(); + /* + * Set our internal output buffer size. We could end up not + * reporting a cloner that is added between the unlock and lock + * below, but that's not a major problem. Not caping our + * allocation to the number of cloners actually in the system + * could be because that would let arbitrary users cause us to + * allocate abritrary amounts of kernel memory. + */ + buf_count = (V_if_cloners_count < ifcr->ifcr_count) ? + V_if_cloners_count : ifcr->ifcr_count; + IF_CLONERS_UNLOCK(); + + outbuf = malloc(IFNAMSIZ*buf_count, M_CLONE, M_WAITOK | M_ZERO); + + IF_CLONERS_LOCK(); + + ifcr->ifcr_total = V_if_cloners_count; + if ((dst = ifcr->ifcr_buffer) == NULL) { + /* Just asking how many there are. */ + goto done; + } + count = (V_if_cloners_count < buf_count) ? + V_if_cloners_count : buf_count; + + for (ifc = LIST_FIRST(&V_if_cloners), buf = outbuf; + ifc != NULL && count != 0; + ifc = LIST_NEXT(ifc, ifc_list), count--, buf += IFNAMSIZ) { + strlcpy(buf, ifc->ifc_name, IFNAMSIZ); + } + +done: + IF_CLONERS_UNLOCK(); + if (err == 0) + err = copyout(outbuf, dst, buf_count*IFNAMSIZ); + if (outbuf != NULL) + free(outbuf, M_CLONE); + return (err); +} + +/* + * A utility function to extract unit numbers from interface names of + * the form name###. + * + * Returns 0 on success and an error on failure. + */ +int +ifc_name2unit(const char *name, int *unit) +{ + const char *cp; + int cutoff = INT_MAX / 10; + int cutlim = INT_MAX % 10; + + for (cp = name; *cp != '\0' && (*cp < '0' || *cp > '9'); cp++); + if (*cp == '\0') { + *unit = -1; + } else if (cp[0] == '0' && cp[1] != '\0') { + /* Disallow leading zeroes. */ + return (EINVAL); + } else { + for (*unit = 0; *cp != '\0'; cp++) { + if (*cp < '0' || *cp > '9') { + /* Bogus unit number. */ + return (EINVAL); + } + if (*unit > cutoff || + (*unit == cutoff && *cp - '0' > cutlim)) + return (EINVAL); + *unit = (*unit * 10) + (*cp - '0'); + } + } + + return (0); +} + +int +ifc_alloc_unit(struct if_clone *ifc, int *unit) +{ + int wildcard, bytoff, bitoff; + int err = 0; + + IF_CLONE_LOCK(ifc); + + bytoff = bitoff = 0; + wildcard = (*unit < 0); + /* + * Find a free unit if none was given. + */ + if (wildcard) { + while ((bytoff < ifc->ifc_bmlen) + && (ifc->ifc_units[bytoff] == 0xff)) + bytoff++; + if (bytoff >= ifc->ifc_bmlen) { + err = ENOSPC; + goto done; + } + while ((ifc->ifc_units[bytoff] & (1 << bitoff)) != 0) + bitoff++; + *unit = (bytoff << 3) + bitoff; + } + + if (*unit > ifc->ifc_maxunit) { + err = ENOSPC; + goto done; + } + + if (!wildcard) { + bytoff = *unit >> 3; + bitoff = *unit - (bytoff << 3); + } + + if((ifc->ifc_units[bytoff] & (1 << bitoff)) != 0) { + err = EEXIST; + goto done; + } + /* + * Allocate the unit in the bitmap. + */ + KASSERT((ifc->ifc_units[bytoff] & (1 << bitoff)) == 0, + ("%s: bit is already set", __func__)); + ifc->ifc_units[bytoff] |= (1 << bitoff); + IF_CLONE_ADDREF_LOCKED(ifc); + +done: + IF_CLONE_UNLOCK(ifc); + return (err); +} + +void +ifc_free_unit(struct if_clone *ifc, int unit) +{ + int bytoff, bitoff; + + + /* + * Compute offset in the bitmap and deallocate the unit. + */ + bytoff = unit >> 3; + bitoff = unit - (bytoff << 3); + + IF_CLONE_LOCK(ifc); + KASSERT((ifc->ifc_units[bytoff] & (1 << bitoff)) != 0, + ("%s: bit is already cleared", __func__)); + ifc->ifc_units[bytoff] &= ~(1 << bitoff); + IF_CLONE_REMREF_LOCKED(ifc); /* releases lock */ +} + +void +ifc_simple_attach(struct if_clone *ifc) +{ + int err; + int unit; + char name[IFNAMSIZ]; + struct ifc_simple_data *ifcs = ifc->ifc_data; + + KASSERT(ifcs->ifcs_minifs - 1 <= ifc->ifc_maxunit, + ("%s: %s requested more units than allowed (%d > %d)", + __func__, ifc->ifc_name, ifcs->ifcs_minifs, + ifc->ifc_maxunit + 1)); + + for (unit = 0; unit < ifcs->ifcs_minifs; unit++) { + snprintf(name, IFNAMSIZ, "%s%d", ifc->ifc_name, unit); + err = if_clone_createif(ifc, name, IFNAMSIZ, NULL); + KASSERT(err == 0, + ("%s: failed to create required interface %s", + __func__, name)); + } +} + +int +ifc_simple_match(struct if_clone *ifc, const char *name) +{ + const char *cp; + int i; + + /* Match the name */ + for (cp = name, i = 0; i < strlen(ifc->ifc_name); i++, cp++) { + if (ifc->ifc_name[i] != *cp) + return (0); + } + + /* Make sure there's a unit number or nothing after the name */ + for (; *cp != '\0'; cp++) { + if (*cp < '0' || *cp > '9') + return (0); + } + + return (1); +} + +int +ifc_simple_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +{ + char *dp; + int wildcard; + int unit; + int err; + struct ifc_simple_data *ifcs = ifc->ifc_data; + + err = ifc_name2unit(name, &unit); + if (err != 0) + return (err); + + wildcard = (unit < 0); + + err = ifc_alloc_unit(ifc, &unit); + if (err != 0) + return (err); + + err = ifcs->ifcs_create(ifc, unit, params); + if (err != 0) { + ifc_free_unit(ifc, unit); + return (err); + } + + /* In the wildcard case, we need to update the name. */ + if (wildcard) { + for (dp = name; *dp != '\0'; dp++); + if (snprintf(dp, len - (dp-name), "%d", unit) > + len - (dp-name) - 1) { + /* + * This can only be a programmer error and + * there's no straightforward way to recover if + * it happens. + */ + panic("if_clone_create(): interface name too long"); + } + + } + + return (0); +} + +int +ifc_simple_destroy(struct if_clone *ifc, struct ifnet *ifp) +{ + int unit; + struct ifc_simple_data *ifcs = ifc->ifc_data; + + unit = ifp->if_dunit; + + if (unit < ifcs->ifcs_minifs) + return (EINVAL); + + ifcs->ifcs_destroy(ifp); + + ifc_free_unit(ifc, unit); + + return (0); +} diff --git a/freebsd/sys/net/if_clone.h b/freebsd/sys/net/if_clone.h new file mode 100644 index 00000000..67de320b --- /dev/null +++ b/freebsd/sys/net/if_clone.h @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)if.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NET_IF_CLONE_HH_ +#define _NET_IF_CLONE_HH_ + +#ifdef _KERNEL + +#define IFC_CLONE_INITIALIZER(name, data, maxunit, \ + attach, match, create, destroy) \ + { { 0 }, name, maxunit, NULL, 0, data, attach, match, create, destroy } + +/* + * Structure describing a `cloning' interface. + * + * List of locks + * (c) const until freeing + * (d) driver specific data, may need external protection. + * (e) locked by if_cloners_mtx + * (i) locked by ifc_mtx mtx + */ +struct if_clone { + LIST_ENTRY(if_clone) ifc_list; /* (e) On list of cloners */ + const char *ifc_name; /* (c) Name of device, e.g. `gif' */ + int ifc_maxunit; /* (c) Maximum unit number */ + unsigned char *ifc_units; /* (i) Bitmap to handle units. */ + /* Considered private, access */ + /* via ifc_(alloc|free)_unit(). */ + int ifc_bmlen; /* (c) Bitmap length. */ + void *ifc_data; /* (*) Data for ifc_* functions. */ + + /* (c) Driver specific cloning functions. Called with no locks held. */ + void (*ifc_attach)(struct if_clone *); + int (*ifc_match)(struct if_clone *, const char *); + int (*ifc_create)(struct if_clone *, char *, size_t, caddr_t); + int (*ifc_destroy)(struct if_clone *, struct ifnet *); + + long ifc_refcnt; /* (i) Refrence count. */ + struct mtx ifc_mtx; /* Muted to protect members. */ + LIST_HEAD(, ifnet) ifc_iflist; /* (i) List of cloned interfaces */ +}; + +void if_clone_init(void); +void if_clone_attach(struct if_clone *); +void if_clone_detach(struct if_clone *); +void vnet_if_clone_init(void); + +int if_clone_create(char *, size_t, caddr_t); +int if_clone_destroy(const char *); +int if_clone_destroyif(struct if_clone *, struct ifnet *); +int if_clone_list(struct if_clonereq *); + +int ifc_name2unit(const char *name, int *unit); +int ifc_alloc_unit(struct if_clone *, int *); +void ifc_free_unit(struct if_clone *, int); + +/* + * The ifc_simple functions, structures, and macros implement basic + * cloning as in 5.[012]. + */ + +struct ifc_simple_data { + int ifcs_minifs; /* minimum number of interfaces */ + + int (*ifcs_create)(struct if_clone *, int, caddr_t); + void (*ifcs_destroy)(struct ifnet *); +}; + +/* interface clone event */ +typedef void (*if_clone_event_handler_t)(void *, struct if_clone *); +EVENTHANDLER_DECLARE(if_clone_event, if_clone_event_handler_t); + +#define IFC_SIMPLE_DECLARE(name, minifs) \ +struct ifc_simple_data name##_cloner_data = \ + {minifs, name##_clone_create, name##_clone_destroy}; \ +struct if_clone name##_cloner = \ + IFC_CLONE_INITIALIZER(#name, &name##_cloner_data, IF_MAXUNIT, \ + ifc_simple_attach, ifc_simple_match, ifc_simple_create, ifc_simple_destroy) + +void ifc_simple_attach(struct if_clone *); +int ifc_simple_match(struct if_clone *, const char *); +int ifc_simple_create(struct if_clone *, char *, size_t, caddr_t); +int ifc_simple_destroy(struct if_clone *, struct ifnet *); + +#endif /* _KERNEL */ + +#endif /* !_NET_IF_CLONE_HH_ */ diff --git a/freebsd/sys/net/if_dead.c b/freebsd/sys/net/if_dead.c new file mode 100644 index 00000000..dcceaf25 --- /dev/null +++ b/freebsd/sys/net/if_dead.c @@ -0,0 +1,116 @@ +#include + +/*- + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * When an interface has been detached but not yet freed, we set the various + * ifnet function pointers to "ifdead" versions. This prevents unexpected + * calls from the network stack into the device driver after if_detach() has + * returned. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include + +static int +ifdead_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, + struct route *ro) +{ + + m_freem(m); + return (ENXIO); +} + +static void +ifdead_input(struct ifnet *ifp, struct mbuf *m) +{ + + m_freem(m); +} + +static void +ifdead_start(struct ifnet *ifp) +{ + +} + +static int +ifdead_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + + return (ENXIO); +} + +static void +ifdead_watchdog(struct ifnet *ifp) +{ + +} + +static int +ifdead_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, + struct sockaddr *sa) +{ + + *llsa = NULL; + return (ENXIO); +} + +static void +ifdead_qflush(struct ifnet *ifp) +{ + +} + +static int +ifdead_transmit(struct ifnet *ifp, struct mbuf *m) +{ + + m_freem(m); + return (ENXIO); +} + +void +if_dead(struct ifnet *ifp) +{ + + ifp->if_output = ifdead_output; + ifp->if_input = ifdead_input; + ifp->if_start = ifdead_start; + ifp->if_ioctl = ifdead_ioctl; + ifp->if_watchdog = ifdead_watchdog; + ifp->if_resolvemulti = ifdead_resolvemulti; + ifp->if_qflush = ifdead_qflush; + ifp->if_transmit = ifdead_transmit; +} diff --git a/freebsd/sys/net/if_disc.c b/freebsd/sys/net/if_disc.c new file mode 100644 index 00000000..09918bb4 --- /dev/null +++ b/freebsd/sys/net/if_disc.c @@ -0,0 +1,247 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)if_loop.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +/* + * Discard interface driver for protocol testing and timing. + * (Based on the loopback.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#ifdef TINY_DSMTU +#define DSMTU (1024+512) +#else +#define DSMTU 65532 +#endif + +#define DISCNAME "disc" + +struct disc_softc { + struct ifnet *sc_ifp; +}; + +static int discoutput(struct ifnet *, struct mbuf *, + struct sockaddr *, struct route *); +static void discrtrequest(int, struct rtentry *, struct rt_addrinfo *); +static int discioctl(struct ifnet *, u_long, caddr_t); +static int disc_clone_create(struct if_clone *, int, caddr_t); +static void disc_clone_destroy(struct ifnet *); + +static MALLOC_DEFINE(M_DISC, DISCNAME, "Discard interface"); + +IFC_SIMPLE_DECLARE(disc, 0); + +static int +disc_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + struct ifnet *ifp; + struct disc_softc *sc; + + sc = malloc(sizeof(struct disc_softc), M_DISC, M_WAITOK | M_ZERO); + ifp = sc->sc_ifp = if_alloc(IFT_LOOP); + if (ifp == NULL) { + free(sc, M_DISC); + return (ENOSPC); + } + + ifp->if_softc = sc; + if_initname(ifp, ifc->ifc_name, unit); + ifp->if_mtu = DSMTU; + /* + * IFF_LOOPBACK should not be removed from disc's flags because + * it controls what PF-specific routes are magically added when + * a network address is assigned to the interface. Things just + * won't work as intended w/o such routes because the output + * interface selection for a packet is totally route-driven. + * A valid alternative to IFF_LOOPBACK can be IFF_BROADCAST or + * IFF_POINTOPOINT, but it would result in different properties + * of the interface. + */ + ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; + ifp->if_drv_flags = IFF_DRV_RUNNING; + ifp->if_ioctl = discioctl; + ifp->if_output = discoutput; + ifp->if_hdrlen = 0; + ifp->if_addrlen = 0; + ifp->if_snd.ifq_maxlen = 20; + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + + return (0); +} + +static void +disc_clone_destroy(struct ifnet *ifp) +{ + struct disc_softc *sc; + + sc = ifp->if_softc; + + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); + + free(sc, M_DISC); +} + +static int +disc_modevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + if_clone_attach(&disc_cloner); + break; + case MOD_UNLOAD: + if_clone_detach(&disc_cloner); + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t disc_mod = { + "if_disc", + disc_modevent, + NULL +}; + +DECLARE_MODULE(if_disc, disc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); + +static int +discoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct route *ro) +{ + u_int32_t af; + + M_ASSERTPKTHDR(m); + + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC) { + bcopy(dst->sa_data, &af, sizeof(af)); + dst->sa_family = af; + } + + if (bpf_peers_present(ifp->if_bpf)) { + u_int af = dst->sa_family; + bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); + } + m->m_pkthdr.rcvif = ifp; + + ifp->if_opackets++; + ifp->if_obytes += m->m_pkthdr.len; + + m_freem(m); + return (0); +} + +/* ARGSUSED */ +static void +discrtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) +{ + RT_LOCK_ASSERT(rt); + rt->rt_rmx.rmx_mtu = DSMTU; +} + +/* + * Process an ioctl request. + */ +static int +discioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct ifaddr *ifa; + struct ifreq *ifr = (struct ifreq *)data; + int error = 0; + + switch (cmd) { + + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + ifa = (struct ifaddr *)data; + if (ifa != 0) + ifa->ifa_rtrequest = discrtrequest; + /* + * Everything else is done at a higher level. + */ + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + if (ifr == 0) { + error = EAFNOSUPPORT; /* XXX */ + break; + } + switch (ifr->ifr_addr.sa_family) { + +#ifdef INET + case AF_INET: + break; +#endif +#ifdef INET6 + case AF_INET6: + break; +#endif + + default: + error = EAFNOSUPPORT; + break; + } + break; + + case SIOCSIFMTU: + ifp->if_mtu = ifr->ifr_mtu; + break; + + default: + error = EINVAL; + } + return (error); +} diff --git a/freebsd/sys/net/if_dl.h b/freebsd/sys/net/if_dl.h new file mode 100644 index 00000000..ad29f1de --- /dev/null +++ b/freebsd/sys/net/if_dl.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/net/if_edsc.c b/freebsd/sys/net/if_edsc.c new file mode 100644 index 00000000..89618ce5 --- /dev/null +++ b/freebsd/sys/net/if_edsc.c @@ -0,0 +1,356 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following edsclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following edsclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE EDSCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)if_loop.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +/* + * Discard interface driver for protocol testing and timing. + * Mimics an Ethernet device so that VLANs can be attached to it etc. + */ + +#include /* types, important constants */ +#include /* SYSINIT for load-time initializations */ +#include /* malloc(9) */ +#include /* module(9) */ +#include /* mbuf(9) */ +#include /* struct ifreq */ +#include /* socket ioctl's */ +/* #include if you need printf(9) or other all-purpose globals */ + +#include /* bpf(9) */ +#include /* Ethernet related constants and types */ +#include /* basic part of ifnet(9) */ +#include /* network interface cloning */ +#include /* IFT_ETHER and friends */ +#include /* kernel-only part of ifnet(9) */ + +/* + * Software configuration of an interface specific to this device type. + */ +struct edsc_softc { + struct ifnet *sc_ifp; /* ptr to generic interface configuration */ + + /* + * A non-null driver can keep various things here, for instance, + * the hardware revision, cached values of write-only registers, etc. + */ +}; + +/* + * Simple cloning methods. + * IFC_SIMPLE_DECLARE() expects precisely these names. + */ +static int edsc_clone_create(struct if_clone *, int, caddr_t); +static void edsc_clone_destroy(struct ifnet *); + +/* + * Interface driver methods. + */ +static void edsc_init(void *dummy); +/* static void edsc_input(struct ifnet *ifp, struct mbuf *m); would be here */ +static int edsc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); +static void edsc_start(struct ifnet *ifp); + +/* + * We'll allocate softc instances from this. + */ +static MALLOC_DEFINE(M_EDSC, "edsc", "Ethernet discard interface"); + +/* + * Attach to the interface cloning framework under the name of "edsc". + * The second argument is the number of units to be created from + * the outset. It's also the minimum number of units allowed. + * We don't want any units created as soon as the driver is loaded. + */ +IFC_SIMPLE_DECLARE(edsc, 0); + +/* + * Create an interface instance. + */ +static int +edsc_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + struct edsc_softc *sc; + struct ifnet *ifp; + static u_char eaddr[ETHER_ADDR_LEN]; /* 0:0:0:0:0:0 */ + + /* + * Allocate soft and ifnet structures. Link each to the other. + */ + sc = malloc(sizeof(struct edsc_softc), M_EDSC, M_WAITOK | M_ZERO); + ifp = sc->sc_ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) { + free(sc, M_EDSC); + return (ENOSPC); + } + + ifp->if_softc = sc; + + /* + * Get a name for this particular interface in its ifnet structure. + */ + if_initname(ifp, ifc->ifc_name, unit); + + /* + * Typical Ethernet interface flags: we can do broadcast and + * multicast but can't hear our own broadcasts or multicasts. + */ + ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX; + + /* + * We can pretent we have the whole set of hardware features + * because we just discard all packets we get from the upper layer. + * However, the features are disabled initially. They can be + * enabled via edsc_ioctl() when needed. + */ + ifp->if_capabilities = + IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM | + IFCAP_HWCSUM | IFCAP_TSO | + IFCAP_JUMBO_MTU; + ifp->if_capenable = 0; + + /* + * Set the interface driver methods. + */ + ifp->if_init = edsc_init; + /* ifp->if_input = edsc_input; */ + ifp->if_ioctl = edsc_ioctl; + ifp->if_start = edsc_start; + + /* + * Set the maximum output queue length from the global parameter. + */ + ifp->if_snd.ifq_maxlen = ifqmaxlen; + + /* + * Do ifnet initializations common to all Ethernet drivers + * and attach to the network interface framework. + * TODO: Pick a non-zero link level address. + */ + ether_ifattach(ifp, eaddr); + + /* + * Now we can mark the interface as running, i.e., ready + * for operation. + */ + ifp->if_drv_flags |= IFF_DRV_RUNNING; + + return (0); +} + +/* + * Destroy an interface instance. + */ +static void +edsc_clone_destroy(struct ifnet *ifp) +{ + struct edsc_softc *sc = ifp->if_softc; + + /* + * Detach from the network interface framework. + */ + ether_ifdetach(ifp); + + /* + * Free memory occupied by ifnet and softc. + */ + if_free(ifp); + free(sc, M_EDSC); +} + +/* + * This method is invoked from ether_ioctl() when it's time + * to bring up the hardware. + */ +static void +edsc_init(void *dummy) +{ +#if 0 /* what a hardware driver would do here... */ + struct edsc_soft *sc = (struct edsc_softc *)dummy; + struct ifnet *ifp = sc->sc_ifp; + + /* blah-blah-blah */ +#endif +} + +/* + * Network interfaces are controlled via the ioctl(2) syscall. + */ +static int +edsc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct ifreq *ifr = (struct ifreq *)data; + + switch (cmd) { + case SIOCSIFCAP: +#if 1 + /* + * Just turn on any capabilities requested. + * The generic ifioctl() function has already made sure + * that they are supported, i.e., set in if_capabilities. + */ + ifp->if_capenable = ifr->ifr_reqcap; +#else + /* + * A h/w driver would need to analyze the requested + * bits and program the hardware, e.g.: + */ + mask = ifp->if_capenable ^ ifr->ifr_reqcap; + + if (mask & IFCAP_VLAN_HWTAGGING) { + ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; + + if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) + /* blah-blah-blah */ + else + /* etc-etc-etc */ + } +#endif + break; + + default: + /* + * Offload the rest onto the common Ethernet handler. + */ + return (ether_ioctl(ifp, cmd, data)); + } + + return (0); +} + +/* + * Process the output queue. + */ +static void +edsc_start(struct ifnet *ifp) +{ + struct mbuf *m; + + /* + * A hardware interface driver can set IFF_DRV_OACTIVE + * in ifp->if_drv_flags: + * + * ifp->if_drv_flags |= IFF_DRV_OACTIVE; + * + * to prevent if_start from being invoked again while the + * transmission is under way. The flag is to protect the + * device's transmitter, not the method itself. The output + * queue is locked and several threads can process it in + * parallel safely, so the driver can use other means to + * serialize access to the transmitter. + * + * If using IFF_DRV_OACTIVE, the driver should clear the flag + * not earlier than the current transmission is complete, e.g., + * upon an interrupt from the device, not just before returning + * from if_start. This method merely starts the transmission, + * which may proceed asynchronously. + */ + + /* + * We loop getting packets from the queue until it's empty. + * A h/w driver would loop until the device can accept more + * data into its buffer, or while there are free transmit + * descriptors, or whatever. + */ + for (;;) { + /* + * Try to dequeue one packet. Stop if the queue is empty. + * Use IF_DEQUEUE() here if ALTQ(9) support is unneeded. + */ + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m == NULL) + break; + + /* + * Let bpf(9) at the packet. + */ + BPF_MTAP(ifp, m); + + /* + * Update the interface counters. + */ + ifp->if_obytes += m->m_pkthdr.len; + ifp->if_opackets++; + + /* + * Finally, just drop the packet. + * TODO: Reply to ARP requests unless IFF_NOARP is set. + */ + m_freem(m); + } + + /* + * ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + * would be here only if the transmission were synchronous. + */ +} + +/* + * This function provides handlers for module events, namely load and unload. + */ +static int +edsc_modevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + /* + * Connect to the network interface cloning framework. + */ + if_clone_attach(&edsc_cloner); + break; + + case MOD_UNLOAD: + /* + * Disconnect from the cloning framework. + * Existing interfaces will be disposed of properly. + */ + if_clone_detach(&edsc_cloner); + break; + + default: + /* + * There are other event types, but we don't handle them. + * See module(9). + */ + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t edsc_mod = { + "if_edsc", /* name */ + edsc_modevent, /* event handler */ + NULL /* additional data */ +}; + +DECLARE_MODULE(if_edsc, edsc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/freebsd/sys/net/if_ef.c b/freebsd/sys/net/if_ef.c new file mode 100644 index 00000000..8114806c --- /dev/null +++ b/freebsd/sys/net/if_ef.c @@ -0,0 +1,610 @@ +#include + +/*- + * Copyright (c) 1999, 2000 Boris Popov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET +#include +#include +#include +#endif + +#ifdef IPX +#include +#include +#endif + +/* If none of the supported layers is enabled explicitly enable them all */ +#if !defined(ETHER_II) && !defined(ETHER_8023) && !defined(ETHER_8022) && \ + !defined(ETHER_SNAP) +#define ETHER_II 1 +#define ETHER_8023 1 +#define ETHER_8022 1 +#define ETHER_SNAP 1 +#endif + +/* internal frame types */ +#define ETHER_FT_EII 0 /* Ethernet_II - default */ +#define ETHER_FT_8023 1 /* 802.3 (Novell) */ +#define ETHER_FT_8022 2 /* 802.2 */ +#define ETHER_FT_SNAP 3 /* SNAP */ +#define EF_NFT 4 /* total number of frame types */ + +#ifdef EF_DEBUG +#define EFDEBUG(format, args...) printf("%s: "format, __func__ ,## args) +#else +#define EFDEBUG(format, args...) +#endif + +#define EFERROR(format, args...) printf("%s: "format, __func__ ,## args) + +struct efnet { + struct ifnet *ef_ifp; + struct ifnet *ef_pifp; + int ef_frametype; +}; + +struct ef_link { + SLIST_ENTRY(ef_link) el_next; + struct ifnet *el_ifp; /* raw device for this clones */ + struct efnet *el_units[EF_NFT]; /* our clones */ +}; + +static SLIST_HEAD(ef_link_head, ef_link) efdev = {NULL}; +static int efcount; + +extern int (*ef_inputp)(struct ifnet*, struct ether_header *eh, struct mbuf *m); +extern int (*ef_outputp)(struct ifnet *ifp, struct mbuf **mp, + struct sockaddr *dst, short *tp, int *hlen); + +/* +static void ef_reset (struct ifnet *); +*/ +static int ef_attach(struct efnet *sc); +static int ef_detach(struct efnet *sc); +static void ef_init(void *); +static int ef_ioctl(struct ifnet *, u_long, caddr_t); +static void ef_start(struct ifnet *); +static int ef_input(struct ifnet*, struct ether_header *, struct mbuf *); +static int ef_output(struct ifnet *ifp, struct mbuf **mp, + struct sockaddr *dst, short *tp, int *hlen); + +static int ef_load(void); +static int ef_unload(void); + +/* + * Install the interface, most of structure initialization done in ef_clone() + */ +static int +ef_attach(struct efnet *sc) +{ + struct ifnet *ifp = sc->ef_ifp; + + ifp->if_start = ef_start; + ifp->if_init = ef_init; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_flags = (IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); + /* + * Attach the interface + */ + ether_ifattach(ifp, IF_LLADDR(sc->ef_pifp)); + + ifp->if_resolvemulti = 0; + ifp->if_type = IFT_XETHER; + ifp->if_drv_flags |= IFF_DRV_RUNNING; + + EFDEBUG("%s: attached\n", ifp->if_xname); + return 1; +} + +/* + * This is for _testing_only_, just removes interface from interfaces list + */ +static int +ef_detach(struct efnet *sc) +{ + struct ifnet *ifp = sc->ef_ifp; + int s; + + s = splimp(); + + ether_ifdetach(ifp); + if_free(ifp); + + splx(s); + return 0; +} + +static void +ef_init(void *foo) { + return; +} + +static int +ef_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct efnet *sc = ifp->if_softc; + struct ifaddr *ifa = (struct ifaddr*)data; + int s, error; + + EFDEBUG("IOCTL %ld for %s\n", cmd, ifp->if_xname); + error = 0; + s = splimp(); + switch (cmd) { + case SIOCSIFFLAGS: + error = 0; + break; + case SIOCSIFADDR: + if (sc->ef_frametype == ETHER_FT_8023 && + ifa->ifa_addr->sa_family != AF_IPX) { + error = EAFNOSUPPORT; + break; + } + ifp->if_flags |= IFF_UP; + /* FALL THROUGH */ + default: + error = ether_ioctl(ifp, cmd, data); + break; + } + splx(s); + return error; +} + +/* + * Currently packet prepared in the ether_output(), but this can be a better + * place. + */ +static void +ef_start(struct ifnet *ifp) +{ + struct efnet *sc = (struct efnet*)ifp->if_softc; + struct ifnet *p; + struct mbuf *m; + int error; + + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + p = sc->ef_pifp; + + EFDEBUG("\n"); + for (;;) { + IF_DEQUEUE(&ifp->if_snd, m); + if (m == 0) + break; + BPF_MTAP(ifp, m); + error = p->if_transmit(p, m); + if (error) { + ifp->if_oerrors++; + continue; + } + ifp->if_opackets++; + } + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + return; +} + +/* + * Inline functions do not put additional overhead to procedure call or + * parameter passing but simplify the code + */ +static int __inline +ef_inputEII(struct mbuf *m, struct ether_header *eh, u_short ether_type) +{ + int isr; + + switch(ether_type) { +#ifdef IPX + case ETHERTYPE_IPX: + isr = NETISR_IPX; + break; +#endif +#ifdef INET + case ETHERTYPE_IP: + if ((m = ip_fastforward(m)) == NULL) + return (0); + isr = NETISR_IP; + break; + + case ETHERTYPE_ARP: + isr = NETISR_ARP; + break; +#endif + default: + return (EPROTONOSUPPORT); + } + netisr_dispatch(isr, m); + return (0); +} + +static int __inline +ef_inputSNAP(struct mbuf *m, struct ether_header *eh, struct llc* l, + u_short ether_type) +{ + int isr; + + switch(ether_type) { +#ifdef IPX + case ETHERTYPE_IPX: + m_adj(m, 8); + isr = NETISR_IPX; + break; +#endif + default: + return (EPROTONOSUPPORT); + } + netisr_dispatch(isr, m); + return (0); +} + +static int __inline +ef_input8022(struct mbuf *m, struct ether_header *eh, struct llc* l, + u_short ether_type) +{ + int isr; + + switch(ether_type) { +#ifdef IPX + case 0xe0: + m_adj(m, 3); + isr = NETISR_IPX; + break; +#endif + default: + return (EPROTONOSUPPORT); + } + netisr_dispatch(isr, m); + return (0); +} + +/* + * Called from ether_input() + */ +static int +ef_input(struct ifnet *ifp, struct ether_header *eh, struct mbuf *m) +{ + u_short ether_type; + int ft = -1; + struct efnet *efp; + struct ifnet *eifp; + struct llc *l; + struct ef_link *efl; + int isr; + + ether_type = ntohs(eh->ether_type); + l = NULL; + if (ether_type < ETHERMTU) { + l = mtod(m, struct llc*); + if (l->llc_dsap == 0xff && l->llc_ssap == 0xff) { + /* + * Novell's "802.3" frame + */ + ft = ETHER_FT_8023; + } else if (l->llc_dsap == 0xaa && l->llc_ssap == 0xaa) { + /* + * 802.2/SNAP + */ + ft = ETHER_FT_SNAP; + ether_type = ntohs(l->llc_un.type_snap.ether_type); + } else if (l->llc_dsap == l->llc_ssap) { + /* + * 802.3/802.2 + */ + ft = ETHER_FT_8022; + ether_type = l->llc_ssap; + } + } else + ft = ETHER_FT_EII; + + if (ft == -1) { + EFDEBUG("Unrecognised ether_type %x\n", ether_type); + return EPROTONOSUPPORT; + } + + /* + * Check if interface configured for the given frame + */ + efp = NULL; + SLIST_FOREACH(efl, &efdev, el_next) { + if (efl->el_ifp == ifp) { + efp = efl->el_units[ft]; + break; + } + } + if (efp == NULL) { + EFDEBUG("Can't find if for %d\n", ft); + return EPROTONOSUPPORT; + } + eifp = efp->ef_ifp; + if ((eifp->if_flags & IFF_UP) == 0) + return EPROTONOSUPPORT; + eifp->if_ibytes += m->m_pkthdr.len + sizeof (*eh); + m->m_pkthdr.rcvif = eifp; + + BPF_MTAP2(eifp, eh, ETHER_HDR_LEN, m); + /* + * Now we ready to adjust mbufs and pass them to protocol intr's + */ + switch(ft) { + case ETHER_FT_EII: + return (ef_inputEII(m, eh, ether_type)); +#ifdef IPX + case ETHER_FT_8023: /* only IPX can be here */ + isr = NETISR_IPX; + break; +#endif + case ETHER_FT_SNAP: + return (ef_inputSNAP(m, eh, l, ether_type)); + case ETHER_FT_8022: + return (ef_input8022(m, eh, l, ether_type)); + default: + EFDEBUG("No support for frame %d and proto %04x\n", + ft, ether_type); + return (EPROTONOSUPPORT); + } + netisr_dispatch(isr, m); + return (0); +} + +static int +ef_output(struct ifnet *ifp, struct mbuf **mp, struct sockaddr *dst, short *tp, + int *hlen) +{ + struct efnet *sc = (struct efnet*)ifp->if_softc; + struct mbuf *m = *mp; + u_char *cp; + short type; + + if (ifp->if_type != IFT_XETHER) + return ENETDOWN; + switch (sc->ef_frametype) { + case ETHER_FT_EII: +#ifdef IPX + type = htons(ETHERTYPE_IPX); +#else + return EPFNOSUPPORT; +#endif + break; + case ETHER_FT_8023: + type = htons(m->m_pkthdr.len); + break; + case ETHER_FT_8022: + M_PREPEND(m, ETHER_HDR_LEN + 3, M_WAIT); + /* + * Ensure that ethernet header and next three bytes + * will fit into single mbuf + */ + m = m_pullup(m, ETHER_HDR_LEN + 3); + if (m == NULL) { + *mp = NULL; + return ENOBUFS; + } + m_adj(m, ETHER_HDR_LEN); + type = htons(m->m_pkthdr.len); + cp = mtod(m, u_char *); + *cp++ = 0xE0; + *cp++ = 0xE0; + *cp++ = 0x03; + *hlen += 3; + break; + case ETHER_FT_SNAP: + M_PREPEND(m, 8, M_WAIT); + type = htons(m->m_pkthdr.len); + cp = mtod(m, u_char *); + bcopy("\xAA\xAA\x03\x00\x00\x00\x81\x37", cp, 8); + *hlen += 8; + break; + default: + return EPFNOSUPPORT; + } + *mp = m; + *tp = type; + return 0; +} + +/* + * Create clone from the given interface + */ +static int +ef_clone(struct ef_link *efl, int ft) +{ + struct efnet *efp; + struct ifnet *eifp; + struct ifnet *ifp = efl->el_ifp; + + efp = (struct efnet*)malloc(sizeof(struct efnet), M_IFADDR, + M_WAITOK | M_ZERO); + if (efp == NULL) + return ENOMEM; + efp->ef_pifp = ifp; + efp->ef_frametype = ft; + eifp = efp->ef_ifp = if_alloc(IFT_ETHER); + if (eifp == NULL) { + free(efp, M_IFADDR); + return (ENOSPC); + } + snprintf(eifp->if_xname, IFNAMSIZ, + "%sf%d", ifp->if_xname, efp->ef_frametype); + eifp->if_dname = "ef"; + eifp->if_dunit = IF_DUNIT_NONE; + eifp->if_softc = efp; + if (ifp->if_ioctl) + eifp->if_ioctl = ef_ioctl; + efl->el_units[ft] = efp; + return 0; +} + +static int +ef_load(void) +{ + VNET_ITERATOR_DECL(vnet_iter); + struct ifnet *ifp; + struct efnet *efp; + struct ef_link *efl = NULL, *efl_temp; + int error = 0, d; + + VNET_LIST_RLOCK(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + + /* + * XXXRW: The following loop walks the ifnet list while + * modifying it, something not well-supported by ifnet + * locking. To avoid lock upgrade/recursion issues, manually + * acquire a write lock of ifnet_sxlock here, rather than a + * read lock, so that when if_alloc() recurses the lock, we + * don't panic. This structure, in which if_ef automatically + * attaches to all ethernet interfaces, should be replaced + * with a model like that found in if_vlan, in which + * interfaces are explicitly configured, which would avoid + * this (and other) problems. + */ + sx_xlock(&ifnet_sxlock); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (ifp->if_type != IFT_ETHER) continue; + EFDEBUG("Found interface %s\n", ifp->if_xname); + efl = (struct ef_link*)malloc(sizeof(struct ef_link), + M_IFADDR, M_WAITOK | M_ZERO); + if (efl == NULL) { + error = ENOMEM; + break; + } + + efl->el_ifp = ifp; +#ifdef ETHER_II + error = ef_clone(efl, ETHER_FT_EII); + if (error) break; +#endif +#ifdef ETHER_8023 + error = ef_clone(efl, ETHER_FT_8023); + if (error) break; +#endif +#ifdef ETHER_8022 + error = ef_clone(efl, ETHER_FT_8022); + if (error) break; +#endif +#ifdef ETHER_SNAP + error = ef_clone(efl, ETHER_FT_SNAP); + if (error) break; +#endif + efcount++; + SLIST_INSERT_HEAD(&efdev, efl, el_next); + } + sx_xunlock(&ifnet_sxlock); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK(); + if (error) { + if (efl) + SLIST_INSERT_HEAD(&efdev, efl, el_next); + SLIST_FOREACH_SAFE(efl, &efdev, el_next, efl_temp) { + for (d = 0; d < EF_NFT; d++) + if (efl->el_units[d]) { + if (efl->el_units[d]->ef_pifp != NULL) + if_free(efl->el_units[d]->ef_pifp); + free(efl->el_units[d], M_IFADDR); + } + free(efl, M_IFADDR); + } + return error; + } + SLIST_FOREACH(efl, &efdev, el_next) { + for (d = 0; d < EF_NFT; d++) { + efp = efl->el_units[d]; + if (efp) + ef_attach(efp); + } + } + ef_inputp = ef_input; + ef_outputp = ef_output; + EFDEBUG("Loaded\n"); + return 0; +} + +static int +ef_unload(void) +{ + struct efnet *efp; + struct ef_link *efl; + int d; + + ef_inputp = NULL; + ef_outputp = NULL; + SLIST_FOREACH(efl, &efdev, el_next) { + for (d = 0; d < EF_NFT; d++) { + efp = efl->el_units[d]; + if (efp) { + ef_detach(efp); + } + } + } + EFDEBUG("Unloaded\n"); + return 0; +} + +static int +if_ef_modevent(module_t mod, int type, void *data) +{ + switch ((modeventtype_t)type) { + case MOD_LOAD: + return ef_load(); + case MOD_UNLOAD: + return ef_unload(); + default: + return EOPNOTSUPP; + } + return 0; +} + +static moduledata_t if_ef_mod = { + "if_ef", if_ef_modevent, NULL +}; + +DECLARE_MODULE(if_ef, if_ef_mod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE); diff --git a/freebsd/sys/net/if_enc.c b/freebsd/sys/net/if_enc.c new file mode 100644 index 00000000..6bbb6ceb --- /dev/null +++ b/freebsd/sys/net/if_enc.c @@ -0,0 +1,375 @@ +#include + +/*- + * Copyright (c) 2006 The FreeBSD Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef INET6 +#include +#include +#endif + +#include +#include +#include + +#define ENCMTU (1024+512) + +/* XXX this define must have the same value as in OpenBSD */ +#define M_CONF 0x0400 /* payload was encrypted (ESP-transport) */ +#define M_AUTH 0x0800 /* payload was authenticated (AH or ESP auth) */ +#define M_AUTH_AH 0x2000 /* header was authenticated (AH) */ + +struct enchdr { + u_int32_t af; + u_int32_t spi; + u_int32_t flags; +}; + +struct ifnet *encif; +static struct mtx enc_mtx; + +struct enc_softc { + struct ifnet *sc_ifp; +}; + +static int enc_ioctl(struct ifnet *, u_long, caddr_t); +static int enc_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro); +static int enc_clone_create(struct if_clone *, int, caddr_t); +static void enc_clone_destroy(struct ifnet *); + +IFC_SIMPLE_DECLARE(enc, 1); + +/* + * Sysctls. + */ + +/* + * Before and after are relative to when we are stripping the + * outer IP header. + */ +SYSCTL_NODE(_net, OID_AUTO, enc, CTLFLAG_RW, 0, "enc sysctl"); + +SYSCTL_NODE(_net_enc, OID_AUTO, in, CTLFLAG_RW, 0, "enc input sysctl"); +static int ipsec_filter_mask_in = ENC_BEFORE; +SYSCTL_XINT(_net_enc_in, OID_AUTO, ipsec_filter_mask, CTLFLAG_RW, + &ipsec_filter_mask_in, 0, "IPsec input firewall filter mask"); +static int ipsec_bpf_mask_in = ENC_BEFORE; +SYSCTL_XINT(_net_enc_in, OID_AUTO, ipsec_bpf_mask, CTLFLAG_RW, + &ipsec_bpf_mask_in, 0, "IPsec input bpf mask"); + +SYSCTL_NODE(_net_enc, OID_AUTO, out, CTLFLAG_RW, 0, "enc output sysctl"); +static int ipsec_filter_mask_out = ENC_BEFORE; +SYSCTL_XINT(_net_enc_out, OID_AUTO, ipsec_filter_mask, CTLFLAG_RW, + &ipsec_filter_mask_out, 0, "IPsec output firewall filter mask"); +static int ipsec_bpf_mask_out = ENC_BEFORE|ENC_AFTER; +SYSCTL_XINT(_net_enc_out, OID_AUTO, ipsec_bpf_mask, CTLFLAG_RW, + &ipsec_bpf_mask_out, 0, "IPsec output bpf mask"); + +static void +enc_clone_destroy(struct ifnet *ifp) +{ + KASSERT(ifp != encif, ("%s: destroying encif", __func__)); + + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); +} + +static int +enc_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + struct ifnet *ifp; + struct enc_softc *sc; + + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); + ifp = sc->sc_ifp = if_alloc(IFT_ENC); + if (ifp == NULL) { + free(sc, M_DEVBUF); + return (ENOSPC); + } + + if_initname(ifp, ifc->ifc_name, unit); + ifp->if_mtu = ENCMTU; + ifp->if_ioctl = enc_ioctl; + ifp->if_output = enc_output; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_softc = sc; + if_attach(ifp); + bpfattach(ifp, DLT_ENC, sizeof(struct enchdr)); + + mtx_lock(&enc_mtx); + /* grab a pointer to enc0, ignore the rest */ + if (encif == NULL) + encif = ifp; + mtx_unlock(&enc_mtx); + + return (0); +} + +static int +enc_modevent(module_t mod, int type, void *data) +{ + switch (type) { + case MOD_LOAD: + mtx_init(&enc_mtx, "enc mtx", NULL, MTX_DEF); + if_clone_attach(&enc_cloner); + break; + case MOD_UNLOAD: + printf("enc module unload - not possible for this module\n"); + return (EINVAL); + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t enc_mod = { + "enc", + enc_modevent, + 0 +}; + +DECLARE_MODULE(enc, enc_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); + +static int +enc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct route *ro) +{ + m_freem(m); + return (0); +} + +/* + * Process an ioctl request. + */ +/* ARGSUSED */ +static int +enc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + int error = 0; + + mtx_lock(&enc_mtx); + + switch (cmd) { + + case SIOCSIFFLAGS: + if (ifp->if_flags & IFF_UP) + ifp->if_drv_flags |= IFF_DRV_RUNNING; + else + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + + break; + + default: + error = EINVAL; + } + + mtx_unlock(&enc_mtx); + return (error); +} + +int +ipsec_filter(struct mbuf **mp, int dir, int flags) +{ + int error, i; + struct ip *ip; + + KASSERT(encif != NULL, ("%s: encif is null", __func__)); + KASSERT(flags & (ENC_IN|ENC_OUT), + ("%s: invalid flags: %04x", __func__, flags)); + + if ((encif->if_drv_flags & IFF_DRV_RUNNING) == 0) + return (0); + + if (flags & ENC_IN) { + if ((flags & ipsec_filter_mask_in) == 0) + return (0); + } else { + if ((flags & ipsec_filter_mask_out) == 0) + return (0); + } + + /* Skip pfil(9) if no filters are loaded */ + if (!(PFIL_HOOKED(&V_inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&V_inet6_pfil_hook) +#endif + )) { + return (0); + } + + i = min((*mp)->m_pkthdr.len, max_protohdr); + if ((*mp)->m_len < i) { + *mp = m_pullup(*mp, i); + if (*mp == NULL) { + printf("%s: m_pullup failed\n", __func__); + return (-1); + } + } + + error = 0; + ip = mtod(*mp, struct ip *); + switch (ip->ip_v) { + case 4: + /* + * before calling the firewall, swap fields the same as + * IP does. here we assume the header is contiguous + */ + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + + error = pfil_run_hooks(&V_inet_pfil_hook, mp, + encif, dir, NULL); + + if (*mp == NULL || error != 0) + break; + + /* restore byte ordering */ + ip = mtod(*mp, struct ip *); + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + break; + +#ifdef INET6 + case 6: + error = pfil_run_hooks(&V_inet6_pfil_hook, mp, + encif, dir, NULL); + break; +#endif + default: + printf("%s: unknown IP version\n", __func__); + } + + /* + * If the mbuf was consumed by the filter for requeueing (dummynet, etc) + * then error will be zero but we still want to return an error to our + * caller so the null mbuf isn't forwarded further. + */ + if (*mp == NULL && error == 0) + return (-1); /* Consumed by the filter */ + if (*mp == NULL) + return (error); + if (error != 0) + goto bad; + + return (error); + +bad: + m_freem(*mp); + *mp = NULL; + return (error); +} + +void +ipsec_bpf(struct mbuf *m, struct secasvar *sav, int af, int flags) +{ + int mflags; + struct enchdr hdr; + + KASSERT(encif != NULL, ("%s: encif is null", __func__)); + KASSERT(flags & (ENC_IN|ENC_OUT), + ("%s: invalid flags: %04x", __func__, flags)); + + if ((encif->if_drv_flags & IFF_DRV_RUNNING) == 0) + return; + + if (flags & ENC_IN) { + if ((flags & ipsec_bpf_mask_in) == 0) + return; + } else { + if ((flags & ipsec_bpf_mask_out) == 0) + return; + } + + if (bpf_peers_present(encif->if_bpf)) { + mflags = 0; + hdr.spi = 0; + if (!sav) { + struct m_tag *mtag; + mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); + if (mtag != NULL) { + struct tdb_ident *tdbi; + tdbi = (struct tdb_ident *) (mtag + 1); + if (tdbi->alg_enc != SADB_EALG_NONE) + mflags |= M_CONF; + if (tdbi->alg_auth != SADB_AALG_NONE) + mflags |= M_AUTH; + hdr.spi = tdbi->spi; + } + } else { + if (sav->alg_enc != SADB_EALG_NONE) + mflags |= M_CONF; + if (sav->alg_auth != SADB_AALG_NONE) + mflags |= M_AUTH; + hdr.spi = sav->spi; + } + + /* + * We need to prepend the address family as a four byte + * field. Cons up a dummy header to pacify bpf. This + * is safe because bpf will only read from the mbuf + * (i.e., it won't try to free it or keep a pointer a + * to it). + */ + hdr.af = af; + /* hdr.spi already set above */ + hdr.flags = mflags; + + bpf_mtap2(encif->if_bpf, &hdr, sizeof(hdr), m); + } +} diff --git a/freebsd/sys/net/if_enc.h b/freebsd/sys/net/if_enc.h new file mode 100644 index 00000000..59a55fcf --- /dev/null +++ b/freebsd/sys/net/if_enc.h @@ -0,0 +1,35 @@ +/*- + * Copyright (c) 2008 The FreeBSD Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_IF_ENC_H +#define _NET_IF_ENC_H + +extern struct ifnet *encif; + +#endif /* _NET_IF_ENC_H */ diff --git a/freebsd/sys/net/if_epair.c b/freebsd/sys/net/if_epair.c new file mode 100644 index 00000000..65baeab8 --- /dev/null +++ b/freebsd/sys/net/if_epair.c @@ -0,0 +1,955 @@ +#include + +/*- + * Copyright (c) 2008 The FreeBSD Foundation + * Copyright (c) 2009-2010 Bjoern A. Zeeb + * All rights reserved. + * + * This software was developed by CK Software GmbH under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * A pair of virtual back-to-back connected ethernet like interfaces + * (``two interfaces with a virtual cross-over cable''). + * + * This is mostly intended to be used to provide connectivity between + * different virtual network stack instances. + */ +/* + * Things to re-think once we have more experience: + * - ifp->if_reassign function once we can test with vimage. Depending on + * how if_vmove() is going to be improved. + * - Real random etheraddrs that are checked to be uniquish; we would need + * to re-do them in case we move the interface between network stacks + * in a private if_reassign function. + * In case we bridge to a real interface/network or between indepedent + * epairs on multiple stacks/machines, we may need this. + * For now let the user handle that case. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define EPAIRNAME "epair" + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, OID_AUTO, epair, CTLFLAG_RW, 0, "epair sysctl"); + +#ifdef EPAIR_DEBUG +static int epair_debug = 0; +SYSCTL_INT(_net_link_epair, OID_AUTO, epair_debug, CTLFLAG_RW, + &epair_debug, 0, "if_epair(4) debugging."); +#define DPRINTF(fmt, arg...) \ + if (epair_debug) \ + printf("[%s:%d] " fmt, __func__, __LINE__, ##arg) +#else +#define DPRINTF(fmt, arg...) +#endif + +static void epair_nh_sintr(struct mbuf *); +static struct mbuf *epair_nh_m2cpuid(struct mbuf *, uintptr_t, u_int *); +static void epair_nh_drainedcpu(u_int); + +static void epair_start_locked(struct ifnet *); + +static int epair_clone_match(struct if_clone *, const char *); +static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t); +static int epair_clone_destroy(struct if_clone *, struct ifnet *); + +/* Netisr realted definitions and sysctl. */ +static struct netisr_handler epair_nh = { + .nh_name = EPAIRNAME, + .nh_proto = NETISR_EPAIR, + .nh_policy = NETISR_POLICY_CPU, + .nh_handler = epair_nh_sintr, + .nh_m2cpuid = epair_nh_m2cpuid, + .nh_drainedcpu = epair_nh_drainedcpu, +}; + +static int +sysctl_epair_netisr_maxqlen(SYSCTL_HANDLER_ARGS) +{ + int error, qlimit; + + netisr_getqlimit(&epair_nh, &qlimit); + error = sysctl_handle_int(oidp, &qlimit, 0, req); + if (error || !req->newptr) + return (error); + if (qlimit < 1) + return (EINVAL); + return (netisr_setqlimit(&epair_nh, qlimit)); +} +SYSCTL_PROC(_net_link_epair, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, + 0, 0, sysctl_epair_netisr_maxqlen, "I", + "Maximum if_epair(4) netisr \"hw\" queue length"); + +struct epair_softc { + struct ifnet *ifp; /* This ifp. */ + struct ifnet *oifp; /* other ifp of pair. */ + u_int refcount; /* # of mbufs in flight. */ + u_int cpuid; /* CPU ID assigned upon creation. */ + void (*if_qflush)(struct ifnet *); + /* Original if_qflush routine. */ +}; + +/* + * Per-CPU list of ifps with data in the ifq that needs to be flushed + * to the netisr ``hw'' queue before we allow any further direct queuing + * to the ``hw'' queue. + */ +struct epair_ifp_drain { + STAILQ_ENTRY(epair_ifp_drain) ifp_next; + struct ifnet *ifp; +}; +STAILQ_HEAD(eid_list, epair_ifp_drain); + +#define EPAIR_LOCK_INIT(dpcpu) mtx_init(&(dpcpu)->if_epair_mtx, \ + "if_epair", NULL, MTX_DEF) +#define EPAIR_LOCK_DESTROY(dpcpu) mtx_destroy(&(dpcpu)->if_epair_mtx) +#define EPAIR_LOCK_ASSERT(dpcpu) mtx_assert(&(dpcpu)->if_epair_mtx, \ + MA_OWNED) +#define EPAIR_LOCK(dpcpu) mtx_lock(&(dpcpu)->if_epair_mtx) +#define EPAIR_UNLOCK(dpcpu) mtx_unlock(&(dpcpu)->if_epair_mtx) + +#ifdef INVARIANTS +#define EPAIR_REFCOUNT_INIT(r, v) refcount_init((r), (v)) +#define EPAIR_REFCOUNT_AQUIRE(r) refcount_acquire((r)) +#define EPAIR_REFCOUNT_RELEASE(r) refcount_release((r)) +#define EPAIR_REFCOUNT_ASSERT(a, p) KASSERT(a, p) +#else +#define EPAIR_REFCOUNT_INIT(r, v) +#define EPAIR_REFCOUNT_AQUIRE(r) +#define EPAIR_REFCOUNT_RELEASE(r) +#define EPAIR_REFCOUNT_ASSERT(a, p) +#endif + +static MALLOC_DEFINE(M_EPAIR, EPAIRNAME, + "Pair of virtual cross-over connected Ethernet-like interfaces"); + +static struct if_clone epair_cloner = IFC_CLONE_INITIALIZER( + EPAIRNAME, NULL, IF_MAXUNIT, + NULL, epair_clone_match, epair_clone_create, epair_clone_destroy); + +/* + * DPCPU area and functions. + */ +struct epair_dpcpu { + struct mtx if_epair_mtx; /* Per-CPU locking. */ + int epair_drv_flags; /* Per-CPU ``hw'' drv flags. */ + struct eid_list epair_ifp_drain_list; /* Per-CPU list of ifps with + * data in the ifq. */ +}; +DPCPU_DEFINE(struct epair_dpcpu, epair_dpcpu); + +static void +epair_dpcpu_init(void) +{ + struct epair_dpcpu *epair_dpcpu; + struct eid_list *s; + u_int cpuid; + + for (cpuid = 0; cpuid <= mp_maxid; cpuid++) { + if (CPU_ABSENT(cpuid)) + continue; + + epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); + + /* Initialize per-cpu lock. */ + EPAIR_LOCK_INIT(epair_dpcpu); + + /* Driver flags are per-cpu as are our netisr "hw" queues. */ + epair_dpcpu->epair_drv_flags = 0; + + /* + * Initialize per-cpu drain list. + * Manually do what STAILQ_HEAD_INITIALIZER would do. + */ + s = &epair_dpcpu->epair_ifp_drain_list; + s->stqh_first = NULL; + s->stqh_last = &s->stqh_first; + } +} + +static void +epair_dpcpu_detach(void) +{ + struct epair_dpcpu *epair_dpcpu; + u_int cpuid; + + for (cpuid = 0; cpuid <= mp_maxid; cpuid++) { + if (CPU_ABSENT(cpuid)) + continue; + + epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); + + /* Destroy per-cpu lock. */ + EPAIR_LOCK_DESTROY(epair_dpcpu); + } +} + +/* + * Helper functions. + */ +static u_int +cpuid_from_ifp(struct ifnet *ifp) +{ + struct epair_softc *sc; + + if (ifp == NULL) + return (0); + sc = ifp->if_softc; + + return (sc->cpuid); +} + +/* + * Netisr handler functions. + */ +static void +epair_nh_sintr(struct mbuf *m) +{ + struct ifnet *ifp; + struct epair_softc *sc; + + ifp = m->m_pkthdr.rcvif; + (*ifp->if_input)(ifp, m); + sc = ifp->if_softc; + EPAIR_REFCOUNT_RELEASE(&sc->refcount); + EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, + ("%s: ifp=%p sc->refcount not >= 1: %d", + __func__, ifp, sc->refcount)); + DPRINTF("ifp=%p refcount=%u\n", ifp, sc->refcount); +} + +static struct mbuf * +epair_nh_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) +{ + + *cpuid = cpuid_from_ifp(m->m_pkthdr.rcvif); + + return (m); +} + +static void +epair_nh_drainedcpu(u_int cpuid) +{ + struct epair_dpcpu *epair_dpcpu; + struct epair_ifp_drain *elm, *tvar; + struct ifnet *ifp; + + epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); + EPAIR_LOCK(epair_dpcpu); + /* + * Assume our "hw" queue and possibly ifq will be emptied + * again. In case we will overflow the "hw" queue while + * draining, epair_start_locked will set IFF_DRV_OACTIVE + * again and we will stop and return. + */ + STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list, + ifp_next, tvar) { + ifp = elm->ifp; + epair_dpcpu->epair_drv_flags &= ~IFF_DRV_OACTIVE; + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + epair_start_locked(ifp); + + IFQ_LOCK(&ifp->if_snd); + if (IFQ_IS_EMPTY(&ifp->if_snd)) { + struct epair_softc *sc; + + STAILQ_REMOVE(&epair_dpcpu->epair_ifp_drain_list, + elm, epair_ifp_drain, ifp_next); + /* The cached ifp goes off the list. */ + sc = ifp->if_softc; + EPAIR_REFCOUNT_RELEASE(&sc->refcount); + EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, + ("%s: ifp=%p sc->refcount not >= 1: %d", + __func__, ifp, sc->refcount)); + free(elm, M_EPAIR); + } + IFQ_UNLOCK(&ifp->if_snd); + + if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) != 0) { + /* Our "hw"q overflew again. */ + epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE; + DPRINTF("hw queue length overflow at %u\n", + epair_nh.nh_qlimit); + break; + } + } + EPAIR_UNLOCK(epair_dpcpu); +} + +/* + * Network interface (`if') related functions. + */ +static void +epair_remove_ifp_from_draining(struct ifnet *ifp) +{ + struct epair_dpcpu *epair_dpcpu; + struct epair_ifp_drain *elm, *tvar; + u_int cpuid; + + for (cpuid = 0; cpuid <= mp_maxid; cpuid++) { + if (CPU_ABSENT(cpuid)) + continue; + + epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); + EPAIR_LOCK(epair_dpcpu); + STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list, + ifp_next, tvar) { + if (ifp == elm->ifp) { + struct epair_softc *sc; + + STAILQ_REMOVE( + &epair_dpcpu->epair_ifp_drain_list, elm, + epair_ifp_drain, ifp_next); + /* The cached ifp goes off the list. */ + sc = ifp->if_softc; + EPAIR_REFCOUNT_RELEASE(&sc->refcount); + EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, + ("%s: ifp=%p sc->refcount not >= 1: %d", + __func__, ifp, sc->refcount)); + free(elm, M_EPAIR); + } + } + EPAIR_UNLOCK(epair_dpcpu); + } +} + +static int +epair_add_ifp_for_draining(struct ifnet *ifp) +{ + struct epair_dpcpu *epair_dpcpu; + struct epair_softc *sc; + struct epair_ifp_drain *elm = NULL; + + sc = ifp->if_softc; + epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu); + EPAIR_LOCK_ASSERT(epair_dpcpu); + STAILQ_FOREACH(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next) + if (elm->ifp == ifp) + break; + /* If the ifp is there already, return success. */ + if (elm != NULL) + return (0); + + elm = malloc(sizeof(struct epair_ifp_drain), M_EPAIR, M_NOWAIT|M_ZERO); + if (elm == NULL) + return (ENOMEM); + + elm->ifp = ifp; + /* Add a reference for the ifp pointer on the list. */ + EPAIR_REFCOUNT_AQUIRE(&sc->refcount); + STAILQ_INSERT_TAIL(&epair_dpcpu->epair_ifp_drain_list, elm, ifp_next); + + return (0); +} + +static void +epair_start_locked(struct ifnet *ifp) +{ + struct epair_dpcpu *epair_dpcpu; + struct mbuf *m; + struct epair_softc *sc; + struct ifnet *oifp; + int error; + + DPRINTF("ifp=%p\n", ifp); + sc = ifp->if_softc; + epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu); + EPAIR_LOCK_ASSERT(epair_dpcpu); + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + return; + if ((ifp->if_flags & IFF_UP) == 0) + return; + + /* + * We get patckets here from ether_output via if_handoff() + * and ned to put them into the input queue of the oifp + * and call oifp->if_input() via netisr/epair_sintr(). + */ + oifp = sc->oifp; + sc = oifp->if_softc; + for (;;) { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m == NULL) + break; + BPF_MTAP(ifp, m); + + /* + * In case the outgoing interface is not usable, + * drop the packet. + */ + if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + (oifp->if_flags & IFF_UP) ==0) { + ifp->if_oerrors++; + m_freem(m); + continue; + } + DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname); + + /* + * Add a reference so the interface cannot go while the + * packet is in transit as we rely on rcvif to stay valid. + */ + EPAIR_REFCOUNT_AQUIRE(&sc->refcount); + m->m_pkthdr.rcvif = oifp; + CURVNET_SET_QUIET(oifp->if_vnet); + error = netisr_queue(NETISR_EPAIR, m); + CURVNET_RESTORE(); + if (!error) { + ifp->if_opackets++; + /* Someone else received the packet. */ + oifp->if_ipackets++; + } else { + /* The packet was freed already. */ + epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE; + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + (void) epair_add_ifp_for_draining(ifp); + ifp->if_oerrors++; + EPAIR_REFCOUNT_RELEASE(&sc->refcount); + EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, + ("%s: ifp=%p sc->refcount not >= 1: %d", + __func__, oifp, sc->refcount)); + } + } +} + +static void +epair_start(struct ifnet *ifp) +{ + struct epair_dpcpu *epair_dpcpu; + + epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu); + EPAIR_LOCK(epair_dpcpu); + epair_start_locked(ifp); + EPAIR_UNLOCK(epair_dpcpu); +} + +static int +epair_transmit_locked(struct ifnet *ifp, struct mbuf *m) +{ + struct epair_dpcpu *epair_dpcpu; + struct epair_softc *sc; + struct ifnet *oifp; + int error, len; + short mflags; + + DPRINTF("ifp=%p m=%p\n", ifp, m); + sc = ifp->if_softc; + epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu); + EPAIR_LOCK_ASSERT(epair_dpcpu); + + if (m == NULL) + return (0); + + /* + * We are not going to use the interface en/dequeue mechanism + * on the TX side. We are called from ether_output_frame() + * and will put the packet into the incoming queue of the + * other interface of our pair via the netsir. + */ + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + m_freem(m); + return (ENXIO); + } + if ((ifp->if_flags & IFF_UP) == 0) { + m_freem(m); + return (ENETDOWN); + } + + BPF_MTAP(ifp, m); + + /* + * In case the outgoing interface is not usable, + * drop the packet. + */ + oifp = sc->oifp; + if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + (oifp->if_flags & IFF_UP) ==0) { + ifp->if_oerrors++; + m_freem(m); + return (0); + } + len = m->m_pkthdr.len; + mflags = m->m_flags; + DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname); + +#ifdef ALTQ + /* Support ALTQ via the clasic if_start() path. */ + IF_LOCK(&ifp->if_snd); + if (ALTQ_IS_ENABLED(&ifp->if_snd)) { + ALTQ_ENQUEUE(&ifp->if_snd, m, NULL, error); + if (error) + ifp->if_snd.ifq_drops++; + IF_UNLOCK(&ifp->if_snd); + if (!error) { + ifp->if_obytes += len; + if (mflags & (M_BCAST|M_MCAST)) + ifp->if_omcasts++; + + if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) + epair_start_locked(ifp); + else + (void)epair_add_ifp_for_draining(ifp); + } + return (error); + } + IF_UNLOCK(&ifp->if_snd); +#endif + + if ((epair_dpcpu->epair_drv_flags & IFF_DRV_OACTIVE) != 0) { + /* + * Our hardware queue is full, try to fall back + * queuing to the ifq but do not call ifp->if_start. + * Either we are lucky or the packet is gone. + */ + IFQ_ENQUEUE(&ifp->if_snd, m, error); + if (!error) + (void)epair_add_ifp_for_draining(ifp); + return (error); + } + sc = oifp->if_softc; + /* + * Add a reference so the interface cannot go while the + * packet is in transit as we rely on rcvif to stay valid. + */ + EPAIR_REFCOUNT_AQUIRE(&sc->refcount); + m->m_pkthdr.rcvif = oifp; + CURVNET_SET_QUIET(oifp->if_vnet); + error = netisr_queue(NETISR_EPAIR, m); + CURVNET_RESTORE(); + if (!error) { + ifp->if_opackets++; + /* + * IFQ_HANDOFF_ADJ/ip_handoff() update statistics, + * but as we bypass all this we have to duplicate + * the logic another time. + */ + ifp->if_obytes += len; + if (mflags & (M_BCAST|M_MCAST)) + ifp->if_omcasts++; + /* Someone else received the packet. */ + oifp->if_ipackets++; + } else { + /* The packet was freed already. */ + epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE; + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + ifp->if_oerrors++; + EPAIR_REFCOUNT_RELEASE(&sc->refcount); + EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, + ("%s: ifp=%p sc->refcount not >= 1: %d", + __func__, oifp, sc->refcount)); + } + + return (error); +} + +static int +epair_transmit(struct ifnet *ifp, struct mbuf *m) +{ + struct epair_dpcpu *epair_dpcpu; + int error; + + epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu); + EPAIR_LOCK(epair_dpcpu); + error = epair_transmit_locked(ifp, m); + EPAIR_UNLOCK(epair_dpcpu); + return (error); +} + +static void +epair_qflush(struct ifnet *ifp) +{ + struct epair_softc *sc; + + sc = ifp->if_softc; + KASSERT(sc != NULL, ("%s: ifp=%p, epair_softc gone? sc=%p\n", + __func__, ifp, sc)); + /* + * Remove this ifp from all backpointer lists. The interface will not + * usable for flushing anyway nor should it have anything to flush + * after if_qflush(). + */ + epair_remove_ifp_from_draining(ifp); + + if (sc->if_qflush) + sc->if_qflush(ifp); +} + +static int +epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct ifreq *ifr; + int error; + + ifr = (struct ifreq *)data; + switch (cmd) { + case SIOCSIFFLAGS: + case SIOCADDMULTI: + case SIOCDELMULTI: + error = 0; + break; + + case SIOCSIFMTU: + /* We basically allow all kinds of MTUs. */ + ifp->if_mtu = ifr->ifr_mtu; + error = 0; + break; + + default: + /* Let the common ethernet handler process this. */ + error = ether_ioctl(ifp, cmd, data); + break; + } + + return (error); +} + +static void +epair_init(void *dummy __unused) +{ +} + + +/* + * Interface cloning functions. + * We use our private ones so that we can create/destroy our secondary + * device along with the primary one. + */ +static int +epair_clone_match(struct if_clone *ifc, const char *name) +{ + const char *cp; + + DPRINTF("name='%s'\n", name); + + /* + * Our base name is epair. + * Our interfaces will be named epair[ab]. + * So accept anything of the following list: + * - epair + * - epair + * but not the epair[ab] versions. + */ + if (strncmp(EPAIRNAME, name, sizeof(EPAIRNAME)-1) != 0) + return (0); + + for (cp = name + sizeof(EPAIRNAME) - 1; *cp != '\0'; cp++) { + if (*cp < '0' || *cp > '9') + return (0); + } + + return (1); +} + +static int +epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +{ + struct epair_softc *sca, *scb; + struct ifnet *ifp; + char *dp; + int error, unit, wildcard; + uint8_t eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ + + /* + * We are abusing params to create our second interface. + * Actually we already created it and called if_clone_createif() + * for it to do the official insertion procedure the moment we knew + * it cannot fail anymore. So just do attach it here. + */ + if (params) { + scb = (struct epair_softc *)params; + ifp = scb->ifp; + /* Assign a hopefully unique, locally administered etheraddr. */ + eaddr[0] = 0x02; + eaddr[3] = (ifp->if_index >> 8) & 0xff; + eaddr[4] = ifp->if_index & 0xff; + eaddr[5] = 0x0b; + ether_ifattach(ifp, eaddr); + /* Correctly set the name for the cloner list. */ + strlcpy(name, scb->ifp->if_xname, len); + return (0); + } + + /* Try to see if a special unit was requested. */ + error = ifc_name2unit(name, &unit); + if (error != 0) + return (error); + wildcard = (unit < 0); + + error = ifc_alloc_unit(ifc, &unit); + if (error != 0) + return (error); + + /* + * If no unit had been given, we need to adjust the ifName. + * Also make sure there is space for our extra [ab] suffix. + */ + for (dp = name; *dp != '\0'; dp++); + if (wildcard) { + error = snprintf(dp, len - (dp - name), "%d", unit); + if (error > len - (dp - name) - 1) { + /* ifName too long. */ + ifc_free_unit(ifc, unit); + return (ENOSPC); + } + dp += error; + } + if (len - (dp - name) - 1 < 1) { + /* No space left for our [ab] suffix. */ + ifc_free_unit(ifc, unit); + return (ENOSPC); + } + *dp = 'a'; + /* Must not change dp so we can replace 'a' by 'b' later. */ + *(dp+1) = '\0'; + + /* Allocate memory for both [ab] interfaces */ + sca = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO); + EPAIR_REFCOUNT_INIT(&sca->refcount, 1); + sca->ifp = if_alloc(IFT_ETHER); + if (sca->ifp == NULL) { + free(sca, M_EPAIR); + ifc_free_unit(ifc, unit); + return (ENOSPC); + } + + scb = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO); + EPAIR_REFCOUNT_INIT(&scb->refcount, 1); + scb->ifp = if_alloc(IFT_ETHER); + if (scb->ifp == NULL) { + free(scb, M_EPAIR); + if_free(sca->ifp); + free(sca, M_EPAIR); + ifc_free_unit(ifc, unit); + return (ENOSPC); + } + + /* + * Cross-reference the interfaces so we will be able to free both. + */ + sca->oifp = scb->ifp; + scb->oifp = sca->ifp; + + /* + * Calculate the cpuid for netisr queueing based on the + * ifIndex of the interfaces. As long as we cannot configure + * this or use cpuset information easily we cannot guarantee + * cache locality but we can at least allow parallelism. + */ + sca->cpuid = + netisr_get_cpuid(sca->ifp->if_index % netisr_get_cpucount()); + scb->cpuid = + netisr_get_cpuid(scb->ifp->if_index % netisr_get_cpucount()); + + /* Finish initialization of interface a. */ + ifp = sca->ifp; + ifp->if_softc = sca; + strlcpy(ifp->if_xname, name, IFNAMSIZ); + ifp->if_dname = ifc->ifc_name; + ifp->if_dunit = unit; + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_start = epair_start; + ifp->if_ioctl = epair_ioctl; + ifp->if_init = epair_init; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + /* Assign a hopefully unique, locally administered etheraddr. */ + eaddr[0] = 0x02; + eaddr[3] = (ifp->if_index >> 8) & 0xff; + eaddr[4] = ifp->if_index & 0xff; + eaddr[5] = 0x0a; + ether_ifattach(ifp, eaddr); + sca->if_qflush = ifp->if_qflush; + ifp->if_qflush = epair_qflush; + ifp->if_transmit = epair_transmit; + ifp->if_baudrate = IF_Gbps(10UL); /* arbitrary maximum */ + + /* Swap the name and finish initialization of interface b. */ + *dp = 'b'; + + ifp = scb->ifp; + ifp->if_softc = scb; + strlcpy(ifp->if_xname, name, IFNAMSIZ); + ifp->if_dname = ifc->ifc_name; + ifp->if_dunit = unit; + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_start = epair_start; + ifp->if_ioctl = epair_ioctl; + ifp->if_init = epair_init; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + /* We need to play some tricks here for the second interface. */ + strlcpy(name, EPAIRNAME, len); + error = if_clone_create(name, len, (caddr_t)scb); + if (error) + panic("%s: if_clone_createif() for our 2nd iface failed: %d", + __func__, error); + scb->if_qflush = ifp->if_qflush; + ifp->if_qflush = epair_qflush; + ifp->if_transmit = epair_transmit; + ifp->if_baudrate = IF_Gbps(10UL); /* arbitrary maximum */ + + /* + * Restore name to a as the ifp for this will go into the + * cloner list for the initial call. + */ + strlcpy(name, sca->ifp->if_xname, len); + DPRINTF("name='%s/%db' created sca=%p scb=%p\n", name, unit, sca, scb); + + /* Tell the world, that we are ready to rock. */ + sca->ifp->if_drv_flags |= IFF_DRV_RUNNING; + scb->ifp->if_drv_flags |= IFF_DRV_RUNNING; + if_link_state_change(sca->ifp, LINK_STATE_UP); + if_link_state_change(scb->ifp, LINK_STATE_UP); + + return (0); +} + +static int +epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) +{ + struct ifnet *oifp; + struct epair_softc *sca, *scb; + int unit, error; + + DPRINTF("ifp=%p\n", ifp); + + /* + * In case we called into if_clone_destroyif() ourselves + * again to remove the second interface, the softc will be + * NULL. In that case so not do anything but return success. + */ + if (ifp->if_softc == NULL) + return (0); + + unit = ifp->if_dunit; + sca = ifp->if_softc; + oifp = sca->oifp; + scb = oifp->if_softc; + + DPRINTF("ifp=%p oifp=%p\n", ifp, oifp); + if_link_state_change(ifp, LINK_STATE_DOWN); + if_link_state_change(oifp, LINK_STATE_DOWN); + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + oifp->if_drv_flags &= ~IFF_DRV_RUNNING; + ether_ifdetach(oifp); + ether_ifdetach(ifp); + /* + * Wait for all packets to be dispatched to if_input. + * The numbers can only go down as the interfaces are + * detached so there is no need to use atomics. + */ + DPRINTF("sca refcnt=%u scb refcnt=%u\n", sca->refcount, scb->refcount); + EPAIR_REFCOUNT_ASSERT(sca->refcount == 1 && scb->refcount == 1, + ("%s: ifp=%p sca->refcount!=1: %d || ifp=%p scb->refcount!=1: %d", + __func__, ifp, sca->refcount, oifp, scb->refcount)); + + /* + * Get rid of our second half. + */ + oifp->if_softc = NULL; + error = if_clone_destroyif(ifc, oifp); + if (error) + panic("%s: if_clone_destroyif() for our 2nd iface failed: %d", + __func__, error); + + /* + * Finish cleaning up. Free them and release the unit. + * As the other of the two interfaces my reside in a different vnet, + * we need to switch before freeing them. + */ + CURVNET_SET_QUIET(oifp->if_vnet); + if_free(oifp); + CURVNET_RESTORE(); + if_free(ifp); + free(scb, M_EPAIR); + free(sca, M_EPAIR); + ifc_free_unit(ifc, unit); + + return (0); +} + +static int +epair_modevent(module_t mod, int type, void *data) +{ + int qlimit; + + switch (type) { + case MOD_LOAD: + /* For now limit us to one global mutex and one inq. */ + epair_dpcpu_init(); + epair_nh.nh_qlimit = 42 * ifqmaxlen; /* 42 shall be the number. */ +#ifndef __rtems__ + if (TUNABLE_INT_FETCH("net.link.epair.netisr_maxqlen", &qlimit)) + epair_nh.nh_qlimit = qlimit; +#endif + netisr_register(&epair_nh); + if_clone_attach(&epair_cloner); + if (bootverbose) + printf("%s initialized.\n", EPAIRNAME); + break; + case MOD_UNLOAD: + if_clone_detach(&epair_cloner); + netisr_unregister(&epair_nh); + epair_dpcpu_detach(); + if (bootverbose) + printf("%s unloaded.\n", EPAIRNAME); + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t epair_mod = { + "if_epair", + epair_modevent, + 0 +}; + +DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_epair, 1); diff --git a/freebsd/sys/net/if_ethersubr.c b/freebsd/sys/net/if_ethersubr.c new file mode 100644 index 00000000..d87ebbd0 --- /dev/null +++ b/freebsd/sys/net/if_ethersubr.c @@ -0,0 +1,1364 @@ +#include + +/*- + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(INET) || defined(INET6) +#include +#include +#include +#include +#include +#include +#include +#endif +#ifdef INET6 +#include +#endif + +#ifdef IPX +#include +#include +#endif + +int (*ef_inputp)(struct ifnet*, struct ether_header *eh, struct mbuf *m); +int (*ef_outputp)(struct ifnet *ifp, struct mbuf **mp, + struct sockaddr *dst, short *tp, int *hlen); + +#ifdef NETATALK +#include +#include +#include + +#define llc_snap_org_code llc_un.type_snap.org_code +#define llc_snap_ether_type llc_un.type_snap.ether_type + +extern u_char at_org_code[3]; +extern u_char aarp_org_code[3]; +#endif /* NETATALK */ + +#include + +#ifdef CTASSERT +CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); +CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); +#endif + +/* netgraph node hooks for ng_ether(4) */ +void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); +void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); +int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); +void (*ng_ether_attach_p)(struct ifnet *ifp); +void (*ng_ether_detach_p)(struct ifnet *ifp); + +void (*vlan_input_p)(struct ifnet *, struct mbuf *); + +/* if_bridge(4) support */ +struct mbuf *(*bridge_input_p)(struct ifnet *, struct mbuf *); +int (*bridge_output_p)(struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *); +void (*bridge_dn_p)(struct mbuf *, struct ifnet *); + +/* if_lagg(4) support */ +struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); + +static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + +static int ether_resolvemulti(struct ifnet *, struct sockaddr **, + struct sockaddr *); +#ifdef VIMAGE +static void ether_reassign(struct ifnet *, struct vnet *, char *); +#endif + +/* XXX: should be in an arp support file, not here */ +MALLOC_DEFINE(M_ARPCOM, "arpcom", "802.* interface internals"); + +#define ETHER_IS_BROADCAST(addr) \ + (bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0) + +#define senderr(e) do { error = (e); goto bad;} while (0) + +#if defined(INET) || defined(INET6) +int +ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared); +static VNET_DEFINE(int, ether_ipfw); +#define V_ether_ipfw VNET(ether_ipfw) +#endif + + +/* + * Ethernet output routine. + * Encapsulate a packet of type family for the local net. + * Use trailer local net encapsulation if enough data in first + * packet leaves a multiple of 512 bytes of data in remainder. + */ +int +ether_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro) +{ + short type; + int error = 0, hdrcmplt = 0; + u_char esrc[ETHER_ADDR_LEN], edst[ETHER_ADDR_LEN]; + struct llentry *lle = NULL; + struct rtentry *rt0 = NULL; + struct ether_header *eh; + struct pf_mtag *t; + int loop_copy = 1; + int hlen; /* link layer header length */ + + if (ro != NULL) { + if (!(m->m_flags & (M_BCAST | M_MCAST))) + lle = ro->ro_lle; + rt0 = ro->ro_rt; + } +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) + senderr(error); +#endif + + M_PROFILE(m); + if (ifp->if_flags & IFF_MONITOR) + senderr(ENETDOWN); + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) + senderr(ENETDOWN); + + hlen = ETHER_HDR_LEN; + switch (dst->sa_family) { +#ifdef INET + case AF_INET: + if (lle != NULL && (lle->la_flags & LLE_VALID)) + memcpy(edst, &lle->ll_addr.mac16, sizeof(edst)); + else + error = arpresolve(ifp, rt0, m, dst, edst, &lle); + if (error) + return (error == EWOULDBLOCK ? 0 : error); + type = htons(ETHERTYPE_IP); + break; + case AF_ARP: + { + struct arphdr *ah; + ah = mtod(m, struct arphdr *); + ah->ar_hrd = htons(ARPHRD_ETHER); + + loop_copy = 0; /* if this is for us, don't do it */ + + switch(ntohs(ah->ar_op)) { + case ARPOP_REVREQUEST: + case ARPOP_REVREPLY: + type = htons(ETHERTYPE_REVARP); + break; + case ARPOP_REQUEST: + case ARPOP_REPLY: + default: + type = htons(ETHERTYPE_ARP); + break; + } + + if (m->m_flags & M_BCAST) + bcopy(ifp->if_broadcastaddr, edst, ETHER_ADDR_LEN); + else + bcopy(ar_tha(ah), edst, ETHER_ADDR_LEN); + + } + break; +#endif +#ifdef INET6 + case AF_INET6: + if (lle != NULL && (lle->la_flags & LLE_VALID)) + memcpy(edst, &lle->ll_addr.mac16, sizeof(edst)); + else + error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle); + if (error) + return error; + type = htons(ETHERTYPE_IPV6); + break; +#endif +#ifdef IPX + case AF_IPX: + if (ef_outputp) { + error = ef_outputp(ifp, &m, dst, &type, &hlen); + if (error) + goto bad; + } else + type = htons(ETHERTYPE_IPX); + bcopy((caddr_t)&(((struct sockaddr_ipx *)dst)->sipx_addr.x_host), + (caddr_t)edst, sizeof (edst)); + break; +#endif +#ifdef NETATALK + case AF_APPLETALK: + { + struct at_ifaddr *aa; + + if ((aa = at_ifawithnet((struct sockaddr_at *)dst)) == NULL) + senderr(EHOSTUNREACH); /* XXX */ + if (!aarpresolve(ifp, m, (struct sockaddr_at *)dst, edst)) { + ifa_free(&aa->aa_ifa); + return (0); + } + /* + * In the phase 2 case, need to prepend an mbuf for the llc header. + */ + if ( aa->aa_flags & AFA_PHASE2 ) { + struct llc llc; + + ifa_free(&aa->aa_ifa); + M_PREPEND(m, LLC_SNAPFRAMELEN, M_DONTWAIT); + if (m == NULL) + senderr(ENOBUFS); + llc.llc_dsap = llc.llc_ssap = LLC_SNAP_LSAP; + llc.llc_control = LLC_UI; + bcopy(at_org_code, llc.llc_snap_org_code, sizeof(at_org_code)); + llc.llc_snap_ether_type = htons( ETHERTYPE_AT ); + bcopy(&llc, mtod(m, caddr_t), LLC_SNAPFRAMELEN); + type = htons(m->m_pkthdr.len); + hlen = LLC_SNAPFRAMELEN + ETHER_HDR_LEN; + } else { + ifa_free(&aa->aa_ifa); + type = htons(ETHERTYPE_AT); + } + break; + } +#endif /* NETATALK */ + + case pseudo_AF_HDRCMPLT: + hdrcmplt = 1; + eh = (struct ether_header *)dst->sa_data; + (void)memcpy(esrc, eh->ether_shost, sizeof (esrc)); + /* FALLTHROUGH */ + + case AF_UNSPEC: + loop_copy = 0; /* if this is for us, don't do it */ + eh = (struct ether_header *)dst->sa_data; + (void)memcpy(edst, eh->ether_dhost, sizeof (edst)); + type = eh->ether_type; + break; + + default: + if_printf(ifp, "can't handle af%d\n", dst->sa_family); + senderr(EAFNOSUPPORT); + } + + if (lle != NULL && (lle->la_flags & LLE_IFADDR)) { + int csum_flags = 0; + if (m->m_pkthdr.csum_flags & CSUM_IP) + csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) + csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); + if (m->m_pkthdr.csum_flags & CSUM_SCTP) + csum_flags |= CSUM_SCTP_VALID; + m->m_pkthdr.csum_flags |= csum_flags; + m->m_pkthdr.csum_data = 0xffff; + return (if_simloop(ifp, m, dst->sa_family, 0)); + } + + /* + * Add local net header. If no space in first mbuf, + * allocate another. + */ + M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); + if (m == NULL) + senderr(ENOBUFS); + eh = mtod(m, struct ether_header *); + (void)memcpy(&eh->ether_type, &type, + sizeof(eh->ether_type)); + (void)memcpy(eh->ether_dhost, edst, sizeof (edst)); + if (hdrcmplt) + (void)memcpy(eh->ether_shost, esrc, + sizeof(eh->ether_shost)); + else + (void)memcpy(eh->ether_shost, IF_LLADDR(ifp), + sizeof(eh->ether_shost)); + + /* + * If a simplex interface, and the packet is being sent to our + * Ethernet address or a broadcast address, loopback a copy. + * XXX To make a simplex device behave exactly like a duplex + * device, we should copy in the case of sending to our own + * ethernet address (thus letting the original actually appear + * on the wire). However, we don't do that here for security + * reasons and compatibility with the original behavior. + */ + if ((ifp->if_flags & IFF_SIMPLEX) && loop_copy && + ((t = pf_find_mtag(m)) == NULL || !t->routed)) { + int csum_flags = 0; + + if (m->m_pkthdr.csum_flags & CSUM_IP) + csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) + csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); + if (m->m_pkthdr.csum_flags & CSUM_SCTP) + csum_flags |= CSUM_SCTP_VALID; + + if (m->m_flags & M_BCAST) { + struct mbuf *n; + + /* + * Because if_simloop() modifies the packet, we need a + * writable copy through m_dup() instead of a readonly + * one as m_copy[m] would give us. The alternative would + * be to modify if_simloop() to handle the readonly mbuf, + * but performancewise it is mostly equivalent (trading + * extra data copying vs. extra locking). + * + * XXX This is a local workaround. A number of less + * often used kernel parts suffer from the same bug. + * See PR kern/105943 for a proposed general solution. + */ + if ((n = m_dup(m, M_DONTWAIT)) != NULL) { + n->m_pkthdr.csum_flags |= csum_flags; + if (csum_flags & CSUM_DATA_VALID) + n->m_pkthdr.csum_data = 0xffff; + (void)if_simloop(ifp, n, dst->sa_family, hlen); + } else + ifp->if_iqdrops++; + } else if (bcmp(eh->ether_dhost, eh->ether_shost, + ETHER_ADDR_LEN) == 0) { + m->m_pkthdr.csum_flags |= csum_flags; + if (csum_flags & CSUM_DATA_VALID) + m->m_pkthdr.csum_data = 0xffff; + (void) if_simloop(ifp, m, dst->sa_family, hlen); + return (0); /* XXX */ + } + } + + /* + * Bridges require special output handling. + */ + if (ifp->if_bridge) { + BRIDGE_OUTPUT(ifp, m, error); + return (error); + } + +#if defined(INET) || defined(INET6) + if (ifp->if_carp && + (error = (*carp_output_p)(ifp, m, dst, NULL))) + goto bad; +#endif + + /* Handle ng_ether(4) processing, if any */ + if (IFP2AC(ifp)->ac_netgraph != NULL) { + KASSERT(ng_ether_output_p != NULL, + ("ng_ether_output_p is NULL")); + if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { +bad: if (m != NULL) + m_freem(m); + return (error); + } + if (m == NULL) + return (0); + } + + /* Continue with link-layer output */ + return ether_output_frame(ifp, m); +} + +/* + * Ethernet link layer output routine to send a raw frame to the device. + * + * This assumes that the 14 byte Ethernet header is present and contiguous + * in the first mbuf (if BRIDGE'ing). + */ +int +ether_output_frame(struct ifnet *ifp, struct mbuf *m) +{ +#if defined(INET) || defined(INET6) + + if (V_ip_fw_chk_ptr && V_ether_ipfw != 0) { + if (ether_ipfw_chk(&m, ifp, 0) == 0) { + if (m) { + m_freem(m); + return EACCES; /* pkt dropped */ + } else + return 0; /* consumed e.g. in a pipe */ + } + } +#endif + + /* + * Queue message on interface, update output statistics if + * successful, and start output if interface not yet active. + */ + return ((ifp->if_transmit)(ifp, m)); +} + +#if defined(INET) || defined(INET6) +/* + * ipfw processing for ethernet packets (in and out). + * The second parameter is NULL from ether_demux, and ifp from + * ether_output_frame. + */ +int +ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared) +{ + struct ether_header *eh; + struct ether_header save_eh; + struct mbuf *m; + int i; + struct ip_fw_args args; + struct m_tag *mtag; + + /* fetch start point from rule, if any */ + mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); + if (mtag == NULL) { + args.rule.slot = 0; + } else { + /* dummynet packet, already partially processed */ + struct ipfw_rule_ref *r; + + /* XXX can we free it after use ? */ + mtag->m_tag_id = PACKET_TAG_NONE; + r = (struct ipfw_rule_ref *)(mtag + 1); + if (r->info & IPFW_ONEPASS) + return (1); + args.rule = *r; + } + + /* + * I need some amt of data to be contiguous, and in case others need + * the packet (shared==1) also better be in the first mbuf. + */ + m = *m0; + i = min( m->m_pkthdr.len, max_protohdr); + if ( shared || m->m_len < i) { + m = m_pullup(m, i); + if (m == NULL) { + *m0 = m; + return 0; + } + } + eh = mtod(m, struct ether_header *); + save_eh = *eh; /* save copy for restore below */ + m_adj(m, ETHER_HDR_LEN); /* strip ethernet header */ + + args.m = m; /* the packet we are looking at */ + args.oif = dst; /* destination, if any */ + args.next_hop = NULL; /* we do not support forward yet */ + args.eh = &save_eh; /* MAC header for bridged/MAC packets */ + args.inp = NULL; /* used by ipfw uid/gid/jail rules */ + i = V_ip_fw_chk_ptr(&args); + m = args.m; + if (m != NULL) { + /* + * Restore Ethernet header, as needed, in case the + * mbuf chain was replaced by ipfw. + */ + M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); + if (m == NULL) { + *m0 = m; + return 0; + } + if (eh != mtod(m, struct ether_header *)) + bcopy(&save_eh, mtod(m, struct ether_header *), + ETHER_HDR_LEN); + } + *m0 = m; + + if (i == IP_FW_DENY) /* drop */ + return 0; + + KASSERT(m != NULL, ("ether_ipfw_chk: m is NULL")); + + if (i == IP_FW_PASS) /* a PASS rule. */ + return 1; + + if (ip_dn_io_ptr && (i == IP_FW_DUMMYNET)) { + int dir; + /* + * Pass the pkt to dummynet, which consumes it. + * If shared, make a copy and keep the original. + */ + if (shared) { + m = m_copypacket(m, M_DONTWAIT); + if (m == NULL) + return 0; + } else { + /* + * Pass the original to dummynet and + * nothing back to the caller + */ + *m0 = NULL ; + } + dir = PROTO_LAYER2 | (dst ? DIR_OUT : DIR_IN); + ip_dn_io_ptr(&m, dir, &args); + return 0; + } + /* + * XXX at some point add support for divert/forward actions. + * If none of the above matches, we have to drop the pkt. + */ + return 0; +} +#endif + +/* + * Process a received Ethernet packet; the packet is in the + * mbuf chain m with the ethernet header at the front. + */ +static void +ether_input(struct ifnet *ifp, struct mbuf *m) +{ + struct ether_header *eh; + u_short etype; + + if ((ifp->if_flags & IFF_UP) == 0) { + m_freem(m); + return; + } +#ifdef DIAGNOSTIC + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n"); + m_freem(m); + return; + } +#endif + /* + * Do consistency checks to verify assumptions + * made by code past this point. + */ + if ((m->m_flags & M_PKTHDR) == 0) { + if_printf(ifp, "discard frame w/o packet header\n"); + ifp->if_ierrors++; + m_freem(m); + return; + } + if (m->m_len < ETHER_HDR_LEN) { + /* XXX maybe should pullup? */ + if_printf(ifp, "discard frame w/o leading ethernet " + "header (len %u pkt len %u)\n", + m->m_len, m->m_pkthdr.len); + ifp->if_ierrors++; + m_freem(m); + return; + } + eh = mtod(m, struct ether_header *); + etype = ntohs(eh->ether_type); + if (m->m_pkthdr.rcvif == NULL) { + if_printf(ifp, "discard frame w/o interface pointer\n"); + ifp->if_ierrors++; + m_freem(m); + return; + } +#ifdef DIAGNOSTIC + if (m->m_pkthdr.rcvif != ifp) { + if_printf(ifp, "Warning, frame marked as received on %s\n", + m->m_pkthdr.rcvif->if_xname); + } +#endif + + CURVNET_SET_QUIET(ifp->if_vnet); + + if (ETHER_IS_MULTICAST(eh->ether_dhost)) { + if (ETHER_IS_BROADCAST(eh->ether_dhost)) + m->m_flags |= M_BCAST; + else + m->m_flags |= M_MCAST; + ifp->if_imcasts++; + } + +#ifdef MAC + /* + * Tag the mbuf with an appropriate MAC label before any other + * consumers can get to it. + */ + mac_ifnet_create_mbuf(ifp, m); +#endif + + /* + * Give bpf a chance at the packet. + */ + ETHER_BPF_MTAP(ifp, m); + + /* + * If the CRC is still on the packet, trim it off. We do this once + * and once only in case we are re-entered. Nothing else on the + * Ethernet receive path expects to see the FCS. + */ + if (m->m_flags & M_HASFCS) { + m_adj(m, -ETHER_CRC_LEN); + m->m_flags &= ~M_HASFCS; + } + + ifp->if_ibytes += m->m_pkthdr.len; + + /* Allow monitor mode to claim this frame, after stats are updated. */ + if (ifp->if_flags & IFF_MONITOR) { + m_freem(m); + CURVNET_RESTORE(); + return; + } + + /* Handle input from a lagg(4) port */ + if (ifp->if_type == IFT_IEEE8023ADLAG) { + KASSERT(lagg_input_p != NULL, + ("%s: if_lagg not loaded!", __func__)); + m = (*lagg_input_p)(ifp, m); + if (m != NULL) + ifp = m->m_pkthdr.rcvif; + else + return; + } + + /* + * If the hardware did not process an 802.1Q tag, do this now, + * to allow 802.1P priority frames to be passed to the main input + * path correctly. + * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels. + */ + if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) { + struct ether_vlan_header *evl; + + if (m->m_len < sizeof(*evl) && + (m = m_pullup(m, sizeof(*evl))) == NULL) { +#ifdef DIAGNOSTIC + if_printf(ifp, "cannot pullup VLAN header\n"); +#endif + ifp->if_ierrors++; + m_freem(m); + return; + } + + evl = mtod(m, struct ether_vlan_header *); + m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); + m->m_flags |= M_VLANTAG; + + bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, + ETHER_HDR_LEN - ETHER_TYPE_LEN); + m_adj(m, ETHER_VLAN_ENCAP_LEN); + } + + /* Allow ng_ether(4) to claim this frame. */ + if (IFP2AC(ifp)->ac_netgraph != NULL) { + KASSERT(ng_ether_input_p != NULL, + ("%s: ng_ether_input_p is NULL", __func__)); + m->m_flags &= ~M_PROMISC; + (*ng_ether_input_p)(ifp, &m); + if (m == NULL) { + CURVNET_RESTORE(); + return; + } + } + + /* + * Allow if_bridge(4) to claim this frame. + * The BRIDGE_INPUT() macro will update ifp if the bridge changed it + * and the frame should be delivered locally. + */ + if (ifp->if_bridge != NULL) { + m->m_flags &= ~M_PROMISC; + BRIDGE_INPUT(ifp, m); + if (m == NULL) { + CURVNET_RESTORE(); + return; + } + } + +#if defined(INET) || defined(INET6) + /* + * Clear M_PROMISC on frame so that carp(4) will see it when the + * mbuf flows up to Layer 3. + * FreeBSD's implementation of carp(4) uses the inprotosw + * to dispatch IPPROTO_CARP. carp(4) also allocates its own + * Ethernet addresses of the form 00:00:5e:00:01:xx, which + * is outside the scope of the M_PROMISC test below. + * TODO: Maintain a hash table of ethernet addresses other than + * ether_dhost which may be active on this ifp. + */ + if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) { + m->m_flags &= ~M_PROMISC; + } else +#endif + { + /* + * If the frame received was not for our MAC address, set the + * M_PROMISC flag on the mbuf chain. The frame may need to + * be seen by the rest of the Ethernet input path in case of + * re-entry (e.g. bridge, vlan, netgraph) but should not be + * seen by upper protocol layers. + */ + if (!ETHER_IS_MULTICAST(eh->ether_dhost) && + bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0) + m->m_flags |= M_PROMISC; + } + + /* First chunk of an mbuf contains good entropy */ + if (harvest.ethernet) + random_harvest(m, 16, 3, 0, RANDOM_NET); + + ether_demux(ifp, m); + CURVNET_RESTORE(); +} + +/* + * Upper layer processing for a received Ethernet packet. + */ +void +ether_demux(struct ifnet *ifp, struct mbuf *m) +{ + struct ether_header *eh; + int isr; + u_short ether_type; +#if defined(NETATALK) + struct llc *l; +#endif + + KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); + +#if defined(INET) || defined(INET6) + /* + * Allow dummynet and/or ipfw to claim the frame. + * Do not do this for PROMISC frames in case we are re-entered. + */ + if (V_ip_fw_chk_ptr && V_ether_ipfw != 0 && !(m->m_flags & M_PROMISC)) { + if (ether_ipfw_chk(&m, NULL, 0) == 0) { + if (m) + m_freem(m); /* dropped; free mbuf chain */ + return; /* consumed */ + } + } +#endif + eh = mtod(m, struct ether_header *); + ether_type = ntohs(eh->ether_type); + + /* + * If this frame has a VLAN tag other than 0, call vlan_input() + * if its module is loaded. Otherwise, drop. + */ + if ((m->m_flags & M_VLANTAG) && + EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) { + if (ifp->if_vlantrunk == NULL) { + ifp->if_noproto++; + m_freem(m); + return; + } + KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!", + __func__)); + /* Clear before possibly re-entering ether_input(). */ + m->m_flags &= ~M_PROMISC; + (*vlan_input_p)(ifp, m); + return; + } + + /* + * Pass promiscuously received frames to the upper layer if the user + * requested this by setting IFF_PPROMISC. Otherwise, drop them. + */ + if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) { + m_freem(m); + return; + } + + /* + * Reset layer specific mbuf flags to avoid confusing upper layers. + * Strip off Ethernet header. + */ + m->m_flags &= ~M_VLANTAG; + m->m_flags &= ~(M_PROTOFLAGS); + m_adj(m, ETHER_HDR_LEN); + + /* + * Dispatch frame to upper layer. + */ + switch (ether_type) { +#ifdef INET + case ETHERTYPE_IP: + if ((m = ip_fastforward(m)) == NULL) + return; + isr = NETISR_IP; + break; + + case ETHERTYPE_ARP: + if (ifp->if_flags & IFF_NOARP) { + /* Discard packet if ARP is disabled on interface */ + m_freem(m); + return; + } + isr = NETISR_ARP; + break; +#endif +#ifdef IPX + case ETHERTYPE_IPX: + if (ef_inputp && ef_inputp(ifp, eh, m) == 0) + return; + isr = NETISR_IPX; + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + isr = NETISR_IPV6; + break; +#endif +#ifdef NETATALK + case ETHERTYPE_AT: + isr = NETISR_ATALK1; + break; + case ETHERTYPE_AARP: + isr = NETISR_AARP; + break; +#endif /* NETATALK */ + default: +#ifdef IPX + if (ef_inputp && ef_inputp(ifp, eh, m) == 0) + return; +#endif /* IPX */ +#if defined(NETATALK) + if (ether_type > ETHERMTU) + goto discard; + l = mtod(m, struct llc *); + if (l->llc_dsap == LLC_SNAP_LSAP && + l->llc_ssap == LLC_SNAP_LSAP && + l->llc_control == LLC_UI) { + if (bcmp(&(l->llc_snap_org_code)[0], at_org_code, + sizeof(at_org_code)) == 0 && + ntohs(l->llc_snap_ether_type) == ETHERTYPE_AT) { + m_adj(m, LLC_SNAPFRAMELEN); + isr = NETISR_ATALK2; + break; + } + if (bcmp(&(l->llc_snap_org_code)[0], aarp_org_code, + sizeof(aarp_org_code)) == 0 && + ntohs(l->llc_snap_ether_type) == ETHERTYPE_AARP) { + m_adj(m, LLC_SNAPFRAMELEN); + isr = NETISR_AARP; + break; + } + } +#endif /* NETATALK */ + goto discard; + } + netisr_dispatch(isr, m); + return; + +discard: + /* + * Packet is to be discarded. If netgraph is present, + * hand the packet to it for last chance processing; + * otherwise dispose of it. + */ + if (IFP2AC(ifp)->ac_netgraph != NULL) { + KASSERT(ng_ether_input_orphan_p != NULL, + ("ng_ether_input_orphan_p is NULL")); + /* + * Put back the ethernet header so netgraph has a + * consistent view of inbound packets. + */ + M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); + (*ng_ether_input_orphan_p)(ifp, m); + return; + } + m_freem(m); +} + +/* + * Convert Ethernet address to printable (loggable) representation. + * This routine is for compatibility; it's better to just use + * + * printf("%6D", , ":"); + * + * since there's no static buffer involved. + */ +char * +ether_sprintf(const u_char *ap) +{ + static char etherbuf[18]; + snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":"); + return (etherbuf); +} + +/* + * Perform common duties while attaching to interface list + */ +void +ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) +{ + int i; + struct ifaddr *ifa; + struct sockaddr_dl *sdl; + + ifp->if_addrlen = ETHER_ADDR_LEN; + ifp->if_hdrlen = ETHER_HDR_LEN; + if_attach(ifp); + ifp->if_mtu = ETHERMTU; + ifp->if_output = ether_output; + ifp->if_input = ether_input; + ifp->if_resolvemulti = ether_resolvemulti; +#ifdef VIMAGE + ifp->if_reassign = ether_reassign; +#endif + if (ifp->if_baudrate == 0) + ifp->if_baudrate = IF_Mbps(10); /* just a default */ + ifp->if_broadcastaddr = etherbroadcastaddr; + + ifa = ifp->if_addr; + KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + sdl->sdl_type = IFT_ETHER; + sdl->sdl_alen = ifp->if_addrlen; + bcopy(lla, LLADDR(sdl), ifp->if_addrlen); + + bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); + if (ng_ether_attach_p != NULL) + (*ng_ether_attach_p)(ifp); + + /* Announce Ethernet MAC address if non-zero. */ + for (i = 0; i < ifp->if_addrlen; i++) + if (lla[i] != 0) + break; + if (i != ifp->if_addrlen) + if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); +} + +/* + * Perform common duties while detaching an Ethernet interface + */ +void +ether_ifdetach(struct ifnet *ifp) +{ + if (IFP2AC(ifp)->ac_netgraph != NULL) { + KASSERT(ng_ether_detach_p != NULL, + ("ng_ether_detach_p is NULL")); + (*ng_ether_detach_p)(ifp); + } + + bpfdetach(ifp); + if_detach(ifp); +} + +#ifdef VIMAGE +void +ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) +{ + + if (IFP2AC(ifp)->ac_netgraph != NULL) { + KASSERT(ng_ether_detach_p != NULL, + ("ng_ether_detach_p is NULL")); + (*ng_ether_detach_p)(ifp); + } + + if (ng_ether_attach_p != NULL) { + CURVNET_SET_QUIET(new_vnet); + (*ng_ether_attach_p)(ifp); + CURVNET_RESTORE(); + } +} +#endif + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); +#if defined(INET) || defined(INET6) +SYSCTL_VNET_INT(_net_link_ether, OID_AUTO, ipfw, CTLFLAG_RW, + &VNET_NAME(ether_ipfw), 0, "Pass ether pkts through firewall"); +#endif + +#if 0 +/* + * This is for reference. We have a table-driven version + * of the little-endian crc32 generator, which is faster + * than the double-loop. + */ +uint32_t +ether_crc32_le(const uint8_t *buf, size_t len) +{ + size_t i; + uint32_t crc; + int bit; + uint8_t data; + + crc = 0xffffffff; /* initial value */ + + for (i = 0; i < len; i++) { + for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { + carry = (crc ^ data) & 1; + crc >>= 1; + if (carry) + crc = (crc ^ ETHER_CRC_POLY_LE); + } + } + + return (crc); +} +#else +uint32_t +ether_crc32_le(const uint8_t *buf, size_t len) +{ + static const uint32_t crctab[] = { + 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, + 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, + 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, + 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c + }; + size_t i; + uint32_t crc; + + crc = 0xffffffff; /* initial value */ + + for (i = 0; i < len; i++) { + crc ^= buf[i]; + crc = (crc >> 4) ^ crctab[crc & 0xf]; + crc = (crc >> 4) ^ crctab[crc & 0xf]; + } + + return (crc); +} +#endif + +uint32_t +ether_crc32_be(const uint8_t *buf, size_t len) +{ + size_t i; + uint32_t crc, carry; + int bit; + uint8_t data; + + crc = 0xffffffff; /* initial value */ + + for (i = 0; i < len; i++) { + for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { + carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01); + crc <<= 1; + if (carry) + crc = (crc ^ ETHER_CRC_POLY_BE) | carry; + } + } + + return (crc); +} + +int +ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) +{ + struct ifaddr *ifa = (struct ifaddr *) data; + struct ifreq *ifr = (struct ifreq *) data; + int error = 0; + + switch (command) { + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + ifp->if_init(ifp->if_softc); /* before arpwhohas */ + arp_ifinit(ifp, ifa); + break; +#endif +#ifdef IPX + /* + * XXX - This code is probably wrong + */ + case AF_IPX: + { + struct ipx_addr *ina = &(IA_SIPX(ifa)->sipx_addr); + + if (ipx_nullhost(*ina)) + ina->x_host = + *(union ipx_host *) + IF_LLADDR(ifp); + else { + bcopy((caddr_t) ina->x_host.c_host, + (caddr_t) IF_LLADDR(ifp), + ETHER_ADDR_LEN); + } + + /* + * Set new address + */ + ifp->if_init(ifp->if_softc); + break; + } +#endif + default: + ifp->if_init(ifp->if_softc); + break; + } + break; + + case SIOCGIFADDR: + { + struct sockaddr *sa; + + sa = (struct sockaddr *) & ifr->ifr_data; + bcopy(IF_LLADDR(ifp), + (caddr_t) sa->sa_data, ETHER_ADDR_LEN); + } + break; + + case SIOCSIFMTU: + /* + * Set the interface MTU. + */ + if (ifr->ifr_mtu > ETHERMTU) { + error = EINVAL; + } else { + ifp->if_mtu = ifr->ifr_mtu; + } + break; + default: + error = EINVAL; /* XXX netbsd has ENOTTY??? */ + break; + } + return (error); +} + +static int +ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, + struct sockaddr *sa) +{ + struct sockaddr_dl *sdl; +#ifdef INET + struct sockaddr_in *sin; +#endif +#ifdef INET6 + struct sockaddr_in6 *sin6; +#endif + u_char *e_addr; + + switch(sa->sa_family) { + case AF_LINK: + /* + * No mapping needed. Just check that it's a valid MC address. + */ + sdl = (struct sockaddr_dl *)sa; + e_addr = LLADDR(sdl); + if (!ETHER_IS_MULTICAST(e_addr)) + return EADDRNOTAVAIL; + *llsa = 0; + return 0; + +#ifdef INET + case AF_INET: + sin = (struct sockaddr_in *)sa; + if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + return EADDRNOTAVAIL; + sdl = malloc(sizeof *sdl, M_IFMADDR, + M_NOWAIT|M_ZERO); + if (sdl == NULL) + return ENOMEM; + sdl->sdl_len = sizeof *sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = IFT_ETHER; + sdl->sdl_alen = ETHER_ADDR_LEN; + e_addr = LLADDR(sdl); + ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); + *llsa = (struct sockaddr *)sdl; + return 0; +#endif +#ifdef INET6 + case AF_INET6: + sin6 = (struct sockaddr_in6 *)sa; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + /* + * An IP6 address of 0 means listen to all + * of the Ethernet multicast address used for IP6. + * (This is used for multicast routers.) + */ + ifp->if_flags |= IFF_ALLMULTI; + *llsa = 0; + return 0; + } + if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) + return EADDRNOTAVAIL; + sdl = malloc(sizeof *sdl, M_IFMADDR, + M_NOWAIT|M_ZERO); + if (sdl == NULL) + return (ENOMEM); + sdl->sdl_len = sizeof *sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = IFT_ETHER; + sdl->sdl_alen = ETHER_ADDR_LEN; + e_addr = LLADDR(sdl); + ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); + *llsa = (struct sockaddr *)sdl; + return 0; +#endif + + default: + /* + * Well, the text isn't quite right, but it's the name + * that counts... + */ + return EAFNOSUPPORT; + } +} + +static void* +ether_alloc(u_char type, struct ifnet *ifp) +{ + struct arpcom *ac; + + ac = malloc(sizeof(struct arpcom), M_ARPCOM, M_WAITOK | M_ZERO); + ac->ac_ifp = ifp; + + return (ac); +} + +static void +ether_free(void *com, u_char type) +{ + + free(com, M_ARPCOM); +} + +static int +ether_modevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + if_register_com_alloc(IFT_ETHER, ether_alloc, ether_free); + break; + case MOD_UNLOAD: + if_deregister_com_alloc(IFT_ETHER); + break; + default: + return EOPNOTSUPP; + } + + return (0); +} + +static moduledata_t ether_mod = { + "ether", + ether_modevent, + 0 +}; + +void +ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen) +{ + struct ether_vlan_header vlan; + struct mbuf mv, mb; + + KASSERT((m->m_flags & M_VLANTAG) != 0, + ("%s: vlan information not present", __func__)); + KASSERT(m->m_len >= sizeof(struct ether_header), + ("%s: mbuf not large enough for header", __func__)); + bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header)); + vlan.evl_proto = vlan.evl_encap_proto; + vlan.evl_encap_proto = htons(ETHERTYPE_VLAN); + vlan.evl_tag = htons(m->m_pkthdr.ether_vtag); + m->m_len -= sizeof(struct ether_header); + m->m_data += sizeof(struct ether_header); + /* + * If a data link has been supplied by the caller, then we will need to + * re-create a stack allocated mbuf chain with the following structure: + * + * (1) mbuf #1 will contain the supplied data link + * (2) mbuf #2 will contain the vlan header + * (3) mbuf #3 will contain the original mbuf's packet data + * + * Otherwise, submit the packet and vlan header via bpf_mtap2(). + */ + if (data != NULL) { + mv.m_next = m; + mv.m_data = (caddr_t)&vlan; + mv.m_len = sizeof(vlan); + mb.m_next = &mv; + mb.m_data = data; + mb.m_len = dlen; + bpf_mtap(bp, &mb); + } else + bpf_mtap2(bp, &vlan, sizeof(vlan), m); + m->m_len += sizeof(struct ether_header); + m->m_data -= sizeof(struct ether_header); +} + +struct mbuf * +ether_vlanencap(struct mbuf *m, uint16_t tag) +{ + struct ether_vlan_header *evl; + + M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT); + if (m == NULL) + return (NULL); + /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ + + if (m->m_len < sizeof(*evl)) { + m = m_pullup(m, sizeof(*evl)); + if (m == NULL) + return (NULL); + } + + /* + * Transform the Ethernet header into an Ethernet header + * with 802.1Q encapsulation. + */ + evl = mtod(m, struct ether_vlan_header *); + bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, + (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); + evl->evl_encap_proto = htons(ETHERTYPE_VLAN); + evl->evl_tag = htons(tag); + return (m); +} + +DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); +MODULE_VERSION(ether, 1); diff --git a/freebsd/sys/net/if_faith.c b/freebsd/sys/net/if_faith.c new file mode 100644 index 00000000..c8989922 --- /dev/null +++ b/freebsd/sys/net/if_faith.c @@ -0,0 +1,353 @@ +#include + +/* $KAME: if_faith.c,v 1.23 2001/12/17 13:55:29 sumikawa Exp $ */ + +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +/* + * derived from + * @(#)if_loop.c 8.1 (Berkeley) 6/10/93 + * Id: if_loop.c,v 1.22 1996/06/19 16:24:10 wollman Exp + */ + +/* + * Loopback interface driver for protocol testing and timing. + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET +#include +#include +#include +#include +#endif + +#ifdef INET6 +#ifndef INET +#include +#endif +#include +#include +#include +#endif + +#define FAITHNAME "faith" + +struct faith_softc { + struct ifnet *sc_ifp; +}; + +static int faithioctl(struct ifnet *, u_long, caddr_t); +int faithoutput(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); +static void faithrtrequest(int, struct rtentry *, struct rt_addrinfo *); +#ifdef INET6 +static int faithprefix(struct in6_addr *); +#endif + +static int faithmodevent(module_t, int, void *); + +static MALLOC_DEFINE(M_FAITH, FAITHNAME, "Firewall Assisted Tunnel Interface"); + +static int faith_clone_create(struct if_clone *, int, caddr_t); +static void faith_clone_destroy(struct ifnet *); + +IFC_SIMPLE_DECLARE(faith, 0); + +#define FAITHMTU 1500 + +static int +faithmodevent(mod, type, data) + module_t mod; + int type; + void *data; +{ + + switch (type) { + case MOD_LOAD: + if_clone_attach(&faith_cloner); + +#ifdef INET6 + faithprefix_p = faithprefix; +#endif + + break; + case MOD_UNLOAD: +#ifdef INET6 + faithprefix_p = NULL; +#endif + + if_clone_detach(&faith_cloner); + break; + default: + return EOPNOTSUPP; + } + return 0; +} + +static moduledata_t faith_mod = { + "if_faith", + faithmodevent, + 0 +}; + +DECLARE_MODULE(if_faith, faith_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_faith, 1); + +static int +faith_clone_create(ifc, unit, params) + struct if_clone *ifc; + int unit; + caddr_t params; +{ + struct ifnet *ifp; + struct faith_softc *sc; + + sc = malloc(sizeof(struct faith_softc), M_FAITH, M_WAITOK | M_ZERO); + ifp = sc->sc_ifp = if_alloc(IFT_FAITH); + if (ifp == NULL) { + free(sc, M_FAITH); + return (ENOSPC); + } + + ifp->if_softc = sc; + if_initname(sc->sc_ifp, ifc->ifc_name, unit); + + ifp->if_mtu = FAITHMTU; + /* Change to BROADCAST experimentaly to announce its prefix. */ + ifp->if_flags = /* IFF_LOOPBACK */ IFF_BROADCAST | IFF_MULTICAST; + ifp->if_ioctl = faithioctl; + ifp->if_output = faithoutput; + ifp->if_hdrlen = 0; + ifp->if_addrlen = 0; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + return (0); +} + +static void +faith_clone_destroy(ifp) + struct ifnet *ifp; +{ + struct faith_softc *sc = ifp->if_softc; + + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); + free(sc, M_FAITH); +} + +int +faithoutput(ifp, m, dst, ro) + struct ifnet *ifp; + struct mbuf *m; + struct sockaddr *dst; + struct route *ro; +{ + int isr; + u_int32_t af; + struct rtentry *rt = NULL; + + M_ASSERTPKTHDR(m); + + if (ro != NULL) + rt = ro->ro_rt; + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC) { + bcopy(dst->sa_data, &af, sizeof(af)); + dst->sa_family = af; + } + + if (bpf_peers_present(ifp->if_bpf)) { + af = dst->sa_family; + bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); + } + + if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + m_freem(m); + return (rt->rt_flags & RTF_BLACKHOLE ? 0 : + rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + } + ifp->if_opackets++; + ifp->if_obytes += m->m_pkthdr.len; + switch (dst->sa_family) { +#ifdef INET + case AF_INET: + isr = NETISR_IP; + break; +#endif +#ifdef INET6 + case AF_INET6: + isr = NETISR_IPV6; + break; +#endif + default: + m_freem(m); + return EAFNOSUPPORT; + } + + /* XXX do we need more sanity checks? */ + + m->m_pkthdr.rcvif = ifp; + ifp->if_ipackets++; + ifp->if_ibytes += m->m_pkthdr.len; + netisr_dispatch(isr, m); + return (0); +} + +/* ARGSUSED */ +static void +faithrtrequest(cmd, rt, info) + int cmd; + struct rtentry *rt; + struct rt_addrinfo *info; +{ + RT_LOCK_ASSERT(rt); + rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; +} + +/* + * Process an ioctl request. + */ +/* ARGSUSED */ +static int +faithioctl(ifp, cmd, data) + struct ifnet *ifp; + u_long cmd; + caddr_t data; +{ + struct ifaddr *ifa; + struct ifreq *ifr = (struct ifreq *)data; + int error = 0; + + switch (cmd) { + + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + ifp->if_drv_flags |= IFF_DRV_RUNNING; + ifa = (struct ifaddr *)data; + ifa->ifa_rtrequest = faithrtrequest; + /* + * Everything else is done at a higher level. + */ + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + if (ifr == 0) { + error = EAFNOSUPPORT; /* XXX */ + break; + } + switch (ifr->ifr_addr.sa_family) { +#ifdef INET + case AF_INET: + break; +#endif +#ifdef INET6 + case AF_INET6: + break; +#endif + + default: + error = EAFNOSUPPORT; + break; + } + break; + +#ifdef SIOCSIFMTU + case SIOCSIFMTU: + ifp->if_mtu = ifr->ifr_mtu; + break; +#endif + + case SIOCSIFFLAGS: + break; + + default: + error = EINVAL; + } + return (error); +} + +#ifdef INET6 +/* + * XXX could be slow + * XXX could be layer violation to call sys/net from sys/netinet6 + */ +static int +faithprefix(in6) + struct in6_addr *in6; +{ + struct rtentry *rt; + struct sockaddr_in6 sin6; + int ret; + + if (V_ip6_keepfaith == 0) + return 0; + + bzero(&sin6, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_addr = *in6; + rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL); + if (rt && rt->rt_ifp && rt->rt_ifp->if_type == IFT_FAITH && + (rt->rt_ifp->if_flags & IFF_UP) != 0) + ret = 1; + else + ret = 0; + if (rt) + RTFREE_LOCKED(rt); + return ret; +} +#endif diff --git a/freebsd/sys/net/if_fddisubr.c b/freebsd/sys/net/if_fddisubr.c new file mode 100644 index 00000000..fc9f27e1 --- /dev/null +++ b/freebsd/sys/net/if_fddisubr.c @@ -0,0 +1,800 @@ +#include + +/*- + * Copyright (c) 1995, 1996 + * Matt Thomas . All rights reserved. + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: if_ethersubr.c,v 1.5 1994/12/13 22:31:45 wollman Exp + * $FreeBSD$ + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#if defined(INET) || defined(INET6) +#include +#include +#include +#endif +#ifdef INET6 +#include +#endif + +#ifdef IPX +#include +#include +#endif + +#ifdef DECNET +#include +#endif + +#ifdef NETATALK +#include +#include +#include + +extern u_char at_org_code[ 3 ]; +extern u_char aarp_org_code[ 3 ]; +#endif /* NETATALK */ + +#include + +static const u_char fddibroadcastaddr[FDDI_ADDR_LEN] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + +static int fddi_resolvemulti(struct ifnet *, struct sockaddr **, + struct sockaddr *); +static int fddi_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); +static void fddi_input(struct ifnet *ifp, struct mbuf *m); + +#define senderr(e) do { error = (e); goto bad; } while (0) + +/* + * FDDI output routine. + * Encapsulate a packet of type family for the local net. + * Use trailer local net encapsulation if enough data in first + * packet leaves a multiple of 512 bytes of data in remainder. + * Assumes that ifp is actually pointer to arpcom structure. + */ +static int +fddi_output(ifp, m, dst, ro) + struct ifnet *ifp; + struct mbuf *m; + struct sockaddr *dst; + struct route *ro; +{ + u_int16_t type; + int loop_copy = 0, error = 0, hdrcmplt = 0; + u_char esrc[FDDI_ADDR_LEN], edst[FDDI_ADDR_LEN]; + struct fddi_header *fh; +#if defined(INET) || defined(INET6) + struct llentry *lle; +#endif + +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) + senderr(error); +#endif + + if (ifp->if_flags & IFF_MONITOR) + senderr(ENETDOWN); + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) + senderr(ENETDOWN); + getmicrotime(&ifp->if_lastchange); + + switch (dst->sa_family) { +#ifdef INET + case AF_INET: { + struct rtentry *rt0 = NULL; + + if (ro != NULL) + rt0 = ro->ro_rt; + error = arpresolve(ifp, rt0, m, dst, edst, &lle); + if (error) + return (error == EWOULDBLOCK ? 0 : error); + type = htons(ETHERTYPE_IP); + break; + } + case AF_ARP: + { + struct arphdr *ah; + ah = mtod(m, struct arphdr *); + ah->ar_hrd = htons(ARPHRD_ETHER); + + loop_copy = -1; /* if this is for us, don't do it */ + + switch (ntohs(ah->ar_op)) { + case ARPOP_REVREQUEST: + case ARPOP_REVREPLY: + type = htons(ETHERTYPE_REVARP); + break; + case ARPOP_REQUEST: + case ARPOP_REPLY: + default: + type = htons(ETHERTYPE_ARP); + break; + } + + if (m->m_flags & M_BCAST) + bcopy(ifp->if_broadcastaddr, edst, FDDI_ADDR_LEN); + else + bcopy(ar_tha(ah), edst, FDDI_ADDR_LEN); + + } + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle); + if (error) + return (error); /* Something bad happened */ + type = htons(ETHERTYPE_IPV6); + break; +#endif /* INET6 */ +#ifdef IPX + case AF_IPX: + type = htons(ETHERTYPE_IPX); + bcopy((caddr_t)&(((struct sockaddr_ipx *)dst)->sipx_addr.x_host), + (caddr_t)edst, FDDI_ADDR_LEN); + break; +#endif /* IPX */ +#ifdef NETATALK + case AF_APPLETALK: { + struct at_ifaddr *aa; + if (!aarpresolve(ifp, m, (struct sockaddr_at *)dst, edst)) + return (0); + /* + * ifaddr is the first thing in at_ifaddr + */ + if ((aa = at_ifawithnet( (struct sockaddr_at *)dst)) == 0) + goto bad; + + /* + * In the phase 2 case, we need to prepend an mbuf for the llc header. + * Since we must preserve the value of m, which is passed to us by + * value, we m_copy() the first mbuf, and use it for our llc header. + */ + if (aa->aa_flags & AFA_PHASE2) { + struct llc llc; + + M_PREPEND(m, LLC_SNAPFRAMELEN, M_WAIT); + llc.llc_dsap = llc.llc_ssap = LLC_SNAP_LSAP; + llc.llc_control = LLC_UI; + bcopy(at_org_code, llc.llc_snap.org_code, sizeof(at_org_code)); + llc.llc_snap.ether_type = htons(ETHERTYPE_AT); + bcopy(&llc, mtod(m, caddr_t), LLC_SNAPFRAMELEN); + type = 0; + } else { + type = htons(ETHERTYPE_AT); + } + ifa_free(&aa->aa_ifa); + break; + } +#endif /* NETATALK */ + + case pseudo_AF_HDRCMPLT: + { + struct ether_header *eh; + hdrcmplt = 1; + eh = (struct ether_header *)dst->sa_data; + bcopy((caddr_t)eh->ether_shost, (caddr_t)esrc, FDDI_ADDR_LEN); + /* FALLTHROUGH */ + } + + case AF_UNSPEC: + { + struct ether_header *eh; + loop_copy = -1; + eh = (struct ether_header *)dst->sa_data; + bcopy((caddr_t)eh->ether_dhost, (caddr_t)edst, FDDI_ADDR_LEN); + if (*edst & 1) + m->m_flags |= (M_BCAST|M_MCAST); + type = eh->ether_type; + break; + } + + case AF_IMPLINK: + { + fh = mtod(m, struct fddi_header *); + error = EPROTONOSUPPORT; + switch (fh->fddi_fc & (FDDIFC_C|FDDIFC_L|FDDIFC_F)) { + case FDDIFC_LLC_ASYNC: { + /* legal priorities are 0 through 7 */ + if ((fh->fddi_fc & FDDIFC_Z) > 7) + goto bad; + break; + } + case FDDIFC_LLC_SYNC: { + /* FDDIFC_Z bits reserved, must be zero */ + if (fh->fddi_fc & FDDIFC_Z) + goto bad; + break; + } + case FDDIFC_SMT: { + /* FDDIFC_Z bits must be non zero */ + if ((fh->fddi_fc & FDDIFC_Z) == 0) + goto bad; + break; + } + default: { + /* anything else is too dangerous */ + goto bad; + } + } + error = 0; + if (fh->fddi_dhost[0] & 1) + m->m_flags |= (M_BCAST|M_MCAST); + goto queue_it; + } + default: + if_printf(ifp, "can't handle af%d\n", dst->sa_family); + senderr(EAFNOSUPPORT); + } + + /* + * Add LLC header. + */ + if (type != 0) { + struct llc *l; + M_PREPEND(m, LLC_SNAPFRAMELEN, M_DONTWAIT); + if (m == 0) + senderr(ENOBUFS); + l = mtod(m, struct llc *); + l->llc_control = LLC_UI; + l->llc_dsap = l->llc_ssap = LLC_SNAP_LSAP; + l->llc_snap.org_code[0] = + l->llc_snap.org_code[1] = + l->llc_snap.org_code[2] = 0; + l->llc_snap.ether_type = htons(type); + } + + /* + * Add local net header. If no space in first mbuf, + * allocate another. + */ + M_PREPEND(m, FDDI_HDR_LEN, M_DONTWAIT); + if (m == 0) + senderr(ENOBUFS); + fh = mtod(m, struct fddi_header *); + fh->fddi_fc = FDDIFC_LLC_ASYNC|FDDIFC_LLC_PRIO4; + bcopy((caddr_t)edst, (caddr_t)fh->fddi_dhost, FDDI_ADDR_LEN); + queue_it: + if (hdrcmplt) + bcopy((caddr_t)esrc, (caddr_t)fh->fddi_shost, FDDI_ADDR_LEN); + else + bcopy(IF_LLADDR(ifp), (caddr_t)fh->fddi_shost, + FDDI_ADDR_LEN); + + /* + * If a simplex interface, and the packet is being sent to our + * Ethernet address or a broadcast address, loopback a copy. + * XXX To make a simplex device behave exactly like a duplex + * device, we should copy in the case of sending to our own + * ethernet address (thus letting the original actually appear + * on the wire). However, we don't do that here for security + * reasons and compatibility with the original behavior. + */ + if ((ifp->if_flags & IFF_SIMPLEX) && (loop_copy != -1)) { + if ((m->m_flags & M_BCAST) || (loop_copy > 0)) { + struct mbuf *n; + n = m_copy(m, 0, (int)M_COPYALL); + (void) if_simloop(ifp, n, dst->sa_family, + FDDI_HDR_LEN); + } else if (bcmp(fh->fddi_dhost, fh->fddi_shost, + FDDI_ADDR_LEN) == 0) { + (void) if_simloop(ifp, m, dst->sa_family, + FDDI_HDR_LEN); + return (0); /* XXX */ + } + } + + error = (ifp->if_transmit)(ifp, m); + if (error) + ifp->if_oerrors++; + + return (error); + +bad: + ifp->if_oerrors++; + if (m) + m_freem(m); + return (error); +} + +/* + * Process a received FDDI packet. + */ +static void +fddi_input(ifp, m) + struct ifnet *ifp; + struct mbuf *m; +{ + int isr; + struct llc *l; + struct fddi_header *fh; + + /* + * Do consistency checks to verify assumptions + * made by code past this point. + */ + if ((m->m_flags & M_PKTHDR) == 0) { + if_printf(ifp, "discard frame w/o packet header\n"); + ifp->if_ierrors++; + m_freem(m); + return; + } + if (m->m_pkthdr.rcvif == NULL) { + if_printf(ifp, "discard frame w/o interface pointer\n"); + ifp->if_ierrors++; + m_freem(m); + return; + } + + m = m_pullup(m, FDDI_HDR_LEN); + if (m == NULL) { + ifp->if_ierrors++; + goto dropanyway; + } + fh = mtod(m, struct fddi_header *); + m->m_pkthdr.header = (void *)fh; + + /* + * Discard packet if interface is not up. + */ + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) + goto dropanyway; + + /* + * Give bpf a chance at the packet. + */ + BPF_MTAP(ifp, m); + + /* + * Interface marked for monitoring; discard packet. + */ + if (ifp->if_flags & IFF_MONITOR) { + m_freem(m); + return; + } + +#ifdef MAC + mac_ifnet_create_mbuf(ifp, m); +#endif + + /* + * Update interface statistics. + */ + ifp->if_ibytes += m->m_pkthdr.len; + getmicrotime(&ifp->if_lastchange); + + /* + * Discard non local unicast packets when interface + * is in promiscuous mode. + */ + if ((ifp->if_flags & IFF_PROMISC) && ((fh->fddi_dhost[0] & 1) == 0) && + (bcmp(IF_LLADDR(ifp), (caddr_t)fh->fddi_dhost, + FDDI_ADDR_LEN) != 0)) + goto dropanyway; + + /* + * Set mbuf flags for bcast/mcast. + */ + if (fh->fddi_dhost[0] & 1) { + if (bcmp(ifp->if_broadcastaddr, fh->fddi_dhost, + FDDI_ADDR_LEN) == 0) + m->m_flags |= M_BCAST; + else + m->m_flags |= M_MCAST; + ifp->if_imcasts++; + } + +#ifdef M_LINK0 + /* + * If this has a LLC priority of 0, then mark it so upper + * layers have a hint that it really came via a FDDI/Ethernet + * bridge. + */ + if ((fh->fddi_fc & FDDIFC_LLC_PRIO7) == FDDIFC_LLC_PRIO0) + m->m_flags |= M_LINK0; +#endif + + /* Strip off FDDI header. */ + m_adj(m, FDDI_HDR_LEN); + + m = m_pullup(m, LLC_SNAPFRAMELEN); + if (m == 0) { + ifp->if_ierrors++; + goto dropanyway; + } + l = mtod(m, struct llc *); + + switch (l->llc_dsap) { + case LLC_SNAP_LSAP: + { + u_int16_t type; + if ((l->llc_control != LLC_UI) || + (l->llc_ssap != LLC_SNAP_LSAP)) { + ifp->if_noproto++; + goto dropanyway; + } +#ifdef NETATALK + if (bcmp(&(l->llc_snap.org_code)[0], at_org_code, + sizeof(at_org_code)) == 0 && + ntohs(l->llc_snap.ether_type) == ETHERTYPE_AT) { + isr = NETISR_ATALK2; + m_adj(m, LLC_SNAPFRAMELEN); + break; + } + + if (bcmp(&(l->llc_snap.org_code)[0], aarp_org_code, + sizeof(aarp_org_code)) == 0 && + ntohs(l->llc_snap.ether_type) == ETHERTYPE_AARP) { + m_adj(m, LLC_SNAPFRAMELEN); + isr = NETISR_AARP; + break; + } +#endif /* NETATALK */ + if (l->llc_snap.org_code[0] != 0 || + l->llc_snap.org_code[1] != 0 || + l->llc_snap.org_code[2] != 0) { + ifp->if_noproto++; + goto dropanyway; + } + + type = ntohs(l->llc_snap.ether_type); + m_adj(m, LLC_SNAPFRAMELEN); + + switch (type) { +#ifdef INET + case ETHERTYPE_IP: + if ((m = ip_fastforward(m)) == NULL) + return; + isr = NETISR_IP; + break; + + case ETHERTYPE_ARP: + if (ifp->if_flags & IFF_NOARP) + goto dropanyway; + isr = NETISR_ARP; + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + isr = NETISR_IPV6; + break; +#endif +#ifdef IPX + case ETHERTYPE_IPX: + isr = NETISR_IPX; + break; +#endif +#ifdef DECNET + case ETHERTYPE_DECNET: + isr = NETISR_DECNET; + break; +#endif +#ifdef NETATALK + case ETHERTYPE_AT: + isr = NETISR_ATALK1; + break; + case ETHERTYPE_AARP: + isr = NETISR_AARP; + break; +#endif /* NETATALK */ + default: + /* printf("fddi_input: unknown protocol 0x%x\n", type); */ + ifp->if_noproto++; + goto dropanyway; + } + break; + } + + default: + /* printf("fddi_input: unknown dsap 0x%x\n", l->llc_dsap); */ + ifp->if_noproto++; + goto dropanyway; + } + netisr_dispatch(isr, m); + return; + +dropanyway: + ifp->if_iqdrops++; + if (m) + m_freem(m); + return; +} + +/* + * Perform common duties while attaching to interface list + */ +void +fddi_ifattach(ifp, lla, bpf) + struct ifnet *ifp; + const u_int8_t *lla; + int bpf; +{ + struct ifaddr *ifa; + struct sockaddr_dl *sdl; + + ifp->if_type = IFT_FDDI; + ifp->if_addrlen = FDDI_ADDR_LEN; + ifp->if_hdrlen = 21; + + if_attach(ifp); /* Must be called before additional assignments */ + + ifp->if_mtu = FDDIMTU; + ifp->if_output = fddi_output; + ifp->if_input = fddi_input; + ifp->if_resolvemulti = fddi_resolvemulti; + ifp->if_broadcastaddr = fddibroadcastaddr; + ifp->if_baudrate = 100000000; +#ifdef IFF_NOTRAILERS + ifp->if_flags |= IFF_NOTRAILERS; +#endif + ifa = ifp->if_addr; + KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); + + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + sdl->sdl_type = IFT_FDDI; + sdl->sdl_alen = ifp->if_addrlen; + bcopy(lla, LLADDR(sdl), ifp->if_addrlen); + + if (bpf) + bpfattach(ifp, DLT_FDDI, FDDI_HDR_LEN); + + return; +} + +void +fddi_ifdetach(ifp, bpf) + struct ifnet *ifp; + int bpf; +{ + + if (bpf) + bpfdetach(ifp); + + if_detach(ifp); + + return; +} + +int +fddi_ioctl (ifp, command, data) + struct ifnet *ifp; + u_long command; + caddr_t data; +{ + struct ifaddr *ifa; + struct ifreq *ifr; + int error; + + ifa = (struct ifaddr *) data; + ifr = (struct ifreq *) data; + error = 0; + + switch (command) { + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: /* before arpwhohas */ + ifp->if_init(ifp->if_softc); + arp_ifinit(ifp, ifa); + break; +#endif +#ifdef IPX + /* + * XXX - This code is probably wrong + */ + case AF_IPX: { + struct ipx_addr *ina; + + ina = &(IA_SIPX(ifa)->sipx_addr); + + if (ipx_nullhost(*ina)) { + ina->x_host = *(union ipx_host *) + IF_LLADDR(ifp); + } else { + bcopy((caddr_t) ina->x_host.c_host, + (caddr_t) IF_LLADDR(ifp), + ETHER_ADDR_LEN); + } + + /* + * Set new address + */ + ifp->if_init(ifp->if_softc); + } + break; +#endif + default: + ifp->if_init(ifp->if_softc); + break; + } + break; + case SIOCGIFADDR: { + struct sockaddr *sa; + + sa = (struct sockaddr *) & ifr->ifr_data; + bcopy(IF_LLADDR(ifp), + (caddr_t) sa->sa_data, FDDI_ADDR_LEN); + + } + break; + case SIOCSIFMTU: + /* + * Set the interface MTU. + */ + if (ifr->ifr_mtu > FDDIMTU) { + error = EINVAL; + } else { + ifp->if_mtu = ifr->ifr_mtu; + } + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +static int +fddi_resolvemulti(ifp, llsa, sa) + struct ifnet *ifp; + struct sockaddr **llsa; + struct sockaddr *sa; +{ + struct sockaddr_dl *sdl; +#ifdef INET + struct sockaddr_in *sin; +#endif +#ifdef INET6 + struct sockaddr_in6 *sin6; +#endif + u_char *e_addr; + + switch(sa->sa_family) { + case AF_LINK: + /* + * No mapping needed. Just check that it's a valid MC address. + */ + sdl = (struct sockaddr_dl *)sa; + e_addr = LLADDR(sdl); + if ((e_addr[0] & 1) != 1) + return (EADDRNOTAVAIL); + *llsa = 0; + return (0); + +#ifdef INET + case AF_INET: + sin = (struct sockaddr_in *)sa; + if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + return (EADDRNOTAVAIL); + sdl = malloc(sizeof *sdl, M_IFMADDR, + M_NOWAIT | M_ZERO); + if (sdl == NULL) + return (ENOMEM); + sdl->sdl_len = sizeof *sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = IFT_FDDI; + sdl->sdl_nlen = 0; + sdl->sdl_alen = FDDI_ADDR_LEN; + sdl->sdl_slen = 0; + e_addr = LLADDR(sdl); + ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); + *llsa = (struct sockaddr *)sdl; + return (0); +#endif +#ifdef INET6 + case AF_INET6: + sin6 = (struct sockaddr_in6 *)sa; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + /* + * An IP6 address of 0 means listen to all + * of the Ethernet multicast address used for IP6. + * (This is used for multicast routers.) + */ + ifp->if_flags |= IFF_ALLMULTI; + *llsa = 0; + return (0); + } + if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) + return (EADDRNOTAVAIL); + sdl = malloc(sizeof *sdl, M_IFMADDR, + M_NOWAIT | M_ZERO); + if (sdl == NULL) + return (ENOMEM); + sdl->sdl_len = sizeof *sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = IFT_FDDI; + sdl->sdl_nlen = 0; + sdl->sdl_alen = FDDI_ADDR_LEN; + sdl->sdl_slen = 0; + e_addr = LLADDR(sdl); + ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); + *llsa = (struct sockaddr *)sdl; + return (0); +#endif + + default: + /* + * Well, the text isn't quite right, but it's the name + * that counts... + */ + return (EAFNOSUPPORT); + } + + return (0); +} + +static moduledata_t fddi_mod = { + "fddi", /* module name */ + NULL, /* event handler */ + 0 /* extra data */ +}; + +DECLARE_MODULE(fddi, fddi_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(fddi, 1); diff --git a/freebsd/sys/net/if_fwsubr.c b/freebsd/sys/net/if_fwsubr.c new file mode 100644 index 00000000..d084bea4 --- /dev/null +++ b/freebsd/sys/net/if_fwsubr.c @@ -0,0 +1,853 @@ +#include + +/*- + * Copyright (c) 2004 Doug Rabson + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(INET) || defined(INET6) +#include +#include +#include +#endif +#ifdef INET6 +#include +#endif + +#include + +MALLOC_DEFINE(M_FWCOM, "fw_com", "firewire interface internals"); + +struct fw_hwaddr firewire_broadcastaddr = { + 0xffffffff, + 0xffffffff, + 0xff, + 0xff, + 0xffff, + 0xffffffff +}; + +static int +firewire_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct route *ro) +{ + struct fw_com *fc = IFP2FWC(ifp); + int error, type; + struct m_tag *mtag; + union fw_encap *enc; + struct fw_hwaddr *destfw; + uint8_t speed; + uint16_t psize, fsize, dsize; + struct mbuf *mtail; + int unicast, dgl, foff; + static int next_dgl; +#if defined(INET) || defined(INET6) + struct llentry *lle; +#endif + +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) + goto bad; +#endif + + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) { + error = ENETDOWN; + goto bad; + } + + /* + * For unicast, we make a tag to store the lladdr of the + * destination. This might not be the first time we have seen + * the packet (for instance, the arp code might be trying to + * re-send it after receiving an arp reply) so we only + * allocate a tag if there isn't one there already. For + * multicast, we will eventually use a different tag to store + * the channel number. + */ + unicast = !(m->m_flags & (M_BCAST | M_MCAST)); + if (unicast) { + mtag = m_tag_locate(m, MTAG_FIREWIRE, MTAG_FIREWIRE_HWADDR, NULL); + if (!mtag) { + mtag = m_tag_alloc(MTAG_FIREWIRE, MTAG_FIREWIRE_HWADDR, + sizeof (struct fw_hwaddr), M_NOWAIT); + if (!mtag) { + error = ENOMEM; + goto bad; + } + m_tag_prepend(m, mtag); + } + destfw = (struct fw_hwaddr *)(mtag + 1); + } else { + destfw = 0; + } + + switch (dst->sa_family) { +#ifdef INET + case AF_INET: + /* + * Only bother with arp for unicast. Allocation of + * channels etc. for firewire is quite different and + * doesn't fit into the arp model. + */ + if (unicast) { + error = arpresolve(ifp, ro ? ro->ro_rt : NULL, m, dst, (u_char *) destfw, &lle); + if (error) + return (error == EWOULDBLOCK ? 0 : error); + } + type = ETHERTYPE_IP; + break; + + case AF_ARP: + { + struct arphdr *ah; + ah = mtod(m, struct arphdr *); + ah->ar_hrd = htons(ARPHRD_IEEE1394); + type = ETHERTYPE_ARP; + if (unicast) + *destfw = *(struct fw_hwaddr *) ar_tha(ah); + + /* + * The standard arp code leaves a hole for the target + * hardware address which we need to close up. + */ + bcopy(ar_tpa(ah), ar_tha(ah), ah->ar_pln); + m_adj(m, -ah->ar_hln); + break; + } +#endif + +#ifdef INET6 + case AF_INET6: + if (unicast) { + error = nd6_storelladdr(fc->fc_ifp, m, dst, + (u_char *) destfw, &lle); + if (error) + return (error); + } + type = ETHERTYPE_IPV6; + break; +#endif + + default: + if_printf(ifp, "can't handle af%d\n", dst->sa_family); + error = EAFNOSUPPORT; + goto bad; + } + + /* + * Let BPF tap off a copy before we encapsulate. + */ + if (bpf_peers_present(ifp->if_bpf)) { + struct fw_bpfhdr h; + if (unicast) + bcopy(destfw, h.firewire_dhost, 8); + else + bcopy(&firewire_broadcastaddr, h.firewire_dhost, 8); + bcopy(&fc->fc_hwaddr, h.firewire_shost, 8); + h.firewire_type = htons(type); + bpf_mtap2(ifp->if_bpf, &h, sizeof(h), m); + } + + /* + * Punt on MCAP for now and send all multicast packets on the + * broadcast channel. + */ + if (m->m_flags & M_MCAST) + m->m_flags |= M_BCAST; + + /* + * Figure out what speed to use and what the largest supported + * packet size is. For unicast, this is the minimum of what we + * can speak and what they can hear. For broadcast, lets be + * conservative and use S100. We could possibly improve that + * by examining the bus manager's speed map or similar. We + * also reduce the packet size for broadcast to account for + * the GASP header. + */ + if (unicast) { + speed = min(fc->fc_speed, destfw->sspd); + psize = min(512 << speed, 2 << destfw->sender_max_rec); + } else { + speed = 0; + psize = 512 - 2*sizeof(uint32_t); + } + + /* + * Next, we encapsulate, possibly fragmenting the original + * datagram if it won't fit into a single packet. + */ + if (m->m_pkthdr.len <= psize - sizeof(uint32_t)) { + /* + * No fragmentation is necessary. + */ + M_PREPEND(m, sizeof(uint32_t), M_DONTWAIT); + if (!m) { + error = ENOBUFS; + goto bad; + } + enc = mtod(m, union fw_encap *); + enc->unfrag.ether_type = type; + enc->unfrag.lf = FW_ENCAP_UNFRAG; + enc->unfrag.reserved = 0; + + /* + * Byte swap the encapsulation header manually. + */ + enc->ul[0] = htonl(enc->ul[0]); + + error = (ifp->if_transmit)(ifp, m); + return (error); + } else { + /* + * Fragment the datagram, making sure to leave enough + * space for the encapsulation header in each packet. + */ + fsize = psize - 2*sizeof(uint32_t); + dgl = next_dgl++; + dsize = m->m_pkthdr.len; + foff = 0; + while (m) { + if (m->m_pkthdr.len > fsize) { + /* + * Split off the tail segment from the + * datagram, copying our tags over. + */ + mtail = m_split(m, fsize, M_DONTWAIT); + m_tag_copy_chain(mtail, m, M_NOWAIT); + } else { + mtail = 0; + } + + /* + * Add our encapsulation header to this + * fragment and hand it off to the link. + */ + M_PREPEND(m, 2*sizeof(uint32_t), M_DONTWAIT); + if (!m) { + error = ENOBUFS; + goto bad; + } + enc = mtod(m, union fw_encap *); + if (foff == 0) { + enc->firstfrag.lf = FW_ENCAP_FIRST; + enc->firstfrag.reserved1 = 0; + enc->firstfrag.reserved2 = 0; + enc->firstfrag.datagram_size = dsize - 1; + enc->firstfrag.ether_type = type; + enc->firstfrag.dgl = dgl; + } else { + if (mtail) + enc->nextfrag.lf = FW_ENCAP_NEXT; + else + enc->nextfrag.lf = FW_ENCAP_LAST; + enc->nextfrag.reserved1 = 0; + enc->nextfrag.reserved2 = 0; + enc->nextfrag.reserved3 = 0; + enc->nextfrag.datagram_size = dsize - 1; + enc->nextfrag.fragment_offset = foff; + enc->nextfrag.dgl = dgl; + } + foff += m->m_pkthdr.len - 2*sizeof(uint32_t); + + /* + * Byte swap the encapsulation header manually. + */ + enc->ul[0] = htonl(enc->ul[0]); + enc->ul[1] = htonl(enc->ul[1]); + + error = (ifp->if_transmit)(ifp, m); + if (error) { + if (mtail) + m_freem(mtail); + return (ENOBUFS); + } + + m = mtail; + } + + return (0); + } + +bad: + if (m) + m_freem(m); + return (error); +} + +static struct mbuf * +firewire_input_fragment(struct fw_com *fc, struct mbuf *m, int src) +{ + union fw_encap *enc; + struct fw_reass *r; + struct mbuf *mf, *mprev; + int dsize; + int fstart, fend, start, end, islast; + uint32_t id; + + /* + * Find an existing reassembly buffer or create a new one. + */ + enc = mtod(m, union fw_encap *); + id = enc->firstfrag.dgl | (src << 16); + STAILQ_FOREACH(r, &fc->fc_frags, fr_link) + if (r->fr_id == id) + break; + if (!r) { + r = malloc(sizeof(struct fw_reass), M_TEMP, M_NOWAIT); + if (!r) { + m_freem(m); + return 0; + } + r->fr_id = id; + r->fr_frags = 0; + STAILQ_INSERT_HEAD(&fc->fc_frags, r, fr_link); + } + + /* + * If this fragment overlaps any other fragment, we must discard + * the partial reassembly and start again. + */ + if (enc->firstfrag.lf == FW_ENCAP_FIRST) + fstart = 0; + else + fstart = enc->nextfrag.fragment_offset; + fend = fstart + m->m_pkthdr.len - 2*sizeof(uint32_t); + dsize = enc->nextfrag.datagram_size; + islast = (enc->nextfrag.lf == FW_ENCAP_LAST); + + for (mf = r->fr_frags; mf; mf = mf->m_nextpkt) { + enc = mtod(mf, union fw_encap *); + if (enc->nextfrag.datagram_size != dsize) { + /* + * This fragment must be from a different + * packet. + */ + goto bad; + } + if (enc->firstfrag.lf == FW_ENCAP_FIRST) + start = 0; + else + start = enc->nextfrag.fragment_offset; + end = start + mf->m_pkthdr.len - 2*sizeof(uint32_t); + if ((fstart < end && fend > start) || + (islast && enc->nextfrag.lf == FW_ENCAP_LAST)) { + /* + * Overlap - discard reassembly buffer and start + * again with this fragment. + */ + goto bad; + } + } + + /* + * Find where to put this fragment in the list. + */ + for (mf = r->fr_frags, mprev = NULL; mf; + mprev = mf, mf = mf->m_nextpkt) { + enc = mtod(mf, union fw_encap *); + if (enc->firstfrag.lf == FW_ENCAP_FIRST) + start = 0; + else + start = enc->nextfrag.fragment_offset; + if (start >= fend) + break; + } + + /* + * If this is a last fragment and we are not adding at the end + * of the list, discard the buffer. + */ + if (islast && mprev && mprev->m_nextpkt) + goto bad; + + if (mprev) { + m->m_nextpkt = mprev->m_nextpkt; + mprev->m_nextpkt = m; + + /* + * Coalesce forwards and see if we can make a whole + * datagram. + */ + enc = mtod(mprev, union fw_encap *); + if (enc->firstfrag.lf == FW_ENCAP_FIRST) + start = 0; + else + start = enc->nextfrag.fragment_offset; + end = start + mprev->m_pkthdr.len - 2*sizeof(uint32_t); + while (end == fstart) { + /* + * Strip off the encap header from m and + * append it to mprev, freeing m. + */ + m_adj(m, 2*sizeof(uint32_t)); + mprev->m_nextpkt = m->m_nextpkt; + mprev->m_pkthdr.len += m->m_pkthdr.len; + m_cat(mprev, m); + + if (mprev->m_pkthdr.len == dsize + 1 + 2*sizeof(uint32_t)) { + /* + * We have assembled a complete packet + * we must be finished. Make sure we have + * merged the whole chain. + */ + STAILQ_REMOVE(&fc->fc_frags, r, fw_reass, fr_link); + free(r, M_TEMP); + m = mprev->m_nextpkt; + while (m) { + mf = m->m_nextpkt; + m_freem(m); + m = mf; + } + mprev->m_nextpkt = NULL; + + return (mprev); + } + + /* + * See if we can continue merging forwards. + */ + end = fend; + m = mprev->m_nextpkt; + if (m) { + enc = mtod(m, union fw_encap *); + if (enc->firstfrag.lf == FW_ENCAP_FIRST) + fstart = 0; + else + fstart = enc->nextfrag.fragment_offset; + fend = fstart + m->m_pkthdr.len + - 2*sizeof(uint32_t); + } else { + break; + } + } + } else { + m->m_nextpkt = 0; + r->fr_frags = m; + } + + return (0); + +bad: + while (r->fr_frags) { + mf = r->fr_frags; + r->fr_frags = mf->m_nextpkt; + m_freem(mf); + } + m->m_nextpkt = 0; + r->fr_frags = m; + + return (0); +} + +void +firewire_input(struct ifnet *ifp, struct mbuf *m, uint16_t src) +{ + struct fw_com *fc = IFP2FWC(ifp); + union fw_encap *enc; + int type, isr; + + /* + * The caller has already stripped off the packet header + * (stream or wreqb) and marked the mbuf's M_BCAST flag + * appropriately. We de-encapsulate the IP packet and pass it + * up the line after handling link-level fragmentation. + */ + if (m->m_pkthdr.len < sizeof(uint32_t)) { + if_printf(ifp, "discarding frame without " + "encapsulation header (len %u pkt len %u)\n", + m->m_len, m->m_pkthdr.len); + } + + m = m_pullup(m, sizeof(uint32_t)); + if (m == NULL) + return; + enc = mtod(m, union fw_encap *); + + /* + * Byte swap the encapsulation header manually. + */ + enc->ul[0] = ntohl(enc->ul[0]); + + if (enc->unfrag.lf != 0) { + m = m_pullup(m, 2*sizeof(uint32_t)); + if (!m) + return; + enc = mtod(m, union fw_encap *); + enc->ul[1] = ntohl(enc->ul[1]); + m = firewire_input_fragment(fc, m, src); + if (!m) + return; + enc = mtod(m, union fw_encap *); + type = enc->firstfrag.ether_type; + m_adj(m, 2*sizeof(uint32_t)); + } else { + type = enc->unfrag.ether_type; + m_adj(m, sizeof(uint32_t)); + } + + if (m->m_pkthdr.rcvif == NULL) { + if_printf(ifp, "discard frame w/o interface pointer\n"); + ifp->if_ierrors++; + m_freem(m); + return; + } +#ifdef DIAGNOSTIC + if (m->m_pkthdr.rcvif != ifp) { + if_printf(ifp, "Warning, frame marked as received on %s\n", + m->m_pkthdr.rcvif->if_xname); + } +#endif + +#ifdef MAC + /* + * Tag the mbuf with an appropriate MAC label before any other + * consumers can get to it. + */ + mac_ifnet_create_mbuf(ifp, m); +#endif + + /* + * Give bpf a chance at the packet. The link-level driver + * should have left us a tag with the EUID of the sender. + */ + if (bpf_peers_present(ifp->if_bpf)) { + struct fw_bpfhdr h; + struct m_tag *mtag; + + mtag = m_tag_locate(m, MTAG_FIREWIRE, MTAG_FIREWIRE_SENDER_EUID, 0); + if (mtag) + bcopy(mtag + 1, h.firewire_shost, 8); + else + bcopy(&firewire_broadcastaddr, h.firewire_dhost, 8); + bcopy(&fc->fc_hwaddr, h.firewire_dhost, 8); + h.firewire_type = htons(type); + bpf_mtap2(ifp->if_bpf, &h, sizeof(h), m); + } + + if (ifp->if_flags & IFF_MONITOR) { + /* + * Interface marked for monitoring; discard packet. + */ + m_freem(m); + return; + } + + ifp->if_ibytes += m->m_pkthdr.len; + + /* Discard packet if interface is not up */ + if ((ifp->if_flags & IFF_UP) == 0) { + m_freem(m); + return; + } + + if (m->m_flags & (M_BCAST|M_MCAST)) + ifp->if_imcasts++; + + switch (type) { +#ifdef INET + case ETHERTYPE_IP: + if ((m = ip_fastforward(m)) == NULL) + return; + isr = NETISR_IP; + break; + + case ETHERTYPE_ARP: + { + struct arphdr *ah; + ah = mtod(m, struct arphdr *); + + /* + * Adjust the arp packet to insert an empty tha slot. + */ + m->m_len += ah->ar_hln; + m->m_pkthdr.len += ah->ar_hln; + bcopy(ar_tha(ah), ar_tpa(ah), ah->ar_pln); + isr = NETISR_ARP; + break; + } +#endif + +#ifdef INET6 + case ETHERTYPE_IPV6: + isr = NETISR_IPV6; + break; +#endif + + default: + m_freem(m); + return; + } + + netisr_dispatch(isr, m); +} + +int +firewire_ioctl(struct ifnet *ifp, u_long command, caddr_t data) +{ + struct ifaddr *ifa = (struct ifaddr *) data; + struct ifreq *ifr = (struct ifreq *) data; + int error = 0; + + switch (command) { + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + ifp->if_init(ifp->if_softc); /* before arpwhohas */ + arp_ifinit(ifp, ifa); + break; +#endif + default: + ifp->if_init(ifp->if_softc); + break; + } + break; + + case SIOCGIFADDR: + { + struct sockaddr *sa; + + sa = (struct sockaddr *) & ifr->ifr_data; + bcopy(&IFP2FWC(ifp)->fc_hwaddr, + (caddr_t) sa->sa_data, sizeof(struct fw_hwaddr)); + } + break; + + case SIOCSIFMTU: + /* + * Set the interface MTU. + */ + if (ifr->ifr_mtu > 1500) { + error = EINVAL; + } else { + ifp->if_mtu = ifr->ifr_mtu; + } + break; + default: + error = EINVAL; /* XXX netbsd has ENOTTY??? */ + break; + } + return (error); +} + +static int +firewire_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, + struct sockaddr *sa) +{ +#ifdef INET + struct sockaddr_in *sin; +#endif +#ifdef INET6 + struct sockaddr_in6 *sin6; +#endif + + switch(sa->sa_family) { + case AF_LINK: + /* + * No mapping needed. + */ + *llsa = 0; + return 0; + +#ifdef INET + case AF_INET: + sin = (struct sockaddr_in *)sa; + if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + return EADDRNOTAVAIL; + *llsa = 0; + return 0; +#endif +#ifdef INET6 + case AF_INET6: + sin6 = (struct sockaddr_in6 *)sa; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + /* + * An IP6 address of 0 means listen to all + * of the Ethernet multicast address used for IP6. + * (This is used for multicast routers.) + */ + ifp->if_flags |= IFF_ALLMULTI; + *llsa = 0; + return 0; + } + if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) + return EADDRNOTAVAIL; + *llsa = 0; + return 0; +#endif + + default: + /* + * Well, the text isn't quite right, but it's the name + * that counts... + */ + return EAFNOSUPPORT; + } +} + +void +firewire_ifattach(struct ifnet *ifp, struct fw_hwaddr *llc) +{ + struct fw_com *fc = IFP2FWC(ifp); + struct ifaddr *ifa; + struct sockaddr_dl *sdl; + static const char* speeds[] = { + "S100", "S200", "S400", "S800", + "S1600", "S3200" + }; + + fc->fc_speed = llc->sspd; + STAILQ_INIT(&fc->fc_frags); + + ifp->if_addrlen = sizeof(struct fw_hwaddr); + ifp->if_hdrlen = 0; + if_attach(ifp); + ifp->if_mtu = 1500; /* XXX */ + ifp->if_output = firewire_output; + ifp->if_resolvemulti = firewire_resolvemulti; + ifp->if_broadcastaddr = (u_char *) &firewire_broadcastaddr; + + ifa = ifp->if_addr; + KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + sdl->sdl_type = IFT_IEEE1394; + sdl->sdl_alen = ifp->if_addrlen; + bcopy(llc, LLADDR(sdl), ifp->if_addrlen); + + bpfattach(ifp, DLT_APPLE_IP_OVER_IEEE1394, + sizeof(struct fw_hwaddr)); + + if_printf(ifp, "Firewire address: %8D @ 0x%04x%08x, %s, maxrec %d\n", + (uint8_t *) &llc->sender_unique_ID_hi, ":", + ntohs(llc->sender_unicast_FIFO_hi), + ntohl(llc->sender_unicast_FIFO_lo), + speeds[llc->sspd], + (2 << llc->sender_max_rec)); +} + +void +firewire_ifdetach(struct ifnet *ifp) +{ + bpfdetach(ifp); + if_detach(ifp); +} + +void +firewire_busreset(struct ifnet *ifp) +{ + struct fw_com *fc = IFP2FWC(ifp); + struct fw_reass *r; + struct mbuf *m; + + /* + * Discard any partial datagrams since the host ids may have changed. + */ + while ((r = STAILQ_FIRST(&fc->fc_frags))) { + STAILQ_REMOVE_HEAD(&fc->fc_frags, fr_link); + while (r->fr_frags) { + m = r->fr_frags; + r->fr_frags = m->m_nextpkt; + m_freem(m); + } + free(r, M_TEMP); + } +} + +static void * +firewire_alloc(u_char type, struct ifnet *ifp) +{ + struct fw_com *fc; + + fc = malloc(sizeof(struct fw_com), M_FWCOM, M_WAITOK | M_ZERO); + fc->fc_ifp = ifp; + + return (fc); +} + +static void +firewire_free(void *com, u_char type) +{ + + free(com, M_FWCOM); +} + +static int +firewire_modevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + if_register_com_alloc(IFT_IEEE1394, + firewire_alloc, firewire_free); + break; + case MOD_UNLOAD: + if_deregister_com_alloc(IFT_IEEE1394); + break; + default: + return (EOPNOTSUPP); + } + + return (0); +} + +static moduledata_t firewire_mod = { + "if_firewire", + firewire_modevent, + 0 +}; + +DECLARE_MODULE(if_firewire, firewire_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); +MODULE_VERSION(if_firewire, 1); diff --git a/freebsd/sys/net/if_gif.c b/freebsd/sys/net/if_gif.c new file mode 100644 index 00000000..be67500a --- /dev/null +++ b/freebsd/sys/net/if_gif.c @@ -0,0 +1,1025 @@ +#include + +/* $FreeBSD$ */ +/* $KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $ */ + +/*- + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#ifdef INET +#include +#include +#include +#endif /* INET */ + +#ifdef INET6 +#ifndef INET +#include +#endif +#include +#include +#include +#include +#include +#include +#endif /* INET6 */ + +#include +#include +#include +#include + +#include + +#define GIFNAME "gif" + +/* + * gif_mtx protects the global gif_softc_list. + */ +static struct mtx gif_mtx; +static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); +static VNET_DEFINE(LIST_HEAD(, gif_softc), gif_softc_list); +#define V_gif_softc_list VNET(gif_softc_list) + +void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af); +void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af); +void (*ng_gif_attach_p)(struct ifnet *ifp); +void (*ng_gif_detach_p)(struct ifnet *ifp); + +static void gif_start(struct ifnet *); +static int gif_clone_create(struct if_clone *, int, caddr_t); +static void gif_clone_destroy(struct ifnet *); + +IFC_SIMPLE_DECLARE(gif, 0); + +static int gifmodevent(module_t, int, void *); + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW, 0, + "Generic Tunnel Interface"); +#ifndef MAX_GIF_NEST +/* + * This macro controls the default upper limitation on nesting of gif tunnels. + * Since, setting a large value to this macro with a careless configuration + * may introduce system crash, we don't allow any nestings by default. + * If you need to configure nested gif tunnels, you can define this macro + * in your kernel configuration file. However, if you do so, please be + * careful to configure the tunnels so that it won't make a loop. + */ +#define MAX_GIF_NEST 1 +#endif +static VNET_DEFINE(int, max_gif_nesting) = MAX_GIF_NEST; +#define V_max_gif_nesting VNET(max_gif_nesting) +SYSCTL_VNET_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_RW, + &VNET_NAME(max_gif_nesting), 0, "Max nested tunnels"); + +/* + * By default, we disallow creation of multiple tunnels between the same + * pair of addresses. Some applications require this functionality so + * we allow control over this check here. + */ +#ifdef XBONEHACK +static VNET_DEFINE(int, parallel_tunnels) = 1; +#else +static VNET_DEFINE(int, parallel_tunnels) = 0; +#endif +#define V_parallel_tunnels VNET(parallel_tunnels) +SYSCTL_VNET_INT(_net_link_gif, OID_AUTO, parallel_tunnels, CTLFLAG_RW, + &VNET_NAME(parallel_tunnels), 0, "Allow parallel tunnels?"); + +/* copy from src/sys/net/if_ethersubr.c */ +static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; +#ifndef ETHER_IS_BROADCAST +#define ETHER_IS_BROADCAST(addr) \ + (bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0) +#endif + +static int +gif_clone_create(ifc, unit, params) + struct if_clone *ifc; + int unit; + caddr_t params; +{ + struct gif_softc *sc; + + sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK | M_ZERO); + sc->gif_fibnum = curthread->td_proc->p_fibnum; + GIF2IFP(sc) = if_alloc(IFT_GIF); + if (GIF2IFP(sc) == NULL) { + free(sc, M_GIF); + return (ENOSPC); + } + + GIF_LOCK_INIT(sc); + + GIF2IFP(sc)->if_softc = sc; + if_initname(GIF2IFP(sc), ifc->ifc_name, unit); + + sc->encap_cookie4 = sc->encap_cookie6 = NULL; + sc->gif_options = GIF_ACCEPT_REVETHIP; + + GIF2IFP(sc)->if_addrlen = 0; + GIF2IFP(sc)->if_mtu = GIF_MTU; + GIF2IFP(sc)->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; +#if 0 + /* turn off ingress filter */ + GIF2IFP(sc)->if_flags |= IFF_LINK2; +#endif + GIF2IFP(sc)->if_ioctl = gif_ioctl; + GIF2IFP(sc)->if_start = gif_start; + GIF2IFP(sc)->if_output = gif_output; + GIF2IFP(sc)->if_snd.ifq_maxlen = ifqmaxlen; + if_attach(GIF2IFP(sc)); + bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t)); + if (ng_gif_attach_p != NULL) + (*ng_gif_attach_p)(GIF2IFP(sc)); + + mtx_lock(&gif_mtx); + LIST_INSERT_HEAD(&V_gif_softc_list, sc, gif_list); + mtx_unlock(&gif_mtx); + + return (0); +} + +static void +gif_clone_destroy(ifp) + struct ifnet *ifp; +{ +#if defined(INET) || defined(INET6) + int err; +#endif + struct gif_softc *sc = ifp->if_softc; + + mtx_lock(&gif_mtx); + LIST_REMOVE(sc, gif_list); + mtx_unlock(&gif_mtx); + + gif_delete_tunnel(ifp); +#ifdef INET6 + if (sc->encap_cookie6 != NULL) { + err = encap_detach(sc->encap_cookie6); + KASSERT(err == 0, ("Unexpected error detaching encap_cookie6")); + } +#endif +#ifdef INET + if (sc->encap_cookie4 != NULL) { + err = encap_detach(sc->encap_cookie4); + KASSERT(err == 0, ("Unexpected error detaching encap_cookie4")); + } +#endif + + if (ng_gif_detach_p != NULL) + (*ng_gif_detach_p)(ifp); + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); + + GIF_LOCK_DESTROY(sc); + + free(sc, M_GIF); +} + +static void +vnet_gif_init(const void *unused __unused) +{ + + LIST_INIT(&V_gif_softc_list); +} +VNET_SYSINIT(vnet_gif_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, vnet_gif_init, + NULL); + +static int +gifmodevent(mod, type, data) + module_t mod; + int type; + void *data; +{ + + switch (type) { + case MOD_LOAD: + mtx_init(&gif_mtx, "gif_mtx", NULL, MTX_DEF); + if_clone_attach(&gif_cloner); + break; + + case MOD_UNLOAD: + if_clone_detach(&gif_cloner); + mtx_destroy(&gif_mtx); + break; + default: + return EOPNOTSUPP; + } + return 0; +} + +static moduledata_t gif_mod = { + "if_gif", + gifmodevent, + 0 +}; + +DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_gif, 1); + +int +gif_encapcheck(m, off, proto, arg) + const struct mbuf *m; + int off; + int proto; + void *arg; +{ + struct ip ip; + struct gif_softc *sc; + + sc = (struct gif_softc *)arg; + if (sc == NULL) + return 0; + + if ((GIF2IFP(sc)->if_flags & IFF_UP) == 0) + return 0; + + /* no physical address */ + if (!sc->gif_psrc || !sc->gif_pdst) + return 0; + + switch (proto) { +#ifdef INET + case IPPROTO_IPV4: + break; +#endif +#ifdef INET6 + case IPPROTO_IPV6: + break; +#endif + case IPPROTO_ETHERIP: + break; + + default: + return 0; + } + + /* Bail on short packets */ + if (m->m_pkthdr.len < sizeof(ip)) + return 0; + + m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); + + switch (ip.ip_v) { +#ifdef INET + case 4: + if (sc->gif_psrc->sa_family != AF_INET || + sc->gif_pdst->sa_family != AF_INET) + return 0; + return gif_encapcheck4(m, off, proto, arg); +#endif +#ifdef INET6 + case 6: + if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) + return 0; + if (sc->gif_psrc->sa_family != AF_INET6 || + sc->gif_pdst->sa_family != AF_INET6) + return 0; + return gif_encapcheck6(m, off, proto, arg); +#endif + default: + return 0; + } +} + +static void +gif_start(struct ifnet *ifp) +{ + struct gif_softc *sc; + struct mbuf *m; + + sc = ifp->if_softc; + + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + for (;;) { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m == 0) + break; + + gif_output(ifp, m, sc->gif_pdst, NULL); + + } + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + + return; +} + +int +gif_output(ifp, m, dst, ro) + struct ifnet *ifp; + struct mbuf *m; + struct sockaddr *dst; + struct route *ro; +{ + struct gif_softc *sc = ifp->if_softc; + struct m_tag *mtag; + int error = 0; + int gif_called; + u_int32_t af; + +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) { + m_freem(m); + goto end; + } +#endif + + /* + * gif may cause infinite recursion calls when misconfigured. + * We'll prevent this by detecting loops. + * + * High nesting level may cause stack exhaustion. + * We'll prevent this by introducing upper limit. + */ + gif_called = 1; + mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, NULL); + while (mtag != NULL) { + if (*(struct ifnet **)(mtag + 1) == ifp) { + log(LOG_NOTICE, + "gif_output: loop detected on %s\n", + (*(struct ifnet **)(mtag + 1))->if_xname); + m_freem(m); + error = EIO; /* is there better errno? */ + goto end; + } + mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, mtag); + gif_called++; + } + if (gif_called > V_max_gif_nesting) { + log(LOG_NOTICE, + "gif_output: recursively called too many times(%d)\n", + gif_called); + m_freem(m); + error = EIO; /* is there better errno? */ + goto end; + } + mtag = m_tag_alloc(MTAG_GIF, MTAG_GIF_CALLED, sizeof(struct ifnet *), + M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + error = ENOMEM; + goto end; + } + *(struct ifnet **)(mtag + 1) = ifp; + m_tag_prepend(m, mtag); + + m->m_flags &= ~(M_BCAST|M_MCAST); + + GIF_LOCK(sc); + + if (!(ifp->if_flags & IFF_UP) || + sc->gif_psrc == NULL || sc->gif_pdst == NULL) { + GIF_UNLOCK(sc); + m_freem(m); + error = ENETDOWN; + goto end; + } + + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC) { + bcopy(dst->sa_data, &af, sizeof(af)); + dst->sa_family = af; + } + + af = dst->sa_family; + BPF_MTAP2(ifp, &af, sizeof(af), m); + ifp->if_opackets++; + ifp->if_obytes += m->m_pkthdr.len; + + /* override to IPPROTO_ETHERIP for bridged traffic */ + if (ifp->if_bridge) + af = AF_LINK; + + M_SETFIB(m, sc->gif_fibnum); + /* inner AF-specific encapsulation */ + + /* XXX should we check if our outer source is legal? */ + + /* dispatch to output logic based on outer AF */ + switch (sc->gif_psrc->sa_family) { +#ifdef INET + case AF_INET: + error = in_gif_output(ifp, af, m); + break; +#endif +#ifdef INET6 + case AF_INET6: + error = in6_gif_output(ifp, af, m); + break; +#endif + default: + m_freem(m); + error = ENETDOWN; + } + + GIF_UNLOCK(sc); + end: + if (error) + ifp->if_oerrors++; + return (error); +} + +void +gif_input(m, af, ifp) + struct mbuf *m; + int af; + struct ifnet *ifp; +{ + int isr, n; + struct gif_softc *sc = ifp->if_softc; + struct etherip_header *eip; + struct ether_header *eh; + struct ifnet *oldifp; + + if (ifp == NULL) { + /* just in case */ + m_freem(m); + return; + } + + m->m_pkthdr.rcvif = ifp; + +#ifdef MAC + mac_ifnet_create_mbuf(ifp, m); +#endif + + if (bpf_peers_present(ifp->if_bpf)) { + u_int32_t af1 = af; + bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m); + } + + if (ng_gif_input_p != NULL) { + (*ng_gif_input_p)(ifp, &m, af); + if (m == NULL) + return; + } + + /* + * Put the packet to the network layer input queue according to the + * specified address family. + * Note: older versions of gif_input directly called network layer + * input functions, e.g. ip6_input, here. We changed the policy to + * prevent too many recursive calls of such input functions, which + * might cause kernel panic. But the change may introduce another + * problem; if the input queue is full, packets are discarded. + * The kernel stack overflow really happened, and we believed + * queue-full rarely occurs, so we changed the policy. + */ + switch (af) { +#ifdef INET + case AF_INET: + isr = NETISR_IP; + break; +#endif +#ifdef INET6 + case AF_INET6: + isr = NETISR_IPV6; + break; +#endif + case AF_LINK: + n = sizeof(struct etherip_header) + sizeof(struct ether_header); + if (n > m->m_len) { + m = m_pullup(m, n); + if (m == NULL) { + ifp->if_ierrors++; + return; + } + } + + eip = mtod(m, struct etherip_header *); + /* + * GIF_ACCEPT_REVETHIP (enabled by default) intentionally + * accepts an EtherIP packet with revered version field in + * the header. This is a knob for backward compatibility + * with FreeBSD 7.2R or prior. + */ + if (sc->gif_options & GIF_ACCEPT_REVETHIP) { + if (eip->eip_resvl != ETHERIP_VERSION + && eip->eip_ver != ETHERIP_VERSION) { + /* discard unknown versions */ + m_freem(m); + return; + } + } else { + if (eip->eip_ver != ETHERIP_VERSION) { + /* discard unknown versions */ + m_freem(m); + return; + } + } + m_adj(m, sizeof(struct etherip_header)); + + m->m_flags &= ~(M_BCAST|M_MCAST); + m->m_pkthdr.rcvif = ifp; + + if (ifp->if_bridge) { + oldifp = ifp; + eh = mtod(m, struct ether_header *); + if (ETHER_IS_MULTICAST(eh->ether_dhost)) { + if (ETHER_IS_BROADCAST(eh->ether_dhost)) + m->m_flags |= M_BCAST; + else + m->m_flags |= M_MCAST; + ifp->if_imcasts++; + } + BRIDGE_INPUT(ifp, m); + + if (m != NULL && ifp != oldifp) { + /* + * The bridge gave us back itself or one of the + * members for which the frame is addressed. + */ + ether_demux(ifp, m); + return; + } + } + if (m != NULL) + m_freem(m); + return; + + default: + if (ng_gif_input_orphan_p != NULL) + (*ng_gif_input_orphan_p)(ifp, m, af); + else + m_freem(m); + return; + } + + ifp->if_ipackets++; + ifp->if_ibytes += m->m_pkthdr.len; + netisr_dispatch(isr, m); +} + +/* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */ +int +gif_ioctl(ifp, cmd, data) + struct ifnet *ifp; + u_long cmd; + caddr_t data; +{ + struct gif_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq*)data; + int error = 0, size; + u_int options; + struct sockaddr *dst, *src; +#ifdef SIOCSIFMTU /* xxx */ + u_long mtu; +#endif + + switch (cmd) { + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + break; + + case SIOCSIFDSTADDR: + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + +#ifdef SIOCSIFMTU /* xxx */ + case SIOCGIFMTU: + break; + + case SIOCSIFMTU: + mtu = ifr->ifr_mtu; + if (mtu < GIF_MTU_MIN || mtu > GIF_MTU_MAX) + return (EINVAL); + ifp->if_mtu = mtu; + break; +#endif /* SIOCSIFMTU */ + +#ifdef INET + case SIOCSIFPHYADDR: +#endif +#ifdef INET6 + case SIOCSIFPHYADDR_IN6: +#endif /* INET6 */ + case SIOCSLIFPHYADDR: + switch (cmd) { +#ifdef INET + case SIOCSIFPHYADDR: + src = (struct sockaddr *) + &(((struct in_aliasreq *)data)->ifra_addr); + dst = (struct sockaddr *) + &(((struct in_aliasreq *)data)->ifra_dstaddr); + break; +#endif +#ifdef INET6 + case SIOCSIFPHYADDR_IN6: + src = (struct sockaddr *) + &(((struct in6_aliasreq *)data)->ifra_addr); + dst = (struct sockaddr *) + &(((struct in6_aliasreq *)data)->ifra_dstaddr); + break; +#endif + case SIOCSLIFPHYADDR: + src = (struct sockaddr *) + &(((struct if_laddrreq *)data)->addr); + dst = (struct sockaddr *) + &(((struct if_laddrreq *)data)->dstaddr); + break; + default: + return EINVAL; + } + + /* sa_family must be equal */ + if (src->sa_family != dst->sa_family) + return EINVAL; + + /* validate sa_len */ + switch (src->sa_family) { +#ifdef INET + case AF_INET: + if (src->sa_len != sizeof(struct sockaddr_in)) + return EINVAL; + break; +#endif +#ifdef INET6 + case AF_INET6: + if (src->sa_len != sizeof(struct sockaddr_in6)) + return EINVAL; + break; +#endif + default: + return EAFNOSUPPORT; + } + switch (dst->sa_family) { +#ifdef INET + case AF_INET: + if (dst->sa_len != sizeof(struct sockaddr_in)) + return EINVAL; + break; +#endif +#ifdef INET6 + case AF_INET6: + if (dst->sa_len != sizeof(struct sockaddr_in6)) + return EINVAL; + break; +#endif + default: + return EAFNOSUPPORT; + } + + /* check sa_family looks sane for the cmd */ + switch (cmd) { + case SIOCSIFPHYADDR: + if (src->sa_family == AF_INET) + break; + return EAFNOSUPPORT; +#ifdef INET6 + case SIOCSIFPHYADDR_IN6: + if (src->sa_family == AF_INET6) + break; + return EAFNOSUPPORT; +#endif /* INET6 */ + case SIOCSLIFPHYADDR: + /* checks done in the above */ + break; + } + + error = gif_set_tunnel(GIF2IFP(sc), src, dst); + break; + +#ifdef SIOCDIFPHYADDR + case SIOCDIFPHYADDR: + gif_delete_tunnel(GIF2IFP(sc)); + break; +#endif + + case SIOCGIFPSRCADDR: +#ifdef INET6 + case SIOCGIFPSRCADDR_IN6: +#endif /* INET6 */ + if (sc->gif_psrc == NULL) { + error = EADDRNOTAVAIL; + goto bad; + } + src = sc->gif_psrc; + switch (cmd) { +#ifdef INET + case SIOCGIFPSRCADDR: + dst = &ifr->ifr_addr; + size = sizeof(ifr->ifr_addr); + break; +#endif /* INET */ +#ifdef INET6 + case SIOCGIFPSRCADDR_IN6: + dst = (struct sockaddr *) + &(((struct in6_ifreq *)data)->ifr_addr); + size = sizeof(((struct in6_ifreq *)data)->ifr_addr); + break; +#endif /* INET6 */ + default: + error = EADDRNOTAVAIL; + goto bad; + } + if (src->sa_len > size) + return EINVAL; + bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); +#ifdef INET6 + if (dst->sa_family == AF_INET6) { + error = sa6_recoverscope((struct sockaddr_in6 *)dst); + if (error != 0) + return (error); + } +#endif + break; + + case SIOCGIFPDSTADDR: +#ifdef INET6 + case SIOCGIFPDSTADDR_IN6: +#endif /* INET6 */ + if (sc->gif_pdst == NULL) { + error = EADDRNOTAVAIL; + goto bad; + } + src = sc->gif_pdst; + switch (cmd) { +#ifdef INET + case SIOCGIFPDSTADDR: + dst = &ifr->ifr_addr; + size = sizeof(ifr->ifr_addr); + break; +#endif /* INET */ +#ifdef INET6 + case SIOCGIFPDSTADDR_IN6: + dst = (struct sockaddr *) + &(((struct in6_ifreq *)data)->ifr_addr); + size = sizeof(((struct in6_ifreq *)data)->ifr_addr); + break; +#endif /* INET6 */ + default: + error = EADDRNOTAVAIL; + goto bad; + } + if (src->sa_len > size) + return EINVAL; + bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); +#ifdef INET6 + if (dst->sa_family == AF_INET6) { + error = sa6_recoverscope((struct sockaddr_in6 *)dst); + if (error != 0) + return (error); + } +#endif + break; + + case SIOCGLIFPHYADDR: + if (sc->gif_psrc == NULL || sc->gif_pdst == NULL) { + error = EADDRNOTAVAIL; + goto bad; + } + + /* copy src */ + src = sc->gif_psrc; + dst = (struct sockaddr *) + &(((struct if_laddrreq *)data)->addr); + size = sizeof(((struct if_laddrreq *)data)->addr); + if (src->sa_len > size) + return EINVAL; + bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); + + /* copy dst */ + src = sc->gif_pdst; + dst = (struct sockaddr *) + &(((struct if_laddrreq *)data)->dstaddr); + size = sizeof(((struct if_laddrreq *)data)->dstaddr); + if (src->sa_len > size) + return EINVAL; + bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); + break; + + case SIOCSIFFLAGS: + /* if_ioctl() takes care of it */ + break; + + case GIFGOPTS: + options = sc->gif_options; + error = copyout(&options, ifr->ifr_data, + sizeof(options)); + break; + + case GIFSOPTS: + if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0) + break; + error = copyin(ifr->ifr_data, &options, sizeof(options)); + if (error) + break; + if (options & ~GIF_OPTMASK) + error = EINVAL; + else + sc->gif_options = options; + break; + + default: + error = EINVAL; + break; + } + bad: + return error; +} + +/* + * XXXRW: There's a general event-ordering issue here: the code to check + * if a given tunnel is already present happens before we perform a + * potentially blocking setup of the tunnel. This code needs to be + * re-ordered so that the check and replacement can be atomic using + * a mutex. + */ +int +gif_set_tunnel(ifp, src, dst) + struct ifnet *ifp; + struct sockaddr *src; + struct sockaddr *dst; +{ + struct gif_softc *sc = ifp->if_softc; + struct gif_softc *sc2; + struct sockaddr *osrc, *odst, *sa; + int error = 0; + + mtx_lock(&gif_mtx); + LIST_FOREACH(sc2, &V_gif_softc_list, gif_list) { + if (sc2 == sc) + continue; + if (!sc2->gif_pdst || !sc2->gif_psrc) + continue; + if (sc2->gif_pdst->sa_family != dst->sa_family || + sc2->gif_pdst->sa_len != dst->sa_len || + sc2->gif_psrc->sa_family != src->sa_family || + sc2->gif_psrc->sa_len != src->sa_len) + continue; + + /* + * Disallow parallel tunnels unless instructed + * otherwise. + */ + if (!V_parallel_tunnels && + bcmp(sc2->gif_pdst, dst, dst->sa_len) == 0 && + bcmp(sc2->gif_psrc, src, src->sa_len) == 0) { + error = EADDRNOTAVAIL; + mtx_unlock(&gif_mtx); + goto bad; + } + + /* XXX both end must be valid? (I mean, not 0.0.0.0) */ + } + mtx_unlock(&gif_mtx); + + /* XXX we can detach from both, but be polite just in case */ + if (sc->gif_psrc) + switch (sc->gif_psrc->sa_family) { +#ifdef INET + case AF_INET: + (void)in_gif_detach(sc); + break; +#endif +#ifdef INET6 + case AF_INET6: + (void)in6_gif_detach(sc); + break; +#endif + } + + osrc = sc->gif_psrc; + sa = (struct sockaddr *)malloc(src->sa_len, M_IFADDR, M_WAITOK); + bcopy((caddr_t)src, (caddr_t)sa, src->sa_len); + sc->gif_psrc = sa; + + odst = sc->gif_pdst; + sa = (struct sockaddr *)malloc(dst->sa_len, M_IFADDR, M_WAITOK); + bcopy((caddr_t)dst, (caddr_t)sa, dst->sa_len); + sc->gif_pdst = sa; + + switch (sc->gif_psrc->sa_family) { +#ifdef INET + case AF_INET: + error = in_gif_attach(sc); + break; +#endif +#ifdef INET6 + case AF_INET6: + /* + * Check validity of the scope zone ID of the addresses, and + * convert it into the kernel internal form if necessary. + */ + error = sa6_embedscope((struct sockaddr_in6 *)sc->gif_psrc, 0); + if (error != 0) + break; + error = sa6_embedscope((struct sockaddr_in6 *)sc->gif_pdst, 0); + if (error != 0) + break; + error = in6_gif_attach(sc); + break; +#endif + } + if (error) { + /* rollback */ + free((caddr_t)sc->gif_psrc, M_IFADDR); + free((caddr_t)sc->gif_pdst, M_IFADDR); + sc->gif_psrc = osrc; + sc->gif_pdst = odst; + goto bad; + } + + if (osrc) + free((caddr_t)osrc, M_IFADDR); + if (odst) + free((caddr_t)odst, M_IFADDR); + + bad: + if (sc->gif_psrc && sc->gif_pdst) + ifp->if_drv_flags |= IFF_DRV_RUNNING; + else + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + + return error; +} + +void +gif_delete_tunnel(ifp) + struct ifnet *ifp; +{ + struct gif_softc *sc = ifp->if_softc; + + if (sc->gif_psrc) { + free((caddr_t)sc->gif_psrc, M_IFADDR); + sc->gif_psrc = NULL; + } + if (sc->gif_pdst) { + free((caddr_t)sc->gif_pdst, M_IFADDR); + sc->gif_pdst = NULL; + } + /* it is safe to detach from both */ +#ifdef INET + (void)in_gif_detach(sc); +#endif +#ifdef INET6 + (void)in6_gif_detach(sc); +#endif + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; +} diff --git a/freebsd/sys/net/if_gif.h b/freebsd/sys/net/if_gif.h new file mode 100644 index 00000000..14f06fd6 --- /dev/null +++ b/freebsd/sys/net/if_gif.h @@ -0,0 +1,130 @@ +/* $FreeBSD$ */ +/* $KAME: if_gif.h,v 1.17 2000/09/11 11:36:41 sumikawa Exp $ */ + +/*- + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * if_gif.h + */ + +#ifndef _NET_IF_GIF_HH_ +#define _NET_IF_GIF_HH_ + + +#ifdef _KERNEL +#include +#include + +#include +/* xxx sigh, why route have struct route instead of pointer? */ + +struct encaptab; + +extern void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, + int af); +extern void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, + int af); +extern int (*ng_gif_output_p)(struct ifnet *ifp, struct mbuf **mp); +extern void (*ng_gif_attach_p)(struct ifnet *ifp); +extern void (*ng_gif_detach_p)(struct ifnet *ifp); + +struct gif_softc { + struct ifnet *gif_ifp; + struct mtx gif_mtx; + struct sockaddr *gif_psrc; /* Physical src addr */ + struct sockaddr *gif_pdst; /* Physical dst addr */ + union { + struct route gifscr_ro; /* xxx */ +#ifdef INET6 + struct route_in6 gifscr_ro6; /* xxx */ +#endif + } gifsc_gifscr; + int gif_flags; + u_int gif_fibnum; + const struct encaptab *encap_cookie4; + const struct encaptab *encap_cookie6; + void *gif_netgraph; /* ng_gif(4) netgraph node info */ + u_int gif_options; + LIST_ENTRY(gif_softc) gif_list; /* all gif's are linked */ +}; +#define GIF2IFP(sc) ((sc)->gif_ifp) +#define GIF_LOCK_INIT(sc) mtx_init(&(sc)->gif_mtx, "gif softc", \ + NULL, MTX_DEF) +#define GIF_LOCK_DESTROY(sc) mtx_destroy(&(sc)->gif_mtx) +#define GIF_LOCK(sc) mtx_lock(&(sc)->gif_mtx) +#define GIF_UNLOCK(sc) mtx_unlock(&(sc)->gif_mtx) +#define GIF_LOCK_ASSERT(sc) mtx_assert(&(sc)->gif_mtx, MA_OWNED) + +#define gif_ro gifsc_gifscr.gifscr_ro +#ifdef INET6 +#define gif_ro6 gifsc_gifscr.gifscr_ro6 +#endif + +#define GIF_MTU (1280) /* Default MTU */ +#define GIF_MTU_MIN (1280) /* Minimum MTU */ +#define GIF_MTU_MAX (8192) /* Maximum MTU */ + +#define MTAG_GIF 1080679712 +#define MTAG_GIF_CALLED 0 + +struct etherip_header { +#if BYTE_ORDER == LITTLE_ENDIAN + u_int eip_resvl:4, /* reserved */ + eip_ver:4; /* version */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int eip_ver:4, /* version */ + eip_resvl:4; /* reserved */ +#endif + u_int8_t eip_resvh; /* reserved */ +} __packed; + +#define ETHERIP_VERSION 0x3 +/* mbuf adjust factor to force 32-bit alignment of IP header */ +#define ETHERIP_ALIGN 2 + +/* Prototypes */ +void gif_input(struct mbuf *, int, struct ifnet *); +int gif_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); +int gif_ioctl(struct ifnet *, u_long, caddr_t); +int gif_set_tunnel(struct ifnet *, struct sockaddr *, struct sockaddr *); +void gif_delete_tunnel(struct ifnet *); +int gif_encapcheck(const struct mbuf *, int, int, void *); +#endif /* _KERNEL */ + +#define GIFGOPTS _IOWR('i', 150, struct ifreq) +#define GIFSOPTS _IOW('i', 151, struct ifreq) + +#define GIF_ACCEPT_REVETHIP 0x0001 +#define GIF_SEND_REVETHIP 0x0010 +#define GIF_OPTMASK (GIF_ACCEPT_REVETHIP|GIF_SEND_REVETHIP) + +#endif /* _NET_IF_GIF_HH_ */ diff --git a/freebsd/sys/net/if_gre.c b/freebsd/sys/net/if_gre.c new file mode 100644 index 00000000..4a42029b --- /dev/null +++ b/freebsd/sys/net/if_gre.c @@ -0,0 +1,909 @@ +#include + +/* $NetBSD: if_gre.c,v 1.49 2003/12/11 00:22:29 itojun Exp $ */ +/* $FreeBSD$ */ + +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Heiko W.Rupp + * + * IPv6-over-GRE contributed by Gert Doering + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Encapsulate L3 protocols into IP + * See RFC 2784 (successor of RFC 1701 and 1702) for more details. + * If_gre is compatible with Cisco GRE tunnels, so you can + * have a NetBSD box as the other end of a tunnel interface of a Cisco + * router. See gre(4) for more details. + * Also supported: IP in IP encaps (proto 55) as of RFC 2004 + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef INET +#include +#include +#include +#include +#include +#include +#include +#else +#error "Huh? if_gre without inet?" +#endif + +#include + +#include + +/* + * It is not easy to calculate the right value for a GRE MTU. + * We leave this task to the admin and use the same default that + * other vendors use. + */ +#define GREMTU 1476 + +#define GRENAME "gre" + +/* + * gre_mtx protects all global variables in if_gre.c. + * XXX: gre_softc data not protected yet. + */ +struct mtx gre_mtx; +static MALLOC_DEFINE(M_GRE, GRENAME, "Generic Routing Encapsulation"); + +struct gre_softc_head gre_softc_list; + +static int gre_clone_create(struct if_clone *, int, caddr_t); +static void gre_clone_destroy(struct ifnet *); +static int gre_ioctl(struct ifnet *, u_long, caddr_t); +static int gre_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *ro); + +IFC_SIMPLE_DECLARE(gre, 0); + +static int gre_compute_route(struct gre_softc *sc); + +static void greattach(void); + +#ifdef INET +extern struct domain inetdomain; +static const struct protosw in_gre_protosw = { + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_GRE, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = gre_input, + .pr_output = (pr_output_t *)rip_output, + .pr_ctlinput = rip_ctlinput, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}; +static const struct protosw in_mobile_protosw = { + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_MOBILE, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = gre_mobile_input, + .pr_output = (pr_output_t *)rip_output, + .pr_ctlinput = rip_ctlinput, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}; +#endif + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, IFT_TUNNEL, gre, CTLFLAG_RW, 0, + "Generic Routing Encapsulation"); +#ifndef MAX_GRE_NEST +/* + * This macro controls the default upper limitation on nesting of gre tunnels. + * Since, setting a large value to this macro with a careless configuration + * may introduce system crash, we don't allow any nestings by default. + * If you need to configure nested gre tunnels, you can define this macro + * in your kernel configuration file. However, if you do so, please be + * careful to configure the tunnels so that it won't make a loop. + */ +#define MAX_GRE_NEST 1 +#endif +static int max_gre_nesting = MAX_GRE_NEST; +SYSCTL_INT(_net_link_gre, OID_AUTO, max_nesting, CTLFLAG_RW, + &max_gre_nesting, 0, "Max nested tunnels"); + +/* ARGSUSED */ +static void +greattach(void) +{ + + mtx_init(&gre_mtx, "gre_mtx", NULL, MTX_DEF); + LIST_INIT(&gre_softc_list); + if_clone_attach(&gre_cloner); +} + +static int +gre_clone_create(ifc, unit, params) + struct if_clone *ifc; + int unit; + caddr_t params; +{ + struct gre_softc *sc; + + sc = malloc(sizeof(struct gre_softc), M_GRE, M_WAITOK | M_ZERO); + + GRE2IFP(sc) = if_alloc(IFT_TUNNEL); + if (GRE2IFP(sc) == NULL) { + free(sc, M_GRE); + return (ENOSPC); + } + + GRE2IFP(sc)->if_softc = sc; + if_initname(GRE2IFP(sc), ifc->ifc_name, unit); + + GRE2IFP(sc)->if_snd.ifq_maxlen = ifqmaxlen; + GRE2IFP(sc)->if_addrlen = 0; + GRE2IFP(sc)->if_hdrlen = 24; /* IP + GRE */ + GRE2IFP(sc)->if_mtu = GREMTU; + GRE2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST; + GRE2IFP(sc)->if_output = gre_output; + GRE2IFP(sc)->if_ioctl = gre_ioctl; + sc->g_dst.s_addr = sc->g_src.s_addr = INADDR_ANY; + sc->g_proto = IPPROTO_GRE; + GRE2IFP(sc)->if_flags |= IFF_LINK0; + sc->encap = NULL; + sc->called = 0; + sc->gre_fibnum = curthread->td_proc->p_fibnum; + sc->wccp_ver = WCCP_V1; + sc->key = 0; + if_attach(GRE2IFP(sc)); + bpfattach(GRE2IFP(sc), DLT_NULL, sizeof(u_int32_t)); + mtx_lock(&gre_mtx); + LIST_INSERT_HEAD(&gre_softc_list, sc, sc_list); + mtx_unlock(&gre_mtx); + return (0); +} + +static void +gre_clone_destroy(ifp) + struct ifnet *ifp; +{ + struct gre_softc *sc = ifp->if_softc; + + mtx_lock(&gre_mtx); + LIST_REMOVE(sc, sc_list); + mtx_unlock(&gre_mtx); + +#ifdef INET + if (sc->encap != NULL) + encap_detach(sc->encap); +#endif + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); + free(sc, M_GRE); +} + +/* + * The output routine. Takes a packet and encapsulates it in the protocol + * given by sc->g_proto. See also RFC 1701 and RFC 2004 + */ +static int +gre_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct route *ro) +{ + int error = 0; + struct gre_softc *sc = ifp->if_softc; + struct greip *gh; + struct ip *ip; + u_short gre_ip_id = 0; + uint8_t gre_ip_tos = 0; + u_int16_t etype = 0; + struct mobile_h mob_h; + u_int32_t af; + int extra = 0; + + /* + * gre may cause infinite recursion calls when misconfigured. + * We'll prevent this by introducing upper limit. + */ + if (++(sc->called) > max_gre_nesting) { + printf("%s: gre_output: recursively called too many " + "times(%d)\n", if_name(GRE2IFP(sc)), sc->called); + m_freem(m); + error = EIO; /* is there better errno? */ + goto end; + } + + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING)) || + sc->g_src.s_addr == INADDR_ANY || sc->g_dst.s_addr == INADDR_ANY) { + m_freem(m); + error = ENETDOWN; + goto end; + } + + gh = NULL; + ip = NULL; + + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC) { + bcopy(dst->sa_data, &af, sizeof(af)); + dst->sa_family = af; + } + + if (bpf_peers_present(ifp->if_bpf)) { + af = dst->sa_family; + bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); + } + + m->m_flags &= ~(M_BCAST|M_MCAST); + + if (sc->g_proto == IPPROTO_MOBILE) { + if (dst->sa_family == AF_INET) { + struct mbuf *m0; + int msiz; + + ip = mtod(m, struct ip *); + + /* + * RFC2004 specifies that fragmented diagrams shouldn't + * be encapsulated. + */ + if (ip->ip_off & (IP_MF | IP_OFFMASK)) { + _IF_DROP(&ifp->if_snd); + m_freem(m); + error = EINVAL; /* is there better errno? */ + goto end; + } + memset(&mob_h, 0, MOB_HH_SIZ_L); + mob_h.proto = (ip->ip_p) << 8; + mob_h.odst = ip->ip_dst.s_addr; + ip->ip_dst.s_addr = sc->g_dst.s_addr; + + /* + * If the packet comes from our host, we only change + * the destination address in the IP header. + * Else we also need to save and change the source + */ + if (in_hosteq(ip->ip_src, sc->g_src)) { + msiz = MOB_HH_SIZ_S; + } else { + mob_h.proto |= MOB_HH_SBIT; + mob_h.osrc = ip->ip_src.s_addr; + ip->ip_src.s_addr = sc->g_src.s_addr; + msiz = MOB_HH_SIZ_L; + } + mob_h.proto = htons(mob_h.proto); + mob_h.hcrc = gre_in_cksum((u_int16_t *)&mob_h, msiz); + + if ((m->m_data - msiz) < m->m_pktdat) { + /* need new mbuf */ + MGETHDR(m0, M_DONTWAIT, MT_DATA); + if (m0 == NULL) { + _IF_DROP(&ifp->if_snd); + m_freem(m); + error = ENOBUFS; + goto end; + } + m0->m_next = m; + m->m_data += sizeof(struct ip); + m->m_len -= sizeof(struct ip); + m0->m_pkthdr.len = m->m_pkthdr.len + msiz; + m0->m_len = msiz + sizeof(struct ip); + m0->m_data += max_linkhdr; + memcpy(mtod(m0, caddr_t), (caddr_t)ip, + sizeof(struct ip)); + m = m0; + } else { /* we have some space left in the old one */ + m->m_data -= msiz; + m->m_len += msiz; + m->m_pkthdr.len += msiz; + bcopy(ip, mtod(m, caddr_t), + sizeof(struct ip)); + } + ip = mtod(m, struct ip *); + memcpy((caddr_t)(ip + 1), &mob_h, (unsigned)msiz); + ip->ip_len = ntohs(ip->ip_len) + msiz; + } else { /* AF_INET */ + _IF_DROP(&ifp->if_snd); + m_freem(m); + error = EINVAL; + goto end; + } + } else if (sc->g_proto == IPPROTO_GRE) { + switch (dst->sa_family) { + case AF_INET: + ip = mtod(m, struct ip *); + gre_ip_tos = ip->ip_tos; + gre_ip_id = ip->ip_id; + if (sc->wccp_ver == WCCP_V2) { + extra = sizeof(uint32_t); + etype = WCCP_PROTOCOL_TYPE; + } else { + etype = ETHERTYPE_IP; + } + break; +#ifdef INET6 + case AF_INET6: + gre_ip_id = ip_newid(); + etype = ETHERTYPE_IPV6; + break; +#endif +#ifdef NETATALK + case AF_APPLETALK: + etype = ETHERTYPE_ATALK; + break; +#endif + default: + _IF_DROP(&ifp->if_snd); + m_freem(m); + error = EAFNOSUPPORT; + goto end; + } + + /* Reserve space for GRE header + optional GRE key */ + int hdrlen = sizeof(struct greip) + extra; + if (sc->key) + hdrlen += sizeof(uint32_t); + M_PREPEND(m, hdrlen, M_DONTWAIT); + } else { + _IF_DROP(&ifp->if_snd); + m_freem(m); + error = EINVAL; + goto end; + } + + if (m == NULL) { /* mbuf allocation failed */ + _IF_DROP(&ifp->if_snd); + error = ENOBUFS; + goto end; + } + + M_SETFIB(m, sc->gre_fibnum); /* The envelope may use a different FIB */ + + gh = mtod(m, struct greip *); + if (sc->g_proto == IPPROTO_GRE) { + uint32_t *options = gh->gi_options; + + memset((void *)gh, 0, sizeof(struct greip) + extra); + gh->gi_ptype = htons(etype); + gh->gi_flags = 0; + + /* Add key option */ + if (sc->key) + { + gh->gi_flags |= htons(GRE_KP); + *(options++) = htonl(sc->key); + } + } + + gh->gi_pr = sc->g_proto; + if (sc->g_proto != IPPROTO_MOBILE) { + gh->gi_src = sc->g_src; + gh->gi_dst = sc->g_dst; + ((struct ip*)gh)->ip_v = IPPROTO_IPV4; + ((struct ip*)gh)->ip_hl = (sizeof(struct ip)) >> 2; + ((struct ip*)gh)->ip_ttl = GRE_TTL; + ((struct ip*)gh)->ip_tos = gre_ip_tos; + ((struct ip*)gh)->ip_id = gre_ip_id; + gh->gi_len = m->m_pkthdr.len; + } + + ifp->if_opackets++; + ifp->if_obytes += m->m_pkthdr.len; + /* + * Send it off and with IP_FORWARD flag to prevent it from + * overwriting the ip_id again. ip_id is already set to the + * ip_id of the encapsulated packet. + */ + error = ip_output(m, NULL, &sc->route, IP_FORWARDING, + (struct ip_moptions *)NULL, (struct inpcb *)NULL); + end: + sc->called = 0; + if (error) + ifp->if_oerrors++; + return (error); +} + +static int +gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct ifreq *ifr = (struct ifreq *)data; + struct if_laddrreq *lifr = (struct if_laddrreq *)data; + struct in_aliasreq *aifr = (struct in_aliasreq *)data; + struct gre_softc *sc = ifp->if_softc; + int s; + struct sockaddr_in si; + struct sockaddr *sa = NULL; + int error, adj; + struct sockaddr_in sp, sm, dp, dm; + uint32_t key; + + error = 0; + adj = 0; + + s = splnet(); + switch (cmd) { + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + break; + case SIOCSIFDSTADDR: + break; + case SIOCSIFFLAGS: + /* + * XXXRW: Isn't this priv_check() redundant to the ifnet + * layer check? + */ + if ((error = priv_check(curthread, PRIV_NET_SETIFFLAGS)) != 0) + break; + if ((ifr->ifr_flags & IFF_LINK0) != 0) + sc->g_proto = IPPROTO_GRE; + else + sc->g_proto = IPPROTO_MOBILE; + if ((ifr->ifr_flags & IFF_LINK2) != 0) + sc->wccp_ver = WCCP_V2; + else + sc->wccp_ver = WCCP_V1; + goto recompute; + case SIOCSIFMTU: + /* + * XXXRW: Isn't this priv_check() redundant to the ifnet + * layer check? + */ + if ((error = priv_check(curthread, PRIV_NET_SETIFMTU)) != 0) + break; + if (ifr->ifr_mtu < 576) { + error = EINVAL; + break; + } + ifp->if_mtu = ifr->ifr_mtu; + break; + case SIOCGIFMTU: + ifr->ifr_mtu = GRE2IFP(sc)->if_mtu; + break; + case SIOCADDMULTI: + /* + * XXXRW: Isn't this priv_checkr() redundant to the ifnet + * layer check? + */ + if ((error = priv_check(curthread, PRIV_NET_ADDMULTI)) != 0) + break; + if (ifr == 0) { + error = EAFNOSUPPORT; + break; + } + switch (ifr->ifr_addr.sa_family) { +#ifdef INET + case AF_INET: + break; +#endif +#ifdef INET6 + case AF_INET6: + break; +#endif + default: + error = EAFNOSUPPORT; + break; + } + break; + case SIOCDELMULTI: + /* + * XXXRW: Isn't this priv_check() redundant to the ifnet + * layer check? + */ + if ((error = priv_check(curthread, PRIV_NET_DELIFGROUP)) != 0) + break; + if (ifr == 0) { + error = EAFNOSUPPORT; + break; + } + switch (ifr->ifr_addr.sa_family) { +#ifdef INET + case AF_INET: + break; +#endif +#ifdef INET6 + case AF_INET6: + break; +#endif + default: + error = EAFNOSUPPORT; + break; + } + break; + case GRESPROTO: + /* + * XXXRW: Isn't this priv_check() redundant to the ifnet + * layer check? + */ + if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) + break; + sc->g_proto = ifr->ifr_flags; + switch (sc->g_proto) { + case IPPROTO_GRE: + ifp->if_flags |= IFF_LINK0; + break; + case IPPROTO_MOBILE: + ifp->if_flags &= ~IFF_LINK0; + break; + default: + error = EPROTONOSUPPORT; + break; + } + goto recompute; + case GREGPROTO: + ifr->ifr_flags = sc->g_proto; + break; + case GRESADDRS: + case GRESADDRD: + error = priv_check(curthread, PRIV_NET_GRE); + if (error) + return (error); + /* + * set tunnel endpoints, compute a less specific route + * to the remote end and mark if as up + */ + sa = &ifr->ifr_addr; + if (cmd == GRESADDRS) + sc->g_src = (satosin(sa))->sin_addr; + if (cmd == GRESADDRD) + sc->g_dst = (satosin(sa))->sin_addr; + recompute: +#ifdef INET + if (sc->encap != NULL) { + encap_detach(sc->encap); + sc->encap = NULL; + } +#endif + if ((sc->g_src.s_addr != INADDR_ANY) && + (sc->g_dst.s_addr != INADDR_ANY)) { + bzero(&sp, sizeof(sp)); + bzero(&sm, sizeof(sm)); + bzero(&dp, sizeof(dp)); + bzero(&dm, sizeof(dm)); + sp.sin_len = sm.sin_len = dp.sin_len = dm.sin_len = + sizeof(struct sockaddr_in); + sp.sin_family = sm.sin_family = dp.sin_family = + dm.sin_family = AF_INET; + sp.sin_addr = sc->g_src; + dp.sin_addr = sc->g_dst; + sm.sin_addr.s_addr = dm.sin_addr.s_addr = + INADDR_BROADCAST; +#ifdef INET + sc->encap = encap_attach(AF_INET, sc->g_proto, + sintosa(&sp), sintosa(&sm), sintosa(&dp), + sintosa(&dm), (sc->g_proto == IPPROTO_GRE) ? + &in_gre_protosw : &in_mobile_protosw, sc); + if (sc->encap == NULL) + printf("%s: unable to attach encap\n", + if_name(GRE2IFP(sc))); +#endif + if (sc->route.ro_rt != 0) /* free old route */ + RTFREE(sc->route.ro_rt); + if (gre_compute_route(sc) == 0) + ifp->if_drv_flags |= IFF_DRV_RUNNING; + else + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + } + break; + case GREGADDRS: + memset(&si, 0, sizeof(si)); + si.sin_family = AF_INET; + si.sin_len = sizeof(struct sockaddr_in); + si.sin_addr.s_addr = sc->g_src.s_addr; + sa = sintosa(&si); + ifr->ifr_addr = *sa; + break; + case GREGADDRD: + memset(&si, 0, sizeof(si)); + si.sin_family = AF_INET; + si.sin_len = sizeof(struct sockaddr_in); + si.sin_addr.s_addr = sc->g_dst.s_addr; + sa = sintosa(&si); + ifr->ifr_addr = *sa; + break; + case SIOCSIFPHYADDR: + /* + * XXXRW: Isn't this priv_check() redundant to the ifnet + * layer check? + */ + if ((error = priv_check(curthread, PRIV_NET_SETIFPHYS)) != 0) + break; + if (aifr->ifra_addr.sin_family != AF_INET || + aifr->ifra_dstaddr.sin_family != AF_INET) { + error = EAFNOSUPPORT; + break; + } + if (aifr->ifra_addr.sin_len != sizeof(si) || + aifr->ifra_dstaddr.sin_len != sizeof(si)) { + error = EINVAL; + break; + } + sc->g_src = aifr->ifra_addr.sin_addr; + sc->g_dst = aifr->ifra_dstaddr.sin_addr; + goto recompute; + case SIOCSLIFPHYADDR: + /* + * XXXRW: Isn't this priv_check() redundant to the ifnet + * layer check? + */ + if ((error = priv_check(curthread, PRIV_NET_SETIFPHYS)) != 0) + break; + if (lifr->addr.ss_family != AF_INET || + lifr->dstaddr.ss_family != AF_INET) { + error = EAFNOSUPPORT; + break; + } + if (lifr->addr.ss_len != sizeof(si) || + lifr->dstaddr.ss_len != sizeof(si)) { + error = EINVAL; + break; + } + sc->g_src = (satosin(&lifr->addr))->sin_addr; + sc->g_dst = + (satosin(&lifr->dstaddr))->sin_addr; + goto recompute; + case SIOCDIFPHYADDR: + /* + * XXXRW: Isn't this priv_check() redundant to the ifnet + * layer check? + */ + if ((error = priv_check(curthread, PRIV_NET_SETIFPHYS)) != 0) + break; + sc->g_src.s_addr = INADDR_ANY; + sc->g_dst.s_addr = INADDR_ANY; + goto recompute; + case SIOCGLIFPHYADDR: + if (sc->g_src.s_addr == INADDR_ANY || + sc->g_dst.s_addr == INADDR_ANY) { + error = EADDRNOTAVAIL; + break; + } + memset(&si, 0, sizeof(si)); + si.sin_family = AF_INET; + si.sin_len = sizeof(struct sockaddr_in); + si.sin_addr.s_addr = sc->g_src.s_addr; + memcpy(&lifr->addr, &si, sizeof(si)); + si.sin_addr.s_addr = sc->g_dst.s_addr; + memcpy(&lifr->dstaddr, &si, sizeof(si)); + break; + case SIOCGIFPSRCADDR: +#ifdef INET6 + case SIOCGIFPSRCADDR_IN6: +#endif + if (sc->g_src.s_addr == INADDR_ANY) { + error = EADDRNOTAVAIL; + break; + } + memset(&si, 0, sizeof(si)); + si.sin_family = AF_INET; + si.sin_len = sizeof(struct sockaddr_in); + si.sin_addr.s_addr = sc->g_src.s_addr; + bcopy(&si, &ifr->ifr_addr, sizeof(ifr->ifr_addr)); + break; + case SIOCGIFPDSTADDR: +#ifdef INET6 + case SIOCGIFPDSTADDR_IN6: +#endif + if (sc->g_dst.s_addr == INADDR_ANY) { + error = EADDRNOTAVAIL; + break; + } + memset(&si, 0, sizeof(si)); + si.sin_family = AF_INET; + si.sin_len = sizeof(struct sockaddr_in); + si.sin_addr.s_addr = sc->g_dst.s_addr; + bcopy(&si, &ifr->ifr_addr, sizeof(ifr->ifr_addr)); + break; + case GRESKEY: + error = priv_check(curthread, PRIV_NET_GRE); + if (error) + break; + error = copyin(ifr->ifr_data, &key, sizeof(key)); + if (error) + break; + /* adjust MTU for option header */ + if (key == 0 && sc->key != 0) /* clear */ + adj += sizeof(key); + else if (key != 0 && sc->key == 0) /* set */ + adj -= sizeof(key); + + if (ifp->if_mtu + adj < 576) { + error = EINVAL; + break; + } + ifp->if_mtu += adj; + sc->key = key; + break; + case GREGKEY: + error = copyout(&sc->key, ifr->ifr_data, sizeof(sc->key)); + break; + + default: + error = EINVAL; + break; + } + + splx(s); + return (error); +} + +/* + * computes a route to our destination that is not the one + * which would be taken by ip_output(), as this one will loop back to + * us. If the interface is p2p as a--->b, then a routing entry exists + * If we now send a packet to b (e.g. ping b), this will come down here + * gets src=a, dst=b tacked on and would from ip_output() sent back to + * if_gre. + * Goal here is to compute a route to b that is less specific than + * a-->b. We know that this one exists as in normal operation we have + * at least a default route which matches. + */ +static int +gre_compute_route(struct gre_softc *sc) +{ + struct route *ro; + + ro = &sc->route; + + memset(ro, 0, sizeof(struct route)); + ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = sc->g_dst; + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(ro->ro_dst); + + /* + * toggle last bit, so our interface is not found, but a less + * specific route. I'd rather like to specify a shorter mask, + * but this is not possible. Should work though. XXX + * XXX MRT Use a different FIB for the tunnel to solve this problem. + */ + if ((GRE2IFP(sc)->if_flags & IFF_LINK1) == 0) { + ((struct sockaddr_in *)&ro->ro_dst)->sin_addr.s_addr ^= + htonl(0x01); + } + +#ifdef DIAGNOSTIC + printf("%s: searching for a route to %s", if_name(GRE2IFP(sc)), + inet_ntoa(((struct sockaddr_in *)&ro->ro_dst)->sin_addr)); +#endif + + rtalloc_fib(ro, sc->gre_fibnum); + + /* + * check if this returned a route at all and this route is no + * recursion to ourself + */ + if (ro->ro_rt == NULL || ro->ro_rt->rt_ifp->if_softc == sc) { +#ifdef DIAGNOSTIC + if (ro->ro_rt == NULL) + printf(" - no route found!\n"); + else + printf(" - route loops back to ourself!\n"); +#endif + return EADDRNOTAVAIL; + } + + /* + * now change it back - else ip_output will just drop + * the route and search one to this interface ... + */ + if ((GRE2IFP(sc)->if_flags & IFF_LINK1) == 0) + ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = sc->g_dst; + +#ifdef DIAGNOSTIC + printf(", choosing %s with gateway %s", if_name(ro->ro_rt->rt_ifp), + inet_ntoa(((struct sockaddr_in *)(ro->ro_rt->rt_gateway))->sin_addr)); + printf("\n"); +#endif + + return 0; +} + +/* + * do a checksum of a buffer - much like in_cksum, which operates on + * mbufs. + */ +u_int16_t +gre_in_cksum(u_int16_t *p, u_int len) +{ + u_int32_t sum = 0; + int nwords = len >> 1; + + while (nwords-- != 0) + sum += *p++; + + if (len & 1) { + union { + u_short w; + u_char c[2]; + } u; + u.c[0] = *(u_char *)p; + u.c[1] = 0; + sum += u.w; + } + + /* end-around-carry */ + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + return (~sum); +} + +static int +gremodevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + greattach(); + break; + case MOD_UNLOAD: + if_clone_detach(&gre_cloner); + mtx_destroy(&gre_mtx); + break; + default: + return EOPNOTSUPP; + } + return 0; +} + +static moduledata_t gre_mod = { + "if_gre", + gremodevent, + 0 +}; + +DECLARE_MODULE(if_gre, gre_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_gre, 1); diff --git a/freebsd/sys/net/if_gre.h b/freebsd/sys/net/if_gre.h new file mode 100644 index 00000000..ff9c63cf --- /dev/null +++ b/freebsd/sys/net/if_gre.h @@ -0,0 +1,194 @@ +/* $NetBSD: if_gre.h,v 1.13 2003/11/10 08:51:52 wiz Exp $ */ +/* $FreeBSD$ */ + +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved + * + * This code is derived from software contributed to The NetBSD Foundation + * by Heiko W.Rupp + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _NET_IF_GRE_H +#define _NET_IF_GRE_H + +#include +#ifdef _KERNEL +#include + +/* + * Version of the WCCP, need to be configured manually since + * header for version 2 is the same but IP payload is prepended + * with additional 4-bytes field. + */ +typedef enum { + WCCP_V1 = 0, + WCCP_V2 +} wccp_ver_t; + +struct gre_softc { + struct ifnet *sc_ifp; + LIST_ENTRY(gre_softc) sc_list; + int gre_unit; + int gre_flags; + u_int gre_fibnum; /* use this fib for envelopes */ + struct in_addr g_src; /* source address of gre packets */ + struct in_addr g_dst; /* destination address of gre packets */ + struct route route; /* routing entry that determines, where a + encapsulated packet should go */ + u_char g_proto; /* protocol of encapsulator */ + + const struct encaptab *encap; /* encapsulation cookie */ + + int called; /* infinite recursion preventer */ + + uint32_t key; /* key included in outgoing GRE packets */ + /* zero means none */ + + wccp_ver_t wccp_ver; /* version of the WCCP */ +}; +#define GRE2IFP(sc) ((sc)->sc_ifp) + + +struct gre_h { + u_int16_t flags; /* GRE flags */ + u_int16_t ptype; /* protocol type of payload typically + Ether protocol type*/ + uint32_t options[0]; /* optional options */ +/* + * from here on: fields are optional, presence indicated by flags + * + u_int_16 checksum checksum (one-complements of GRE header + and payload + Present if (ck_pres | rt_pres == 1). + Valid if (ck_pres == 1). + u_int_16 offset offset from start of routing filed to + first octet of active SRE (see below). + Present if (ck_pres | rt_pres == 1). + Valid if (rt_pres == 1). + u_int_32 key inserted by encapsulator e.g. for + authentication + Present if (key_pres ==1 ). + u_int_32 seq_num Sequence number to allow for packet order + Present if (seq_pres ==1 ). + struct gre_sre[] routing Routing fileds (see below) + Present if (rt_pres == 1) + */ +} __packed; + +struct greip { + struct ip gi_i; + struct gre_h gi_g; +} __packed; + +#define gi_pr gi_i.ip_p +#define gi_len gi_i.ip_len +#define gi_src gi_i.ip_src +#define gi_dst gi_i.ip_dst +#define gi_ptype gi_g.ptype +#define gi_flags gi_g.flags +#define gi_options gi_g.options + +#define GRE_CP 0x8000 /* Checksum Present */ +#define GRE_RP 0x4000 /* Routing Present */ +#define GRE_KP 0x2000 /* Key Present */ +#define GRE_SP 0x1000 /* Sequence Present */ +#define GRE_SS 0x0800 /* Strict Source Route */ + +/* + * CISCO uses special type for GRE tunnel created as part of WCCP + * connection, while in fact those packets are just IPv4 encapsulated + * into GRE. + */ +#define WCCP_PROTOCOL_TYPE 0x883E + +/* + * gre_sre defines a Source route Entry. These are needed if packets + * should be routed over more than one tunnel hop by hop + */ +struct gre_sre { + u_int16_t sre_family; /* address family */ + u_char sre_offset; /* offset to first octet of active entry */ + u_char sre_length; /* number of octets in the SRE. + sre_lengthl==0 -> last entry. */ + u_char *sre_rtinfo; /* the routing information */ +}; + +struct greioctl { + int unit; + struct in_addr addr; +}; + +/* for mobile encaps */ + +struct mobile_h { + u_int16_t proto; /* protocol and S-bit */ + u_int16_t hcrc; /* header checksum */ + u_int32_t odst; /* original destination address */ + u_int32_t osrc; /* original source addr, if S-bit set */ +} __packed; + +struct mobip_h { + struct ip mi; + struct mobile_h mh; +} __packed; + + +#define MOB_HH_SIZ_S (sizeof(struct mobile_h) - sizeof(u_int32_t)) +#define MOB_HH_SIZ_L (sizeof(struct mobile_h)) +#define MOB_HH_SBIT 0x0080 + +#define GRE_TTL 30 + +#endif /* _KERNEL */ + +/* + * ioctls needed to manipulate the interface + */ + +#define GRESADDRS _IOW('i', 101, struct ifreq) +#define GRESADDRD _IOW('i', 102, struct ifreq) +#define GREGADDRS _IOWR('i', 103, struct ifreq) +#define GREGADDRD _IOWR('i', 104, struct ifreq) +#define GRESPROTO _IOW('i' , 105, struct ifreq) +#define GREGPROTO _IOWR('i', 106, struct ifreq) +#define GREGKEY _IOWR('i', 107, struct ifreq) +#define GRESKEY _IOW('i', 108, struct ifreq) + +#ifdef _KERNEL +LIST_HEAD(gre_softc_head, gre_softc); +extern struct mtx gre_mtx; +extern struct gre_softc_head gre_softc_list; + +u_int16_t gre_in_cksum(u_int16_t *, u_int); +#endif /* _KERNEL */ + +#endif diff --git a/freebsd/sys/net/if_iso88025subr.c b/freebsd/sys/net/if_iso88025subr.c new file mode 100644 index 00000000..87d3eb87 --- /dev/null +++ b/freebsd/sys/net/if_iso88025subr.c @@ -0,0 +1,831 @@ +#include + +/*- + * Copyright (c) 1998, Larry Lile + * All rights reserved. + * + * For latest sources and information on this driver, please + * go to http://anarchy.stdio.com. + * + * Questions, comments or suggestions should be directed to + * Larry Lile . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +/* + * + * General ISO 802.5 (Token Ring) support routines + * + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#if defined(INET) || defined(INET6) +#include +#include +#include +#endif +#ifdef INET6 +#include +#endif + +#ifdef IPX +#include +#include +#endif + +#include + +static const u_char iso88025_broadcastaddr[ISO88025_ADDR_LEN] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + +static int iso88025_resolvemulti (struct ifnet *, struct sockaddr **, + struct sockaddr *); + +#define senderr(e) do { error = (e); goto bad; } while (0) + +/* + * Perform common duties while attaching to interface list + */ +void +iso88025_ifattach(struct ifnet *ifp, const u_int8_t *lla, int bpf) +{ + struct ifaddr *ifa; + struct sockaddr_dl *sdl; + + ifa = NULL; + + ifp->if_type = IFT_ISO88025; + ifp->if_addrlen = ISO88025_ADDR_LEN; + ifp->if_hdrlen = ISO88025_HDR_LEN; + + if_attach(ifp); /* Must be called before additional assignments */ + + ifp->if_output = iso88025_output; + ifp->if_input = iso88025_input; + ifp->if_resolvemulti = iso88025_resolvemulti; + ifp->if_broadcastaddr = iso88025_broadcastaddr; + + if (ifp->if_baudrate == 0) + ifp->if_baudrate = TR_16MBPS; /* 16Mbit should be a safe default */ + if (ifp->if_mtu == 0) + ifp->if_mtu = ISO88025_DEFAULT_MTU; + + ifa = ifp->if_addr; + KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); + + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + sdl->sdl_type = IFT_ISO88025; + sdl->sdl_alen = ifp->if_addrlen; + bcopy(lla, LLADDR(sdl), ifp->if_addrlen); + + if (bpf) + bpfattach(ifp, DLT_IEEE802, ISO88025_HDR_LEN); + + return; +} + +/* + * Perform common duties while detaching a Token Ring interface + */ +void +iso88025_ifdetach(ifp, bpf) + struct ifnet *ifp; + int bpf; +{ + + if (bpf) + bpfdetach(ifp); + + if_detach(ifp); + + return; +} + +int +iso88025_ioctl(struct ifnet *ifp, u_long command, caddr_t data) +{ + struct ifaddr *ifa; + struct ifreq *ifr; + int error; + + ifa = (struct ifaddr *) data; + ifr = (struct ifreq *) data; + error = 0; + + switch (command) { + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + ifp->if_init(ifp->if_softc); /* before arpwhohas */ + arp_ifinit(ifp, ifa); + break; +#endif /* INET */ +#ifdef IPX + /* + * XXX - This code is probably wrong + */ + case AF_IPX: { + struct ipx_addr *ina; + + ina = &(IA_SIPX(ifa)->sipx_addr); + + if (ipx_nullhost(*ina)) + ina->x_host = *(union ipx_host *) + IF_LLADDR(ifp); + else + bcopy((caddr_t) ina->x_host.c_host, + (caddr_t) IF_LLADDR(ifp), + ISO88025_ADDR_LEN); + + /* + * Set new address + */ + ifp->if_init(ifp->if_softc); + } + break; +#endif /* IPX */ + default: + ifp->if_init(ifp->if_softc); + break; + } + break; + + case SIOCGIFADDR: { + struct sockaddr *sa; + + sa = (struct sockaddr *) & ifr->ifr_data; + bcopy(IF_LLADDR(ifp), + (caddr_t) sa->sa_data, ISO88025_ADDR_LEN); + } + break; + + case SIOCSIFMTU: + /* + * Set the interface MTU. + */ + if (ifr->ifr_mtu > ISO88025_MAX_MTU) { + error = EINVAL; + } else { + ifp->if_mtu = ifr->ifr_mtu; + } + break; + default: + error = EINVAL; /* XXX netbsd has ENOTTY??? */ + break; + } + + return (error); +} + +/* + * ISO88025 encapsulation + */ +int +iso88025_output(ifp, m, dst, ro) + struct ifnet *ifp; + struct mbuf *m; + struct sockaddr *dst; + struct route *ro; +{ + u_int16_t snap_type = 0; + int loop_copy = 0, error = 0, rif_len = 0; + u_char edst[ISO88025_ADDR_LEN]; + struct iso88025_header *th; + struct iso88025_header gen_th; + struct sockaddr_dl *sdl = NULL; + struct rtentry *rt0 = NULL; +#if defined(INET) || defined(INET6) + struct llentry *lle; +#endif + + if (ro != NULL) + rt0 = ro->ro_rt; + +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) + senderr(error); +#endif + + if (ifp->if_flags & IFF_MONITOR) + senderr(ENETDOWN); + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) + senderr(ENETDOWN); + getmicrotime(&ifp->if_lastchange); + + /* Calculate routing info length based on arp table entry */ + /* XXX any better way to do this ? */ + + if (rt0 && (sdl = (struct sockaddr_dl *)rt0->rt_gateway)) + if (SDL_ISO88025(sdl)->trld_rcf != 0) + rif_len = TR_RCF_RIFLEN(SDL_ISO88025(sdl)->trld_rcf); + + /* Generate a generic 802.5 header for the packet */ + gen_th.ac = TR_AC; + gen_th.fc = TR_LLC_FRAME; + (void)memcpy((caddr_t)gen_th.iso88025_shost, IF_LLADDR(ifp), + ISO88025_ADDR_LEN); + if (rif_len) { + gen_th.iso88025_shost[0] |= TR_RII; + if (rif_len > 2) { + gen_th.rcf = SDL_ISO88025(sdl)->trld_rcf; + (void)memcpy((caddr_t)gen_th.rd, + (caddr_t)SDL_ISO88025(sdl)->trld_route, + rif_len - 2); + } + } + + switch (dst->sa_family) { +#ifdef INET + case AF_INET: + error = arpresolve(ifp, rt0, m, dst, edst, &lle); + if (error) + return (error == EWOULDBLOCK ? 0 : error); + snap_type = ETHERTYPE_IP; + break; + case AF_ARP: + { + struct arphdr *ah; + ah = mtod(m, struct arphdr *); + ah->ar_hrd = htons(ARPHRD_IEEE802); + + loop_copy = -1; /* if this is for us, don't do it */ + + switch(ntohs(ah->ar_op)) { + case ARPOP_REVREQUEST: + case ARPOP_REVREPLY: + snap_type = ETHERTYPE_REVARP; + break; + case ARPOP_REQUEST: + case ARPOP_REPLY: + default: + snap_type = ETHERTYPE_ARP; + break; + } + + if (m->m_flags & M_BCAST) + bcopy(ifp->if_broadcastaddr, edst, ISO88025_ADDR_LEN); + else + bcopy(ar_tha(ah), edst, ISO88025_ADDR_LEN); + + } + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle); + if (error) + return (error); + snap_type = ETHERTYPE_IPV6; + break; +#endif /* INET6 */ +#ifdef IPX + case AF_IPX: + { + u_int8_t *cp; + + bcopy((caddr_t)&(satoipx_addr(dst).x_host), (caddr_t)edst, + ISO88025_ADDR_LEN); + + M_PREPEND(m, 3, M_WAIT); + m = m_pullup(m, 3); + if (m == 0) + senderr(ENOBUFS); + cp = mtod(m, u_int8_t *); + *cp++ = ETHERTYPE_IPX_8022; + *cp++ = ETHERTYPE_IPX_8022; + *cp++ = LLC_UI; + } + break; +#endif /* IPX */ + case AF_UNSPEC: + { + struct iso88025_sockaddr_data *sd; + /* + * For AF_UNSPEC sockaddr.sa_data must contain all of the + * mac information needed to send the packet. This allows + * full mac, llc, and source routing function to be controlled. + * llc and source routing information must already be in the + * mbuf provided, ac/fc are set in sa_data. sockaddr.sa_data + * should be an iso88025_sockaddr_data structure see iso88025.h + */ + loop_copy = -1; + sd = (struct iso88025_sockaddr_data *)dst->sa_data; + gen_th.ac = sd->ac; + gen_th.fc = sd->fc; + (void)memcpy((caddr_t)edst, (caddr_t)sd->ether_dhost, + ISO88025_ADDR_LEN); + (void)memcpy((caddr_t)gen_th.iso88025_shost, + (caddr_t)sd->ether_shost, ISO88025_ADDR_LEN); + rif_len = 0; + break; + } + default: + if_printf(ifp, "can't handle af%d\n", dst->sa_family); + senderr(EAFNOSUPPORT); + break; + } + + /* + * Add LLC header. + */ + if (snap_type != 0) { + struct llc *l; + M_PREPEND(m, LLC_SNAPFRAMELEN, M_DONTWAIT); + if (m == 0) + senderr(ENOBUFS); + l = mtod(m, struct llc *); + l->llc_control = LLC_UI; + l->llc_dsap = l->llc_ssap = LLC_SNAP_LSAP; + l->llc_snap.org_code[0] = + l->llc_snap.org_code[1] = + l->llc_snap.org_code[2] = 0; + l->llc_snap.ether_type = htons(snap_type); + } + + /* + * Add local net header. If no space in first mbuf, + * allocate another. + */ + M_PREPEND(m, ISO88025_HDR_LEN + rif_len, M_DONTWAIT); + if (m == 0) + senderr(ENOBUFS); + th = mtod(m, struct iso88025_header *); + bcopy((caddr_t)edst, (caddr_t)&gen_th.iso88025_dhost, ISO88025_ADDR_LEN); + + /* Copy as much of the generic header as is needed into the mbuf */ + memcpy(th, &gen_th, ISO88025_HDR_LEN + rif_len); + + /* + * If a simplex interface, and the packet is being sent to our + * Ethernet address or a broadcast address, loopback a copy. + * XXX To make a simplex device behave exactly like a duplex + * device, we should copy in the case of sending to our own + * ethernet address (thus letting the original actually appear + * on the wire). However, we don't do that here for security + * reasons and compatibility with the original behavior. + */ + if ((ifp->if_flags & IFF_SIMPLEX) && (loop_copy != -1)) { + if ((m->m_flags & M_BCAST) || (loop_copy > 0)) { + struct mbuf *n; + n = m_copy(m, 0, (int)M_COPYALL); + (void) if_simloop(ifp, n, dst->sa_family, + ISO88025_HDR_LEN); + } else if (bcmp(th->iso88025_dhost, th->iso88025_shost, + ETHER_ADDR_LEN) == 0) { + (void) if_simloop(ifp, m, dst->sa_family, + ISO88025_HDR_LEN); + return(0); /* XXX */ + } + } + + IFQ_HANDOFF_ADJ(ifp, m, ISO88025_HDR_LEN + LLC_SNAPFRAMELEN, error); + if (error) { + printf("iso88025_output: packet dropped QFULL.\n"); + ifp->if_oerrors++; + } + return (error); + +bad: + ifp->if_oerrors++; + if (m) + m_freem(m); + return (error); +} + +/* + * ISO 88025 de-encapsulation + */ +void +iso88025_input(ifp, m) + struct ifnet *ifp; + struct mbuf *m; +{ + struct iso88025_header *th; + struct llc *l; + int isr; + int mac_hdr_len; + + /* + * Do consistency checks to verify assumptions + * made by code past this point. + */ + if ((m->m_flags & M_PKTHDR) == 0) { + if_printf(ifp, "discard frame w/o packet header\n"); + ifp->if_ierrors++; + m_freem(m); + return; + } + if (m->m_pkthdr.rcvif == NULL) { + if_printf(ifp, "discard frame w/o interface pointer\n"); + ifp->if_ierrors++; + m_freem(m); + return; + } + + m = m_pullup(m, ISO88025_HDR_LEN); + if (m == NULL) { + ifp->if_ierrors++; + goto dropanyway; + } + th = mtod(m, struct iso88025_header *); + m->m_pkthdr.header = (void *)th; + + /* + * Discard packet if interface is not up. + */ + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) + goto dropanyway; + + /* + * Give bpf a chance at the packet. + */ + BPF_MTAP(ifp, m); + + /* + * Interface marked for monitoring; discard packet. + */ + if (ifp->if_flags & IFF_MONITOR) { + m_freem(m); + return; + } + +#ifdef MAC + mac_ifnet_create_mbuf(ifp, m); +#endif + + /* + * Update interface statistics. + */ + ifp->if_ibytes += m->m_pkthdr.len; + getmicrotime(&ifp->if_lastchange); + + /* + * Discard non local unicast packets when interface + * is in promiscuous mode. + */ + if ((ifp->if_flags & IFF_PROMISC) && + ((th->iso88025_dhost[0] & 1) == 0) && + (bcmp(IF_LLADDR(ifp), (caddr_t) th->iso88025_dhost, + ISO88025_ADDR_LEN) != 0)) + goto dropanyway; + + /* + * Set mbuf flags for bcast/mcast. + */ + if (th->iso88025_dhost[0] & 1) { + if (bcmp(iso88025_broadcastaddr, th->iso88025_dhost, + ISO88025_ADDR_LEN) == 0) + m->m_flags |= M_BCAST; + else + m->m_flags |= M_MCAST; + ifp->if_imcasts++; + } + + mac_hdr_len = ISO88025_HDR_LEN; + /* Check for source routing info */ + if (th->iso88025_shost[0] & TR_RII) + mac_hdr_len += TR_RCF_RIFLEN(th->rcf); + + /* Strip off ISO88025 header. */ + m_adj(m, mac_hdr_len); + + m = m_pullup(m, LLC_SNAPFRAMELEN); + if (m == 0) { + ifp->if_ierrors++; + goto dropanyway; + } + l = mtod(m, struct llc *); + + switch (l->llc_dsap) { +#ifdef IPX + case ETHERTYPE_IPX_8022: /* Thanks a bunch Novell */ + if ((l->llc_control != LLC_UI) || + (l->llc_ssap != ETHERTYPE_IPX_8022)) { + ifp->if_noproto++; + goto dropanyway; + } + + th->iso88025_shost[0] &= ~(TR_RII); + m_adj(m, 3); + isr = NETISR_IPX; + break; +#endif /* IPX */ + case LLC_SNAP_LSAP: { + u_int16_t type; + if ((l->llc_control != LLC_UI) || + (l->llc_ssap != LLC_SNAP_LSAP)) { + ifp->if_noproto++; + goto dropanyway; + } + + if (l->llc_snap.org_code[0] != 0 || + l->llc_snap.org_code[1] != 0 || + l->llc_snap.org_code[2] != 0) { + ifp->if_noproto++; + goto dropanyway; + } + + type = ntohs(l->llc_snap.ether_type); + m_adj(m, LLC_SNAPFRAMELEN); + switch (type) { +#ifdef INET + case ETHERTYPE_IP: + th->iso88025_shost[0] &= ~(TR_RII); + if ((m = ip_fastforward(m)) == NULL) + return; + isr = NETISR_IP; + break; + + case ETHERTYPE_ARP: + if (ifp->if_flags & IFF_NOARP) + goto dropanyway; + isr = NETISR_ARP; + break; +#endif /* INET */ +#ifdef IPX_SNAP /* XXX: Not supported! */ + case ETHERTYPE_IPX: + th->iso88025_shost[0] &= ~(TR_RII); + isr = NETISR_IPX; + break; +#endif /* IPX_SNAP */ +#ifdef INET6 + case ETHERTYPE_IPV6: + th->iso88025_shost[0] &= ~(TR_RII); + isr = NETISR_IPV6; + break; +#endif /* INET6 */ + default: + printf("iso88025_input: unexpected llc_snap ether_type 0x%02x\n", type); + ifp->if_noproto++; + goto dropanyway; + } + break; + } +#ifdef ISO + case LLC_ISO_LSAP: + switch (l->llc_control) { + case LLC_UI: + ifp->if_noproto++; + goto dropanyway; + break; + case LLC_XID: + case LLC_XID_P: + if(m->m_len < ISO88025_ADDR_LEN) + goto dropanyway; + l->llc_window = 0; + l->llc_fid = 9; + l->llc_class = 1; + l->llc_dsap = l->llc_ssap = 0; + /* Fall through to */ + case LLC_TEST: + case LLC_TEST_P: + { + struct sockaddr sa; + struct arpcom *ac; + struct iso88025_sockaddr_data *th2; + int i; + u_char c; + + c = l->llc_dsap; + + if (th->iso88025_shost[0] & TR_RII) { /* XXX */ + printf("iso88025_input: dropping source routed LLC_TEST\n"); + goto dropanyway; + } + l->llc_dsap = l->llc_ssap; + l->llc_ssap = c; + if (m->m_flags & (M_BCAST | M_MCAST)) + bcopy((caddr_t)IF_LLADDR(ifp), + (caddr_t)th->iso88025_dhost, + ISO88025_ADDR_LEN); + sa.sa_family = AF_UNSPEC; + sa.sa_len = sizeof(sa); + th2 = (struct iso88025_sockaddr_data *)sa.sa_data; + for (i = 0; i < ISO88025_ADDR_LEN; i++) { + th2->ether_shost[i] = c = th->iso88025_dhost[i]; + th2->ether_dhost[i] = th->iso88025_dhost[i] = + th->iso88025_shost[i]; + th->iso88025_shost[i] = c; + } + th2->ac = TR_AC; + th2->fc = TR_LLC_FRAME; + ifp->if_output(ifp, m, &sa, NULL); + return; + } + default: + printf("iso88025_input: unexpected llc control 0x%02x\n", l->llc_control); + ifp->if_noproto++; + goto dropanyway; + break; + } + break; +#endif /* ISO */ + default: + printf("iso88025_input: unknown dsap 0x%x\n", l->llc_dsap); + ifp->if_noproto++; + goto dropanyway; + break; + } + + netisr_dispatch(isr, m); + return; + +dropanyway: + ifp->if_iqdrops++; + if (m) + m_freem(m); + return; +} + +static int +iso88025_resolvemulti (ifp, llsa, sa) + struct ifnet *ifp; + struct sockaddr **llsa; + struct sockaddr *sa; +{ + struct sockaddr_dl *sdl; +#ifdef INET + struct sockaddr_in *sin; +#endif +#ifdef INET6 + struct sockaddr_in6 *sin6; +#endif + u_char *e_addr; + + switch(sa->sa_family) { + case AF_LINK: + /* + * No mapping needed. Just check that it's a valid MC address. + */ + sdl = (struct sockaddr_dl *)sa; + e_addr = LLADDR(sdl); + if ((e_addr[0] & 1) != 1) { + return (EADDRNOTAVAIL); + } + *llsa = 0; + return (0); + +#ifdef INET + case AF_INET: + sin = (struct sockaddr_in *)sa; + if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { + return (EADDRNOTAVAIL); + } + sdl = malloc(sizeof *sdl, M_IFMADDR, + M_NOWAIT|M_ZERO); + if (sdl == NULL) + return (ENOMEM); + sdl->sdl_len = sizeof *sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = IFT_ISO88025; + sdl->sdl_alen = ISO88025_ADDR_LEN; + e_addr = LLADDR(sdl); + ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); + *llsa = (struct sockaddr *)sdl; + return (0); +#endif +#ifdef INET6 + case AF_INET6: + sin6 = (struct sockaddr_in6 *)sa; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + /* + * An IP6 address of 0 means listen to all + * of the Ethernet multicast address used for IP6. + * (This is used for multicast routers.) + */ + ifp->if_flags |= IFF_ALLMULTI; + *llsa = 0; + return (0); + } + if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { + return (EADDRNOTAVAIL); + } + sdl = malloc(sizeof *sdl, M_IFMADDR, + M_NOWAIT|M_ZERO); + if (sdl == NULL) + return (ENOMEM); + sdl->sdl_len = sizeof *sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = IFT_ISO88025; + sdl->sdl_alen = ISO88025_ADDR_LEN; + e_addr = LLADDR(sdl); + ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); + *llsa = (struct sockaddr *)sdl; + return (0); +#endif + + default: + /* + * Well, the text isn't quite right, but it's the name + * that counts... + */ + return (EAFNOSUPPORT); + } + + return (0); +} + +MALLOC_DEFINE(M_ISO88025, "arpcom", "802.5 interface internals"); + +static void* +iso88025_alloc(u_char type, struct ifnet *ifp) +{ + struct arpcom *ac; + + ac = malloc(sizeof(struct arpcom), M_ISO88025, M_WAITOK | M_ZERO); + ac->ac_ifp = ifp; + + return (ac); +} + +static void +iso88025_free(void *com, u_char type) +{ + + free(com, M_ISO88025); +} + +static int +iso88025_modevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + if_register_com_alloc(IFT_ISO88025, iso88025_alloc, + iso88025_free); + break; + case MOD_UNLOAD: + if_deregister_com_alloc(IFT_ISO88025); + break; + default: + return EOPNOTSUPP; + } + + return (0); +} + +static moduledata_t iso88025_mod = { + "iso88025", + iso88025_modevent, + 0 +}; + +DECLARE_MODULE(iso88025, iso88025_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(iso88025, 1); diff --git a/freebsd/sys/net/if_lagg.c b/freebsd/sys/net/if_lagg.c new file mode 100644 index 00000000..47c72ca2 --- /dev/null +++ b/freebsd/sys/net/if_lagg.c @@ -0,0 +1,1808 @@ +#include + +/* $OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $ */ + +/* + * Copyright (c) 2005, 2006 Reyk Floeter + * Copyright (c) 2007 Andrew Thompson + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET +#include +#include +#include +#include +#endif + +#ifdef INET6 +#include +#endif + +#include +#include +#include + +/* Special flags we should propagate to the lagg ports. */ +static struct { + int flag; + int (*func)(struct ifnet *, int); +} lagg_pflags[] = { + {IFF_PROMISC, ifpromisc}, + {IFF_ALLMULTI, if_allmulti}, + {0, NULL} +}; + +SLIST_HEAD(__trhead, lagg_softc) lagg_list; /* list of laggs */ +static struct mtx lagg_list_mtx; +eventhandler_tag lagg_detach_cookie = NULL; + +static int lagg_clone_create(struct if_clone *, int, caddr_t); +static void lagg_clone_destroy(struct ifnet *); +static void lagg_lladdr(struct lagg_softc *, uint8_t *); +static void lagg_capabilities(struct lagg_softc *); +static void lagg_port_lladdr(struct lagg_port *, uint8_t *); +static void lagg_port_setlladdr(void *, int); +static int lagg_port_create(struct lagg_softc *, struct ifnet *); +static int lagg_port_destroy(struct lagg_port *, int); +static struct mbuf *lagg_input(struct ifnet *, struct mbuf *); +static void lagg_linkstate(struct lagg_softc *); +static void lagg_port_state(struct ifnet *, int); +static int lagg_port_ioctl(struct ifnet *, u_long, caddr_t); +static int lagg_port_output(struct ifnet *, struct mbuf *, + struct sockaddr *, struct route *); +static void lagg_port_ifdetach(void *arg __unused, struct ifnet *); +static int lagg_port_checkstacking(struct lagg_softc *); +static void lagg_port2req(struct lagg_port *, struct lagg_reqport *); +static void lagg_init(void *); +static void lagg_stop(struct lagg_softc *); +static int lagg_ioctl(struct ifnet *, u_long, caddr_t); +static int lagg_ether_setmulti(struct lagg_softc *); +static int lagg_ether_cmdmulti(struct lagg_port *, int); +static int lagg_setflag(struct lagg_port *, int, int, + int (*func)(struct ifnet *, int)); +static int lagg_setflags(struct lagg_port *, int status); +static void lagg_start(struct ifnet *); +static int lagg_media_change(struct ifnet *); +static void lagg_media_status(struct ifnet *, struct ifmediareq *); +static struct lagg_port *lagg_link_active(struct lagg_softc *, + struct lagg_port *); +static const void *lagg_gethdr(struct mbuf *, u_int, u_int, void *); + +IFC_SIMPLE_DECLARE(lagg, 0); + +/* Simple round robin */ +static int lagg_rr_attach(struct lagg_softc *); +static int lagg_rr_detach(struct lagg_softc *); +static int lagg_rr_start(struct lagg_softc *, struct mbuf *); +static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *, + struct mbuf *); + +/* Active failover */ +static int lagg_fail_attach(struct lagg_softc *); +static int lagg_fail_detach(struct lagg_softc *); +static int lagg_fail_start(struct lagg_softc *, struct mbuf *); +static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *, + struct mbuf *); + +/* Loadbalancing */ +static int lagg_lb_attach(struct lagg_softc *); +static int lagg_lb_detach(struct lagg_softc *); +static int lagg_lb_port_create(struct lagg_port *); +static void lagg_lb_port_destroy(struct lagg_port *); +static int lagg_lb_start(struct lagg_softc *, struct mbuf *); +static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *, + struct mbuf *); +static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *); + +/* 802.3ad LACP */ +static int lagg_lacp_attach(struct lagg_softc *); +static int lagg_lacp_detach(struct lagg_softc *); +static int lagg_lacp_start(struct lagg_softc *, struct mbuf *); +static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *, + struct mbuf *); +static void lagg_lacp_lladdr(struct lagg_softc *); + +/* lagg protocol table */ +static const struct { + int ti_proto; + int (*ti_attach)(struct lagg_softc *); +} lagg_protos[] = { + { LAGG_PROTO_ROUNDROBIN, lagg_rr_attach }, + { LAGG_PROTO_FAILOVER, lagg_fail_attach }, + { LAGG_PROTO_LOADBALANCE, lagg_lb_attach }, + { LAGG_PROTO_ETHERCHANNEL, lagg_lb_attach }, + { LAGG_PROTO_LACP, lagg_lacp_attach }, + { LAGG_PROTO_NONE, NULL } +}; + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0, "Link Aggregation"); + +static int lagg_failover_rx_all = 0; /* Allow input on any failover links */ +SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW, + &lagg_failover_rx_all, 0, + "Accept input from any interface in a failover lagg"); + +static int +lagg_modevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + mtx_init(&lagg_list_mtx, "if_lagg list", NULL, MTX_DEF); + SLIST_INIT(&lagg_list); + if_clone_attach(&lagg_cloner); + lagg_input_p = lagg_input; + lagg_linkstate_p = lagg_port_state; + lagg_detach_cookie = EVENTHANDLER_REGISTER( + ifnet_departure_event, lagg_port_ifdetach, NULL, + EVENTHANDLER_PRI_ANY); + break; + case MOD_UNLOAD: + EVENTHANDLER_DEREGISTER(ifnet_departure_event, + lagg_detach_cookie); + if_clone_detach(&lagg_cloner); + lagg_input_p = NULL; + lagg_linkstate_p = NULL; + mtx_destroy(&lagg_list_mtx); + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t lagg_mod = { + "if_lagg", + lagg_modevent, + 0 +}; + +DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); + +#if __FreeBSD_version >= 800000 +/* + * This routine is run via an vlan + * config EVENT + */ +static void +lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) +{ + struct lagg_softc *sc = ifp->if_softc; + struct lagg_port *lp; + + if (ifp->if_softc != arg) /* Not our event */ + return; + + LAGG_RLOCK(sc); + if (!SLIST_EMPTY(&sc->sc_ports)) { + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag); + } + LAGG_RUNLOCK(sc); +} + +/* + * This routine is run via an vlan + * unconfig EVENT + */ +static void +lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) +{ + struct lagg_softc *sc = ifp->if_softc; + struct lagg_port *lp; + + if (ifp->if_softc != arg) /* Not our event */ + return; + + LAGG_RLOCK(sc); + if (!SLIST_EMPTY(&sc->sc_ports)) { + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag); + } + LAGG_RUNLOCK(sc); +} +#endif + +static int +lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + struct lagg_softc *sc; + struct ifnet *ifp; + int i, error = 0; + static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ + + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); + ifp = sc->sc_ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) { + free(sc, M_DEVBUF); + return (ENOSPC); + } + + sc->sc_proto = LAGG_PROTO_NONE; + for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) { + if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) { + sc->sc_proto = lagg_protos[i].ti_proto; + if ((error = lagg_protos[i].ti_attach(sc)) != 0) { + if_free_type(ifp, IFT_ETHER); + free(sc, M_DEVBUF); + return (error); + } + break; + } + } + LAGG_LOCK_INIT(sc); + SLIST_INIT(&sc->sc_ports); + TASK_INIT(&sc->sc_lladdr_task, 0, lagg_port_setlladdr, sc); + + /* Initialise pseudo media types */ + ifmedia_init(&sc->sc_media, 0, lagg_media_change, + lagg_media_status); + ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); + + if_initname(ifp, ifc->ifc_name, unit); + ifp->if_type = IFT_ETHER; + ifp->if_softc = sc; + ifp->if_start = lagg_start; + ifp->if_init = lagg_init; + ifp->if_ioctl = lagg_ioctl; + ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; + + IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); + ifp->if_snd.ifq_drv_maxlen = ifqmaxlen; + IFQ_SET_READY(&ifp->if_snd); + + /* + * Attach as an ordinary ethernet device, childs will be attached + * as special device IFT_IEEE8023ADLAG. + */ + ether_ifattach(ifp, eaddr); + +#if __FreeBSD_version >= 800000 + sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, + lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST); + sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, + lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); +#endif + + /* Insert into the global list of laggs */ + mtx_lock(&lagg_list_mtx); + SLIST_INSERT_HEAD(&lagg_list, sc, sc_entries); + mtx_unlock(&lagg_list_mtx); + + return (0); +} + +static void +lagg_clone_destroy(struct ifnet *ifp) +{ + struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + struct lagg_port *lp; + + LAGG_WLOCK(sc); + + lagg_stop(sc); + ifp->if_flags &= ~IFF_UP; + +#if __FreeBSD_version >= 800000 + EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); + EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); +#endif + + /* Shutdown and remove lagg ports */ + while ((lp = SLIST_FIRST(&sc->sc_ports)) != NULL) + lagg_port_destroy(lp, 1); + /* Unhook the aggregation protocol */ + (*sc->sc_detach)(sc); + + LAGG_WUNLOCK(sc); + + ifmedia_removeall(&sc->sc_media); + ether_ifdetach(ifp); + if_free_type(ifp, IFT_ETHER); + + mtx_lock(&lagg_list_mtx); + SLIST_REMOVE(&lagg_list, sc, lagg_softc, sc_entries); + mtx_unlock(&lagg_list_mtx); + + taskqueue_drain(taskqueue_swi, &sc->sc_lladdr_task); + LAGG_LOCK_DESTROY(sc); + free(sc, M_DEVBUF); +} + +static void +lagg_lladdr(struct lagg_softc *sc, uint8_t *lladdr) +{ + struct ifnet *ifp = sc->sc_ifp; + + if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) + return; + + bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN); + /* Let the protocol know the MAC has changed */ + if (sc->sc_lladdr != NULL) + (*sc->sc_lladdr)(sc); + EVENTHANDLER_INVOKE(iflladdr_event, ifp); +} + +static void +lagg_capabilities(struct lagg_softc *sc) +{ + struct lagg_port *lp; + int cap = ~0, ena = ~0; + u_long hwa = ~0UL; + + LAGG_WLOCK_ASSERT(sc); + + /* Get capabilities from the lagg ports */ + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + cap &= lp->lp_ifp->if_capabilities; + ena &= lp->lp_ifp->if_capenable; + hwa &= lp->lp_ifp->if_hwassist; + } + cap = (cap == ~0 ? 0 : cap); + ena = (ena == ~0 ? 0 : ena); + hwa = (hwa == ~0 ? 0 : hwa); + + if (sc->sc_ifp->if_capabilities != cap || + sc->sc_ifp->if_capenable != ena || + sc->sc_ifp->if_hwassist != hwa) { + sc->sc_ifp->if_capabilities = cap; + sc->sc_ifp->if_capenable = ena; + sc->sc_ifp->if_hwassist = hwa; + getmicrotime(&sc->sc_ifp->if_lastchange); + + if (sc->sc_ifflags & IFF_DEBUG) + if_printf(sc->sc_ifp, + "capabilities 0x%08x enabled 0x%08x\n", cap, ena); + } +} + +static void +lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr) +{ + struct lagg_softc *sc = lp->lp_softc; + struct ifnet *ifp = lp->lp_ifp; + struct lagg_llq *llq; + int pending = 0; + + LAGG_WLOCK_ASSERT(sc); + + if (lp->lp_detaching || + memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) + return; + + /* Check to make sure its not already queued to be changed */ + SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) { + if (llq->llq_ifp == ifp) { + pending = 1; + break; + } + } + + if (!pending) { + llq = malloc(sizeof(struct lagg_llq), M_DEVBUF, M_NOWAIT); + if (llq == NULL) /* XXX what to do */ + return; + } + + /* Update the lladdr even if pending, it may have changed */ + llq->llq_ifp = ifp; + bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN); + + if (!pending) + SLIST_INSERT_HEAD(&sc->sc_llq_head, llq, llq_entries); + + taskqueue_enqueue(taskqueue_swi, &sc->sc_lladdr_task); +} + +/* + * Set the interface MAC address from a taskqueue to avoid a LOR. + */ +static void +lagg_port_setlladdr(void *arg, int pending) +{ + struct lagg_softc *sc = (struct lagg_softc *)arg; + struct lagg_llq *llq, *head; + struct ifnet *ifp; + int error; + + /* Grab a local reference of the queue and remove it from the softc */ + LAGG_WLOCK(sc); + head = SLIST_FIRST(&sc->sc_llq_head); + SLIST_FIRST(&sc->sc_llq_head) = NULL; + LAGG_WUNLOCK(sc); + + /* + * Traverse the queue and set the lladdr on each ifp. It is safe to do + * unlocked as we have the only reference to it. + */ + for (llq = head; llq != NULL; llq = head) { + ifp = llq->llq_ifp; + + /* Set the link layer address */ + error = if_setlladdr(ifp, llq->llq_lladdr, ETHER_ADDR_LEN); + if (error) + printf("%s: setlladdr failed on %s\n", __func__, + ifp->if_xname); + + head = SLIST_NEXT(llq, llq_entries); + free(llq, M_DEVBUF); + } +} + +static int +lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) +{ + struct lagg_softc *sc_ptr; + struct lagg_port *lp; + int error = 0; + + LAGG_WLOCK_ASSERT(sc); + + /* Limit the maximal number of lagg ports */ + if (sc->sc_count >= LAGG_MAX_PORTS) + return (ENOSPC); + + /* Check if port has already been associated to a lagg */ + if (ifp->if_lagg != NULL) + return (EBUSY); + + /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ + if (ifp->if_type != IFT_ETHER) + return (EPROTONOSUPPORT); + + /* Allow the first Ethernet member to define the MTU */ + if (SLIST_EMPTY(&sc->sc_ports)) + sc->sc_ifp->if_mtu = ifp->if_mtu; + else if (sc->sc_ifp->if_mtu != ifp->if_mtu) { + if_printf(sc->sc_ifp, "invalid MTU for %s\n", + ifp->if_xname); + return (EINVAL); + } + + if ((lp = malloc(sizeof(struct lagg_port), + M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL) + return (ENOMEM); + + /* Check if port is a stacked lagg */ + mtx_lock(&lagg_list_mtx); + SLIST_FOREACH(sc_ptr, &lagg_list, sc_entries) { + if (ifp == sc_ptr->sc_ifp) { + mtx_unlock(&lagg_list_mtx); + free(lp, M_DEVBUF); + return (EINVAL); + /* XXX disable stacking for the moment, its untested + lp->lp_flags |= LAGG_PORT_STACK; + if (lagg_port_checkstacking(sc_ptr) >= + LAGG_MAX_STACKING) { + mtx_unlock(&lagg_list_mtx); + free(lp, M_DEVBUF); + return (E2BIG); + } + */ + } + } + mtx_unlock(&lagg_list_mtx); + + /* Change the interface type */ + lp->lp_iftype = ifp->if_type; + ifp->if_type = IFT_IEEE8023ADLAG; + ifp->if_lagg = lp; + lp->lp_ioctl = ifp->if_ioctl; + ifp->if_ioctl = lagg_port_ioctl; + lp->lp_output = ifp->if_output; + ifp->if_output = lagg_port_output; + + lp->lp_ifp = ifp; + lp->lp_softc = sc; + + /* Save port link layer address */ + bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN); + + if (SLIST_EMPTY(&sc->sc_ports)) { + sc->sc_primary = lp; + lagg_lladdr(sc, IF_LLADDR(ifp)); + } else { + /* Update link layer address for this port */ + lagg_port_lladdr(lp, IF_LLADDR(sc->sc_ifp)); + } + + /* Insert into the list of ports */ + SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries); + sc->sc_count++; + + /* Update lagg capabilities */ + lagg_capabilities(sc); + lagg_linkstate(sc); + + /* Add multicast addresses and interface flags to this port */ + lagg_ether_cmdmulti(lp, 1); + lagg_setflags(lp, 1); + + if (sc->sc_port_create != NULL) + error = (*sc->sc_port_create)(lp); + if (error) { + /* remove the port again, without calling sc_port_destroy */ + lagg_port_destroy(lp, 0); + return (error); + } + + return (error); +} + +static int +lagg_port_checkstacking(struct lagg_softc *sc) +{ + struct lagg_softc *sc_ptr; + struct lagg_port *lp; + int m = 0; + + LAGG_WLOCK_ASSERT(sc); + + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + if (lp->lp_flags & LAGG_PORT_STACK) { + sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc; + m = MAX(m, lagg_port_checkstacking(sc_ptr)); + } + } + + return (m + 1); +} + +static int +lagg_port_destroy(struct lagg_port *lp, int runpd) +{ + struct lagg_softc *sc = lp->lp_softc; + struct lagg_port *lp_ptr; + struct lagg_llq *llq; + struct ifnet *ifp = lp->lp_ifp; + + LAGG_WLOCK_ASSERT(sc); + + if (runpd && sc->sc_port_destroy != NULL) + (*sc->sc_port_destroy)(lp); + + /* + * Remove multicast addresses and interface flags from this port and + * reset the MAC address, skip if the interface is being detached. + */ + if (!lp->lp_detaching) { + lagg_ether_cmdmulti(lp, 0); + lagg_setflags(lp, 0); + lagg_port_lladdr(lp, lp->lp_lladdr); + } + + /* Restore interface */ + ifp->if_type = lp->lp_iftype; + ifp->if_ioctl = lp->lp_ioctl; + ifp->if_output = lp->lp_output; + ifp->if_lagg = NULL; + + /* Finally, remove the port from the lagg */ + SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries); + sc->sc_count--; + + /* Update the primary interface */ + if (lp == sc->sc_primary) { + uint8_t lladdr[ETHER_ADDR_LEN]; + + if ((lp_ptr = SLIST_FIRST(&sc->sc_ports)) == NULL) { + bzero(&lladdr, ETHER_ADDR_LEN); + } else { + bcopy(lp_ptr->lp_lladdr, + lladdr, ETHER_ADDR_LEN); + } + lagg_lladdr(sc, lladdr); + sc->sc_primary = lp_ptr; + + /* Update link layer address for each port */ + SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries) + lagg_port_lladdr(lp_ptr, lladdr); + } + + /* Remove any pending lladdr changes from the queue */ + if (lp->lp_detaching) { + SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) { + if (llq->llq_ifp == ifp) { + SLIST_REMOVE(&sc->sc_llq_head, llq, lagg_llq, + llq_entries); + free(llq, M_DEVBUF); + break; /* Only appears once */ + } + } + } + + if (lp->lp_ifflags) + if_printf(ifp, "%s: lp_ifflags unclean\n", __func__); + + free(lp, M_DEVBUF); + + /* Update lagg capabilities */ + lagg_capabilities(sc); + lagg_linkstate(sc); + + return (0); +} + +static int +lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct lagg_reqport *rp = (struct lagg_reqport *)data; + struct lagg_softc *sc; + struct lagg_port *lp = NULL; + int error = 0; + + /* Should be checked by the caller */ + if (ifp->if_type != IFT_IEEE8023ADLAG || + (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL) + goto fallback; + + switch (cmd) { + case SIOCGLAGGPORT: + if (rp->rp_portname[0] == '\0' || + ifunit(rp->rp_portname) != ifp) { + error = EINVAL; + break; + } + + LAGG_RLOCK(sc); + if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) { + error = ENOENT; + LAGG_RUNLOCK(sc); + break; + } + + lagg_port2req(lp, rp); + LAGG_RUNLOCK(sc); + break; + + case SIOCSIFCAP: + if (lp->lp_ioctl == NULL) { + error = EINVAL; + break; + } + error = (*lp->lp_ioctl)(ifp, cmd, data); + if (error) + break; + + /* Update lagg interface capabilities */ + LAGG_WLOCK(sc); + lagg_capabilities(sc); + LAGG_WUNLOCK(sc); + break; + + case SIOCSIFMTU: + /* Do not allow the MTU to be changed once joined */ + error = EINVAL; + break; + + default: + goto fallback; + } + + return (error); + +fallback: + if (lp->lp_ioctl != NULL) + return ((*lp->lp_ioctl)(ifp, cmd, data)); + + return (EINVAL); +} + +static int +lagg_port_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro) +{ + struct lagg_port *lp = ifp->if_lagg; + struct ether_header *eh; + short type = 0; + + switch (dst->sa_family) { + case pseudo_AF_HDRCMPLT: + case AF_UNSPEC: + eh = (struct ether_header *)dst->sa_data; + type = eh->ether_type; + break; + } + + /* + * Only allow ethernet types required to initiate or maintain the link, + * aggregated frames take a different path. + */ + switch (ntohs(type)) { + case ETHERTYPE_PAE: /* EAPOL PAE/802.1x */ + return ((*lp->lp_output)(ifp, m, dst, ro)); + } + + /* drop any other frames */ + m_freem(m); + return (EBUSY); +} + +static void +lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp) +{ + struct lagg_port *lp; + struct lagg_softc *sc; + + if ((lp = ifp->if_lagg) == NULL) + return; + + sc = lp->lp_softc; + + LAGG_WLOCK(sc); + lp->lp_detaching = 1; + lagg_port_destroy(lp, 1); + LAGG_WUNLOCK(sc); +} + +static void +lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp) +{ + struct lagg_softc *sc = lp->lp_softc; + + strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname)); + strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname)); + rp->rp_prio = lp->lp_prio; + rp->rp_flags = lp->lp_flags; + if (sc->sc_portreq != NULL) + (*sc->sc_portreq)(lp, (caddr_t)&rp->rp_psc); + + /* Add protocol specific flags */ + switch (sc->sc_proto) { + case LAGG_PROTO_FAILOVER: + if (lp == sc->sc_primary) + rp->rp_flags |= LAGG_PORT_MASTER; + if (lp == lagg_link_active(sc, sc->sc_primary)) + rp->rp_flags |= LAGG_PORT_ACTIVE; + break; + + case LAGG_PROTO_ROUNDROBIN: + case LAGG_PROTO_LOADBALANCE: + case LAGG_PROTO_ETHERCHANNEL: + if (LAGG_PORTACTIVE(lp)) + rp->rp_flags |= LAGG_PORT_ACTIVE; + break; + + case LAGG_PROTO_LACP: + /* LACP has a different definition of active */ + if (lacp_isactive(lp)) + rp->rp_flags |= LAGG_PORT_ACTIVE; + if (lacp_iscollecting(lp)) + rp->rp_flags |= LAGG_PORT_COLLECTING; + if (lacp_isdistributing(lp)) + rp->rp_flags |= LAGG_PORT_DISTRIBUTING; + break; + } + +} + +static void +lagg_init(void *xsc) +{ + struct lagg_softc *sc = (struct lagg_softc *)xsc; + struct lagg_port *lp; + struct ifnet *ifp = sc->sc_ifp; + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + return; + + LAGG_WLOCK(sc); + + ifp->if_drv_flags |= IFF_DRV_RUNNING; + /* Update the port lladdrs */ + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + lagg_port_lladdr(lp, IF_LLADDR(ifp)); + + if (sc->sc_init != NULL) + (*sc->sc_init)(sc); + + LAGG_WUNLOCK(sc); +} + +static void +lagg_stop(struct lagg_softc *sc) +{ + struct ifnet *ifp = sc->sc_ifp; + + LAGG_WLOCK_ASSERT(sc); + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + return; + + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + + if (sc->sc_stop != NULL) + (*sc->sc_stop)(sc); +} + +static int +lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + struct lagg_reqall *ra = (struct lagg_reqall *)data; + struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; + struct ifreq *ifr = (struct ifreq *)data; + struct lagg_port *lp; + struct ifnet *tpif; + struct thread *td = curthread; + char *buf, *outbuf; + int count, buflen, len, error = 0; + + bzero(&rpbuf, sizeof(rpbuf)); + + switch (cmd) { + case SIOCGLAGG: + LAGG_RLOCK(sc); + count = 0; + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + count++; + buflen = count * sizeof(struct lagg_reqport); + LAGG_RUNLOCK(sc); + + outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); + + LAGG_RLOCK(sc); + ra->ra_proto = sc->sc_proto; + if (sc->sc_req != NULL) + (*sc->sc_req)(sc, (caddr_t)&ra->ra_psc); + + count = 0; + buf = outbuf; + len = min(ra->ra_size, buflen); + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + if (len < sizeof(rpbuf)) + break; + + lagg_port2req(lp, &rpbuf); + memcpy(buf, &rpbuf, sizeof(rpbuf)); + count++; + buf += sizeof(rpbuf); + len -= sizeof(rpbuf); + } + LAGG_RUNLOCK(sc); + ra->ra_ports = count; + ra->ra_size = count * sizeof(rpbuf); + error = copyout(outbuf, ra->ra_port, ra->ra_size); + free(outbuf, M_TEMP); + break; + case SIOCSLAGG: + error = priv_check(td, PRIV_NET_LAGG); + if (error) + break; + if (ra->ra_proto >= LAGG_PROTO_MAX) { + error = EPROTONOSUPPORT; + break; + } + if (sc->sc_proto != LAGG_PROTO_NONE) { + LAGG_WLOCK(sc); + error = sc->sc_detach(sc); + /* Reset protocol and pointers */ + sc->sc_proto = LAGG_PROTO_NONE; + sc->sc_detach = NULL; + sc->sc_start = NULL; + sc->sc_input = NULL; + sc->sc_port_create = NULL; + sc->sc_port_destroy = NULL; + sc->sc_linkstate = NULL; + sc->sc_init = NULL; + sc->sc_stop = NULL; + sc->sc_lladdr = NULL; + sc->sc_req = NULL; + sc->sc_portreq = NULL; + LAGG_WUNLOCK(sc); + } + if (error != 0) + break; + for (int i = 0; i < (sizeof(lagg_protos) / + sizeof(lagg_protos[0])); i++) { + if (lagg_protos[i].ti_proto == ra->ra_proto) { + if (sc->sc_ifflags & IFF_DEBUG) + printf("%s: using proto %u\n", + sc->sc_ifname, + lagg_protos[i].ti_proto); + LAGG_WLOCK(sc); + sc->sc_proto = lagg_protos[i].ti_proto; + if (sc->sc_proto != LAGG_PROTO_NONE) + error = lagg_protos[i].ti_attach(sc); + LAGG_WUNLOCK(sc); + return (error); + } + } + error = EPROTONOSUPPORT; + break; + case SIOCGLAGGPORT: + if (rp->rp_portname[0] == '\0' || + (tpif = ifunit(rp->rp_portname)) == NULL) { + error = EINVAL; + break; + } + + LAGG_RLOCK(sc); + if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || + lp->lp_softc != sc) { + error = ENOENT; + LAGG_RUNLOCK(sc); + break; + } + + lagg_port2req(lp, rp); + LAGG_RUNLOCK(sc); + break; + case SIOCSLAGGPORT: + error = priv_check(td, PRIV_NET_LAGG); + if (error) + break; + if (rp->rp_portname[0] == '\0' || + (tpif = ifunit(rp->rp_portname)) == NULL) { + error = EINVAL; + break; + } + LAGG_WLOCK(sc); + error = lagg_port_create(sc, tpif); + LAGG_WUNLOCK(sc); + break; + case SIOCSLAGGDELPORT: + error = priv_check(td, PRIV_NET_LAGG); + if (error) + break; + if (rp->rp_portname[0] == '\0' || + (tpif = ifunit(rp->rp_portname)) == NULL) { + error = EINVAL; + break; + } + + LAGG_WLOCK(sc); + if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || + lp->lp_softc != sc) { + error = ENOENT; + LAGG_WUNLOCK(sc); + break; + } + + error = lagg_port_destroy(lp, 1); + LAGG_WUNLOCK(sc); + break; + case SIOCSIFFLAGS: + /* Set flags on ports too */ + LAGG_WLOCK(sc); + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + lagg_setflags(lp, 1); + } + LAGG_WUNLOCK(sc); + + if (!(ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING)) { + /* + * If interface is marked down and it is running, + * then stop and disable it. + */ + LAGG_WLOCK(sc); + lagg_stop(sc); + LAGG_WUNLOCK(sc); + } else if ((ifp->if_flags & IFF_UP) && + !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { + /* + * If interface is marked up and it is stopped, then + * start it. + */ + (*ifp->if_init)(sc); + } + break; + case SIOCADDMULTI: + case SIOCDELMULTI: + LAGG_WLOCK(sc); + error = lagg_ether_setmulti(sc); + LAGG_WUNLOCK(sc); + break; + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); + break; + + case SIOCSIFCAP: + case SIOCSIFMTU: + /* Do not allow the MTU or caps to be directly changed */ + error = EINVAL; + break; + + default: + error = ether_ioctl(ifp, cmd, data); + break; + } + return (error); +} + +static int +lagg_ether_setmulti(struct lagg_softc *sc) +{ + struct lagg_port *lp; + + LAGG_WLOCK_ASSERT(sc); + + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + /* First, remove any existing filter entries. */ + lagg_ether_cmdmulti(lp, 0); + /* copy all addresses from the lagg interface to the port */ + lagg_ether_cmdmulti(lp, 1); + } + return (0); +} + +static int +lagg_ether_cmdmulti(struct lagg_port *lp, int set) +{ + struct lagg_softc *sc = lp->lp_softc; + struct ifnet *ifp = lp->lp_ifp; + struct ifnet *scifp = sc->sc_ifp; + struct lagg_mc *mc; + struct ifmultiaddr *ifma, *rifma = NULL; + struct sockaddr_dl sdl; + int error; + + LAGG_WLOCK_ASSERT(sc); + + bzero((char *)&sdl, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_type = IFT_ETHER; + sdl.sdl_alen = ETHER_ADDR_LEN; + sdl.sdl_index = ifp->if_index; + + if (set) { + TAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_LINK) + continue; + bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), + LLADDR(&sdl), ETHER_ADDR_LEN); + + error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma); + if (error) + return (error); + mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT); + if (mc == NULL) + return (ENOMEM); + mc->mc_ifma = rifma; + SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries); + } + } else { + while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) { + SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); + if_delmulti_ifma(mc->mc_ifma); + free(mc, M_DEVBUF); + } + } + return (0); +} + +/* Handle a ref counted flag that should be set on the lagg port as well */ +static int +lagg_setflag(struct lagg_port *lp, int flag, int status, + int (*func)(struct ifnet *, int)) +{ + struct lagg_softc *sc = lp->lp_softc; + struct ifnet *scifp = sc->sc_ifp; + struct ifnet *ifp = lp->lp_ifp; + int error; + + LAGG_WLOCK_ASSERT(sc); + + status = status ? (scifp->if_flags & flag) : 0; + /* Now "status" contains the flag value or 0 */ + + /* + * See if recorded ports status is different from what + * we want it to be. If it is, flip it. We record ports + * status in lp_ifflags so that we won't clear ports flag + * we haven't set. In fact, we don't clear or set ports + * flags directly, but get or release references to them. + * That's why we can be sure that recorded flags still are + * in accord with actual ports flags. + */ + if (status != (lp->lp_ifflags & flag)) { + error = (*func)(ifp, status); + if (error) + return (error); + lp->lp_ifflags &= ~flag; + lp->lp_ifflags |= status; + } + return (0); +} + +/* + * Handle IFF_* flags that require certain changes on the lagg port + * if "status" is true, update ports flags respective to the lagg + * if "status" is false, forcedly clear the flags set on port. + */ +static int +lagg_setflags(struct lagg_port *lp, int status) +{ + int error, i; + + for (i = 0; lagg_pflags[i].flag; i++) { + error = lagg_setflag(lp, lagg_pflags[i].flag, + status, lagg_pflags[i].func); + if (error) + return (error); + } + return (0); +} + +static void +lagg_start(struct ifnet *ifp) +{ + struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + struct mbuf *m; + int error = 0; + + LAGG_RLOCK(sc); + /* We need a Tx algorithm and at least one port */ + if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { + IF_DRAIN(&ifp->if_snd); + LAGG_RUNLOCK(sc); + return; + } + + for (;; error = 0) { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m == NULL) + break; + + ETHER_BPF_MTAP(ifp, m); + + error = (*sc->sc_start)(sc, m); + if (error == 0) + ifp->if_opackets++; + else + ifp->if_oerrors++; + } + LAGG_RUNLOCK(sc); +} + +static struct mbuf * +lagg_input(struct ifnet *ifp, struct mbuf *m) +{ + struct lagg_port *lp = ifp->if_lagg; + struct lagg_softc *sc = lp->lp_softc; + struct ifnet *scifp = sc->sc_ifp; + + if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + (lp->lp_flags & LAGG_PORT_DISABLED) || + sc->sc_proto == LAGG_PROTO_NONE) { + m_freem(m); + return (NULL); + } + + LAGG_RLOCK(sc); + ETHER_BPF_MTAP(scifp, m); + + m = (*sc->sc_input)(sc, lp, m); + + if (m != NULL) { + scifp->if_ipackets++; + scifp->if_ibytes += m->m_pkthdr.len; + + if (scifp->if_flags & IFF_MONITOR) { + m_freem(m); + m = NULL; + } + } + + LAGG_RUNLOCK(sc); + return (m); +} + +static int +lagg_media_change(struct ifnet *ifp) +{ + struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + + if (sc->sc_ifflags & IFF_DEBUG) + printf("%s\n", __func__); + + /* Ignore */ + return (0); +} + +static void +lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr) +{ + struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + struct lagg_port *lp; + + imr->ifm_status = IFM_AVALID; + imr->ifm_active = IFM_ETHER | IFM_AUTO; + + LAGG_RLOCK(sc); + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + if (LAGG_PORTACTIVE(lp)) + imr->ifm_status |= IFM_ACTIVE; + } + LAGG_RUNLOCK(sc); +} + +static void +lagg_linkstate(struct lagg_softc *sc) +{ + struct lagg_port *lp; + int new_link = LINK_STATE_DOWN; + uint64_t speed; + + /* Our link is considered up if at least one of our ports is active */ + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + if (lp->lp_link_state == LINK_STATE_UP) { + new_link = LINK_STATE_UP; + break; + } + } + if_link_state_change(sc->sc_ifp, new_link); + + /* Update if_baudrate to reflect the max possible speed */ + switch (sc->sc_proto) { + case LAGG_PROTO_FAILOVER: + sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ? + sc->sc_primary->lp_ifp->if_baudrate : 0; + break; + case LAGG_PROTO_ROUNDROBIN: + case LAGG_PROTO_LOADBALANCE: + case LAGG_PROTO_ETHERCHANNEL: + speed = 0; + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + speed += lp->lp_ifp->if_baudrate; + sc->sc_ifp->if_baudrate = speed; + break; + case LAGG_PROTO_LACP: + /* LACP updates if_baudrate itself */ + break; + } +} + +static void +lagg_port_state(struct ifnet *ifp, int state) +{ + struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg; + struct lagg_softc *sc = NULL; + + if (lp != NULL) + sc = lp->lp_softc; + if (sc == NULL) + return; + + LAGG_WLOCK(sc); + lagg_linkstate(sc); + if (sc->sc_linkstate != NULL) + (*sc->sc_linkstate)(lp); + LAGG_WUNLOCK(sc); +} + +struct lagg_port * +lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) +{ + struct lagg_port *lp_next, *rval = NULL; + // int new_link = LINK_STATE_DOWN; + + LAGG_RLOCK_ASSERT(sc); + /* + * Search a port which reports an active link state. + */ + + if (lp == NULL) + goto search; + if (LAGG_PORTACTIVE(lp)) { + rval = lp; + goto found; + } + if ((lp_next = SLIST_NEXT(lp, lp_entries)) != NULL && + LAGG_PORTACTIVE(lp_next)) { + rval = lp_next; + goto found; + } + +search: + SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { + if (LAGG_PORTACTIVE(lp_next)) { + rval = lp_next; + goto found; + } + } + +found: + if (rval != NULL) { + /* + * The IEEE 802.1D standard assumes that a lagg with + * multiple ports is always full duplex. This is valid + * for load sharing laggs and if at least two links + * are active. Unfortunately, checking the latter would + * be too expensive at this point. + XXX + if ((sc->sc_capabilities & IFCAP_LAGG_FULLDUPLEX) && + (sc->sc_count > 1)) + new_link = LINK_STATE_FULL_DUPLEX; + else + new_link = rval->lp_link_state; + */ + } + + return (rval); +} + +static const void * +lagg_gethdr(struct mbuf *m, u_int off, u_int len, void *buf) +{ + if (m->m_pkthdr.len < (off + len)) { + return (NULL); + } else if (m->m_len < (off + len)) { + m_copydata(m, off, len, buf); + return (buf); + } + return (mtod(m, char *) + off); +} + +uint32_t +lagg_hashmbuf(struct mbuf *m, uint32_t key) +{ + uint16_t etype; + uint32_t p = 0; + int off; + struct ether_header *eh; + struct ether_vlan_header vlanbuf; + const struct ether_vlan_header *vlan; +#ifdef INET + const struct ip *ip; + struct ip ipbuf; +#endif +#ifdef INET6 + const struct ip6_hdr *ip6; + struct ip6_hdr ip6buf; + uint32_t flow; +#endif + + off = sizeof(*eh); + if (m->m_len < off) + goto out; + eh = mtod(m, struct ether_header *); + etype = ntohs(eh->ether_type); + p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, key); + p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p); + + /* Special handling for encapsulating VLAN frames */ + if (m->m_flags & M_VLANTAG) { + p = hash32_buf(&m->m_pkthdr.ether_vtag, + sizeof(m->m_pkthdr.ether_vtag), p); + } else if (etype == ETHERTYPE_VLAN) { + vlan = lagg_gethdr(m, off, sizeof(*vlan), &vlanbuf); + if (vlan == NULL) + goto out; + + p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p); + etype = ntohs(vlan->evl_proto); + off += sizeof(*vlan) - sizeof(*eh); + } + + switch (etype) { +#ifdef INET + case ETHERTYPE_IP: + ip = lagg_gethdr(m, off, sizeof(*ip), &ipbuf); + if (ip == NULL) + goto out; + + p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p); + p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p); + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + ip6 = lagg_gethdr(m, off, sizeof(*ip6), &ip6buf); + if (ip6 == NULL) + goto out; + + p = hash32_buf(&ip6->ip6_src, sizeof(struct in6_addr), p); + p = hash32_buf(&ip6->ip6_dst, sizeof(struct in6_addr), p); + flow = ip6->ip6_flow & IPV6_FLOWLABEL_MASK; + p = hash32_buf(&flow, sizeof(flow), p); /* IPv6 flow label */ + break; +#endif + } +out: + return (p); +} + +int +lagg_enqueue(struct ifnet *ifp, struct mbuf *m) +{ + + return (ifp->if_transmit)(ifp, m); +} + +/* + * Simple round robin aggregation + */ + +static int +lagg_rr_attach(struct lagg_softc *sc) +{ + sc->sc_detach = lagg_rr_detach; + sc->sc_start = lagg_rr_start; + sc->sc_input = lagg_rr_input; + sc->sc_port_create = NULL; + sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX; + sc->sc_seq = 0; + + return (0); +} + +static int +lagg_rr_detach(struct lagg_softc *sc) +{ + return (0); +} + +static int +lagg_rr_start(struct lagg_softc *sc, struct mbuf *m) +{ + struct lagg_port *lp; + uint32_t p; + + p = atomic_fetchadd_32(&sc->sc_seq, 1); + p %= sc->sc_count; + lp = SLIST_FIRST(&sc->sc_ports); + while (p--) + lp = SLIST_NEXT(lp, lp_entries); + + /* + * Check the port's link state. This will return the next active + * port if the link is down or the port is NULL. + */ + if ((lp = lagg_link_active(sc, lp)) == NULL) { + m_freem(m); + return (ENOENT); + } + + /* Send mbuf */ + return (lagg_enqueue(lp->lp_ifp, m)); +} + +static struct mbuf * +lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) +{ + struct ifnet *ifp = sc->sc_ifp; + + /* Just pass in the packet to our lagg device */ + m->m_pkthdr.rcvif = ifp; + + return (m); +} + +/* + * Active failover + */ + +static int +lagg_fail_attach(struct lagg_softc *sc) +{ + sc->sc_detach = lagg_fail_detach; + sc->sc_start = lagg_fail_start; + sc->sc_input = lagg_fail_input; + sc->sc_port_create = NULL; + sc->sc_port_destroy = NULL; + + return (0); +} + +static int +lagg_fail_detach(struct lagg_softc *sc) +{ + return (0); +} + +static int +lagg_fail_start(struct lagg_softc *sc, struct mbuf *m) +{ + struct lagg_port *lp; + + /* Use the master port if active or the next available port */ + if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) { + m_freem(m); + return (ENOENT); + } + + /* Send mbuf */ + return (lagg_enqueue(lp->lp_ifp, m)); +} + +static struct mbuf * +lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) +{ + struct ifnet *ifp = sc->sc_ifp; + struct lagg_port *tmp_tp; + + if (lp == sc->sc_primary || lagg_failover_rx_all) { + m->m_pkthdr.rcvif = ifp; + return (m); + } + + if (!LAGG_PORTACTIVE(sc->sc_primary)) { + tmp_tp = lagg_link_active(sc, sc->sc_primary); + /* + * If tmp_tp is null, we've recieved a packet when all + * our links are down. Weird, but process it anyways. + */ + if ((tmp_tp == NULL || tmp_tp == lp)) { + m->m_pkthdr.rcvif = ifp; + return (m); + } + } + + m_freem(m); + return (NULL); +} + +/* + * Loadbalancing + */ + +static int +lagg_lb_attach(struct lagg_softc *sc) +{ + struct lagg_port *lp; + struct lagg_lb *lb; + + if ((lb = (struct lagg_lb *)malloc(sizeof(struct lagg_lb), + M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL) + return (ENOMEM); + + sc->sc_detach = lagg_lb_detach; + sc->sc_start = lagg_lb_start; + sc->sc_input = lagg_lb_input; + sc->sc_port_create = lagg_lb_port_create; + sc->sc_port_destroy = lagg_lb_port_destroy; + sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX; + + lb->lb_key = arc4random(); + sc->sc_psc = (caddr_t)lb; + + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + lagg_lb_port_create(lp); + + return (0); +} + +static int +lagg_lb_detach(struct lagg_softc *sc) +{ + struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; + if (lb != NULL) + free(lb, M_DEVBUF); + return (0); +} + +static int +lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) +{ + struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; + struct lagg_port *lp_next; + int i = 0; + + bzero(&lb->lb_ports, sizeof(lb->lb_ports)); + SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { + if (lp_next == lp) + continue; + if (i >= LAGG_MAX_PORTS) + return (EINVAL); + if (sc->sc_ifflags & IFF_DEBUG) + printf("%s: port %s at index %d\n", + sc->sc_ifname, lp_next->lp_ifname, i); + lb->lb_ports[i++] = lp_next; + } + + return (0); +} + +static int +lagg_lb_port_create(struct lagg_port *lp) +{ + struct lagg_softc *sc = lp->lp_softc; + return (lagg_lb_porttable(sc, NULL)); +} + +static void +lagg_lb_port_destroy(struct lagg_port *lp) +{ + struct lagg_softc *sc = lp->lp_softc; + lagg_lb_porttable(sc, lp); +} + +static int +lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) +{ + struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; + struct lagg_port *lp = NULL; + uint32_t p = 0; + + if (m->m_flags & M_FLOWID) + p = m->m_pkthdr.flowid; + else + p = lagg_hashmbuf(m, lb->lb_key); + p %= sc->sc_count; + lp = lb->lb_ports[p]; + + /* + * Check the port's link state. This will return the next active + * port if the link is down or the port is NULL. + */ + if ((lp = lagg_link_active(sc, lp)) == NULL) { + m_freem(m); + return (ENOENT); + } + + /* Send mbuf */ + return (lagg_enqueue(lp->lp_ifp, m)); +} + +static struct mbuf * +lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) +{ + struct ifnet *ifp = sc->sc_ifp; + + /* Just pass in the packet to our lagg device */ + m->m_pkthdr.rcvif = ifp; + + return (m); +} + +/* + * 802.3ad LACP + */ + +static int +lagg_lacp_attach(struct lagg_softc *sc) +{ + struct lagg_port *lp; + int error; + + sc->sc_detach = lagg_lacp_detach; + sc->sc_port_create = lacp_port_create; + sc->sc_port_destroy = lacp_port_destroy; + sc->sc_linkstate = lacp_linkstate; + sc->sc_start = lagg_lacp_start; + sc->sc_input = lagg_lacp_input; + sc->sc_init = lacp_init; + sc->sc_stop = lacp_stop; + sc->sc_lladdr = lagg_lacp_lladdr; + sc->sc_req = lacp_req; + sc->sc_portreq = lacp_portreq; + + error = lacp_attach(sc); + if (error) + return (error); + + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + lacp_port_create(lp); + + return (error); +} + +static int +lagg_lacp_detach(struct lagg_softc *sc) +{ + struct lagg_port *lp; + int error; + + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + lacp_port_destroy(lp); + + /* unlocking is safe here */ + LAGG_WUNLOCK(sc); + error = lacp_detach(sc); + LAGG_WLOCK(sc); + + return (error); +} + +static void +lagg_lacp_lladdr(struct lagg_softc *sc) +{ + struct lagg_port *lp; + + /* purge all the lacp ports */ + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + lacp_port_destroy(lp); + + /* add them back in */ + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + lacp_port_create(lp); +} + +static int +lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m) +{ + struct lagg_port *lp; + + lp = lacp_select_tx_port(sc, m); + if (lp == NULL) { + m_freem(m); + return (EBUSY); + } + + /* Send mbuf */ + return (lagg_enqueue(lp->lp_ifp, m)); +} + +static struct mbuf * +lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) +{ + struct ifnet *ifp = sc->sc_ifp; + struct ether_header *eh; + u_short etype; + + eh = mtod(m, struct ether_header *); + etype = ntohs(eh->ether_type); + + /* Tap off LACP control messages */ + if (etype == ETHERTYPE_SLOW) { + m = lacp_input(lp, m); + if (m == NULL) + return (NULL); + } + + /* + * If the port is not collecting or not in the active aggregator then + * free and return. + */ + if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) { + m_freem(m); + return (NULL); + } + + m->m_pkthdr.rcvif = ifp; + return (m); +} diff --git a/freebsd/sys/net/if_lagg.h b/freebsd/sys/net/if_lagg.h new file mode 100644 index 00000000..0034c617 --- /dev/null +++ b/freebsd/sys/net/if_lagg.h @@ -0,0 +1,247 @@ +/* $OpenBSD: if_trunk.h,v 1.11 2007/01/31 06:20:19 reyk Exp $ */ + +/* + * Copyright (c) 2005, 2006 Reyk Floeter + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * $FreeBSD$ + */ + +#ifndef _NET_LAGG_H +#define _NET_LAGG_H + +/* + * Global definitions + */ + +#define LAGG_MAX_PORTS 32 /* logically */ +#define LAGG_MAX_NAMESIZE 32 /* name of a protocol */ +#define LAGG_MAX_STACKING 4 /* maximum number of stacked laggs */ + +/* Port flags */ +#define LAGG_PORT_SLAVE 0x00000000 /* normal enslaved port */ +#define LAGG_PORT_MASTER 0x00000001 /* primary port */ +#define LAGG_PORT_STACK 0x00000002 /* stacked lagg port */ +#define LAGG_PORT_ACTIVE 0x00000004 /* port is active */ +#define LAGG_PORT_COLLECTING 0x00000008 /* port is receiving frames */ +#define LAGG_PORT_DISTRIBUTING 0x00000010 /* port is sending frames */ +#define LAGG_PORT_DISABLED 0x00000020 /* port is disabled */ +#define LAGG_PORT_BITS "\20\01MASTER\02STACK\03ACTIVE\04COLLECTING" \ + "\05DISTRIBUTING\06DISABLED" + +/* Supported lagg PROTOs */ +#define LAGG_PROTO_NONE 0 /* no lagg protocol defined */ +#define LAGG_PROTO_ROUNDROBIN 1 /* simple round robin */ +#define LAGG_PROTO_FAILOVER 2 /* active failover */ +#define LAGG_PROTO_LOADBALANCE 3 /* loadbalance */ +#define LAGG_PROTO_LACP 4 /* 802.3ad lacp */ +#define LAGG_PROTO_ETHERCHANNEL 5 /* Cisco FEC */ +#define LAGG_PROTO_MAX 6 + +struct lagg_protos { + const char *lpr_name; + int lpr_proto; +}; + +#define LAGG_PROTO_DEFAULT LAGG_PROTO_FAILOVER +#define LAGG_PROTOS { \ + { "failover", LAGG_PROTO_FAILOVER }, \ + { "fec", LAGG_PROTO_ETHERCHANNEL }, \ + { "lacp", LAGG_PROTO_LACP }, \ + { "loadbalance", LAGG_PROTO_LOADBALANCE }, \ + { "roundrobin", LAGG_PROTO_ROUNDROBIN }, \ + { "none", LAGG_PROTO_NONE }, \ + { "default", LAGG_PROTO_DEFAULT } \ +} + +/* + * lagg ioctls. + */ + +/* + * LACP current operational parameters structure. + */ +struct lacp_opreq { + uint16_t actor_prio; + uint8_t actor_mac[ETHER_ADDR_LEN]; + uint16_t actor_key; + uint16_t actor_portprio; + uint16_t actor_portno; + uint8_t actor_state; + uint16_t partner_prio; + uint8_t partner_mac[ETHER_ADDR_LEN]; + uint16_t partner_key; + uint16_t partner_portprio; + uint16_t partner_portno; + uint8_t partner_state; +}; + +/* lagg port settings */ +struct lagg_reqport { + char rp_ifname[IFNAMSIZ]; /* name of the lagg */ + char rp_portname[IFNAMSIZ]; /* name of the port */ + u_int32_t rp_prio; /* port priority */ + u_int32_t rp_flags; /* port flags */ + union { + struct lacp_opreq rpsc_lacp; + } rp_psc; +#define rp_lacpreq rp_psc.rpsc_lacp +}; + +#define SIOCGLAGGPORT _IOWR('i', 140, struct lagg_reqport) +#define SIOCSLAGGPORT _IOW('i', 141, struct lagg_reqport) +#define SIOCSLAGGDELPORT _IOW('i', 142, struct lagg_reqport) + +/* lagg, ports and options */ +struct lagg_reqall { + char ra_ifname[IFNAMSIZ]; /* name of the lagg */ + u_int ra_proto; /* lagg protocol */ + + size_t ra_size; /* size of buffer */ + struct lagg_reqport *ra_port; /* allocated buffer */ + int ra_ports; /* total port count */ + union { + struct lacp_opreq rpsc_lacp; + } ra_psc; +#define ra_lacpreq ra_psc.rpsc_lacp +}; + +#define SIOCGLAGG _IOWR('i', 143, struct lagg_reqall) +#define SIOCSLAGG _IOW('i', 144, struct lagg_reqall) + +#ifdef _KERNEL +/* + * Internal kernel part + */ + +#define lp_ifname lp_ifp->if_xname /* interface name */ +#define lp_link_state lp_ifp->if_link_state /* link state */ + +#define LAGG_PORTACTIVE(_tp) ( \ + ((_tp)->lp_link_state == LINK_STATE_UP) && \ + ((_tp)->lp_ifp->if_flags & IFF_UP) \ +) + +struct lagg_ifreq { + union { + struct ifreq ifreq; + struct { + char ifr_name[IFNAMSIZ]; + struct sockaddr_storage ifr_ss; + } ifreq_storage; + } ifreq; +}; + +#define sc_ifflags sc_ifp->if_flags /* flags */ +#define sc_ifname sc_ifp->if_xname /* name */ +#define sc_capabilities sc_ifp->if_capabilities /* capabilities */ + +#define IFCAP_LAGG_MASK 0xffff0000 /* private capabilities */ +#define IFCAP_LAGG_FULLDUPLEX 0x00010000 /* full duplex with >1 ports */ + +/* Private data used by the loadbalancing protocol */ +struct lagg_lb { + u_int32_t lb_key; + struct lagg_port *lb_ports[LAGG_MAX_PORTS]; +}; + +struct lagg_mc { + struct ifmultiaddr *mc_ifma; + SLIST_ENTRY(lagg_mc) mc_entries; +}; + +/* List of interfaces to have the MAC address modified */ +struct lagg_llq { + struct ifnet *llq_ifp; + uint8_t llq_lladdr[ETHER_ADDR_LEN]; + SLIST_ENTRY(lagg_llq) llq_entries; +}; + +struct lagg_softc { + struct ifnet *sc_ifp; /* virtual interface */ + struct rwlock sc_mtx; + int sc_proto; /* lagg protocol */ + u_int sc_count; /* number of ports */ + struct lagg_port *sc_primary; /* primary port */ + struct ifmedia sc_media; /* media config */ + caddr_t sc_psc; /* protocol data */ + uint32_t sc_seq; /* sequence counter */ + + SLIST_HEAD(__tplhd, lagg_port) sc_ports; /* list of interfaces */ + SLIST_ENTRY(lagg_softc) sc_entries; + + struct task sc_lladdr_task; + SLIST_HEAD(__llqhd, lagg_llq) sc_llq_head; /* interfaces to program + the lladdr on */ + + /* lagg protocol callbacks */ + int (*sc_detach)(struct lagg_softc *); + int (*sc_start)(struct lagg_softc *, struct mbuf *); + struct mbuf *(*sc_input)(struct lagg_softc *, struct lagg_port *, + struct mbuf *); + int (*sc_port_create)(struct lagg_port *); + void (*sc_port_destroy)(struct lagg_port *); + void (*sc_linkstate)(struct lagg_port *); + void (*sc_init)(struct lagg_softc *); + void (*sc_stop)(struct lagg_softc *); + void (*sc_lladdr)(struct lagg_softc *); + void (*sc_req)(struct lagg_softc *, caddr_t); + void (*sc_portreq)(struct lagg_port *, caddr_t); +#if __FreeBSD_version >= 800000 + eventhandler_tag vlan_attach; + eventhandler_tag vlan_detach; +#endif +}; + +struct lagg_port { + struct ifnet *lp_ifp; /* physical interface */ + struct lagg_softc *lp_softc; /* parent lagg */ + uint8_t lp_lladdr[ETHER_ADDR_LEN]; + + u_char lp_iftype; /* interface type */ + uint32_t lp_prio; /* port priority */ + uint32_t lp_flags; /* port flags */ + int lp_ifflags; /* saved ifp flags */ + void *lh_cookie; /* if state hook */ + caddr_t lp_psc; /* protocol data */ + int lp_detaching; /* ifnet is detaching */ + + SLIST_HEAD(__mclhd, lagg_mc) lp_mc_head; /* multicast addresses */ + + /* Redirected callbacks */ + int (*lp_ioctl)(struct ifnet *, u_long, caddr_t); + int (*lp_output)(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); + + SLIST_ENTRY(lagg_port) lp_entries; +}; + +#define LAGG_LOCK_INIT(_sc) rw_init(&(_sc)->sc_mtx, "if_lagg rwlock") +#define LAGG_LOCK_DESTROY(_sc) rw_destroy(&(_sc)->sc_mtx) +#define LAGG_RLOCK(_sc) rw_rlock(&(_sc)->sc_mtx) +#define LAGG_WLOCK(_sc) rw_wlock(&(_sc)->sc_mtx) +#define LAGG_RUNLOCK(_sc) rw_runlock(&(_sc)->sc_mtx) +#define LAGG_WUNLOCK(_sc) rw_wunlock(&(_sc)->sc_mtx) +#define LAGG_RLOCK_ASSERT(_sc) rw_assert(&(_sc)->sc_mtx, RA_RLOCKED) +#define LAGG_WLOCK_ASSERT(_sc) rw_assert(&(_sc)->sc_mtx, RA_WLOCKED) + +extern struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); +extern void (*lagg_linkstate_p)(struct ifnet *, int ); + +int lagg_enqueue(struct ifnet *, struct mbuf *); +uint32_t lagg_hashmbuf(struct mbuf *, uint32_t); + +#endif /* _KERNEL */ + +#endif /* _NET_LAGG_H */ diff --git a/freebsd/sys/net/if_llatbl.c b/freebsd/sys/net/if_llatbl.c new file mode 100644 index 00000000..b9f78a71 --- /dev/null +++ b/freebsd/sys/net/if_llatbl.c @@ -0,0 +1,528 @@ +#include + +/* + * Copyright (c) 2004 Luigi Rizzo, Alessandro Cerri. All rights reserved. + * Copyright (c) 2004-2008 Qing Li. All rights reserved. + * Copyright (c) 2008 Kip Macy. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DDB +#include +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MALLOC_DEFINE(M_LLTABLE, "lltable", "link level address tables"); + +static VNET_DEFINE(SLIST_HEAD(, lltable), lltables); +#define V_lltables VNET(lltables) + +extern void arprequest(struct ifnet *, struct in_addr *, struct in_addr *, + u_char *); + +static void vnet_lltable_init(void); + +struct rwlock lltable_rwlock; +RW_SYSINIT(lltable_rwlock, &lltable_rwlock, "lltable_rwlock"); + +/* + * Dump arp state for a specific address family. + */ +int +lltable_sysctl_dumparp(int af, struct sysctl_req *wr) +{ + struct lltable *llt; + int error = 0; + + LLTABLE_RLOCK(); + SLIST_FOREACH(llt, &V_lltables, llt_link) { + if (llt->llt_af == af) { + error = llt->llt_dump(llt, wr); + if (error != 0) + goto done; + } + } +done: + LLTABLE_RUNLOCK(); + return (error); +} + +/* + * Deletes an address from the address table. + * This function is called by the timer functions + * such as arptimer() and nd6_llinfo_timer(), and + * the caller does the locking. + */ +void +llentry_free(struct llentry *lle) +{ + + LLE_WLOCK_ASSERT(lle); + LIST_REMOVE(lle, lle_next); + + if (lle->la_hold != NULL) + m_freem(lle->la_hold); + + LLE_FREE_LOCKED(lle); +} + +/* + * Update an llentry for address dst (equivalent to rtalloc for new-arp) + * Caller must pass in a valid struct llentry * (or NULL) + * + * if found the llentry * is returned referenced and unlocked + */ +int +llentry_update(struct llentry **llep, struct lltable *lt, + struct sockaddr_storage *dst, struct ifnet *ifp) +{ + struct llentry *la; + + IF_AFDATA_RLOCK(ifp); + la = lla_lookup(lt, LLE_EXCLUSIVE, + (struct sockaddr *)dst); + IF_AFDATA_RUNLOCK(ifp); + if ((la == NULL) && + (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { + IF_AFDATA_WLOCK(ifp); + la = lla_lookup(lt, + (LLE_CREATE | LLE_EXCLUSIVE), + (struct sockaddr *)dst); + IF_AFDATA_WUNLOCK(ifp); + } + if (la != NULL && (*llep != la)) { + if (*llep != NULL) + LLE_FREE(*llep); + LLE_ADDREF(la); + LLE_WUNLOCK(la); + *llep = la; + } else if (la != NULL) + LLE_WUNLOCK(la); + + if (la == NULL) + return (ENOENT); + + return (0); +} + +/* + * Free all entries from given table and free itself. + */ +void +lltable_free(struct lltable *llt) +{ + struct llentry *lle, *next; + int i; + + KASSERT(llt != NULL, ("%s: llt is NULL", __func__)); + + LLTABLE_WLOCK(); + SLIST_REMOVE(&V_lltables, llt, lltable, llt_link); + LLTABLE_WUNLOCK(); + + for (i=0; i < LLTBL_HASHTBL_SIZE; i++) { + LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { + int canceled; + + canceled = callout_drain(&lle->la_timer); + LLE_WLOCK(lle); + if (canceled) + LLE_REMREF(lle); + llentry_free(lle); + } + } + + free(llt, M_LLTABLE); +} + +#if 0 +void +lltable_drain(int af) +{ + struct lltable *llt; + struct llentry *lle; + register int i; + + LLTABLE_RLOCK(); + SLIST_FOREACH(llt, &V_lltables, llt_link) { + if (llt->llt_af != af) + continue; + + for (i=0; i < LLTBL_HASHTBL_SIZE; i++) { + LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { + LLE_WLOCK(lle); + if (lle->la_hold) { + m_freem(lle->la_hold); + lle->la_hold = NULL; + } + LLE_WUNLOCK(lle); + } + } + } + LLTABLE_RUNLOCK(); +} +#endif + +void +lltable_prefix_free(int af, struct sockaddr *prefix, struct sockaddr *mask) +{ + struct lltable *llt; + + LLTABLE_RLOCK(); + SLIST_FOREACH(llt, &V_lltables, llt_link) { + if (llt->llt_af != af) + continue; + + llt->llt_prefix_free(llt, prefix, mask); + } + LLTABLE_RUNLOCK(); +} + + + +/* + * Create a new lltable. + */ +struct lltable * +lltable_init(struct ifnet *ifp, int af) +{ + struct lltable *llt; + register int i; + + llt = malloc(sizeof(struct lltable), M_LLTABLE, M_WAITOK); + + llt->llt_af = af; + llt->llt_ifp = ifp; + for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) + LIST_INIT(&llt->lle_head[i]); + + LLTABLE_WLOCK(); + SLIST_INSERT_HEAD(&V_lltables, llt, llt_link); + LLTABLE_WUNLOCK(); + + return (llt); +} + +/* + * Called in route_output when adding/deleting a route to an interface. + */ +int +lla_rt_output(struct rt_msghdr *rtm, struct rt_addrinfo *info) +{ + struct sockaddr_dl *dl = + (struct sockaddr_dl *)info->rti_info[RTAX_GATEWAY]; + struct sockaddr *dst = (struct sockaddr *)info->rti_info[RTAX_DST]; + struct ifnet *ifp; + struct lltable *llt; + struct llentry *lle; + u_int laflags = 0, flags = 0; + int error = 0; + + if (dl == NULL || dl->sdl_family != AF_LINK) { + log(LOG_INFO, "%s: invalid dl\n", __func__); + return EINVAL; + } + ifp = ifnet_byindex(dl->sdl_index); + if (ifp == NULL) { + log(LOG_INFO, "%s: invalid ifp (sdl_index %d)\n", + __func__, dl->sdl_index); + return EINVAL; + } + + switch (rtm->rtm_type) { + case RTM_ADD: + if (rtm->rtm_flags & RTF_ANNOUNCE) { + flags |= LLE_PUB; +#ifdef INET + if (dst->sa_family == AF_INET && + ((struct sockaddr_inarp *)dst)->sin_other != 0) { + struct rtentry *rt; + ((struct sockaddr_inarp *)dst)->sin_other = 0; + rt = rtalloc1(dst, 0, 0); + if (rt == NULL || !(rt->rt_flags & RTF_HOST)) { + log(LOG_INFO, "%s: RTM_ADD publish " + "(proxy only) is invalid\n", + __func__); + if (rt) + RTFREE_LOCKED(rt); + return EINVAL; + } + RTFREE_LOCKED(rt); + + flags |= LLE_PROXY; + } +#endif + } + flags |= LLE_CREATE; + break; + + case RTM_DELETE: + flags |= LLE_DELETE; + break; + + case RTM_CHANGE: + break; + + default: + return EINVAL; /* XXX not implemented yet */ + } + + /* XXX linked list may be too expensive */ + LLTABLE_RLOCK(); + SLIST_FOREACH(llt, &V_lltables, llt_link) { + if (llt->llt_af == dst->sa_family && + llt->llt_ifp == ifp) + break; + } + LLTABLE_RUNLOCK(); + KASSERT(llt != NULL, ("Yep, ugly hacks are bad\n")); + + if (flags & LLE_CREATE) + flags |= LLE_EXCLUSIVE; + + IF_AFDATA_LOCK(ifp); + lle = lla_lookup(llt, flags, dst); + IF_AFDATA_UNLOCK(ifp); + if (LLE_IS_VALID(lle)) { + if (flags & LLE_CREATE) { + /* + * If we delay the delete, then a subsequent + * "arp add" should look up this entry, reset the + * LLE_DELETED flag, and reset the expiration timer + */ + bcopy(LLADDR(dl), &lle->ll_addr, ifp->if_addrlen); + lle->la_flags |= (flags & (LLE_PUB | LLE_PROXY)); + lle->la_flags |= LLE_VALID; + lle->la_flags &= ~LLE_DELETED; +#ifdef INET6 + /* + * ND6 + */ + if (dst->sa_family == AF_INET6) + lle->ln_state = ND6_LLINFO_REACHABLE; +#endif + /* + * NB: arp and ndp always set (RTF_STATIC | RTF_HOST) + */ + + if (rtm->rtm_rmx.rmx_expire == 0) { + lle->la_flags |= LLE_STATIC; + lle->la_expire = 0; + } else + lle->la_expire = rtm->rtm_rmx.rmx_expire; + laflags = lle->la_flags; + LLE_WUNLOCK(lle); +#ifdef INET + /* gratuitous ARP */ + if ((laflags & LLE_PUB) && dst->sa_family == AF_INET) { + arprequest(ifp, + &((struct sockaddr_in *)dst)->sin_addr, + &((struct sockaddr_in *)dst)->sin_addr, + ((laflags & LLE_PROXY) ? + (u_char *)IF_LLADDR(ifp) : + (u_char *)LLADDR(dl))); + } +#endif + } else { + if (flags & LLE_EXCLUSIVE) + LLE_WUNLOCK(lle); + else + LLE_RUNLOCK(lle); + } + } else if ((lle == NULL) && (flags & LLE_DELETE)) + error = EINVAL; + + + return (error); +} + +static void +vnet_lltable_init() +{ + + SLIST_INIT(&V_lltables); +} +VNET_SYSINIT(vnet_lltable_init, SI_SUB_PSEUDO, SI_ORDER_FIRST, + vnet_lltable_init, NULL); + +#ifdef DDB +struct llentry_sa { + struct llentry base; + struct sockaddr l3_addr; +}; + +static void +llatbl_lle_show(struct llentry_sa *la) +{ + struct llentry *lle; + uint8_t octet[6]; + + lle = &la->base; + db_printf("lle=%p\n", lle); + db_printf(" lle_next=%p\n", lle->lle_next.le_next); + db_printf(" lle_lock=%p\n", &lle->lle_lock); + db_printf(" lle_tbl=%p\n", lle->lle_tbl); + db_printf(" lle_head=%p\n", lle->lle_head); + db_printf(" la_hold=%p\n", lle->la_hold); + db_printf(" la_expire=%ju\n", (uintmax_t)lle->la_expire); + db_printf(" la_flags=0x%04x\n", lle->la_flags); + db_printf(" la_asked=%u\n", lle->la_asked); + db_printf(" la_preempt=%u\n", lle->la_preempt); + db_printf(" ln_byhint=%u\n", lle->ln_byhint); + db_printf(" ln_state=%d\n", lle->ln_state); + db_printf(" ln_router=%u\n", lle->ln_router); + db_printf(" ln_ntick=%ju\n", (uintmax_t)lle->ln_ntick); + db_printf(" lle_refcnt=%d\n", lle->lle_refcnt); + bcopy(&lle->ll_addr.mac16, octet, sizeof(octet)); + db_printf(" ll_addr=%02x:%02x:%02x:%02x:%02x:%02x\n", + octet[0], octet[1], octet[2], octet[3], octet[4], octet[5]); + db_printf(" la_timer=%p\n", &lle->la_timer); + + switch (la->l3_addr.sa_family) { +#ifdef INET + case AF_INET: + { + struct sockaddr_in *sin; + char l3s[INET_ADDRSTRLEN]; + + sin = (struct sockaddr_in *)&la->l3_addr; + inet_ntoa_r(sin->sin_addr, l3s); + db_printf(" l3_addr=%s\n", l3s); + break; + } +#endif +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 *sin6; + char l3s[INET6_ADDRSTRLEN]; + + sin6 = (struct sockaddr_in6 *)&la->l3_addr; + ip6_sprintf(l3s, &sin6->sin6_addr); + db_printf(" l3_addr=%s\n", l3s); + break; + } +#endif + default: + db_printf(" l3_addr=N/A (af=%d)\n", la->l3_addr.sa_family); + break; + } +} + +DB_SHOW_COMMAND(llentry, db_show_llentry) +{ + + if (!have_addr) { + db_printf("usage: show llentry \n"); + return; + } + + llatbl_lle_show((struct llentry_sa *)addr); +} + +static void +llatbl_llt_show(struct lltable *llt) +{ + int i; + struct llentry *lle; + + db_printf("llt=%p llt_af=%d llt_ifp=%p\n", + llt, llt->llt_af, llt->llt_ifp); + + for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { + LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { + + llatbl_lle_show((struct llentry_sa *)lle); + if (db_pager_quit) + return; + } + } +} + +DB_SHOW_COMMAND(lltable, db_show_lltable) +{ + + if (!have_addr) { + db_printf("usage: show lltable \n"); + return; + } + + llatbl_llt_show((struct lltable *)addr); +} + +DB_SHOW_ALL_COMMAND(lltables, db_show_all_lltables) +{ + VNET_ITERATOR_DECL(vnet_iter); + struct lltable *llt; + + VNET_FOREACH(vnet_iter) { + CURVNET_SET_QUIET(vnet_iter); +#ifdef VIMAGE + db_printf("vnet=%p\n", curvnet); +#endif + SLIST_FOREACH(llt, &V_lltables, llt_link) { + db_printf("llt=%p llt_af=%d llt_ifp=%p(%s)\n", + llt, llt->llt_af, llt->llt_ifp, + (llt->llt_ifp != NULL) ? + llt->llt_ifp->if_xname : "?"); + if (have_addr && addr != 0) /* verbose */ + llatbl_llt_show(llt); + if (db_pager_quit) { + CURVNET_RESTORE(); + return; + } + } + CURVNET_RESTORE(); + } +} +#endif diff --git a/freebsd/sys/net/if_llatbl.h b/freebsd/sys/net/if_llatbl.h new file mode 100644 index 00000000..9e12362b --- /dev/null +++ b/freebsd/sys/net/if_llatbl.h @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2004 Luigi Rizzo, Alessandro Cerri. All rights reserved. + * Copyright (c) 2004-2008 Qing Li. All rights reserved. + * Copyright (c) 2008 Kip Macy. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); + +#ifndef _NET_IF_LLATBL_HH_ +#define _NET_IF_LLATBL_HH_ + +#include +#include + +struct ifnet; +struct sysctl_req; +struct rt_msghdr; +struct rt_addrinfo; + +struct llentry; +LIST_HEAD(llentries, llentry); + +extern struct rwlock lltable_rwlock; +#define LLTABLE_RLOCK() rw_rlock(&lltable_rwlock) +#define LLTABLE_RUNLOCK() rw_runlock(&lltable_rwlock) +#define LLTABLE_WLOCK() rw_wlock(&lltable_rwlock) +#define LLTABLE_WUNLOCK() rw_wunlock(&lltable_rwlock) +#define LLTABLE_LOCK_ASSERT() rw_assert(&lltable_rwlock, RA_LOCKED) + +/* + * Code referencing llentry must at least hold + * a shared lock + */ +struct llentry { + LIST_ENTRY(llentry) lle_next; + struct rwlock lle_lock; + struct lltable *lle_tbl; + struct llentries *lle_head; + struct mbuf *la_hold; + time_t la_expire; + uint16_t la_flags; + uint16_t la_asked; + uint16_t la_preempt; + uint16_t ln_byhint; + int16_t ln_state; /* IPv6 has ND6_LLINFO_NOSTATE == -2 */ + uint16_t ln_router; + time_t ln_ntick; + int lle_refcnt; + + union { + uint64_t mac_aligned; + uint16_t mac16[3]; + } ll_addr; + + /* XXX af-private? */ + union { + struct callout ln_timer_ch; + struct callout la_timer; + } lle_timer; + /* NB: struct sockaddr must immediately follow */ +}; + +#define LLE_WLOCK(lle) rw_wlock(&(lle)->lle_lock) +#define LLE_RLOCK(lle) rw_rlock(&(lle)->lle_lock) +#define LLE_WUNLOCK(lle) rw_wunlock(&(lle)->lle_lock) +#define LLE_RUNLOCK(lle) rw_runlock(&(lle)->lle_lock) +#define LLE_DOWNGRADE(lle) rw_downgrade(&(lle)->lle_lock) +#define LLE_TRY_UPGRADE(lle) rw_try_upgrade(&(lle)->lle_lock) +#define LLE_LOCK_INIT(lle) rw_init_flags(&(lle)->lle_lock, "lle", RW_DUPOK) +#define LLE_LOCK_DESTROY(lle) rw_destroy(&(lle)->lle_lock) +#define LLE_WLOCK_ASSERT(lle) rw_assert(&(lle)->lle_lock, RA_WLOCKED) + +#define LLE_IS_VALID(lle) (((lle) != NULL) && ((lle) != (void *)-1)) + +#define LLE_ADDREF(lle) do { \ + LLE_WLOCK_ASSERT(lle); \ + KASSERT((lle)->lle_refcnt >= 0, \ + ("negative refcnt %d", (lle)->lle_refcnt)); \ + (lle)->lle_refcnt++; \ +} while (0) + +#define LLE_REMREF(lle) do { \ + LLE_WLOCK_ASSERT(lle); \ + KASSERT((lle)->lle_refcnt > 1, \ + ("bogus refcnt %d", (lle)->lle_refcnt)); \ + (lle)->lle_refcnt--; \ +} while (0) + +#define LLE_FREE_LOCKED(lle) do { \ + if ((lle)->lle_refcnt <= 1) \ + (lle)->lle_tbl->llt_free((lle)->lle_tbl, (lle));\ + else { \ + (lle)->lle_refcnt--; \ + LLE_WUNLOCK(lle); \ + } \ + /* guard against invalid refs */ \ + lle = 0; \ +} while (0) + +#define LLE_FREE(lle) do { \ + LLE_WLOCK(lle); \ + if ((lle)->lle_refcnt <= 1) \ + (lle)->lle_tbl->llt_free((lle)->lle_tbl, (lle));\ + else { \ + (lle)->lle_refcnt--; \ + LLE_WUNLOCK(lle); \ + } \ + /* guard against invalid refs */ \ + lle = NULL; \ +} while (0) + + +#define ln_timer_ch lle_timer.ln_timer_ch +#define la_timer lle_timer.la_timer + +/* XXX bad name */ +#define L3_ADDR(lle) ((struct sockaddr *)(&lle[1])) +#define L3_ADDR_LEN(lle) (((struct sockaddr *)(&lle[1]))->sa_len) + +#ifndef LLTBL_HASHTBL_SIZE +#define LLTBL_HASHTBL_SIZE 32 /* default 32 ? */ +#endif + +#ifndef LLTBL_HASHMASK +#define LLTBL_HASHMASK (LLTBL_HASHTBL_SIZE - 1) +#endif + +struct lltable { + SLIST_ENTRY(lltable) llt_link; + struct llentries lle_head[LLTBL_HASHTBL_SIZE]; + int llt_af; + struct ifnet *llt_ifp; + + struct llentry * (*llt_new)(const struct sockaddr *, u_int); + void (*llt_free)(struct lltable *, struct llentry *); + void (*llt_prefix_free)(struct lltable *, + const struct sockaddr *prefix, + const struct sockaddr *mask); + struct llentry * (*llt_lookup)(struct lltable *, u_int flags, + const struct sockaddr *l3addr); + int (*llt_rtcheck)(struct ifnet *, u_int flags, + const struct sockaddr *); + int (*llt_dump)(struct lltable *, + struct sysctl_req *); +}; +MALLOC_DECLARE(M_LLTABLE); + +/* + * flags to be passed to arplookup. + */ +#define LLE_DELETED 0x0001 /* entry must be deleted */ +#define LLE_STATIC 0x0002 /* entry is static */ +#define LLE_IFADDR 0x0004 /* entry is interface addr */ +#define LLE_VALID 0x0008 /* ll_addr is valid */ +#define LLE_PROXY 0x0010 /* proxy entry ??? */ +#define LLE_PUB 0x0020 /* publish entry ??? */ +#define LLE_DELETE 0x4000 /* delete on a lookup - match LLE_IFADDR */ +#define LLE_CREATE 0x8000 /* create on a lookup miss */ +#define LLE_EXCLUSIVE 0x2000 /* return lle xlocked */ + +#define LLATBL_HASH(key, mask) \ + (((((((key >> 8) ^ key) >> 8) ^ key) >> 8) ^ key) & mask) + +struct lltable *lltable_init(struct ifnet *, int); +void lltable_free(struct lltable *); +void lltable_prefix_free(int, struct sockaddr *, + struct sockaddr *); +#if 0 +void lltable_drain(int); +#endif +int lltable_sysctl_dumparp(int, struct sysctl_req *); + +void llentry_free(struct llentry *); +int llentry_update(struct llentry **, struct lltable *, + struct sockaddr_storage *, struct ifnet *); + +/* + * Generic link layer address lookup function. + */ +static __inline struct llentry * +lla_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) +{ + return llt->llt_lookup(llt, flags, l3addr); +} + +int lla_rt_output(struct rt_msghdr *, struct rt_addrinfo *); +#endif /* _NET_IF_LLATBL_HH_ */ diff --git a/freebsd/sys/net/if_llc.h b/freebsd/sys/net/if_llc.h new file mode 100644 index 00000000..b72f21bc --- /dev/null +++ b/freebsd/sys/net/if_llc.h @@ -0,0 +1,161 @@ +/* $NetBSD: if_llc.h,v 1.12 1999/11/19 20:41:19 thorpej Exp $ */ + +/*- + * Copyright (c) 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_llc.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NET_IF_LLC_HH_ +#define _NET_IF_LLC_HH_ + +/* + * IEEE 802.2 Link Level Control headers, for use in conjunction with + * 802.{3,4,5} media access control methods. + * + * Headers here do not use bit fields due to shortcommings in many + * compilers. + */ + +struct llc { + u_int8_t llc_dsap; + u_int8_t llc_ssap; + union { + struct { + u_int8_t control; + u_int8_t format_id; + u_int8_t class; + u_int8_t window_x2; + } __packed type_u; + struct { + u_int8_t num_snd_x2; + u_int8_t num_rcv_x2; + } __packed type_i; + struct { + u_int8_t control; + u_int8_t num_rcv_x2; + } __packed type_s; + struct { + u_int8_t control; + /* + * We cannot put the following fields in a structure because + * the structure rounding might cause padding. + */ + u_int8_t frmr_rej_pdu0; + u_int8_t frmr_rej_pdu1; + u_int8_t frmr_control; + u_int8_t frmr_control_ext; + u_int8_t frmr_cause; + } __packed type_frmr; + struct { + u_int8_t control; + u_int8_t org_code[3]; + u_int16_t ether_type; + } __packed type_snap; + struct { + u_int8_t control; + u_int8_t control_ext; + } __packed type_raw; + } __packed llc_un; +} __packed; + +struct frmrinfo { + u_int8_t frmr_rej_pdu0; + u_int8_t frmr_rej_pdu1; + u_int8_t frmr_control; + u_int8_t frmr_control_ext; + u_int8_t frmr_cause; +} __packed; + +#define llc_control llc_un.type_u.control +#define llc_control_ext llc_un.type_raw.control_ext +#define llc_fid llc_un.type_u.format_id +#define llc_class llc_un.type_u.class +#define llc_window llc_un.type_u.window_x2 +#define llc_frmrinfo llc_un.type_frmr.frmr_rej_pdu0 +#define llc_frmr_pdu0 llc_un.type_frmr.frmr_rej_pdu0 +#define llc_frmr_pdu1 llc_un.type_frmr.frmr_rej_pdu1 +#define llc_frmr_control llc_un.type_frmr.frmr_control +#define llc_frmr_control_ext llc_un.type_frmr.frmr_control_ext +#define llc_frmr_cause llc_un.type_frmr.frmr_cause +#define llc_snap llc_un.type_snap + +/* + * Don't use sizeof(struct llc_un) for LLC header sizes + */ +#define LLC_ISFRAMELEN 4 +#define LLC_UFRAMELEN 3 +#define LLC_FRMRLEN 7 +#define LLC_SNAPFRAMELEN 8 + +#ifdef CTASSERT +CTASSERT(sizeof (struct llc) == LLC_SNAPFRAMELEN); +#endif + +/* + * Unnumbered LLC format commands + */ +#define LLC_UI 0x3 +#define LLC_UI_P 0x13 +#define LLC_DISC 0x43 +#define LLC_DISC_P 0x53 +#define LLC_UA 0x63 +#define LLC_UA_P 0x73 +#define LLC_TEST 0xe3 +#define LLC_TEST_P 0xf3 +#define LLC_FRMR 0x87 +#define LLC_FRMR_P 0x97 +#define LLC_DM 0x0f +#define LLC_DM_P 0x1f +#define LLC_XID 0xaf +#define LLC_XID_P 0xbf +#define LLC_SABME 0x6f +#define LLC_SABME_P 0x7f + +/* + * Supervisory LLC commands + */ +#define LLC_RR 0x01 +#define LLC_RNR 0x05 +#define LLC_REJ 0x09 + +/* + * Info format - dummy only + */ +#define LLC_INFO 0x00 + +/* + * ISO PDTR 10178 contains among others + */ +#define LLC_8021D_LSAP 0x42 +#define LLC_X25_LSAP 0x7e +#define LLC_SNAP_LSAP 0xaa +#define LLC_ISO_LSAP 0xfe + +#endif /* _NET_IF_LLC_HH_ */ diff --git a/freebsd/sys/net/if_loop.c b/freebsd/sys/net/if_loop.c new file mode 100644 index 00000000..d80bfdad --- /dev/null +++ b/freebsd/sys/net/if_loop.c @@ -0,0 +1,451 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_loop.c 8.2 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +/* + * Loopback interface driver for protocol testing and timing. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET +#include +#include +#endif + +#ifdef IPX +#include +#include +#endif + +#ifdef INET6 +#ifndef INET +#include +#endif +#include +#include +#endif + +#ifdef NETATALK +#include +#include +#endif + +#include + +#ifdef TINY_LOMTU +#define LOMTU (1024+512) +#elif defined(LARGE_LOMTU) +#define LOMTU 131072 +#else +#define LOMTU 16384 +#endif + +#define LO_CSUM_FEATURES (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP) +#define LO_CSUM_SET (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | \ + CSUM_IP_CHECKED | CSUM_IP_VALID | \ + CSUM_SCTP_VALID) + +int loioctl(struct ifnet *, u_long, caddr_t); +static void lortrequest(int, struct rtentry *, struct rt_addrinfo *); +int looutput(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro); +static int lo_clone_create(struct if_clone *, int, caddr_t); +static void lo_clone_destroy(struct ifnet *); + +VNET_DEFINE(struct ifnet *, loif); /* Used externally */ + +#ifdef VIMAGE +static VNET_DEFINE(struct ifc_simple_data, lo_cloner_data); +static VNET_DEFINE(struct if_clone, lo_cloner); +#define V_lo_cloner_data VNET(lo_cloner_data) +#define V_lo_cloner VNET(lo_cloner) +#endif + +IFC_SIMPLE_DECLARE(lo, 1); + +static void +lo_clone_destroy(struct ifnet *ifp) +{ + +#ifndef VIMAGE + /* XXX: destroying lo0 will lead to panics. */ + KASSERT(V_loif != ifp, ("%s: destroying lo0", __func__)); +#endif + + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); +} + +static int +lo_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + struct ifnet *ifp; + + ifp = if_alloc(IFT_LOOP); + if (ifp == NULL) + return (ENOSPC); + + if_initname(ifp, ifc->ifc_name, unit); + ifp->if_mtu = LOMTU; + ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; + ifp->if_ioctl = loioctl; + ifp->if_output = looutput; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_capabilities = ifp->if_capenable = IFCAP_HWCSUM; + ifp->if_hwassist = LO_CSUM_FEATURES; + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + if (V_loif == NULL) + V_loif = ifp; + + return (0); +} + +static void +vnet_loif_init(const void *unused __unused) +{ + +#ifdef VIMAGE + V_lo_cloner = lo_cloner; + V_lo_cloner_data = lo_cloner_data; + V_lo_cloner.ifc_data = &V_lo_cloner_data; + if_clone_attach(&V_lo_cloner); +#else + if_clone_attach(&lo_cloner); +#endif +} +VNET_SYSINIT(vnet_loif_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_loif_init, NULL); + +#ifdef VIMAGE +static void +vnet_loif_uninit(const void *unused __unused) +{ + + if_clone_detach(&V_lo_cloner); + V_loif = NULL; +} +VNET_SYSUNINIT(vnet_loif_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_loif_uninit, NULL); +#endif + +static int +loop_modevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + break; + + case MOD_UNLOAD: + printf("loop module unload - not possible for this module type\n"); + return (EINVAL); + + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t loop_mod = { + "if_lo", + loop_modevent, + 0 +}; + +DECLARE_MODULE(if_lo, loop_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); + +int +looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct route *ro) +{ + u_int32_t af; + struct rtentry *rt = NULL; +#ifdef MAC + int error; +#endif + + M_ASSERTPKTHDR(m); /* check if we have the packet header */ + + if (ro != NULL) + rt = ro->ro_rt; +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) { + m_freem(m); + return (error); + } +#endif + + if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + m_freem(m); + return (rt->rt_flags & RTF_BLACKHOLE ? 0 : + rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + } + + ifp->if_opackets++; + ifp->if_obytes += m->m_pkthdr.len; + + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC) { + bcopy(dst->sa_data, &af, sizeof(af)); + dst->sa_family = af; + } + +#if 1 /* XXX */ + switch (dst->sa_family) { + case AF_INET: + if (ifp->if_capenable & IFCAP_RXCSUM) { + m->m_pkthdr.csum_data = 0xffff; + m->m_pkthdr.csum_flags = LO_CSUM_SET; + } + m->m_pkthdr.csum_flags &= ~LO_CSUM_FEATURES; + case AF_INET6: + case AF_IPX: + case AF_APPLETALK: + break; + default: + printf("looutput: af=%d unexpected\n", dst->sa_family); + m_freem(m); + return (EAFNOSUPPORT); + } +#endif + return (if_simloop(ifp, m, dst->sa_family, 0)); +} + +/* + * if_simloop() + * + * This function is to support software emulation of hardware loopback, + * i.e., for interfaces with the IFF_SIMPLEX attribute. Since they can't + * hear their own broadcasts, we create a copy of the packet that we + * would normally receive via a hardware loopback. + * + * This function expects the packet to include the media header of length hlen. + */ +int +if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen) +{ + int isr; + + M_ASSERTPKTHDR(m); + m_tag_delete_nonpersistent(m); + m->m_pkthdr.rcvif = ifp; + +#ifdef MAC + mac_ifnet_create_mbuf(ifp, m); +#endif + + /* + * Let BPF see incoming packet in the following manner: + * - Emulated packet loopback for a simplex interface + * (net/if_ethersubr.c) + * -> passes it to ifp's BPF + * - IPv4/v6 multicast packet loopback (netinet(6)/ip(6)_output.c) + * -> not passes it to any BPF + * - Normal packet loopback from myself to myself (net/if_loop.c) + * -> passes to lo0's BPF (even in case of IPv6, where ifp!=lo0) + */ + if (hlen > 0) { + if (bpf_peers_present(ifp->if_bpf)) { + bpf_mtap(ifp->if_bpf, m); + } + } else { + if (bpf_peers_present(V_loif->if_bpf)) { + if ((m->m_flags & M_MCAST) == 0 || V_loif == ifp) { + /* XXX beware sizeof(af) != 4 */ + u_int32_t af1 = af; + + /* + * We need to prepend the address family. + */ + bpf_mtap2(V_loif->if_bpf, &af1, sizeof(af1), m); + } + } + } + + /* Strip away media header */ + if (hlen > 0) { + m_adj(m, hlen); +#ifndef __NO_STRICT_ALIGNMENT + /* + * Some archs do not like unaligned data, so + * we move data down in the first mbuf. + */ + if (mtod(m, vm_offset_t) & 3) { + KASSERT(hlen >= 3, ("if_simloop: hlen too small")); + bcopy(m->m_data, + (char *)(mtod(m, vm_offset_t) + - (mtod(m, vm_offset_t) & 3)), + m->m_len); + m->m_data -= (mtod(m,vm_offset_t) & 3); + } +#endif + } + + /* Deliver to upper layer protocol */ + switch (af) { +#ifdef INET + case AF_INET: + isr = NETISR_IP; + break; +#endif +#ifdef INET6 + case AF_INET6: + m->m_flags |= M_LOOP; + isr = NETISR_IPV6; + break; +#endif +#ifdef IPX + case AF_IPX: + isr = NETISR_IPX; + break; +#endif +#ifdef NETATALK + case AF_APPLETALK: + isr = NETISR_ATALK2; + break; +#endif + default: + printf("if_simloop: can't handle af=%d\n", af); + m_freem(m); + return (EAFNOSUPPORT); + } + ifp->if_ipackets++; + ifp->if_ibytes += m->m_pkthdr.len; + netisr_queue(isr, m); /* mbuf is free'd on failure. */ + return (0); +} + +/* ARGSUSED */ +static void +lortrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) +{ + + RT_LOCK_ASSERT(rt); + rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; +} + +/* + * Process an ioctl request. + */ +/* ARGSUSED */ +int +loioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct ifaddr *ifa; + struct ifreq *ifr = (struct ifreq *)data; + int error = 0, mask; + + switch (cmd) { + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + ifp->if_drv_flags |= IFF_DRV_RUNNING; + ifa = (struct ifaddr *)data; + ifa->ifa_rtrequest = lortrequest; + /* + * Everything else is done at a higher level. + */ + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + if (ifr == 0) { + error = EAFNOSUPPORT; /* XXX */ + break; + } + switch (ifr->ifr_addr.sa_family) { + +#ifdef INET + case AF_INET: + break; +#endif +#ifdef INET6 + case AF_INET6: + break; +#endif + + default: + error = EAFNOSUPPORT; + break; + } + break; + + case SIOCSIFMTU: + ifp->if_mtu = ifr->ifr_mtu; + break; + + case SIOCSIFFLAGS: + break; + + case SIOCSIFCAP: + mask = ifp->if_capenable ^ ifr->ifr_reqcap; + if ((mask & IFCAP_RXCSUM) != 0) + ifp->if_capenable ^= IFCAP_RXCSUM; + if ((mask & IFCAP_TXCSUM) != 0) + ifp->if_capenable ^= IFCAP_TXCSUM; + if (ifp->if_capenable & IFCAP_TXCSUM) + ifp->if_hwassist = LO_CSUM_FEATURES; + else + ifp->if_hwassist = 0; + break; + + default: + error = EINVAL; + } + return (error); +} diff --git a/freebsd/sys/net/if_media.c b/freebsd/sys/net/if_media.c new file mode 100644 index 00000000..3da5090a --- /dev/null +++ b/freebsd/sys/net/if_media.c @@ -0,0 +1,566 @@ +#include + +/* $NetBSD: if_media.c,v 1.1 1997/03/17 02:55:15 thorpej Exp $ */ +/* $FreeBSD$ */ + +/*- + * Copyright (c) 1997 + * Jonathan Stone and Jason R. Thorpe. All rights reserved. + * + * This software is derived from information provided by Matt Thomas. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Jonathan Stone + * and Jason R. Thorpe for the NetBSD Project. + * 4. The names of the authors may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * BSD/OS-compatible network interface media selection. + * + * Where it is safe to do so, this code strays slightly from the BSD/OS + * design. Software which uses the API (device drivers, basically) + * shouldn't notice any difference. + * + * Many thanks to Matt Thomas for providing the information necessary + * to implement this interface. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Compile-time options: + * IFMEDIA_DEBUG: + * turn on implementation-level debug printfs. + * Useful for debugging newly-ported drivers. + */ + +static struct ifmedia_entry *ifmedia_match(struct ifmedia *ifm, + int flags, int mask); + +#ifdef IFMEDIA_DEBUG +int ifmedia_debug = 0; +SYSCTL_INT(_debug, OID_AUTO, ifmedia, CTLFLAG_RW, &ifmedia_debug, + 0, "if_media debugging msgs"); +static void ifmedia_printword(int); +#endif + +/* + * Initialize if_media struct for a specific interface instance. + */ +void +ifmedia_init(ifm, dontcare_mask, change_callback, status_callback) + struct ifmedia *ifm; + int dontcare_mask; + ifm_change_cb_t change_callback; + ifm_stat_cb_t status_callback; +{ + + LIST_INIT(&ifm->ifm_list); + ifm->ifm_cur = NULL; + ifm->ifm_media = 0; + ifm->ifm_mask = dontcare_mask; /* IF don't-care bits */ + ifm->ifm_change = change_callback; + ifm->ifm_status = status_callback; +} + +void +ifmedia_removeall(ifm) + struct ifmedia *ifm; +{ + struct ifmedia_entry *entry; + + for (entry = LIST_FIRST(&ifm->ifm_list); entry; + entry = LIST_FIRST(&ifm->ifm_list)) { + LIST_REMOVE(entry, ifm_list); + free(entry, M_IFADDR); + } +} + +/* + * Add a media configuration to the list of supported media + * for a specific interface instance. + */ +void +ifmedia_add(ifm, mword, data, aux) + struct ifmedia *ifm; + int mword; + int data; + void *aux; +{ + register struct ifmedia_entry *entry; + +#ifdef IFMEDIA_DEBUG + if (ifmedia_debug) { + if (ifm == NULL) { + printf("ifmedia_add: null ifm\n"); + return; + } + printf("Adding entry for "); + ifmedia_printword(mword); + } +#endif + + entry = malloc(sizeof(*entry), M_IFADDR, M_NOWAIT); + if (entry == NULL) + panic("ifmedia_add: can't malloc entry"); + + entry->ifm_media = mword; + entry->ifm_data = data; + entry->ifm_aux = aux; + + LIST_INSERT_HEAD(&ifm->ifm_list, entry, ifm_list); +} + +/* + * Add an array of media configurations to the list of + * supported media for a specific interface instance. + */ +void +ifmedia_list_add(ifm, lp, count) + struct ifmedia *ifm; + struct ifmedia_entry *lp; + int count; +{ + int i; + + for (i = 0; i < count; i++) + ifmedia_add(ifm, lp[i].ifm_media, lp[i].ifm_data, + lp[i].ifm_aux); +} + +/* + * Set the default active media. + * + * Called by device-specific code which is assumed to have already + * selected the default media in hardware. We do _not_ call the + * media-change callback. + */ +void +ifmedia_set(ifm, target) + struct ifmedia *ifm; + int target; + +{ + struct ifmedia_entry *match; + + match = ifmedia_match(ifm, target, ifm->ifm_mask); + + if (match == NULL) { + printf("ifmedia_set: no match for 0x%x/0x%x\n", + target, ~ifm->ifm_mask); + panic("ifmedia_set"); + } + ifm->ifm_cur = match; + +#ifdef IFMEDIA_DEBUG + if (ifmedia_debug) { + printf("ifmedia_set: target "); + ifmedia_printword(target); + printf("ifmedia_set: setting to "); + ifmedia_printword(ifm->ifm_cur->ifm_media); + } +#endif +} + +/* + * Device-independent media ioctl support function. + */ +int +ifmedia_ioctl(ifp, ifr, ifm, cmd) + struct ifnet *ifp; + struct ifreq *ifr; + struct ifmedia *ifm; + u_long cmd; +{ + struct ifmedia_entry *match; + struct ifmediareq *ifmr = (struct ifmediareq *) ifr; + int error = 0, sticky; + + if (ifp == NULL || ifr == NULL || ifm == NULL) + return(EINVAL); + + switch (cmd) { + + /* + * Set the current media. + */ + case SIOCSIFMEDIA: + { + struct ifmedia_entry *oldentry; + int oldmedia; + int newmedia = ifr->ifr_media; + + match = ifmedia_match(ifm, newmedia, ifm->ifm_mask); + if (match == NULL) { +#ifdef IFMEDIA_DEBUG + if (ifmedia_debug) { + printf( + "ifmedia_ioctl: no media found for 0x%x\n", + newmedia); + } +#endif + return (ENXIO); + } + + /* + * If no change, we're done. + * XXX Automedia may invole software intervention. + * Keep going in case the the connected media changed. + * Similarly, if best match changed (kernel debugger?). + */ + if ((IFM_SUBTYPE(newmedia) != IFM_AUTO) && + (newmedia == ifm->ifm_media) && + (match == ifm->ifm_cur)) + return 0; + + /* + * We found a match, now make the driver switch to it. + * Make sure to preserve our old media type in case the + * driver can't switch. + */ +#ifdef IFMEDIA_DEBUG + if (ifmedia_debug) { + printf("ifmedia_ioctl: switching %s to ", + ifp->if_xname); + ifmedia_printword(match->ifm_media); + } +#endif + oldentry = ifm->ifm_cur; + oldmedia = ifm->ifm_media; + ifm->ifm_cur = match; + ifm->ifm_media = newmedia; + error = (*ifm->ifm_change)(ifp); + if (error) { + ifm->ifm_cur = oldentry; + ifm->ifm_media = oldmedia; + } + break; + } + + /* + * Get list of available media and current media on interface. + */ + case SIOCGIFMEDIA: + { + struct ifmedia_entry *ep; + int *kptr, count; + int usermax; /* user requested max */ + + kptr = NULL; /* XXX gcc */ + + ifmr->ifm_active = ifmr->ifm_current = ifm->ifm_cur ? + ifm->ifm_cur->ifm_media : IFM_NONE; + ifmr->ifm_mask = ifm->ifm_mask; + ifmr->ifm_status = 0; + (*ifm->ifm_status)(ifp, ifmr); + + count = 0; + usermax = 0; + + /* + * If there are more interfaces on the list, count + * them. This allows the caller to set ifmr->ifm_count + * to 0 on the first call to know how much space to + * allocate. + */ + LIST_FOREACH(ep, &ifm->ifm_list, ifm_list) + usermax++; + + /* + * Don't allow the user to ask for too many + * or a negative number. + */ + if (ifmr->ifm_count > usermax) + ifmr->ifm_count = usermax; + else if (ifmr->ifm_count < 0) + return (EINVAL); + + if (ifmr->ifm_count != 0) { + kptr = (int *)malloc(ifmr->ifm_count * sizeof(int), + M_TEMP, M_NOWAIT); + + if (kptr == NULL) + return (ENOMEM); + /* + * Get the media words from the interface's list. + */ + ep = LIST_FIRST(&ifm->ifm_list); + for (; ep != NULL && count < ifmr->ifm_count; + ep = LIST_NEXT(ep, ifm_list), count++) + kptr[count] = ep->ifm_media; + + if (ep != NULL) + error = E2BIG; /* oops! */ + } else { + count = usermax; + } + + /* + * We do the copyout on E2BIG, because that's + * just our way of telling userland that there + * are more. This is the behavior I've observed + * under BSD/OS 3.0 + */ + sticky = error; + if ((error == 0 || error == E2BIG) && ifmr->ifm_count != 0) { + error = copyout((caddr_t)kptr, + (caddr_t)ifmr->ifm_ulist, + ifmr->ifm_count * sizeof(int)); + } + + if (error == 0) + error = sticky; + + if (ifmr->ifm_count != 0) + free(kptr, M_TEMP); + + ifmr->ifm_count = count; + break; + } + + default: + return (EINVAL); + } + + return (error); +} + +/* + * Find media entry matching a given ifm word. + * + */ +static struct ifmedia_entry * +ifmedia_match(ifm, target, mask) + struct ifmedia *ifm; + int target; + int mask; +{ + struct ifmedia_entry *match, *next; + + match = NULL; + mask = ~mask; + + LIST_FOREACH(next, &ifm->ifm_list, ifm_list) { + if ((next->ifm_media & mask) == (target & mask)) { +#if defined(IFMEDIA_DEBUG) || defined(DIAGNOSTIC) + if (match) { + printf("ifmedia_match: multiple match for " + "0x%x/0x%x\n", target, mask); + } +#endif + match = next; + } + } + + return match; +} + +/* + * Compute the interface `baudrate' from the media, for the interface + * metrics (used by routing daemons). + */ +static const struct ifmedia_baudrate ifmedia_baudrate_descriptions[] = + IFM_BAUDRATE_DESCRIPTIONS; + +uint64_t +ifmedia_baudrate(int mword) +{ + int i; + + for (i = 0; ifmedia_baudrate_descriptions[i].ifmb_word != 0; i++) { + if ((mword & (IFM_NMASK|IFM_TMASK)) == + ifmedia_baudrate_descriptions[i].ifmb_word) + return (ifmedia_baudrate_descriptions[i].ifmb_baudrate); + } + + /* Not known. */ + return (0); +} + +#ifdef IFMEDIA_DEBUG +struct ifmedia_description ifm_type_descriptions[] = + IFM_TYPE_DESCRIPTIONS; + +struct ifmedia_description ifm_subtype_ethernet_descriptions[] = + IFM_SUBTYPE_ETHERNET_DESCRIPTIONS; + +struct ifmedia_description ifm_subtype_ethernet_option_descriptions[] = + IFM_SUBTYPE_ETHERNET_OPTION_DESCRIPTIONS; + +struct ifmedia_description ifm_subtype_tokenring_descriptions[] = + IFM_SUBTYPE_TOKENRING_DESCRIPTIONS; + +struct ifmedia_description ifm_subtype_tokenring_option_descriptions[] = + IFM_SUBTYPE_TOKENRING_OPTION_DESCRIPTIONS; + +struct ifmedia_description ifm_subtype_fddi_descriptions[] = + IFM_SUBTYPE_FDDI_DESCRIPTIONS; + +struct ifmedia_description ifm_subtype_fddi_option_descriptions[] = + IFM_SUBTYPE_FDDI_OPTION_DESCRIPTIONS; + +struct ifmedia_description ifm_subtype_ieee80211_descriptions[] = + IFM_SUBTYPE_IEEE80211_DESCRIPTIONS; + +struct ifmedia_description ifm_subtype_ieee80211_option_descriptions[] = + IFM_SUBTYPE_IEEE80211_OPTION_DESCRIPTIONS; + +struct ifmedia_description ifm_subtype_ieee80211_mode_descriptions[] = + IFM_SUBTYPE_IEEE80211_MODE_DESCRIPTIONS; + +struct ifmedia_description ifm_subtype_atm_descriptions[] = + IFM_SUBTYPE_ATM_DESCRIPTIONS; + +struct ifmedia_description ifm_subtype_atm_option_descriptions[] = + IFM_SUBTYPE_ATM_OPTION_DESCRIPTIONS; + +struct ifmedia_description ifm_subtype_shared_descriptions[] = + IFM_SUBTYPE_SHARED_DESCRIPTIONS; + +struct ifmedia_description ifm_shared_option_descriptions[] = + IFM_SHARED_OPTION_DESCRIPTIONS; + +struct ifmedia_type_to_subtype { + struct ifmedia_description *subtypes; + struct ifmedia_description *options; + struct ifmedia_description *modes; +}; + +/* must be in the same order as IFM_TYPE_DESCRIPTIONS */ +struct ifmedia_type_to_subtype ifmedia_types_to_subtypes[] = { + { + &ifm_subtype_ethernet_descriptions[0], + &ifm_subtype_ethernet_option_descriptions[0], + NULL, + }, + { + &ifm_subtype_tokenring_descriptions[0], + &ifm_subtype_tokenring_option_descriptions[0], + NULL, + }, + { + &ifm_subtype_fddi_descriptions[0], + &ifm_subtype_fddi_option_descriptions[0], + NULL, + }, + { + &ifm_subtype_ieee80211_descriptions[0], + &ifm_subtype_ieee80211_option_descriptions[0], + &ifm_subtype_ieee80211_mode_descriptions[0] + }, + { + &ifm_subtype_atm_descriptions[0], + &ifm_subtype_atm_option_descriptions[0], + NULL, + }, +}; + +/* + * print a media word. + */ +static void +ifmedia_printword(ifmw) + int ifmw; +{ + struct ifmedia_description *desc; + struct ifmedia_type_to_subtype *ttos; + int seen_option = 0; + + /* Find the top-level interface type. */ + for (desc = ifm_type_descriptions, ttos = ifmedia_types_to_subtypes; + desc->ifmt_string != NULL; desc++, ttos++) + if (IFM_TYPE(ifmw) == desc->ifmt_word) + break; + if (desc->ifmt_string == NULL) { + printf("\n"); + return; + } + printf(desc->ifmt_string); + + /* Any mode. */ + for (desc = ttos->modes; desc && desc->ifmt_string != NULL; desc++) + if (IFM_MODE(ifmw) == desc->ifmt_word) { + if (desc->ifmt_string != NULL) + printf(" mode %s", desc->ifmt_string); + break; + } + + /* + * Check for the shared subtype descriptions first, then the + * type-specific ones. + */ + for (desc = ifm_subtype_shared_descriptions; + desc->ifmt_string != NULL; desc++) + if (IFM_SUBTYPE(ifmw) == desc->ifmt_word) + goto got_subtype; + + for (desc = ttos->subtypes; desc->ifmt_string != NULL; desc++) + if (IFM_SUBTYPE(ifmw) == desc->ifmt_word) + break; + if (desc->ifmt_string == NULL) { + printf(" \n"); + return; + } + + got_subtype: + printf(" %s", desc->ifmt_string); + + /* + * Look for shared options. + */ + for (desc = ifm_shared_option_descriptions; + desc->ifmt_string != NULL; desc++) { + if (ifmw & desc->ifmt_word) { + if (seen_option == 0) + printf(" <"); + printf("%s%s", seen_option++ ? "," : "", + desc->ifmt_string); + } + } + + /* + * Look for subtype-specific options. + */ + for (desc = ttos->options; desc->ifmt_string != NULL; desc++) { + if (ifmw & desc->ifmt_word) { + if (seen_option == 0) + printf(" <"); + printf("%s%s", seen_option++ ? "," : "", + desc->ifmt_string); + } + } + printf("%s\n", seen_option ? ">" : ""); +} +#endif /* IFMEDIA_DEBUG */ diff --git a/freebsd/sys/net/if_media.h b/freebsd/sys/net/if_media.h new file mode 100644 index 00000000..26a3c417 --- /dev/null +++ b/freebsd/sys/net/if_media.h @@ -0,0 +1,692 @@ +/* $NetBSD: if_media.h,v 1.3 1997/03/26 01:19:27 thorpej Exp $ */ +/* $FreeBSD$ */ + +/*- + * Copyright (c) 1997 + * Jonathan Stone and Jason R. Thorpe. All rights reserved. + * + * This software is derived from information provided by Matt Thomas. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Jonathan Stone + * and Jason R. Thorpe for the NetBSD Project. + * 4. The names of the authors may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_IF_MEDIA_HH_ +#define _NET_IF_MEDIA_HH_ + +/* + * Prototypes and definitions for BSD/OS-compatible network interface + * media selection. + * + * Where it is safe to do so, this code strays slightly from the BSD/OS + * design. Software which uses the API (device drivers, basically) + * shouldn't notice any difference. + * + * Many thanks to Matt Thomas for providing the information necessary + * to implement this interface. + */ + +#ifdef _KERNEL + +#include + +/* + * Driver callbacks for media status and change requests. + */ +typedef int (*ifm_change_cb_t)(struct ifnet *ifp); +typedef void (*ifm_stat_cb_t)(struct ifnet *ifp, struct ifmediareq *req); + +/* + * In-kernel representation of a single supported media type. + */ +struct ifmedia_entry { + LIST_ENTRY(ifmedia_entry) ifm_list; + int ifm_media; /* description of this media attachment */ + int ifm_data; /* for driver-specific use */ + void *ifm_aux; /* for driver-specific use */ +}; + +/* + * One of these goes into a network interface's softc structure. + * It is used to keep general media state. + */ +struct ifmedia { + int ifm_mask; /* mask of changes we don't care about */ + int ifm_media; /* current user-set media word */ + struct ifmedia_entry *ifm_cur; /* currently selected media */ + LIST_HEAD(, ifmedia_entry) ifm_list; /* list of all supported media */ + ifm_change_cb_t ifm_change; /* media change driver callback */ + ifm_stat_cb_t ifm_status; /* media status driver callback */ +}; + +/* Initialize an interface's struct if_media field. */ +void ifmedia_init(struct ifmedia *ifm, int dontcare_mask, + ifm_change_cb_t change_callback, ifm_stat_cb_t status_callback); + +/* Remove all mediums from a struct ifmedia. */ +void ifmedia_removeall( struct ifmedia *ifm); + +/* Add one supported medium to a struct ifmedia. */ +void ifmedia_add(struct ifmedia *ifm, int mword, int data, void *aux); + +/* Add an array (of ifmedia_entry) media to a struct ifmedia. */ +void ifmedia_list_add(struct ifmedia *mp, struct ifmedia_entry *lp, + int count); + +/* Set default media type on initialization. */ +void ifmedia_set(struct ifmedia *ifm, int mword); + +/* Common ioctl function for getting/setting media, called by driver. */ +int ifmedia_ioctl(struct ifnet *ifp, struct ifreq *ifr, + struct ifmedia *ifm, u_long cmd); + +/* Compute baudrate for a given media. */ +uint64_t ifmedia_baudrate(int); + +#endif /*_KERNEL */ + +/* + * if_media Options word: + * Bits Use + * ---- ------- + * 0-4 Media variant + * 5-7 Media type + * 8-15 Type specific options + * 16-18 Mode (for multi-mode devices) + * 19 RFU + * 20-27 Shared (global) options + * 28-31 Instance + */ + +/* + * Ethernet + */ +#define IFM_ETHER 0x00000020 +#define IFM_10_T 3 /* 10BaseT - RJ45 */ +#define IFM_10_2 4 /* 10Base2 - Thinnet */ +#define IFM_10_5 5 /* 10Base5 - AUI */ +#define IFM_100_TX 6 /* 100BaseTX - RJ45 */ +#define IFM_100_FX 7 /* 100BaseFX - Fiber */ +#define IFM_100_T4 8 /* 100BaseT4 - 4 pair cat 3 */ +#define IFM_100_VG 9 /* 100VG-AnyLAN */ +#define IFM_100_T2 10 /* 100BaseT2 */ +#define IFM_1000_SX 11 /* 1000BaseSX - multi-mode fiber */ +#define IFM_10_STP 12 /* 10BaseT over shielded TP */ +#define IFM_10_FL 13 /* 10BaseFL - Fiber */ +#define IFM_1000_LX 14 /* 1000baseLX - single-mode fiber */ +#define IFM_1000_CX 15 /* 1000baseCX - 150ohm STP */ +#define IFM_1000_T 16 /* 1000baseT - 4 pair cat 5 */ +#define IFM_HPNA_1 17 /* HomePNA 1.0 (1Mb/s) */ +#define IFM_10G_LR 18 /* 10GBase-LR 1310nm Single-mode */ +#define IFM_10G_SR 19 /* 10GBase-SR 850nm Multi-mode */ +#define IFM_10G_CX4 20 /* 10GBase CX4 copper */ +#define IFM_2500_SX 21 /* 2500BaseSX - multi-mode fiber */ +#define IFM_10G_TWINAX 22 /* 10GBase Twinax copper */ +#define IFM_10G_TWINAX_LONG 23 /* 10GBase Twinax Long copper */ +#define IFM_10G_LRM 24 /* 10GBase-LRM 850nm Multi-mode */ +#define IFM_UNKNOWN 25 /* media types not defined yet */ +#define IFM_10G_T 26 /* 10GBase-T - RJ45 */ + + +/* note 31 is the max! */ + +#define IFM_ETH_MASTER 0x00000100 /* master mode (1000baseT) */ +#define IFM_ETH_RXPAUSE 0x00000200 /* receive PAUSE frames */ +#define IFM_ETH_TXPAUSE 0x00000400 /* transmit PAUSE frames */ + +/* + * Token ring + */ +#define IFM_TOKEN 0x00000040 +#define IFM_TOK_STP4 3 /* Shielded twisted pair 4m - DB9 */ +#define IFM_TOK_STP16 4 /* Shielded twisted pair 16m - DB9 */ +#define IFM_TOK_UTP4 5 /* Unshielded twisted pair 4m - RJ45 */ +#define IFM_TOK_UTP16 6 /* Unshielded twisted pair 16m - RJ45 */ +#define IFM_TOK_STP100 7 /* Shielded twisted pair 100m - DB9 */ +#define IFM_TOK_UTP100 8 /* Unshielded twisted pair 100m - RJ45 */ +#define IFM_TOK_ETR 0x00000200 /* Early token release */ +#define IFM_TOK_SRCRT 0x00000400 /* Enable source routing features */ +#define IFM_TOK_ALLR 0x00000800 /* All routes / Single route bcast */ +#define IFM_TOK_DTR 0x00002000 /* Dedicated token ring */ +#define IFM_TOK_CLASSIC 0x00004000 /* Classic token ring */ +#define IFM_TOK_AUTO 0x00008000 /* Automatic Dedicate/Classic token ring */ + +/* + * FDDI + */ +#define IFM_FDDI 0x00000060 +#define IFM_FDDI_SMF 3 /* Single-mode fiber */ +#define IFM_FDDI_MMF 4 /* Multi-mode fiber */ +#define IFM_FDDI_UTP 5 /* CDDI / UTP */ +#define IFM_FDDI_DA 0x00000100 /* Dual attach / single attach */ + +/* + * IEEE 802.11 Wireless + */ +#define IFM_IEEE80211 0x00000080 +/* NB: 0,1,2 are auto, manual, none defined below */ +#define IFM_IEEE80211_FH1 3 /* Frequency Hopping 1Mbps */ +#define IFM_IEEE80211_FH2 4 /* Frequency Hopping 2Mbps */ +#define IFM_IEEE80211_DS1 5 /* Direct Sequence 1Mbps */ +#define IFM_IEEE80211_DS2 6 /* Direct Sequence 2Mbps */ +#define IFM_IEEE80211_DS5 7 /* Direct Sequence 5.5Mbps */ +#define IFM_IEEE80211_DS11 8 /* Direct Sequence 11Mbps */ +#define IFM_IEEE80211_DS22 9 /* Direct Sequence 22Mbps */ +#define IFM_IEEE80211_OFDM6 10 /* OFDM 6Mbps */ +#define IFM_IEEE80211_OFDM9 11 /* OFDM 9Mbps */ +#define IFM_IEEE80211_OFDM12 12 /* OFDM 12Mbps */ +#define IFM_IEEE80211_OFDM18 13 /* OFDM 18Mbps */ +#define IFM_IEEE80211_OFDM24 14 /* OFDM 24Mbps */ +#define IFM_IEEE80211_OFDM36 15 /* OFDM 36Mbps */ +#define IFM_IEEE80211_OFDM48 16 /* OFDM 48Mbps */ +#define IFM_IEEE80211_OFDM54 17 /* OFDM 54Mbps */ +#define IFM_IEEE80211_OFDM72 18 /* OFDM 72Mbps */ +#define IFM_IEEE80211_DS354k 19 /* Direct Sequence 354Kbps */ +#define IFM_IEEE80211_DS512k 20 /* Direct Sequence 512Kbps */ +#define IFM_IEEE80211_OFDM3 21 /* OFDM 3Mbps */ +#define IFM_IEEE80211_OFDM4 22 /* OFDM 4.5Mbps */ +#define IFM_IEEE80211_OFDM27 23 /* OFDM 27Mbps */ +/* NB: not enough bits to express MCS fully */ +#define IFM_IEEE80211_MCS 24 /* HT MCS rate */ + +#define IFM_IEEE80211_ADHOC 0x00000100 /* Operate in Adhoc mode */ +#define IFM_IEEE80211_HOSTAP 0x00000200 /* Operate in Host AP mode */ +#define IFM_IEEE80211_IBSS 0x00000400 /* Operate in IBSS mode */ +#define IFM_IEEE80211_WDS 0x00000800 /* Operate in WDS mode */ +#define IFM_IEEE80211_TURBO 0x00001000 /* Operate in turbo mode */ +#define IFM_IEEE80211_MONITOR 0x00002000 /* Operate in monitor mode */ +#define IFM_IEEE80211_MBSS 0x00004000 /* Operate in MBSS mode */ + +/* operating mode for multi-mode devices */ +#define IFM_IEEE80211_11A 0x00010000 /* 5Ghz, OFDM mode */ +#define IFM_IEEE80211_11B 0x00020000 /* Direct Sequence mode */ +#define IFM_IEEE80211_11G 0x00030000 /* 2Ghz, CCK mode */ +#define IFM_IEEE80211_FH 0x00040000 /* 2Ghz, GFSK mode */ +#define IFM_IEEE80211_11NA 0x00050000 /* 5Ghz, HT mode */ +#define IFM_IEEE80211_11NG 0x00060000 /* 2Ghz, HT mode */ + +/* + * ATM + */ +#define IFM_ATM 0x000000a0 +#define IFM_ATM_UNKNOWN 3 +#define IFM_ATM_UTP_25 4 +#define IFM_ATM_TAXI_100 5 +#define IFM_ATM_TAXI_140 6 +#define IFM_ATM_MM_155 7 +#define IFM_ATM_SM_155 8 +#define IFM_ATM_UTP_155 9 +#define IFM_ATM_MM_622 10 +#define IFM_ATM_SM_622 11 +#define IFM_ATM_VIRTUAL 12 +#define IFM_ATM_SDH 0x00000100 /* SDH instead of SONET */ +#define IFM_ATM_NOSCRAMB 0x00000200 /* no scrambling */ +#define IFM_ATM_UNASSIGNED 0x00000400 /* unassigned cells */ + +/* + * CARP Common Address Redundancy Protocol + */ +#define IFM_CARP 0x000000c0 + +/* + * Shared media sub-types + */ +#define IFM_AUTO 0 /* Autoselect best media */ +#define IFM_MANUAL 1 /* Jumper/dipswitch selects media */ +#define IFM_NONE 2 /* Deselect all media */ + +/* + * Shared options + */ +#define IFM_FDX 0x00100000 /* Force full duplex */ +#define IFM_HDX 0x00200000 /* Force half duplex */ +#define IFM_FLOW 0x00400000 /* enable hardware flow control */ +#define IFM_FLAG0 0x01000000 /* Driver defined flag */ +#define IFM_FLAG1 0x02000000 /* Driver defined flag */ +#define IFM_FLAG2 0x04000000 /* Driver defined flag */ +#define IFM_LOOP 0x08000000 /* Put hardware in loopback */ + +/* + * Masks + */ +#define IFM_NMASK 0x000000e0 /* Network type */ +#define IFM_TMASK 0x0000001f /* Media sub-type */ +#define IFM_IMASK 0xf0000000 /* Instance */ +#define IFM_ISHIFT 28 /* Instance shift */ +#define IFM_OMASK 0x0000ff00 /* Type specific options */ +#define IFM_MMASK 0x00070000 /* Mode */ +#define IFM_MSHIFT 16 /* Mode shift */ +#define IFM_GMASK 0x0ff00000 /* Global options */ + +/* Ethernet flow control mask */ +#define IFM_ETH_FMASK (IFM_FLOW | IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE) + +/* + * Status bits + */ +#define IFM_AVALID 0x00000001 /* Active bit valid */ +#define IFM_ACTIVE 0x00000002 /* Interface attached to working net */ + +/* Mask of "status valid" bits, for ifconfig(8). */ +#define IFM_STATUS_VALID IFM_AVALID + +/* List of "status valid" bits, for ifconfig(8). */ +#define IFM_STATUS_VALID_LIST { \ + IFM_AVALID, \ + 0 \ +} + +/* + * Macros to extract various bits of information from the media word. + */ +#define IFM_TYPE(x) ((x) & IFM_NMASK) +#define IFM_SUBTYPE(x) ((x) & IFM_TMASK) +#define IFM_TYPE_OPTIONS(x) ((x) & IFM_OMASK) +#define IFM_INST(x) (((x) & IFM_IMASK) >> IFM_ISHIFT) +#define IFM_OPTIONS(x) ((x) & (IFM_OMASK|IFM_GMASK)) +#define IFM_MODE(x) ((x) & IFM_MMASK) + +#define IFM_INST_MAX IFM_INST(IFM_IMASK) + +/* + * Macro to create a media word. + */ +#define IFM_MAKEWORD(type, subtype, options, instance) \ + ((type) | (subtype) | (options) | ((instance) << IFM_ISHIFT)) +#define IFM_MAKEMODE(mode) \ + (((mode) << IFM_MSHIFT) & IFM_MMASK) + +/* + * NetBSD extension not defined in the BSDI API. This is used in various + * places to get the canonical description for a given type/subtype. + * + * NOTE: all but the top-level type descriptions must contain NO whitespace! + * Otherwise, parsing these in ifconfig(8) would be a nightmare. + */ +struct ifmedia_description { + int ifmt_word; /* word value; may be masked */ + const char *ifmt_string; /* description */ +}; + +#define IFM_TYPE_DESCRIPTIONS { \ + { IFM_ETHER, "Ethernet" }, \ + { IFM_TOKEN, "Token ring" }, \ + { IFM_FDDI, "FDDI" }, \ + { IFM_IEEE80211, "IEEE 802.11 Wireless Ethernet" }, \ + { IFM_ATM, "ATM" }, \ + { IFM_CARP, "Common Address Redundancy Protocol" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_ETHERNET_DESCRIPTIONS { \ + { IFM_10_T, "10baseT/UTP" }, \ + { IFM_10_2, "10base2/BNC" }, \ + { IFM_10_5, "10base5/AUI" }, \ + { IFM_100_TX, "100baseTX" }, \ + { IFM_100_FX, "100baseFX" }, \ + { IFM_100_T4, "100baseT4" }, \ + { IFM_100_VG, "100baseVG" }, \ + { IFM_100_T2, "100baseT2" }, \ + { IFM_10_STP, "10baseSTP" }, \ + { IFM_10_FL, "10baseFL" }, \ + { IFM_1000_SX, "1000baseSX" }, \ + { IFM_1000_LX, "1000baseLX" }, \ + { IFM_1000_CX, "1000baseCX" }, \ + { IFM_1000_T, "1000baseT" }, \ + { IFM_HPNA_1, "homePNA" }, \ + { IFM_10G_LR, "10Gbase-LR" }, \ + { IFM_10G_SR, "10Gbase-SR" }, \ + { IFM_10G_CX4, "10Gbase-CX4" }, \ + { IFM_2500_SX, "2500BaseSX" }, \ + { IFM_10G_LRM, "10Gbase-LRM" }, \ + { IFM_10G_TWINAX, "10Gbase-Twinax" }, \ + { IFM_10G_TWINAX_LONG, "10Gbase-Twinax-Long" }, \ + { IFM_UNKNOWN, "Unknown" }, \ + { IFM_10G_T, "10Gbase-T" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_ETHERNET_ALIASES { \ + { IFM_10_T, "UTP" }, \ + { IFM_10_T, "10UTP" }, \ + { IFM_10_2, "BNC" }, \ + { IFM_10_2, "10BNC" }, \ + { IFM_10_5, "AUI" }, \ + { IFM_10_5, "10AUI" }, \ + { IFM_100_TX, "100TX" }, \ + { IFM_100_T4, "100T4" }, \ + { IFM_100_VG, "100VG" }, \ + { IFM_100_T2, "100T2" }, \ + { IFM_10_STP, "10STP" }, \ + { IFM_10_FL, "10FL" }, \ + { IFM_1000_SX, "1000SX" }, \ + { IFM_1000_LX, "1000LX" }, \ + { IFM_1000_CX, "1000CX" }, \ + { IFM_1000_T, "1000baseTX" }, \ + { IFM_1000_T, "1000TX" }, \ + { IFM_1000_T, "1000T" }, \ + { IFM_2500_SX, "2500SX" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_ETHERNET_OPTION_DESCRIPTIONS { \ + { IFM_ETH_MASTER, "master" }, \ + { IFM_ETH_RXPAUSE, "rxpause" }, \ + { IFM_ETH_TXPAUSE, "txpause" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_TOKENRING_DESCRIPTIONS { \ + { IFM_TOK_STP4, "DB9/4Mbit" }, \ + { IFM_TOK_STP16, "DB9/16Mbit" }, \ + { IFM_TOK_UTP4, "UTP/4Mbit" }, \ + { IFM_TOK_UTP16, "UTP/16Mbit" }, \ + { IFM_TOK_STP100, "STP/100Mbit" }, \ + { IFM_TOK_UTP100, "UTP/100Mbit" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_TOKENRING_ALIASES { \ + { IFM_TOK_STP4, "4STP" }, \ + { IFM_TOK_STP16, "16STP" }, \ + { IFM_TOK_UTP4, "4UTP" }, \ + { IFM_TOK_UTP16, "16UTP" }, \ + { IFM_TOK_STP100, "100STP" }, \ + { IFM_TOK_UTP100, "100UTP" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_TOKENRING_OPTION_DESCRIPTIONS { \ + { IFM_TOK_ETR, "EarlyTokenRelease" }, \ + { IFM_TOK_SRCRT, "SourceRouting" }, \ + { IFM_TOK_ALLR, "AllRoutes" }, \ + { IFM_TOK_DTR, "Dedicated" }, \ + { IFM_TOK_CLASSIC,"Classic" }, \ + { IFM_TOK_AUTO, " " }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_FDDI_DESCRIPTIONS { \ + { IFM_FDDI_SMF, "Single-mode" }, \ + { IFM_FDDI_MMF, "Multi-mode" }, \ + { IFM_FDDI_UTP, "UTP" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_FDDI_ALIASES { \ + { IFM_FDDI_SMF, "SMF" }, \ + { IFM_FDDI_MMF, "MMF" }, \ + { IFM_FDDI_UTP, "CDDI" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_FDDI_OPTION_DESCRIPTIONS { \ + { IFM_FDDI_DA, "Dual-attach" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_IEEE80211_DESCRIPTIONS { \ + { IFM_IEEE80211_FH1, "FH/1Mbps" }, \ + { IFM_IEEE80211_FH2, "FH/2Mbps" }, \ + { IFM_IEEE80211_DS1, "DS/1Mbps" }, \ + { IFM_IEEE80211_DS2, "DS/2Mbps" }, \ + { IFM_IEEE80211_DS5, "DS/5.5Mbps" }, \ + { IFM_IEEE80211_DS11, "DS/11Mbps" }, \ + { IFM_IEEE80211_DS22, "DS/22Mbps" }, \ + { IFM_IEEE80211_OFDM6, "OFDM/6Mbps" }, \ + { IFM_IEEE80211_OFDM9, "OFDM/9Mbps" }, \ + { IFM_IEEE80211_OFDM12, "OFDM/12Mbps" }, \ + { IFM_IEEE80211_OFDM18, "OFDM/18Mbps" }, \ + { IFM_IEEE80211_OFDM24, "OFDM/24Mbps" }, \ + { IFM_IEEE80211_OFDM36, "OFDM/36Mbps" }, \ + { IFM_IEEE80211_OFDM48, "OFDM/48Mbps" }, \ + { IFM_IEEE80211_OFDM54, "OFDM/54Mbps" }, \ + { IFM_IEEE80211_OFDM72, "OFDM/72Mbps" }, \ + { IFM_IEEE80211_DS354k, "DS/354Kbps" }, \ + { IFM_IEEE80211_DS512k, "DS/512Kbps" }, \ + { IFM_IEEE80211_OFDM3, "OFDM/3Mbps" }, \ + { IFM_IEEE80211_OFDM4, "OFDM/4.5Mbps" }, \ + { IFM_IEEE80211_OFDM27, "OFDM/27Mbps" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_IEEE80211_ALIASES { \ + { IFM_IEEE80211_FH1, "FH1" }, \ + { IFM_IEEE80211_FH2, "FH2" }, \ + { IFM_IEEE80211_FH1, "FrequencyHopping/1Mbps" }, \ + { IFM_IEEE80211_FH2, "FrequencyHopping/2Mbps" }, \ + { IFM_IEEE80211_DS1, "DS1" }, \ + { IFM_IEEE80211_DS2, "DS2" }, \ + { IFM_IEEE80211_DS5, "DS5.5" }, \ + { IFM_IEEE80211_DS11, "DS11" }, \ + { IFM_IEEE80211_DS22, "DS22" }, \ + { IFM_IEEE80211_DS1, "DirectSequence/1Mbps" }, \ + { IFM_IEEE80211_DS2, "DirectSequence/2Mbps" }, \ + { IFM_IEEE80211_DS5, "DirectSequence/5.5Mbps" }, \ + { IFM_IEEE80211_DS11, "DirectSequence/11Mbps" }, \ + { IFM_IEEE80211_DS22, "DirectSequence/22Mbps" }, \ + { IFM_IEEE80211_OFDM6, "OFDM6" }, \ + { IFM_IEEE80211_OFDM9, "OFDM9" }, \ + { IFM_IEEE80211_OFDM12, "OFDM12" }, \ + { IFM_IEEE80211_OFDM18, "OFDM18" }, \ + { IFM_IEEE80211_OFDM24, "OFDM24" }, \ + { IFM_IEEE80211_OFDM36, "OFDM36" }, \ + { IFM_IEEE80211_OFDM48, "OFDM48" }, \ + { IFM_IEEE80211_OFDM54, "OFDM54" }, \ + { IFM_IEEE80211_OFDM72, "OFDM72" }, \ + { IFM_IEEE80211_DS1, "CCK1" }, \ + { IFM_IEEE80211_DS2, "CCK2" }, \ + { IFM_IEEE80211_DS5, "CCK5.5" }, \ + { IFM_IEEE80211_DS11, "CCK11" }, \ + { IFM_IEEE80211_DS354k, "DS354K" }, \ + { IFM_IEEE80211_DS354k, "DirectSequence/354Kbps" }, \ + { IFM_IEEE80211_DS512k, "DS512K" }, \ + { IFM_IEEE80211_DS512k, "DirectSequence/512Kbps" }, \ + { IFM_IEEE80211_OFDM3, "OFDM3" }, \ + { IFM_IEEE80211_OFDM4, "OFDM4.5" }, \ + { IFM_IEEE80211_OFDM27, "OFDM27" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_IEEE80211_OPTION_DESCRIPTIONS { \ + { IFM_IEEE80211_ADHOC, "adhoc" }, \ + { IFM_IEEE80211_HOSTAP, "hostap" }, \ + { IFM_IEEE80211_IBSS, "ibss" }, \ + { IFM_IEEE80211_WDS, "wds" }, \ + { IFM_IEEE80211_TURBO, "turbo" }, \ + { IFM_IEEE80211_MONITOR, "monitor" }, \ + { IFM_IEEE80211_MBSS, "mesh" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_IEEE80211_MODE_DESCRIPTIONS { \ + { IFM_AUTO, "autoselect" }, \ + { IFM_IEEE80211_11A, "11a" }, \ + { IFM_IEEE80211_11B, "11b" }, \ + { IFM_IEEE80211_11G, "11g" }, \ + { IFM_IEEE80211_FH, "fh" }, \ + { IFM_IEEE80211_11NA, "11na" }, \ + { IFM_IEEE80211_11NG, "11ng" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_IEEE80211_MODE_ALIASES { \ + { IFM_AUTO, "auto" }, \ + { 0, NULL }, \ +} + +# define IFM_SUBTYPE_ATM_DESCRIPTIONS { \ + { IFM_ATM_UNKNOWN, "Unknown" }, \ + { IFM_ATM_UTP_25, "UTP/25.6MBit" }, \ + { IFM_ATM_TAXI_100, "Taxi/100MBit" }, \ + { IFM_ATM_TAXI_140, "Taxi/140MBit" }, \ + { IFM_ATM_MM_155, "Multi-mode/155MBit" }, \ + { IFM_ATM_SM_155, "Single-mode/155MBit" }, \ + { IFM_ATM_UTP_155, "UTP/155MBit" }, \ + { IFM_ATM_MM_622, "Multi-mode/622MBit" }, \ + { IFM_ATM_SM_622, "Single-mode/622MBit" }, \ + { IFM_ATM_VIRTUAL, "Virtual" }, \ + { 0, NULL }, \ +} + +# define IFM_SUBTYPE_ATM_ALIASES { \ + { IFM_ATM_UNKNOWN, "UNKNOWN" }, \ + { IFM_ATM_UTP_25, "UTP-25" }, \ + { IFM_ATM_TAXI_100, "TAXI-100" }, \ + { IFM_ATM_TAXI_140, "TAXI-140" }, \ + { IFM_ATM_MM_155, "MM-155" }, \ + { IFM_ATM_SM_155, "SM-155" }, \ + { IFM_ATM_UTP_155, "UTP-155" }, \ + { IFM_ATM_MM_622, "MM-622" }, \ + { IFM_ATM_SM_622, "SM-622" }, \ + { IFM_ATM_VIRTUAL, "VIRTUAL" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_ATM_OPTION_DESCRIPTIONS { \ + { IFM_ATM_SDH, "SDH" }, \ + { IFM_ATM_NOSCRAMB, "Noscramb" }, \ + { IFM_ATM_UNASSIGNED, "Unassigned" }, \ + { 0, NULL }, \ +} + + +#define IFM_SUBTYPE_SHARED_DESCRIPTIONS { \ + { IFM_AUTO, "autoselect" }, \ + { IFM_MANUAL, "manual" }, \ + { IFM_NONE, "none" }, \ + { 0, NULL }, \ +} + +#define IFM_SUBTYPE_SHARED_ALIASES { \ + { IFM_AUTO, "auto" }, \ + { 0, NULL }, \ +} + +#define IFM_SHARED_OPTION_DESCRIPTIONS { \ + { IFM_FDX, "full-duplex" }, \ + { IFM_HDX, "half-duplex" }, \ + { IFM_FLOW, "flowcontrol" }, \ + { IFM_FLAG0, "flag0" }, \ + { IFM_FLAG1, "flag1" }, \ + { IFM_FLAG2, "flag2" }, \ + { IFM_LOOP, "hw-loopback" }, \ + { 0, NULL }, \ +} + +/* + * Baudrate descriptions for the various media types. + */ +struct ifmedia_baudrate { + int ifmb_word; /* media word */ + uint64_t ifmb_baudrate; /* corresponding baudrate */ +}; + +#define IFM_BAUDRATE_DESCRIPTIONS { \ + { IFM_ETHER | IFM_10_T, IF_Mbps(10) }, \ + { IFM_ETHER | IFM_10_2, IF_Mbps(10) }, \ + { IFM_ETHER | IFM_10_5, IF_Mbps(10) }, \ + { IFM_ETHER | IFM_100_TX, IF_Mbps(100) }, \ + { IFM_ETHER | IFM_100_FX, IF_Mbps(100) }, \ + { IFM_ETHER | IFM_100_T4, IF_Mbps(100) }, \ + { IFM_ETHER | IFM_100_VG, IF_Mbps(100) }, \ + { IFM_ETHER | IFM_100_T2, IF_Mbps(100) }, \ + { IFM_ETHER | IFM_1000_SX, IF_Mbps(1000) }, \ + { IFM_ETHER | IFM_10_STP, IF_Mbps(10) }, \ + { IFM_ETHER | IFM_10_FL, IF_Mbps(10) }, \ + { IFM_ETHER | IFM_1000_LX, IF_Mbps(1000) }, \ + { IFM_ETHER | IFM_1000_CX, IF_Mbps(1000) }, \ + { IFM_ETHER | IFM_1000_T, IF_Mbps(1000) }, \ + { IFM_ETHER | IFM_HPNA_1, IF_Mbps(1) }, \ + { IFM_ETHER | IFM_10G_LR, IF_Gbps(10ULL) }, \ + { IFM_ETHER | IFM_10G_SR, IF_Gbps(10ULL) }, \ + { IFM_ETHER | IFM_10G_CX4, IF_Gbps(10ULL) }, \ + { IFM_ETHER | IFM_2500_SX, IF_Mbps(2500ULL) }, \ + { IFM_ETHER | IFM_10G_TWINAX, IF_Gbps(10ULL) }, \ + { IFM_ETHER | IFM_10G_TWINAX_LONG, IF_Gbps(10ULL) }, \ + { IFM_ETHER | IFM_10G_LRM, IF_Gbps(10ULL) }, \ + { IFM_ETHER | IFM_10G_T, IF_Gbps(10ULL) }, \ + \ + { IFM_TOKEN | IFM_TOK_STP4, IF_Mbps(4) }, \ + { IFM_TOKEN | IFM_TOK_STP16, IF_Mbps(16) }, \ + { IFM_TOKEN | IFM_TOK_UTP4, IF_Mbps(4) }, \ + { IFM_TOKEN | IFM_TOK_UTP16, IF_Mbps(16) }, \ + \ + { IFM_FDDI | IFM_FDDI_SMF, IF_Mbps(100) }, \ + { IFM_FDDI | IFM_FDDI_MMF, IF_Mbps(100) }, \ + { IFM_FDDI | IFM_FDDI_UTP, IF_Mbps(100) }, \ + \ + { IFM_IEEE80211 | IFM_IEEE80211_FH1, IF_Mbps(1) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_FH2, IF_Mbps(2) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_DS2, IF_Mbps(2) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_DS5, IF_Kbps(5500) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_DS11, IF_Mbps(11) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_DS1, IF_Mbps(1) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_DS22, IF_Mbps(22) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_OFDM6, IF_Mbps(6) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_OFDM9, IF_Mbps(9) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_OFDM12, IF_Mbps(12) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_OFDM18, IF_Mbps(18) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_OFDM24, IF_Mbps(24) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_OFDM36, IF_Mbps(36) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_OFDM48, IF_Mbps(48) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_OFDM54, IF_Mbps(54) }, \ + { IFM_IEEE80211 | IFM_IEEE80211_OFDM72, IF_Mbps(72) }, \ + \ + { 0, 0 }, \ +} + +/* + * Status descriptions for the various media types. + */ +struct ifmedia_status_description { + int ifms_type; + int ifms_valid; + int ifms_bit; + const char *ifms_string[2]; +}; + +#define IFM_STATUS_DESC(ifms, bit) \ + (ifms)->ifms_string[((ifms)->ifms_bit & (bit)) ? 1 : 0] + +#define IFM_STATUS_DESCRIPTIONS { \ + { IFM_ETHER, IFM_AVALID, IFM_ACTIVE, \ + { "no carrier", "active" } }, \ + { IFM_FDDI, IFM_AVALID, IFM_ACTIVE, \ + { "no ring", "inserted" } }, \ + { IFM_TOKEN, IFM_AVALID, IFM_ACTIVE, \ + { "no ring", "inserted" } }, \ + { IFM_IEEE80211, IFM_AVALID, IFM_ACTIVE, \ + { "no network", "active" } }, \ + { IFM_ATM, IFM_AVALID, IFM_ACTIVE, \ + { "no network", "active" } }, \ + { IFM_CARP, IFM_AVALID, IFM_ACTIVE, \ + { "backup", "master" } }, \ + { 0, 0, 0, \ + { NULL, NULL } } \ +} +#endif /* _NET_IF_MEDIA_HH_ */ diff --git a/freebsd/sys/net/if_mib.c b/freebsd/sys/net/if_mib.c new file mode 100644 index 00000000..ddc75df3 --- /dev/null +++ b/freebsd/sys/net/if_mib.c @@ -0,0 +1,171 @@ +#include + +/*- + * Copyright 1996 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * A sysctl(3) MIB for generic interface information. This information + * is exported in the net.link.generic branch, which has the following + * structure: + * + * net.link.generic .system - system-wide control variables + * and statistics (node) + * .ifdata..general + * - what's in `struct ifdata' + * plus some other info + * .ifdata..linkspecific + * - a link-type-specific data + * structure (as might be used + * by an SNMP agent + * + * Perhaps someday we will make addresses accessible via this interface + * as well (then there will be four such...). The reason that the + * index comes before the last element in the name is because it + * seems more orthogonal that way, particularly with the possibility + * of other per-interface data living down here as well (e.g., integrated + * services stuff). + */ + +SYSCTL_DECL(_net_link_generic); +SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RW, 0, + "Variables global to all interfaces"); + +SYSCTL_VNET_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD, + &VNET_NAME(if_index), 0, + "Number of configured interfaces"); + +static int +sysctl_ifdata(SYSCTL_HANDLER_ARGS) /* XXX bad syntax! */ +{ + int *name = (int *)arg1; + int error; + u_int namelen = arg2; + struct ifnet *ifp; + struct ifmibdata ifmd; + size_t dlen; + char *dbuf; + + if (namelen != 2) + return EINVAL; + if (name[0] <= 0) + return (ENOENT); + ifp = ifnet_byindex_ref(name[0]); + if (ifp == NULL) + return (ENOENT); + + switch(name[1]) { + default: + error = ENOENT; + goto out; + + case IFDATA_GENERAL: + bzero(&ifmd, sizeof(ifmd)); + strlcpy(ifmd.ifmd_name, ifp->if_xname, sizeof(ifmd.ifmd_name)); + +#define COPY(fld) ifmd.ifmd_##fld = ifp->if_##fld + COPY(pcount); + COPY(data); +#undef COPY + ifmd.ifmd_flags = ifp->if_flags | ifp->if_drv_flags; + ifmd.ifmd_snd_len = ifp->if_snd.ifq_len; + ifmd.ifmd_snd_maxlen = ifp->if_snd.ifq_maxlen; + ifmd.ifmd_snd_drops = ifp->if_snd.ifq_drops; + + error = SYSCTL_OUT(req, &ifmd, sizeof ifmd); + if (error || !req->newptr) + goto out; + + error = SYSCTL_IN(req, &ifmd, sizeof ifmd); + if (error) + goto out; + +#define DONTCOPY(fld) ifmd.ifmd_data.ifi_##fld = ifp->if_data.ifi_##fld + DONTCOPY(type); + DONTCOPY(physical); + DONTCOPY(addrlen); + DONTCOPY(hdrlen); + DONTCOPY(mtu); + DONTCOPY(metric); + DONTCOPY(baudrate); +#undef DONTCOPY +#define COPY(fld) ifp->if_##fld = ifmd.ifmd_##fld + COPY(data); + ifp->if_snd.ifq_maxlen = ifmd.ifmd_snd_maxlen; + ifp->if_snd.ifq_drops = ifmd.ifmd_snd_drops; +#undef COPY + break; + + case IFDATA_LINKSPECIFIC: + error = SYSCTL_OUT(req, ifp->if_linkmib, ifp->if_linkmiblen); + if (error || !req->newptr) + goto out; + + error = SYSCTL_IN(req, ifp->if_linkmib, ifp->if_linkmiblen); + if (error) + goto out; + break; + + case IFDATA_DRIVERNAME: + /* 20 is enough for 64bit ints */ + dlen = strlen(ifp->if_dname) + 20 + 1; + if ((dbuf = malloc(dlen, M_TEMP, M_NOWAIT)) == NULL) { + error = ENOMEM; + goto out; + } + if (ifp->if_dunit == IF_DUNIT_NONE) + strcpy(dbuf, ifp->if_dname); + else + sprintf(dbuf, "%s%d", ifp->if_dname, ifp->if_dunit); + + error = SYSCTL_OUT(req, dbuf, strlen(dbuf) + 1); + if (error == 0 && req->newptr != NULL) + error = EPERM; + free(dbuf, M_TEMP); + goto out; + } +out: + if_rele(ifp); + return error; +} + +SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RW, + sysctl_ifdata, "Interface table"); + diff --git a/freebsd/sys/net/if_mib.h b/freebsd/sys/net/if_mib.h new file mode 100644 index 00000000..e2b80c87 --- /dev/null +++ b/freebsd/sys/net/if_mib.h @@ -0,0 +1,171 @@ +/*- + * Copyright 1996 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_IF_MIB_H +#define _NET_IF_MIB_H 1 + +struct ifmibdata { + char ifmd_name[IFNAMSIZ]; /* name of interface */ + int ifmd_pcount; /* number of promiscuous listeners */ + int ifmd_flags; /* interface flags */ + int ifmd_snd_len; /* instantaneous length of send queue */ + int ifmd_snd_maxlen; /* maximum length of send queue */ + int ifmd_snd_drops; /* number of drops in send queue */ + int ifmd_filler[4]; /* for future expansion */ + struct if_data ifmd_data; /* generic information and statistics */ +}; + +/* + * sysctl MIB tags at the net.link.generic level + */ +#define IFMIB_SYSTEM 1 /* non-interface-specific */ +#define IFMIB_IFDATA 2 /* per-interface data table */ + +/* + * MIB tags for the various net.link.generic.ifdata tables + */ +#define IFDATA_GENERAL 1 /* generic stats for all kinds of ifaces */ +#define IFDATA_LINKSPECIFIC 2 /* specific to the type of interface */ +#define IFDATA_DRIVERNAME 3 /* driver name and unit */ + +/* + * MIB tags at the net.link.generic.system level + */ +#define IFMIB_IFCOUNT 1 /* number of interfaces configured */ + +/* + * MIB tags as the net.link level + * All of the other values are IFT_* names defined in if_types.h. + */ +#define NETLINK_GENERIC 0 /* functions not specific to a type of iface */ + +/* + * The reason why the IFDATA_LINKSPECIFIC stuff is not under the + * net.link. branches is twofold: + * 1) It's easier to code this way, and doesn't require duplication. + * 2) The fourth level under net.link. is ; that is to say, + * the net.link. tree instruments the adaptation layers between + * and a particular protocol family (e.g., net.link.ether.inet + * instruments ARP). This does not really leave room for anything else + * that needs to have a well-known number. + */ + +/* + * Link-specific MIB structures for various link types. + */ + +/* For IFT_ETHER, IFT_ISO88023, and IFT_STARLAN, as used by RFC 1650 */ +struct ifmib_iso_8802_3 { + u_int32_t dot3StatsAlignmentErrors; + u_int32_t dot3StatsFCSErrors; + u_int32_t dot3StatsSingleCollisionFrames; + u_int32_t dot3StatsMultipleCollisionFrames; + u_int32_t dot3StatsSQETestErrors; + u_int32_t dot3StatsDeferredTransmissions; + u_int32_t dot3StatsLateCollisions; + u_int32_t dot3StatsExcessiveCollisions; + u_int32_t dot3StatsInternalMacTransmitErrors; + u_int32_t dot3StatsCarrierSenseErrors; + u_int32_t dot3StatsFrameTooLongs; + u_int32_t dot3StatsInternalMacReceiveErrors; + u_int32_t dot3StatsEtherChipSet; + /* Matt Thomas wants this one, not included in RFC 1650: */ + u_int32_t dot3StatsMissedFrames; + + u_int32_t dot3StatsCollFrequencies[16]; /* NB: index origin */ + + u_int32_t dot3Compliance; +#define DOT3COMPLIANCE_STATS 1 +#define DOT3COMPLIANCE_COLLS 2 +}; + +/* + * Chipset identifiers are normally part of the vendor's enterprise MIB. + * However, we don't want to be trying to represent arbitrary-length + * OBJECT IDENTIFIERs here (ick!), and the right value is not necessarily + * obvious to the driver implementor. So, we define our own identification + * mechanism here, and let the agent writer deal with the translation. + */ +#define DOT3CHIPSET_VENDOR(x) ((x) >> 16) +#define DOT3CHIPSET_PART(x) ((x) & 0xffff) +#define DOT3CHIPSET(v,p) (((v) << 16) + ((p) & 0xffff)) + +/* Driver writers! Add your vendors here! */ +enum dot3Vendors { + dot3VendorAMD = 1, + dot3VendorIntel = 2, + dot3VendorNational = 4, + dot3VendorFujitsu = 5, + dot3VendorDigital = 6, + dot3VendorWesternDigital = 7 +}; + +/* Driver writers! Add your chipsets here! */ +enum { + dot3ChipSetAMD7990 = 1, + dot3ChipSetAMD79900 = 2, + dot3ChipSetAMD79C940 = 3 +}; + +enum { + dot3ChipSetIntel82586 = 1, + dot3ChipSetIntel82596 = 2, + dot3ChipSetIntel82557 = 3 +}; + +enum { + dot3ChipSetNational8390 = 1, + dot3ChipSetNationalSonic = 2 +}; + +enum { + dot3ChipSetFujitsu86950 = 1 +}; + +enum { + dot3ChipSetDigitalDC21040 = 1, + dot3ChipSetDigitalDC21140 = 2, + dot3ChipSetDigitalDC21041 = 3, + dot3ChipSetDigitalDC21140A = 4, + dot3ChipSetDigitalDC21142 = 5 +}; + +enum { + dot3ChipSetWesternDigital83C690 = 1, + dot3ChipSetWesternDigital83C790 = 2 +}; +/* END of Ethernet-link MIB stuff */ + +/* + * Put other types of interface MIBs here, or in interface-specific + * header files if convenient ones already exist. + */ +#endif /* _NET_IF_MIB_H */ diff --git a/freebsd/sys/net/if_sppp.h b/freebsd/sys/net/if_sppp.h new file mode 100644 index 00000000..ed406b55 --- /dev/null +++ b/freebsd/sys/net/if_sppp.h @@ -0,0 +1,234 @@ +/* + * Defines for synchronous PPP/Cisco/Frame Relay link level subroutines. + */ +/*- + * Copyright (C) 1994-2000 Cronyx Engineering. + * Author: Serge Vakulenko, + * + * Heavily revamped to conform to RFC 1661. + * Copyright (C) 1997, Joerg Wunsch. + * + * This software is distributed with NO WARRANTIES, not even the implied + * warranties for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Authors grant any other persons or organizations permission to use + * or modify this software as long as this message is kept with the software, + * all derivative works or modified versions. + * + * From: Version 2.0, Fri Oct 6 20:39:21 MSK 1995 + * + * $FreeBSD$ + */ + +#ifndef _NET_IF_SPPP_HH_ +#define _NET_IF_SPPP_HH_ 1 + +#define IDX_LCP 0 /* idx into state table */ + +struct slcp { + u_long opts; /* LCP options to send (bitfield) */ + u_long magic; /* local magic number */ + u_long mru; /* our max receive unit */ + u_long their_mru; /* their max receive unit */ + u_long protos; /* bitmask of protos that are started */ + u_char echoid; /* id of last keepalive echo request */ + /* restart max values, see RFC 1661 */ + int timeout; + int max_terminate; + int max_configure; + int max_failure; +}; + +#define IDX_IPCP 1 /* idx into state table */ +#define IDX_IPV6CP 2 /* idx into state table */ + +struct sipcp { + u_long opts; /* IPCP options to send (bitfield) */ + u_int flags; +#define IPCP_HISADDR_SEEN 1 /* have seen his address already */ +#define IPCP_MYADDR_DYN 2 /* my address is dynamically assigned */ +#define IPCP_MYADDR_SEEN 4 /* have seen his address already */ +#ifdef notdef +#define IPV6CP_MYIFID_DYN 8 /* my ifid is dynamically assigned */ +#endif +#define IPV6CP_MYIFID_SEEN 0x10 /* have seen his ifid already */ +#define IPCP_VJ 0x20 /* can use VJ compression */ + int max_state; /* VJ: Max-Slot-Id */ + int compress_cid; /* VJ: Comp-Slot-Id */ +}; + +#define AUTHNAMELEN 64 +#define AUTHKEYLEN 16 + +struct sauth { + u_short proto; /* authentication protocol to use */ + u_short flags; +#define AUTHFLAG_NOCALLOUT 1 /* do not require authentication on */ + /* callouts */ +#define AUTHFLAG_NORECHALLENGE 2 /* do not re-challenge CHAP */ + u_char name[AUTHNAMELEN]; /* system identification name */ + u_char secret[AUTHKEYLEN]; /* secret password */ + u_char challenge[AUTHKEYLEN]; /* random challenge */ +}; + +#define IDX_PAP 3 +#define IDX_CHAP 4 + +#define IDX_COUNT (IDX_CHAP + 1) /* bump this when adding cp's! */ + +/* + * Don't change the order of this. Ordering the phases this way allows + * for a comparision of ``pp_phase >= PHASE_AUTHENTICATE'' in order to + * know whether LCP is up. + */ +enum ppp_phase { + PHASE_DEAD, PHASE_ESTABLISH, PHASE_TERMINATE, + PHASE_AUTHENTICATE, PHASE_NETWORK +}; + +#define PP_MTU 1500 /* default/minimal MRU */ +#define PP_MAX_MRU 2048 /* maximal MRU we want to negotiate */ + +/* + * This is a cut down struct sppp (see below) that can easily be + * exported to/ imported from userland without the need to include + * dozens of kernel-internal header files. It is used by the + * SPPPIO[GS]DEFS ioctl commands below. + */ +struct sppp_parms { + enum ppp_phase pp_phase; /* phase we're currently in */ + int enable_vj; /* VJ header compression enabled */ + int enable_ipv6; /* + * Enable IPv6 negotiations -- only + * needed since each IPv4 i/f auto- + * matically gets an IPv6 address + * assigned, so we can't use this as + * a decision. + */ + struct slcp lcp; /* LCP params */ + struct sipcp ipcp; /* IPCP params */ + struct sipcp ipv6cp; /* IPv6CP params */ + struct sauth myauth; /* auth params, i'm peer */ + struct sauth hisauth; /* auth params, i'm authenticator */ +}; + +/* + * Definitions to pass struct sppp_parms data down into the kernel + * using the SIOC[SG]IFGENERIC ioctl interface. + * + * In order to use this, create a struct spppreq, fill in the cmd + * field with SPPPIOGDEFS, and put the address of this structure into + * the ifr_data portion of a struct ifreq. Pass this struct to a + * SIOCGIFGENERIC ioctl. Then replace the cmd field by SPPPIOSDEFS, + * modify the defs field as desired, and pass the struct ifreq now + * to a SIOCSIFGENERIC ioctl. + */ + +#define SPPPIOGDEFS ((caddr_t)(('S' << 24) + (1 << 16) +\ + sizeof(struct sppp_parms))) +#define SPPPIOSDEFS ((caddr_t)(('S' << 24) + (2 << 16) +\ + sizeof(struct sppp_parms))) + +struct spppreq { + int cmd; + struct sppp_parms defs; +}; + +#ifdef _KERNEL +struct sppp { + struct ifnet *pp_ifp; /* network interface data */ + struct ifqueue pp_fastq; /* fast output queue */ + struct ifqueue pp_cpq; /* PPP control protocol queue */ + struct sppp *pp_next; /* next interface in keepalive list */ + u_int pp_mode; /* major protocol modes (cisco/ppp/...) */ + u_int pp_flags; /* sub modes */ + u_short pp_alivecnt; /* keepalive packets counter */ + u_short pp_loopcnt; /* loopback detection counter */ + u_long pp_seq[IDX_COUNT]; /* local sequence number */ + u_long pp_rseq[IDX_COUNT]; /* remote sequence number */ + enum ppp_phase pp_phase; /* phase we're currently in */ + int state[IDX_COUNT]; /* state machine */ + u_char confid[IDX_COUNT]; /* id of last configuration request */ + int rst_counter[IDX_COUNT]; /* restart counter */ + int fail_counter[IDX_COUNT]; /* negotiation failure counter */ + int confflags; /* administrative configuration flags */ +#define CONF_ENABLE_VJ 0x01 /* VJ header compression enabled */ +#define CONF_ENABLE_IPV6 0x02 /* IPv6 administratively enabled */ + time_t pp_last_recv; /* time last packet has been received */ + time_t pp_last_sent; /* time last packet has been sent */ + struct callout ch[IDX_COUNT]; /* per-proto and if callouts */ + struct callout pap_my_to_ch; /* PAP needs one more... */ + struct callout keepalive_callout; /* keepalive callout */ + struct slcp lcp; /* LCP params */ + struct sipcp ipcp; /* IPCP params */ + struct sipcp ipv6cp; /* IPv6CP params */ + struct sauth myauth; /* auth params, i'm peer */ + struct sauth hisauth; /* auth params, i'm authenticator */ + struct slcompress *pp_comp; /* for VJ compression */ + u_short fr_dlci; /* Frame Relay DLCI number, 16..1023 */ + u_char fr_status; /* PVC status, active/new/delete */ + /* + * These functions are filled in by sppp_attach(), and are + * expected to be used by the lower layer (hardware) drivers + * in order to communicate the (un)availability of the + * communication link. Lower layer drivers that are always + * ready to communicate (like hardware HDLC) can shortcut + * pp_up from pp_tls, and pp_down from pp_tlf. + */ + void (*pp_up)(struct sppp *sp); + void (*pp_down)(struct sppp *sp); + /* + * These functions need to be filled in by the lower layer + * (hardware) drivers if they request notification from the + * PPP layer whether the link is actually required. They + * correspond to the tls and tlf actions. + */ + void (*pp_tls)(struct sppp *sp); + void (*pp_tlf)(struct sppp *sp); + /* + * These (optional) functions may be filled by the hardware + * driver if any notification of established connections + * (currently: IPCP up) is desired (pp_con) or any internal + * state change of the interface state machine should be + * signaled for monitoring purposes (pp_chg). + */ + void (*pp_con)(struct sppp *sp); + void (*pp_chg)(struct sppp *sp, int new_state); + /* These two fields are for use by the lower layer */ + void *pp_lowerp; + int pp_loweri; + /* Lock */ + struct mtx mtx; + /* if_start () wrapper */ + void (*if_start) (struct ifnet *); + struct callout ifstart_callout; /* if_start () scheduler */ +}; +#define IFP2SP(ifp) ((struct sppp *)(ifp)->if_l2com) +#define SP2IFP(sp) ((sp)->pp_ifp) + +/* bits for pp_flags */ +#define PP_KEEPALIVE 0x01 /* use keepalive protocol */ +#define PP_FR 0x04 /* use Frame Relay protocol instead of PPP */ + /* 0x04 was PP_TIMO */ +#define PP_CALLIN 0x08 /* we are being called */ +#define PP_NEEDAUTH 0x10 /* remote requested authentication */ + +void sppp_attach (struct ifnet *ifp); +void sppp_detach (struct ifnet *ifp); +void sppp_input (struct ifnet *ifp, struct mbuf *m); +int sppp_ioctl (struct ifnet *ifp, u_long cmd, void *data); +struct mbuf *sppp_dequeue (struct ifnet *ifp); +struct mbuf *sppp_pick(struct ifnet *ifp); +int sppp_isempty (struct ifnet *ifp); +void sppp_flush (struct ifnet *ifp); + +/* Internal functions */ +void sppp_fr_input (struct sppp *sp, struct mbuf *m); +struct mbuf *sppp_fr_header (struct sppp *sp, struct mbuf *m, int fam); +void sppp_fr_keepalive (struct sppp *sp); +void sppp_get_ip_addrs(struct sppp *sp, u_long *src, u_long *dst, + u_long *srcmask); + +#endif + +#endif /* _NET_IF_SPPP_HH_ */ diff --git a/freebsd/sys/net/if_spppfr.c b/freebsd/sys/net/if_spppfr.c new file mode 100644 index 00000000..fa912363 --- /dev/null +++ b/freebsd/sys/net/if_spppfr.c @@ -0,0 +1,636 @@ +#include + +/*- + * Synchronous Frame Relay link level subroutines. + * ANSI T1.617-compaible link management signaling + * implemented for Frame Relay mode. + * Cisco-type Frame Relay framing added, thanks Alex Tutubalin. + * Only one DLCI per channel for now. + * + * Copyright (C) 1994-2000 Cronyx Engineering. + * Author: Serge Vakulenko, + * + * Copyright (C) 1999-2004 Cronyx Engineering. + * Author: Kurakin Roman, + * + * This software is distributed with NO WARRANTIES, not even the implied + * warranties for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Authors grant any other persons or organisations a permission to use, + * modify and redistribute this software in source and binary forms, + * as long as this message is kept with the software, all derivative + * works or modified versions. + * + * $Cronyx Id: if_spppfr.c,v 1.1.2.10 2004/06/29 09:02:30 rik Exp $ + * $FreeBSD$ + */ + +#include + +#if defined(__FreeBSD__) && __FreeBSD__ >= 3 +#include +#include +#include +#endif + +#ifdef NetBSD1_3 +# if NetBSD1_3 > 6 +# include "opt_inet.h" +# include "opt_inet6.h" +# include "opt_iso.h" +# endif +#endif + +#include +#include +#include +#include +#include +#include +#if defined(__FreeBSD__) && __FreeBSD__ >= 3 +#include +#endif +#include +#include + +#if defined (__OpenBSD__) +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__NetBSD__) || defined (__OpenBSD__) +#include /* XXX for softnet */ +#endif + +#include + +#include +#ifdef INET +#include +#include +#endif + +#if defined (__FreeBSD__) || defined (__OpenBSD__) +# include +#else +# include +#endif + +#ifdef IPX +#include +#include +#endif + +#include + +/* + * Frame Relay. + */ +#define FR_UI 0x03 /* Unnumbered Information */ +#define FR_IP 0xCC /* IP protocol identifier */ +#define FR_PADDING 0x00 /* NLPID padding */ +#define FR_SIGNALING 0x08 /* Q.933/T1.617 signaling identifier */ +#define FR_SNAP 0x80 /* NLPID snap */ + +/* + * Header flags. + */ +#define FR_DE 0x02 /* discard eligibility */ +#define FR_FECN 0x04 /* forward notification */ +#define FR_BECN 0x08 /* backward notification */ + +/* + * Signaling message types. + */ +#define FR_MSG_ENQUIRY 0x75 /* status enquiry */ +#define FR_MSG_STATUS 0x7d /* status */ + +#define FR_ENQUIRY_SIZE 14 + +/* + * Message field types. + */ +#define FR_FLD_RTYPE 0x01 /* report type */ +#define FR_FLD_VERIFY 0x03 /* link verification */ +#define FR_FLD_PVC 0x07 /* PVC status */ +#define FR_FLD_LSHIFT5 0x95 /* locking shift 5 */ + +/* + * Report types. + */ +#define FR_RTYPE_FULL 0 /* full status */ +#define FR_RTYPE_SHORT 1 /* link verification only */ +#define FR_RTYPE_SINGLE 2 /* single PVC status */ + +/* PVC status field. */ +#define FR_DLCI_DELETE 0x04 /* PVC is deleted */ +#define FR_DLCI_ACTIVE 0x02 /* PVC is operational */ +#define FR_DLCI_NEW 0x08 /* PVC is new */ + +struct arp_req { + unsigned short htype; /* hardware type = ARPHRD_FRELAY */ + unsigned short ptype; /* protocol type = ETHERTYPE_IP */ + unsigned char halen; /* hardware address length = 2 */ + unsigned char palen; /* protocol address length = 4 */ + unsigned short op; /* ARP/RARP/InARP request/reply */ + unsigned short hsource; /* hardware source address */ + unsigned short psource1; /* protocol source */ + unsigned short psource2; + unsigned short htarget; /* hardware target address */ + unsigned short ptarget1; /* protocol target */ + unsigned short ptarget2; +} __packed; + +#if defined(__FreeBSD__) && __FreeBSD__ >= 3 && __FreeBSD_version < 501113 +#define SPP_FMT "%s%d: " +#define SPP_ARGS(ifp) (ifp)->if_name, (ifp)->if_unit +#else +#define SPP_FMT "%s: " +#define SPP_ARGS(ifp) (ifp)->if_xname +#endif + +/* almost every function needs these */ +#define STDDCL \ + struct ifnet *ifp = SP2IFP(sp); \ + int debug = ifp->if_flags & IFF_DEBUG + +static void sppp_fr_arp (struct sppp *sp, struct arp_req *req, u_short addr); +static void sppp_fr_signal (struct sppp *sp, unsigned char *h, int len); + +void sppp_fr_input (struct sppp *sp, struct mbuf *m) +{ + STDDCL; + u_char *h = mtod (m, u_char*); + int isr = -1; + int dlci, hlen, proto; + + /* Get the DLCI number. */ + if (m->m_pkthdr.len < 10) { +bad: m_freem (m); + return; + } + dlci = (h[0] << 2 & 0x3f0) | (h[1] >> 4 & 0x0f); + + /* Process signaling packets. */ + if (dlci == 0) { + sppp_fr_signal (sp, h, m->m_pkthdr.len); + m_freem (m); + return; + } + + if (dlci != sp->fr_dlci) { + if (debug) + printf (SPP_FMT "Received packet from invalid DLCI %d\n", + SPP_ARGS(ifp), dlci); + goto bad; + } + + /* Process the packet. */ + if (ntohs (*(short*) (h+2)) == ETHERTYPE_IP) { + /* Prehistoric IP framing? */ + h[2] = FR_UI; + h[3] = FR_IP; + } + if (h[2] != FR_UI) { + if (debug) + printf (SPP_FMT "Invalid frame relay header flag 0x%02x\n", + SPP_ARGS(ifp), h[2]); + goto bad; + } + switch (h[3]) { + default: + if (debug) + printf (SPP_FMT "Unsupported NLPID 0x%02x\n", + SPP_ARGS(ifp), h[3]); + goto bad; + + case FR_PADDING: + if (h[4] != FR_SNAP) { + if (debug) + printf (SPP_FMT "Bad NLPID 0x%02x\n", + SPP_ARGS(ifp), h[4]); + goto bad; + } + if (h[5] || h[6] || h[7]) { + if (debug) + printf (SPP_FMT "Bad OID 0x%02x-0x%02x-0x%02x\n", + SPP_ARGS(ifp), + h[5], h[6], h[7]); + goto bad; + } + proto = ntohs (*(short*) (h+8)); + if (proto == ETHERTYPE_ARP) { + /* Process the ARP request. */ + if (m->m_pkthdr.len != 10 + sizeof (struct arp_req)) { + if (debug) + printf (SPP_FMT "Bad ARP request size = %d bytes\n", + SPP_ARGS(ifp), + m->m_pkthdr.len); + goto bad; + } + sppp_fr_arp (sp, (struct arp_req*) (h + 10), + h[0] << 8 | h[1]); + m_freem (m); + return; + } + hlen = 10; + break; + + case FR_IP: + proto = ETHERTYPE_IP; + hlen = 4; + break; + } + + /* Remove frame relay header. */ + m_adj (m, hlen); + + switch (proto) { + default: + ++ifp->if_noproto; +drop: ++ifp->if_ierrors; + ++ifp->if_iqdrops; + m_freem (m); + return; +#ifdef INET + case ETHERTYPE_IP: + isr = NETISR_IP; + break; +#endif +#ifdef IPX + case ETHERTYPE_IPX: + isr = NETISR_IPX; + break; +#endif +#ifdef NETATALK + case ETHERTYPE_AT: + isr = NETISR_ATALK; + break; +#endif + } + + if (! (ifp->if_flags & IFF_UP)) + goto drop; + + /* Check queue. */ + if (netisr_queue(isr, m)) { /* (0) on success. */ + if (debug) + log(LOG_DEBUG, SPP_FMT "protocol queue overflow\n", + SPP_ARGS(ifp)); + } +} + +/* + * Add the frame relay header to the packet. + * For IP the header length is 4 bytes, + * for all other protocols - 10 bytes (RFC 1490). + */ +struct mbuf *sppp_fr_header (struct sppp *sp, struct mbuf *m, + int family) +{ + STDDCL; + u_char *h; + int type, hlen; + + /* Prepend the space for Frame Relay header. */ + hlen = (family == AF_INET) ? 4 : 10; + M_PREPEND (m, hlen, M_DONTWAIT); + if (! m) + return 0; + h = mtod (m, u_char*); + + /* Fill the header. */ + h[0] = sp->fr_dlci >> 2 & 0xfc; + h[1] = sp->fr_dlci << 4 | 1; + h[2] = FR_UI; + + switch (family) { + default: + if (debug) + printf (SPP_FMT "Cannot handle address family %d\n", + SPP_ARGS(ifp), family); + m_freem (m); + return 0; +#ifdef INET + case AF_INET: +#if 0 /* Crashes on fragmented packets */ + /* + * Set the discard eligibility bit, if: + * 1) no fragmentation + * 2) length > 400 bytes + * 3a) the protocol is UDP or + * 3b) TCP data (no control bits) + */ + { + struct ip *ip = (struct ip*) (h + hlen); + struct tcphdr *tcp = (struct tcphdr*) ((long*)ip + ip->ip_hl); + + if (! (ip->ip_off & ~IP_DF) && ip->ip_len > 400 && + (ip->ip_p == IPPROTO_UDP || + ip->ip_p == IPPROTO_TCP && ! tcp->th_flags)) + h[1] |= FR_DE; + } +#endif + h[3] = FR_IP; + return m; +#endif +#ifdef IPX + case AF_IPX: + type = ETHERTYPE_IPX; + break; +#endif +#ifdef NS + case AF_NS: + type = 0x8137; + break; +#endif +#ifdef NETATALK + case AF_APPLETALK: + type = ETHERTYPE_AT; + break; +#endif + } + h[3] = FR_PADDING; + h[4] = FR_SNAP; + h[5] = 0; + h[6] = 0; + h[7] = 0; + *(short*) (h+8) = htons(type); + return m; +} + +/* + * Send periodical frame relay link verification messages via DLCI 0. + * Called every 10 seconds (default value of T391 timer is 10 sec). + * Every 6-th message is a full status request + * (default value of N391 counter is 6). + */ +void sppp_fr_keepalive (struct sppp *sp) +{ + STDDCL; + unsigned char *h, *p; + struct mbuf *m; + + MGETHDR (m, M_DONTWAIT, MT_DATA); + if (! m) + return; + m->m_pkthdr.rcvif = 0; + + h = mtod (m, u_char*); + p = h; + *p++ = 0; /* DLCI = 0 */ + *p++ = 1; + *p++ = FR_UI; + *p++ = FR_SIGNALING; /* NLPID = UNI call control */ + + *p++ = 0; /* call reference length = 0 */ + *p++ = FR_MSG_ENQUIRY; /* message type = status enquiry */ + + *p++ = FR_FLD_LSHIFT5; /* locking shift 5 */ + + *p++ = FR_FLD_RTYPE; /* report type field */ + *p++ = 1; /* report type length = 1 */ + if (sp->pp_seq[IDX_LCP] % 6) + *p++ = FR_RTYPE_SHORT; /* link verification only */ + else + *p++ = FR_RTYPE_FULL; /* full status needed */ + + if (sp->pp_seq[IDX_LCP] >= 255) + sp->pp_seq[IDX_LCP] = 0; + *p++ = FR_FLD_VERIFY; /* link verification type field */ + *p++ = 2; /* link verification field length = 2 */ + *p++ = ++sp->pp_seq[IDX_LCP]; /* our sequence number */ + *p++ = sp->pp_rseq[IDX_LCP]; /* last received sequence number */ + + m->m_pkthdr.len = m->m_len = p - h; + if (debug) + printf (SPP_FMT "send lmi packet, seq=%d, rseq=%d\n", + SPP_ARGS(ifp), (u_char) sp->pp_seq[IDX_LCP], + (u_char) sp->pp_rseq[IDX_LCP]); + + if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) + ++ifp->if_oerrors; +} + +/* + * Process the frame relay Inverse ARP request. + */ +static void sppp_fr_arp (struct sppp *sp, struct arp_req *req, + u_short his_hardware_address) +{ + STDDCL; + struct mbuf *m; + struct arp_req *reply; + u_char *h; + u_short my_hardware_address; + u_long his_ip_address, my_ip_address; + + if ((ntohs (req->htype) != ARPHRD_FRELAY || + ntohs (req->htype) != 16) || /* for BayNetworks routers */ + ntohs (req->ptype) != ETHERTYPE_IP) { + if (debug) + printf (SPP_FMT "Invalid ARP hardware/protocol type = 0x%x/0x%x\n", + SPP_ARGS(ifp), + ntohs (req->htype), ntohs (req->ptype)); + return; + } + if (req->halen != 2 || req->palen != 4) { + if (debug) + printf (SPP_FMT "Invalid ARP hardware/protocol address length = %d/%d\n", + SPP_ARGS(ifp), + req->halen, req->palen); + return; + } + switch (ntohs (req->op)) { + default: + if (debug) + printf (SPP_FMT "Invalid ARP op = 0x%x\n", + SPP_ARGS(ifp), ntohs (req->op)); + return; + + case ARPOP_INVREPLY: + /* Ignore. */ + return; + + case ARPOP_INVREQUEST: + my_hardware_address = ntohs (req->htarget); + his_ip_address = ntohs (req->psource1) << 16 | + ntohs (req->psource2); + my_ip_address = ntohs (req->ptarget1) << 16 | + ntohs (req->ptarget2); + break; + } + if (debug) + printf (SPP_FMT "got ARP request, source=0x%04x/%d.%d.%d.%d, target=0x%04x/%d.%d.%d.%d\n", + SPP_ARGS(ifp), ntohs (req->hsource), + (unsigned char) (his_ip_address >> 24), + (unsigned char) (his_ip_address >> 16), + (unsigned char) (his_ip_address >> 8), + (unsigned char) his_ip_address, + my_hardware_address, + (unsigned char) (my_ip_address >> 24), + (unsigned char) (my_ip_address >> 16), + (unsigned char) (my_ip_address >> 8), + (unsigned char) my_ip_address); + + sppp_get_ip_addrs (sp, &my_ip_address, 0, 0); + if (! my_ip_address) + return; /* nothing to reply */ + + if (debug) + printf (SPP_FMT "send ARP reply, source=0x%04x/%d.%d.%d.%d, target=0x%04x/%d.%d.%d.%d\n", + SPP_ARGS(ifp), my_hardware_address, + (unsigned char) (my_ip_address >> 24), + (unsigned char) (my_ip_address >> 16), + (unsigned char) (my_ip_address >> 8), + (unsigned char) my_ip_address, + his_hardware_address, + (unsigned char) (his_ip_address >> 24), + (unsigned char) (his_ip_address >> 16), + (unsigned char) (his_ip_address >> 8), + (unsigned char) his_ip_address); + + /* Send the Inverse ARP reply. */ + MGETHDR (m, M_DONTWAIT, MT_DATA); + if (! m) + return; + m->m_pkthdr.len = m->m_len = 10 + sizeof (*reply); + m->m_pkthdr.rcvif = 0; + + h = mtod (m, u_char*); + reply = (struct arp_req*) (h + 10); + + h[0] = his_hardware_address >> 8; + h[1] = his_hardware_address; + h[2] = FR_UI; + h[3] = FR_PADDING; + h[4] = FR_SNAP; + h[5] = 0; + h[6] = 0; + h[7] = 0; + *(short*) (h+8) = htons (ETHERTYPE_ARP); + + reply->htype = htons (ARPHRD_FRELAY); + reply->ptype = htons (ETHERTYPE_IP); + reply->halen = 2; + reply->palen = 4; + reply->op = htons (ARPOP_INVREPLY); + reply->hsource = htons (my_hardware_address); + reply->psource1 = htonl (my_ip_address); + reply->psource2 = htonl (my_ip_address) >> 16; + reply->htarget = htons (his_hardware_address); + reply->ptarget1 = htonl (his_ip_address); + reply->ptarget2 = htonl (his_ip_address) >> 16; + + if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) + ++ifp->if_oerrors; +} + +/* + * Process the input signaling packet (DLCI 0). + * The implemented protocol is ANSI T1.617 Annex D. + */ +static void sppp_fr_signal (struct sppp *sp, unsigned char *h, int len) +{ + STDDCL; + u_char *p; + int dlci; + + if (h[2] != FR_UI || h[3] != FR_SIGNALING || h[4] != 0) { + if (debug) + printf (SPP_FMT "Invalid signaling header\n", + SPP_ARGS(ifp)); +bad: if (debug) { + printf ("%02x", *h++); + while (--len > 0) + printf ("-%02x", *h++); + printf ("\n"); + } + return; + } + if (h[5] == FR_MSG_ENQUIRY) { + if (len == FR_ENQUIRY_SIZE && + h[12] == (u_char) sp->pp_seq[IDX_LCP]) { + sp->pp_seq[IDX_LCP] = random(); + printf (SPP_FMT "loopback detected\n", + SPP_ARGS(ifp)); + } + return; + } + if (h[5] != FR_MSG_STATUS) { + if (debug) + printf (SPP_FMT "Unknown signaling message: 0x%02x\n", + SPP_ARGS(ifp), h[5]); + goto bad; + } + + /* Parse message fields. */ + for (p=h+6; ppp_rseq[IDX_LCP] = p[2]; + if (debug) { + printf (SPP_FMT "got lmi reply rseq=%d, seq=%d", + SPP_ARGS(ifp), p[2], p[3]); + if (p[3] != (u_char) sp->pp_seq[IDX_LCP]) + printf (" (really %d)", + (u_char) sp->pp_seq[IDX_LCP]); + printf ("\n"); + } + break; + case FR_FLD_PVC: + if (p[1] < 3) { + if (debug) + printf (SPP_FMT "Invalid PVC status length %d\n", + SPP_ARGS(ifp), p[1]); + break; + } + dlci = (p[2] << 4 & 0x3f0) | (p[3] >> 3 & 0x0f); + if (! sp->fr_dlci) + sp->fr_dlci = dlci; + if (sp->fr_status != p[4]) + printf (SPP_FMT "DLCI %d %s%s\n", + SPP_ARGS(ifp), dlci, + p[4] & FR_DLCI_DELETE ? "deleted" : + p[4] & FR_DLCI_ACTIVE ? "active" : "passive", + p[4] & FR_DLCI_NEW ? ", new" : ""); + sp->fr_status = p[4]; + break; + } + if (*p & 0x80) + ++p; + else if (p < h+len+1 && p[1]) + p += 2 + p[1]; + else { + if (debug) + printf (SPP_FMT "Invalid signaling field 0x%x\n", + SPP_ARGS(ifp), *p); + goto bad; + } + } +} diff --git a/freebsd/sys/net/if_spppsubr.c b/freebsd/sys/net/if_spppsubr.c new file mode 100644 index 00000000..235ef7c0 --- /dev/null +++ b/freebsd/sys/net/if_spppsubr.c @@ -0,0 +1,5492 @@ +#include + +/* + * Synchronous PPP/Cisco/Frame Relay link level subroutines. + * Keepalive protocol implemented in both Cisco and PPP modes. + */ +/*- + * Copyright (C) 1994-2000 Cronyx Engineering. + * Author: Serge Vakulenko, + * + * Heavily revamped to conform to RFC 1661. + * Copyright (C) 1997, 2001 Joerg Wunsch. + * + * This software is distributed with NO WARRANTIES, not even the implied + * warranties for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Authors grant any other persons or organisations permission to use + * or modify this software as long as this message is kept with the software, + * all derivative works or modified versions. + * + * From: Version 2.4, Thu Apr 30 17:17:21 MSD 1997 + * + * $FreeBSD$ + */ + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#ifdef INET +#include +#include +#endif + +#ifdef INET6 +#include +#endif + +#include + +#ifdef IPX +#include +#include +#endif + +#include + +#define IOCTL_CMD_T u_long +#define MAXALIVECNT 3 /* max. alive packets */ + +/* + * Interface flags that can be set in an ifconfig command. + * + * Setting link0 will make the link passive, i.e. it will be marked + * as being administrative openable, but won't be opened to begin + * with. Incoming calls will be answered, or subsequent calls with + * -link1 will cause the administrative open of the LCP layer. + * + * Setting link1 will cause the link to auto-dial only as packets + * arrive to be sent. + * + * Setting IFF_DEBUG will syslog the option negotiation and state + * transitions at level kern.debug. Note: all logs consistently look + * like + * + * : + * + * with being something like "bppp0", and + * being one of "lcp", "ipcp", "cisco", "chap", "pap", etc. + */ + +#define IFF_PASSIVE IFF_LINK0 /* wait passively for connection */ +#define IFF_AUTO IFF_LINK1 /* auto-dial on output */ +#define IFF_CISCO IFF_LINK2 /* auto-dial on output */ + +#define PPP_ALLSTATIONS 0xff /* All-Stations broadcast address */ +#define PPP_UI 0x03 /* Unnumbered Information */ +#define PPP_IP 0x0021 /* Internet Protocol */ +#define PPP_ISO 0x0023 /* ISO OSI Protocol */ +#define PPP_XNS 0x0025 /* Xerox NS Protocol */ +#define PPP_IPX 0x002b /* Novell IPX Protocol */ +#define PPP_VJ_COMP 0x002d /* VJ compressed TCP/IP */ +#define PPP_VJ_UCOMP 0x002f /* VJ uncompressed TCP/IP */ +#define PPP_IPV6 0x0057 /* Internet Protocol Version 6 */ +#define PPP_LCP 0xc021 /* Link Control Protocol */ +#define PPP_PAP 0xc023 /* Password Authentication Protocol */ +#define PPP_CHAP 0xc223 /* Challenge-Handshake Auth Protocol */ +#define PPP_IPCP 0x8021 /* Internet Protocol Control Protocol */ +#define PPP_IPV6CP 0x8057 /* IPv6 Control Protocol */ + +#define CONF_REQ 1 /* PPP configure request */ +#define CONF_ACK 2 /* PPP configure acknowledge */ +#define CONF_NAK 3 /* PPP configure negative ack */ +#define CONF_REJ 4 /* PPP configure reject */ +#define TERM_REQ 5 /* PPP terminate request */ +#define TERM_ACK 6 /* PPP terminate acknowledge */ +#define CODE_REJ 7 /* PPP code reject */ +#define PROTO_REJ 8 /* PPP protocol reject */ +#define ECHO_REQ 9 /* PPP echo request */ +#define ECHO_REPLY 10 /* PPP echo reply */ +#define DISC_REQ 11 /* PPP discard request */ + +#define LCP_OPT_MRU 1 /* maximum receive unit */ +#define LCP_OPT_ASYNC_MAP 2 /* async control character map */ +#define LCP_OPT_AUTH_PROTO 3 /* authentication protocol */ +#define LCP_OPT_QUAL_PROTO 4 /* quality protocol */ +#define LCP_OPT_MAGIC 5 /* magic number */ +#define LCP_OPT_RESERVED 6 /* reserved */ +#define LCP_OPT_PROTO_COMP 7 /* protocol field compression */ +#define LCP_OPT_ADDR_COMP 8 /* address/control field compression */ + +#define IPCP_OPT_ADDRESSES 1 /* both IP addresses; deprecated */ +#define IPCP_OPT_COMPRESSION 2 /* IP compression protocol (VJ) */ +#define IPCP_OPT_ADDRESS 3 /* local IP address */ + +#define IPV6CP_OPT_IFID 1 /* interface identifier */ +#define IPV6CP_OPT_COMPRESSION 2 /* IPv6 compression protocol */ + +#define IPCP_COMP_VJ 0x2d /* Code for VJ compression */ + +#define PAP_REQ 1 /* PAP name/password request */ +#define PAP_ACK 2 /* PAP acknowledge */ +#define PAP_NAK 3 /* PAP fail */ + +#define CHAP_CHALLENGE 1 /* CHAP challenge request */ +#define CHAP_RESPONSE 2 /* CHAP challenge response */ +#define CHAP_SUCCESS 3 /* CHAP response ok */ +#define CHAP_FAILURE 4 /* CHAP response failed */ + +#define CHAP_MD5 5 /* hash algorithm - MD5 */ + +#define CISCO_MULTICAST 0x8f /* Cisco multicast address */ +#define CISCO_UNICAST 0x0f /* Cisco unicast address */ +#define CISCO_KEEPALIVE 0x8035 /* Cisco keepalive protocol */ +#define CISCO_ADDR_REQ 0 /* Cisco address request */ +#define CISCO_ADDR_REPLY 1 /* Cisco address reply */ +#define CISCO_KEEPALIVE_REQ 2 /* Cisco keepalive request */ + +/* states are named and numbered according to RFC 1661 */ +#define STATE_INITIAL 0 +#define STATE_STARTING 1 +#define STATE_CLOSED 2 +#define STATE_STOPPED 3 +#define STATE_CLOSING 4 +#define STATE_STOPPING 5 +#define STATE_REQ_SENT 6 +#define STATE_ACK_RCVD 7 +#define STATE_ACK_SENT 8 +#define STATE_OPENED 9 + +MALLOC_DEFINE(M_SPPP, "sppp", "synchronous PPP interface internals"); + +struct ppp_header { + u_char address; + u_char control; + u_short protocol; +} __packed; +#define PPP_HEADER_LEN sizeof (struct ppp_header) + +struct lcp_header { + u_char type; + u_char ident; + u_short len; +} __packed; +#define LCP_HEADER_LEN sizeof (struct lcp_header) + +struct cisco_packet { + u_long type; + u_long par1; + u_long par2; + u_short rel; + u_short time0; + u_short time1; +} __packed; +#define CISCO_PACKET_LEN sizeof (struct cisco_packet) + +/* + * We follow the spelling and capitalization of RFC 1661 here, to make + * it easier comparing with the standard. Please refer to this RFC in + * case you can't make sense out of these abbreviation; it will also + * explain the semantics related to the various events and actions. + */ +struct cp { + u_short proto; /* PPP control protocol number */ + u_char protoidx; /* index into state table in struct sppp */ + u_char flags; +#define CP_LCP 0x01 /* this is the LCP */ +#define CP_AUTH 0x02 /* this is an authentication protocol */ +#define CP_NCP 0x04 /* this is a NCP */ +#define CP_QUAL 0x08 /* this is a quality reporting protocol */ + const char *name; /* name of this control protocol */ + /* event handlers */ + void (*Up)(struct sppp *sp); + void (*Down)(struct sppp *sp); + void (*Open)(struct sppp *sp); + void (*Close)(struct sppp *sp); + void (*TO)(void *sp); + int (*RCR)(struct sppp *sp, struct lcp_header *h, int len); + void (*RCN_rej)(struct sppp *sp, struct lcp_header *h, int len); + void (*RCN_nak)(struct sppp *sp, struct lcp_header *h, int len); + /* actions */ + void (*tlu)(struct sppp *sp); + void (*tld)(struct sppp *sp); + void (*tls)(struct sppp *sp); + void (*tlf)(struct sppp *sp); + void (*scr)(struct sppp *sp); +}; + +#define SPP_FMT "%s: " +#define SPP_ARGS(ifp) (ifp)->if_xname + +#define SPPP_LOCK(sp) mtx_lock (&(sp)->mtx) +#define SPPP_UNLOCK(sp) mtx_unlock (&(sp)->mtx) +#define SPPP_LOCK_ASSERT(sp) mtx_assert (&(sp)->mtx, MA_OWNED) +#define SPPP_LOCK_OWNED(sp) mtx_owned (&(sp)->mtx) + +#ifdef INET +/* + * The following disgusting hack gets around the problem that IP TOS + * can't be set yet. We want to put "interactive" traffic on a high + * priority queue. To decide if traffic is interactive, we check that + * a) it is TCP and b) one of its ports is telnet, rlogin or ftp control. + * + * XXX is this really still necessary? - joerg - + */ +static const u_short interactive_ports[8] = { + 0, 513, 0, 0, + 0, 21, 0, 23, +}; +#define INTERACTIVE(p) (interactive_ports[(p) & 7] == (p)) +#endif + +/* almost every function needs these */ +#define STDDCL \ + struct ifnet *ifp = SP2IFP(sp); \ + int debug = ifp->if_flags & IFF_DEBUG + +static int sppp_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro); + +static void sppp_cisco_send(struct sppp *sp, int type, long par1, long par2); +static void sppp_cisco_input(struct sppp *sp, struct mbuf *m); + +static void sppp_cp_input(const struct cp *cp, struct sppp *sp, + struct mbuf *m); +static void sppp_cp_send(struct sppp *sp, u_short proto, u_char type, + u_char ident, u_short len, void *data); +/* static void sppp_cp_timeout(void *arg); */ +static void sppp_cp_change_state(const struct cp *cp, struct sppp *sp, + int newstate); +static void sppp_auth_send(const struct cp *cp, + struct sppp *sp, unsigned int type, unsigned int id, + ...); + +static void sppp_up_event(const struct cp *cp, struct sppp *sp); +static void sppp_down_event(const struct cp *cp, struct sppp *sp); +static void sppp_open_event(const struct cp *cp, struct sppp *sp); +static void sppp_close_event(const struct cp *cp, struct sppp *sp); +static void sppp_to_event(const struct cp *cp, struct sppp *sp); + +static void sppp_null(struct sppp *sp); + +static void sppp_pp_up(struct sppp *sp); +static void sppp_pp_down(struct sppp *sp); + +static void sppp_lcp_init(struct sppp *sp); +static void sppp_lcp_up(struct sppp *sp); +static void sppp_lcp_down(struct sppp *sp); +static void sppp_lcp_open(struct sppp *sp); +static void sppp_lcp_close(struct sppp *sp); +static void sppp_lcp_TO(void *sp); +static int sppp_lcp_RCR(struct sppp *sp, struct lcp_header *h, int len); +static void sppp_lcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len); +static void sppp_lcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len); +static void sppp_lcp_tlu(struct sppp *sp); +static void sppp_lcp_tld(struct sppp *sp); +static void sppp_lcp_tls(struct sppp *sp); +static void sppp_lcp_tlf(struct sppp *sp); +static void sppp_lcp_scr(struct sppp *sp); +static void sppp_lcp_check_and_close(struct sppp *sp); +static int sppp_ncp_check(struct sppp *sp); + +static void sppp_ipcp_init(struct sppp *sp); +static void sppp_ipcp_up(struct sppp *sp); +static void sppp_ipcp_down(struct sppp *sp); +static void sppp_ipcp_open(struct sppp *sp); +static void sppp_ipcp_close(struct sppp *sp); +static void sppp_ipcp_TO(void *sp); +static int sppp_ipcp_RCR(struct sppp *sp, struct lcp_header *h, int len); +static void sppp_ipcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len); +static void sppp_ipcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len); +static void sppp_ipcp_tlu(struct sppp *sp); +static void sppp_ipcp_tld(struct sppp *sp); +static void sppp_ipcp_tls(struct sppp *sp); +static void sppp_ipcp_tlf(struct sppp *sp); +static void sppp_ipcp_scr(struct sppp *sp); + +static void sppp_ipv6cp_init(struct sppp *sp); +static void sppp_ipv6cp_up(struct sppp *sp); +static void sppp_ipv6cp_down(struct sppp *sp); +static void sppp_ipv6cp_open(struct sppp *sp); +static void sppp_ipv6cp_close(struct sppp *sp); +static void sppp_ipv6cp_TO(void *sp); +static int sppp_ipv6cp_RCR(struct sppp *sp, struct lcp_header *h, int len); +static void sppp_ipv6cp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len); +static void sppp_ipv6cp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len); +static void sppp_ipv6cp_tlu(struct sppp *sp); +static void sppp_ipv6cp_tld(struct sppp *sp); +static void sppp_ipv6cp_tls(struct sppp *sp); +static void sppp_ipv6cp_tlf(struct sppp *sp); +static void sppp_ipv6cp_scr(struct sppp *sp); + +static void sppp_pap_input(struct sppp *sp, struct mbuf *m); +static void sppp_pap_init(struct sppp *sp); +static void sppp_pap_open(struct sppp *sp); +static void sppp_pap_close(struct sppp *sp); +static void sppp_pap_TO(void *sp); +static void sppp_pap_my_TO(void *sp); +static void sppp_pap_tlu(struct sppp *sp); +static void sppp_pap_tld(struct sppp *sp); +static void sppp_pap_scr(struct sppp *sp); + +static void sppp_chap_input(struct sppp *sp, struct mbuf *m); +static void sppp_chap_init(struct sppp *sp); +static void sppp_chap_open(struct sppp *sp); +static void sppp_chap_close(struct sppp *sp); +static void sppp_chap_TO(void *sp); +static void sppp_chap_tlu(struct sppp *sp); +static void sppp_chap_tld(struct sppp *sp); +static void sppp_chap_scr(struct sppp *sp); + +static const char *sppp_auth_type_name(u_short proto, u_char type); +static const char *sppp_cp_type_name(u_char type); +#ifdef INET +static const char *sppp_dotted_quad(u_long addr); +static const char *sppp_ipcp_opt_name(u_char opt); +#endif +#ifdef INET6 +static const char *sppp_ipv6cp_opt_name(u_char opt); +#endif +static const char *sppp_lcp_opt_name(u_char opt); +static const char *sppp_phase_name(enum ppp_phase phase); +static const char *sppp_proto_name(u_short proto); +static const char *sppp_state_name(int state); +static int sppp_params(struct sppp *sp, u_long cmd, void *data); +static int sppp_strnlen(u_char *p, int max); +static void sppp_keepalive(void *dummy); +static void sppp_phase_network(struct sppp *sp); +static void sppp_print_bytes(const u_char *p, u_short len); +static void sppp_print_string(const char *p, u_short len); +static void sppp_qflush(struct ifqueue *ifq); +#ifdef INET +static void sppp_set_ip_addr(struct sppp *sp, u_long src); +#endif +#ifdef INET6 +static void sppp_get_ip6_addrs(struct sppp *sp, struct in6_addr *src, + struct in6_addr *dst, struct in6_addr *srcmask); +#ifdef IPV6CP_MYIFID_DYN +static void sppp_set_ip6_addr(struct sppp *sp, const struct in6_addr *src); +static void sppp_gen_ip6_addr(struct sppp *sp, const struct in6_addr *src); +#endif +static void sppp_suggest_ip6_addr(struct sppp *sp, struct in6_addr *src); +#endif + +/* if_start () wrapper */ +static void sppp_ifstart (struct ifnet *ifp); + +/* our control protocol descriptors */ +static const struct cp lcp = { + PPP_LCP, IDX_LCP, CP_LCP, "lcp", + sppp_lcp_up, sppp_lcp_down, sppp_lcp_open, sppp_lcp_close, + sppp_lcp_TO, sppp_lcp_RCR, sppp_lcp_RCN_rej, sppp_lcp_RCN_nak, + sppp_lcp_tlu, sppp_lcp_tld, sppp_lcp_tls, sppp_lcp_tlf, + sppp_lcp_scr +}; + +static const struct cp ipcp = { + PPP_IPCP, IDX_IPCP, +#ifdef INET /* don't run IPCP if there's no IPv4 support */ + CP_NCP, +#else + 0, +#endif + "ipcp", + sppp_ipcp_up, sppp_ipcp_down, sppp_ipcp_open, sppp_ipcp_close, + sppp_ipcp_TO, sppp_ipcp_RCR, sppp_ipcp_RCN_rej, sppp_ipcp_RCN_nak, + sppp_ipcp_tlu, sppp_ipcp_tld, sppp_ipcp_tls, sppp_ipcp_tlf, + sppp_ipcp_scr +}; + +static const struct cp ipv6cp = { + PPP_IPV6CP, IDX_IPV6CP, +#ifdef INET6 /*don't run IPv6CP if there's no IPv6 support*/ + CP_NCP, +#else + 0, +#endif + "ipv6cp", + sppp_ipv6cp_up, sppp_ipv6cp_down, sppp_ipv6cp_open, sppp_ipv6cp_close, + sppp_ipv6cp_TO, sppp_ipv6cp_RCR, sppp_ipv6cp_RCN_rej, sppp_ipv6cp_RCN_nak, + sppp_ipv6cp_tlu, sppp_ipv6cp_tld, sppp_ipv6cp_tls, sppp_ipv6cp_tlf, + sppp_ipv6cp_scr +}; + +static const struct cp pap = { + PPP_PAP, IDX_PAP, CP_AUTH, "pap", + sppp_null, sppp_null, sppp_pap_open, sppp_pap_close, + sppp_pap_TO, 0, 0, 0, + sppp_pap_tlu, sppp_pap_tld, sppp_null, sppp_null, + sppp_pap_scr +}; + +static const struct cp chap = { + PPP_CHAP, IDX_CHAP, CP_AUTH, "chap", + sppp_null, sppp_null, sppp_chap_open, sppp_chap_close, + sppp_chap_TO, 0, 0, 0, + sppp_chap_tlu, sppp_chap_tld, sppp_null, sppp_null, + sppp_chap_scr +}; + +static const struct cp *cps[IDX_COUNT] = { + &lcp, /* IDX_LCP */ + &ipcp, /* IDX_IPCP */ + &ipv6cp, /* IDX_IPV6CP */ + &pap, /* IDX_PAP */ + &chap, /* IDX_CHAP */ +}; + +static void* +sppp_alloc(u_char type, struct ifnet *ifp) +{ + struct sppp *sp; + + sp = malloc(sizeof(struct sppp), M_SPPP, M_WAITOK | M_ZERO); + sp->pp_ifp = ifp; + + return (sp); +} + +static void +sppp_free(void *com, u_char type) +{ + + free(com, M_SPPP); +} + +static int +sppp_modevent(module_t mod, int type, void *unused) +{ + switch (type) { + case MOD_LOAD: + /* + * XXX: should probably be IFT_SPPP, but it's fairly + * harmless to allocate struct sppp's for non-sppp + * interfaces. + */ + + if_register_com_alloc(IFT_PPP, sppp_alloc, sppp_free); + break; + case MOD_UNLOAD: + /* if_deregister_com_alloc(IFT_PPP); */ + return EACCES; + default: + return EOPNOTSUPP; + } + return 0; +} +static moduledata_t spppmod = { + "sppp", + sppp_modevent, + 0 +}; +MODULE_VERSION(sppp, 1); +DECLARE_MODULE(sppp, spppmod, SI_SUB_DRIVERS, SI_ORDER_ANY); + +/* + * Exported functions, comprising our interface to the lower layer. + */ + +/* + * Process the received packet. + */ +void +sppp_input(struct ifnet *ifp, struct mbuf *m) +{ + struct ppp_header *h; + int isr = -1; + struct sppp *sp = IFP2SP(ifp); + int debug, do_account = 0; +#ifdef INET + int hlen, vjlen; + u_char *iphdr; +#endif + + SPPP_LOCK(sp); + debug = ifp->if_flags & IFF_DEBUG; + + if (ifp->if_flags & IFF_UP) + /* Count received bytes, add FCS and one flag */ + ifp->if_ibytes += m->m_pkthdr.len + 3; + + if (m->m_pkthdr.len <= PPP_HEADER_LEN) { + /* Too small packet, drop it. */ + if (debug) + log(LOG_DEBUG, + SPP_FMT "input packet is too small, %d bytes\n", + SPP_ARGS(ifp), m->m_pkthdr.len); + drop: + m_freem (m); + SPPP_UNLOCK(sp); + drop2: + ++ifp->if_ierrors; + ++ifp->if_iqdrops; + return; + } + + if (sp->pp_mode == PP_FR) { + sppp_fr_input (sp, m); + SPPP_UNLOCK(sp); + return; + } + + /* Get PPP header. */ + h = mtod (m, struct ppp_header*); + m_adj (m, PPP_HEADER_LEN); + + switch (h->address) { + case PPP_ALLSTATIONS: + if (h->control != PPP_UI) + goto invalid; + if (sp->pp_mode == IFF_CISCO) { + if (debug) + log(LOG_DEBUG, + SPP_FMT "PPP packet in Cisco mode " + "\n", + SPP_ARGS(ifp), + h->address, h->control, ntohs(h->protocol)); + goto drop; + } + switch (ntohs (h->protocol)) { + default: + if (debug) + log(LOG_DEBUG, + SPP_FMT "rejecting protocol " + "\n", + SPP_ARGS(ifp), + h->address, h->control, ntohs(h->protocol)); + if (sp->state[IDX_LCP] == STATE_OPENED) + sppp_cp_send (sp, PPP_LCP, PROTO_REJ, + ++sp->pp_seq[IDX_LCP], m->m_pkthdr.len + 2, + &h->protocol); + ++ifp->if_noproto; + goto drop; + case PPP_LCP: + sppp_cp_input(&lcp, sp, m); + m_freem (m); + SPPP_UNLOCK(sp); + return; + case PPP_PAP: + if (sp->pp_phase >= PHASE_AUTHENTICATE) + sppp_pap_input(sp, m); + m_freem (m); + SPPP_UNLOCK(sp); + return; + case PPP_CHAP: + if (sp->pp_phase >= PHASE_AUTHENTICATE) + sppp_chap_input(sp, m); + m_freem (m); + SPPP_UNLOCK(sp); + return; +#ifdef INET + case PPP_IPCP: + if (sp->pp_phase == PHASE_NETWORK) + sppp_cp_input(&ipcp, sp, m); + m_freem (m); + SPPP_UNLOCK(sp); + return; + case PPP_IP: + if (sp->state[IDX_IPCP] == STATE_OPENED) { + isr = NETISR_IP; + } + do_account++; + break; + case PPP_VJ_COMP: + if (sp->state[IDX_IPCP] == STATE_OPENED) { + if ((vjlen = + sl_uncompress_tcp_core(mtod(m, u_char *), + m->m_len, m->m_len, + TYPE_COMPRESSED_TCP, + sp->pp_comp, + &iphdr, &hlen)) <= 0) { + if (debug) + log(LOG_INFO, + SPP_FMT "VJ uncompress failed on compressed packet\n", + SPP_ARGS(ifp)); + goto drop; + } + + /* + * Trim the VJ header off the packet, and prepend + * the uncompressed IP header (which will usually + * end up in two chained mbufs since there's not + * enough leading space in the existing mbuf). + */ + m_adj(m, vjlen); + M_PREPEND(m, hlen, M_DONTWAIT); + if (m == NULL) { + SPPP_UNLOCK(sp); + goto drop2; + } + bcopy(iphdr, mtod(m, u_char *), hlen); + isr = NETISR_IP; + } + do_account++; + break; + case PPP_VJ_UCOMP: + if (sp->state[IDX_IPCP] == STATE_OPENED) { + if (sl_uncompress_tcp_core(mtod(m, u_char *), + m->m_len, m->m_len, + TYPE_UNCOMPRESSED_TCP, + sp->pp_comp, + &iphdr, &hlen) != 0) { + if (debug) + log(LOG_INFO, + SPP_FMT "VJ uncompress failed on uncompressed packet\n", + SPP_ARGS(ifp)); + goto drop; + } + isr = NETISR_IP; + } + do_account++; + break; +#endif +#ifdef INET6 + case PPP_IPV6CP: + if (sp->pp_phase == PHASE_NETWORK) + sppp_cp_input(&ipv6cp, sp, m); + m_freem (m); + SPPP_UNLOCK(sp); + return; + + case PPP_IPV6: + if (sp->state[IDX_IPV6CP] == STATE_OPENED) + isr = NETISR_IPV6; + do_account++; + break; +#endif +#ifdef IPX + case PPP_IPX: + /* IPX IPXCP not implemented yet */ + if (sp->pp_phase == PHASE_NETWORK) + isr = NETISR_IPX; + do_account++; + break; +#endif + } + break; + case CISCO_MULTICAST: + case CISCO_UNICAST: + /* Don't check the control field here (RFC 1547). */ + if (sp->pp_mode != IFF_CISCO) { + if (debug) + log(LOG_DEBUG, + SPP_FMT "Cisco packet in PPP mode " + "\n", + SPP_ARGS(ifp), + h->address, h->control, ntohs(h->protocol)); + goto drop; + } + switch (ntohs (h->protocol)) { + default: + ++ifp->if_noproto; + goto invalid; + case CISCO_KEEPALIVE: + sppp_cisco_input (sp, m); + m_freem (m); + SPPP_UNLOCK(sp); + return; +#ifdef INET + case ETHERTYPE_IP: + isr = NETISR_IP; + do_account++; + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + isr = NETISR_IPV6; + do_account++; + break; +#endif +#ifdef IPX + case ETHERTYPE_IPX: + isr = NETISR_IPX; + do_account++; + break; +#endif + } + break; + default: /* Invalid PPP packet. */ + invalid: + if (debug) + log(LOG_DEBUG, + SPP_FMT "invalid input packet " + "\n", + SPP_ARGS(ifp), + h->address, h->control, ntohs(h->protocol)); + goto drop; + } + + if (! (ifp->if_flags & IFF_UP) || isr == -1) + goto drop; + + SPPP_UNLOCK(sp); + /* Check queue. */ + if (netisr_queue(isr, m)) { /* (0) on success. */ + if (debug) + log(LOG_DEBUG, SPP_FMT "protocol queue overflow\n", + SPP_ARGS(ifp)); + goto drop2; + } + + if (do_account) + /* + * Do only account for network packets, not for control + * packets. This is used by some subsystems to detect + * idle lines. + */ + sp->pp_last_recv = time_uptime; +} + +static void +sppp_ifstart_sched(void *dummy) +{ + struct sppp *sp = dummy; + + sp->if_start(SP2IFP(sp)); +} + +/* if_start () wrapper function. We use it to schedule real if_start () for + * execution. We can't call it directly + */ +static void +sppp_ifstart(struct ifnet *ifp) +{ + struct sppp *sp = IFP2SP(ifp); + + if (SPPP_LOCK_OWNED(sp)) { + if (callout_pending(&sp->ifstart_callout)) + return; + callout_reset(&sp->ifstart_callout, 1, sppp_ifstart_sched, + (void *)sp); + } else { + sp->if_start(ifp); + } +} + +/* + * Enqueue transmit packet. + */ +static int +sppp_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro) +{ + struct sppp *sp = IFP2SP(ifp); + struct ppp_header *h; + struct ifqueue *ifq = NULL; + int s, error, rv = 0; +#ifdef INET + int ipproto = PPP_IP; +#endif + int debug = ifp->if_flags & IFF_DEBUG; + + s = splimp(); + SPPP_LOCK(sp); + + if (!(ifp->if_flags & IFF_UP) || + (!(ifp->if_flags & IFF_AUTO) && + !(ifp->if_drv_flags & IFF_DRV_RUNNING))) { +#ifdef INET6 + drop: +#endif + m_freem (m); + SPPP_UNLOCK(sp); + splx (s); + return (ENETDOWN); + } + + if ((ifp->if_flags & IFF_AUTO) && + !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { +#ifdef INET6 + /* + * XXX + * + * Hack to prevent the initialization-time generated + * IPv6 multicast packet to erroneously cause a + * dialout event in case IPv6 has been + * administratively disabled on that interface. + */ + if (dst->sa_family == AF_INET6 && + !(sp->confflags & CONF_ENABLE_IPV6)) + goto drop; +#endif + /* + * Interface is not yet running, but auto-dial. Need + * to start LCP for it. + */ + ifp->if_drv_flags |= IFF_DRV_RUNNING; + splx(s); + lcp.Open(sp); + s = splimp(); + } + +#ifdef INET + if (dst->sa_family == AF_INET) { + /* XXX Check mbuf length here? */ + struct ip *ip = mtod (m, struct ip*); + struct tcphdr *tcp = (struct tcphdr*) ((long*)ip + ip->ip_hl); + + /* + * When using dynamic local IP address assignment by using + * 0.0.0.0 as a local address, the first TCP session will + * not connect because the local TCP checksum is computed + * using 0.0.0.0 which will later become our real IP address + * so the TCP checksum computed at the remote end will + * become invalid. So we + * - don't let packets with src ip addr 0 thru + * - we flag TCP packets with src ip 0 as an error + */ + + if(ip->ip_src.s_addr == INADDR_ANY) /* -hm */ + { + m_freem(m); + SPPP_UNLOCK(sp); + splx(s); + if(ip->ip_p == IPPROTO_TCP) + return(EADDRNOTAVAIL); + else + return(0); + } + + /* + * Put low delay, telnet, rlogin and ftp control packets + * in front of the queue or let ALTQ take care. + */ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) + ; + else if (_IF_QFULL(&sp->pp_fastq)) + ; + else if (ip->ip_tos & IPTOS_LOWDELAY) + ifq = &sp->pp_fastq; + else if (m->m_len < sizeof *ip + sizeof *tcp) + ; + else if (ip->ip_p != IPPROTO_TCP) + ; + else if (INTERACTIVE (ntohs (tcp->th_sport))) + ifq = &sp->pp_fastq; + else if (INTERACTIVE (ntohs (tcp->th_dport))) + ifq = &sp->pp_fastq; + + /* + * Do IP Header compression + */ + if (sp->pp_mode != IFF_CISCO && sp->pp_mode != PP_FR && + (sp->ipcp.flags & IPCP_VJ) && ip->ip_p == IPPROTO_TCP) + switch (sl_compress_tcp(m, ip, sp->pp_comp, + sp->ipcp.compress_cid)) { + case TYPE_COMPRESSED_TCP: + ipproto = PPP_VJ_COMP; + break; + case TYPE_UNCOMPRESSED_TCP: + ipproto = PPP_VJ_UCOMP; + break; + case TYPE_IP: + ipproto = PPP_IP; + break; + default: + m_freem(m); + SPPP_UNLOCK(sp); + splx(s); + return (EINVAL); + } + } +#endif + +#ifdef INET6 + if (dst->sa_family == AF_INET6) { + /* XXX do something tricky here? */ + } +#endif + + if (sp->pp_mode == PP_FR) { + /* Add frame relay header. */ + m = sppp_fr_header (sp, m, dst->sa_family); + if (! m) + goto nobufs; + goto out; + } + + /* + * Prepend general data packet PPP header. For now, IP only. + */ + M_PREPEND (m, PPP_HEADER_LEN, M_DONTWAIT); + if (! m) { +nobufs: if (debug) + log(LOG_DEBUG, SPP_FMT "no memory for transmit header\n", + SPP_ARGS(ifp)); + ++ifp->if_oerrors; + SPPP_UNLOCK(sp); + splx (s); + return (ENOBUFS); + } + /* + * May want to check size of packet + * (albeit due to the implementation it's always enough) + */ + h = mtod (m, struct ppp_header*); + if (sp->pp_mode == IFF_CISCO) { + h->address = CISCO_UNICAST; /* unicast address */ + h->control = 0; + } else { + h->address = PPP_ALLSTATIONS; /* broadcast address */ + h->control = PPP_UI; /* Unnumbered Info */ + } + + switch (dst->sa_family) { +#ifdef INET + case AF_INET: /* Internet Protocol */ + if (sp->pp_mode == IFF_CISCO) + h->protocol = htons (ETHERTYPE_IP); + else { + /* + * Don't choke with an ENETDOWN early. It's + * possible that we just started dialing out, + * so don't drop the packet immediately. If + * we notice that we run out of buffer space + * below, we will however remember that we are + * not ready to carry IP packets, and return + * ENETDOWN, as opposed to ENOBUFS. + */ + h->protocol = htons(ipproto); + if (sp->state[IDX_IPCP] != STATE_OPENED) + rv = ENETDOWN; + } + break; +#endif +#ifdef INET6 + case AF_INET6: /* Internet Protocol */ + if (sp->pp_mode == IFF_CISCO) + h->protocol = htons (ETHERTYPE_IPV6); + else { + /* + * Don't choke with an ENETDOWN early. It's + * possible that we just started dialing out, + * so don't drop the packet immediately. If + * we notice that we run out of buffer space + * below, we will however remember that we are + * not ready to carry IP packets, and return + * ENETDOWN, as opposed to ENOBUFS. + */ + h->protocol = htons(PPP_IPV6); + if (sp->state[IDX_IPV6CP] != STATE_OPENED) + rv = ENETDOWN; + } + break; +#endif +#ifdef IPX + case AF_IPX: /* Novell IPX Protocol */ + h->protocol = htons (sp->pp_mode == IFF_CISCO ? + ETHERTYPE_IPX : PPP_IPX); + break; +#endif + default: + m_freem (m); + ++ifp->if_oerrors; + SPPP_UNLOCK(sp); + splx (s); + return (EAFNOSUPPORT); + } + + /* + * Queue message on interface, and start output if interface + * not yet active. + */ +out: + if (ifq != NULL) + error = !(IF_HANDOFF_ADJ(ifq, m, ifp, 3)); + else + IFQ_HANDOFF_ADJ(ifp, m, 3, error); + if (error) { + ++ifp->if_oerrors; + SPPP_UNLOCK(sp); + splx (s); + return (rv? rv: ENOBUFS); + } + SPPP_UNLOCK(sp); + splx (s); + /* + * Unlike in sppp_input(), we can always bump the timestamp + * here since sppp_output() is only called on behalf of + * network-layer traffic; control-layer traffic is handled + * by sppp_cp_send(). + */ + sp->pp_last_sent = time_uptime; + return (0); +} + +void +sppp_attach(struct ifnet *ifp) +{ + struct sppp *sp = IFP2SP(ifp); + + /* Initialize mtx lock */ + mtx_init(&sp->mtx, "sppp", MTX_NETWORK_LOCK, MTX_DEF | MTX_RECURSE); + + /* Initialize keepalive handler. */ + callout_init(&sp->keepalive_callout, CALLOUT_MPSAFE); + callout_reset(&sp->keepalive_callout, hz * 10, sppp_keepalive, + (void *)sp); + + ifp->if_mtu = PP_MTU; + ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; + ifp->if_output = sppp_output; +#if 0 + sp->pp_flags = PP_KEEPALIVE; +#endif + ifp->if_snd.ifq_maxlen = 32; + sp->pp_fastq.ifq_maxlen = 32; + sp->pp_cpq.ifq_maxlen = 20; + sp->pp_loopcnt = 0; + sp->pp_alivecnt = 0; + bzero(&sp->pp_seq[0], sizeof(sp->pp_seq)); + bzero(&sp->pp_rseq[0], sizeof(sp->pp_rseq)); + sp->pp_phase = PHASE_DEAD; + sp->pp_up = sppp_pp_up; + sp->pp_down = sppp_pp_down; + if(!mtx_initialized(&sp->pp_cpq.ifq_mtx)) + mtx_init(&sp->pp_cpq.ifq_mtx, "sppp_cpq", NULL, MTX_DEF); + if(!mtx_initialized(&sp->pp_fastq.ifq_mtx)) + mtx_init(&sp->pp_fastq.ifq_mtx, "sppp_fastq", NULL, MTX_DEF); + sp->pp_last_recv = sp->pp_last_sent = time_uptime; + sp->confflags = 0; +#ifdef INET + sp->confflags |= CONF_ENABLE_VJ; +#endif +#ifdef INET6 + sp->confflags |= CONF_ENABLE_IPV6; +#endif + callout_init(&sp->ifstart_callout, CALLOUT_MPSAFE); + sp->if_start = ifp->if_start; + ifp->if_start = sppp_ifstart; + sp->pp_comp = malloc(sizeof(struct slcompress), M_TEMP, M_WAITOK); + sl_compress_init(sp->pp_comp, -1); + sppp_lcp_init(sp); + sppp_ipcp_init(sp); + sppp_ipv6cp_init(sp); + sppp_pap_init(sp); + sppp_chap_init(sp); +} + +void +sppp_detach(struct ifnet *ifp) +{ + struct sppp *sp = IFP2SP(ifp); + int i; + + KASSERT(mtx_initialized(&sp->mtx), ("sppp mutex is not initialized")); + + /* Stop keepalive handler. */ + if (!callout_drain(&sp->keepalive_callout)) + callout_stop(&sp->keepalive_callout); + + for (i = 0; i < IDX_COUNT; i++) { + if (!callout_drain(&sp->ch[i])) + callout_stop(&sp->ch[i]); + } + if (!callout_drain(&sp->pap_my_to_ch)) + callout_stop(&sp->pap_my_to_ch); + mtx_destroy(&sp->pp_cpq.ifq_mtx); + mtx_destroy(&sp->pp_fastq.ifq_mtx); + mtx_destroy(&sp->mtx); +} + +/* + * Flush the interface output queue. + */ +static void +sppp_flush_unlocked(struct ifnet *ifp) +{ + struct sppp *sp = IFP2SP(ifp); + + sppp_qflush ((struct ifqueue *)&SP2IFP(sp)->if_snd); + sppp_qflush (&sp->pp_fastq); + sppp_qflush (&sp->pp_cpq); +} + +void +sppp_flush(struct ifnet *ifp) +{ + struct sppp *sp = IFP2SP(ifp); + + SPPP_LOCK(sp); + sppp_flush_unlocked (ifp); + SPPP_UNLOCK(sp); +} + +/* + * Check if the output queue is empty. + */ +int +sppp_isempty(struct ifnet *ifp) +{ + struct sppp *sp = IFP2SP(ifp); + int empty, s; + + s = splimp(); + SPPP_LOCK(sp); + empty = !sp->pp_fastq.ifq_head && !sp->pp_cpq.ifq_head && + !SP2IFP(sp)->if_snd.ifq_head; + SPPP_UNLOCK(sp); + splx(s); + return (empty); +} + +/* + * Get next packet to send. + */ +struct mbuf * +sppp_dequeue(struct ifnet *ifp) +{ + struct sppp *sp = IFP2SP(ifp); + struct mbuf *m; + int s; + + s = splimp(); + SPPP_LOCK(sp); + /* + * Process only the control protocol queue until we have at + * least one NCP open. + * + * Do always serve all three queues in Cisco mode. + */ + IF_DEQUEUE(&sp->pp_cpq, m); + if (m == NULL && + (sppp_ncp_check(sp) || sp->pp_mode == IFF_CISCO || + sp->pp_mode == PP_FR)) { + IF_DEQUEUE(&sp->pp_fastq, m); + if (m == NULL) + IF_DEQUEUE (&SP2IFP(sp)->if_snd, m); + } + SPPP_UNLOCK(sp); + splx(s); + return m; +} + +/* + * Pick the next packet, do not remove it from the queue. + */ +struct mbuf * +sppp_pick(struct ifnet *ifp) +{ + struct sppp *sp = IFP2SP(ifp); + struct mbuf *m; + int s; + + s = splimp (); + SPPP_LOCK(sp); + + m = sp->pp_cpq.ifq_head; + if (m == NULL && + (sp->pp_phase == PHASE_NETWORK || + sp->pp_mode == IFF_CISCO || + sp->pp_mode == PP_FR)) + if ((m = sp->pp_fastq.ifq_head) == NULL) + m = SP2IFP(sp)->if_snd.ifq_head; + SPPP_UNLOCK(sp); + splx (s); + return (m); +} + +/* + * Process an ioctl request. Called on low priority level. + */ +int +sppp_ioctl(struct ifnet *ifp, IOCTL_CMD_T cmd, void *data) +{ + struct ifreq *ifr = (struct ifreq*) data; + struct sppp *sp = IFP2SP(ifp); + int s, rv, going_up, going_down, newmode; + + s = splimp(); + SPPP_LOCK(sp); + rv = 0; + switch (cmd) { + case SIOCAIFADDR: + case SIOCSIFDSTADDR: + break; + + case SIOCSIFADDR: + /* set the interface "up" when assigning an IP address */ + ifp->if_flags |= IFF_UP; + /* FALLTHROUGH */ + + case SIOCSIFFLAGS: + going_up = ifp->if_flags & IFF_UP && + (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0; + going_down = (ifp->if_flags & IFF_UP) == 0 && + ifp->if_drv_flags & IFF_DRV_RUNNING; + + newmode = ifp->if_flags & IFF_PASSIVE; + if (!newmode) + newmode = ifp->if_flags & IFF_AUTO; + if (!newmode) + newmode = ifp->if_flags & IFF_CISCO; + ifp->if_flags &= ~(IFF_PASSIVE | IFF_AUTO | IFF_CISCO); + ifp->if_flags |= newmode; + + if (!newmode) + newmode = sp->pp_flags & PP_FR; + + if (newmode != sp->pp_mode) { + going_down = 1; + if (!going_up) + going_up = ifp->if_drv_flags & IFF_DRV_RUNNING; + } + + if (going_down) { + if (sp->pp_mode != IFF_CISCO && + sp->pp_mode != PP_FR) + lcp.Close(sp); + else if (sp->pp_tlf) + (sp->pp_tlf)(sp); + sppp_flush_unlocked(ifp); + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + sp->pp_mode = newmode; + } + + if (going_up) { + if (sp->pp_mode != IFF_CISCO && + sp->pp_mode != PP_FR) + lcp.Close(sp); + sp->pp_mode = newmode; + if (sp->pp_mode == 0) { + ifp->if_drv_flags |= IFF_DRV_RUNNING; + lcp.Open(sp); + } + if ((sp->pp_mode == IFF_CISCO) || + (sp->pp_mode == PP_FR)) { + if (sp->pp_tls) + (sp->pp_tls)(sp); + ifp->if_drv_flags |= IFF_DRV_RUNNING; + } + } + + break; + +#ifdef SIOCSIFMTU +#ifndef ifr_mtu +#define ifr_mtu ifr_metric +#endif + case SIOCSIFMTU: + if (ifr->ifr_mtu < 128 || ifr->ifr_mtu > sp->lcp.their_mru) + return (EINVAL); + ifp->if_mtu = ifr->ifr_mtu; + break; +#endif +#ifdef SLIOCSETMTU + case SLIOCSETMTU: + if (*(short*)data < 128 || *(short*)data > sp->lcp.their_mru) + return (EINVAL); + ifp->if_mtu = *(short*)data; + break; +#endif +#ifdef SIOCGIFMTU + case SIOCGIFMTU: + ifr->ifr_mtu = ifp->if_mtu; + break; +#endif +#ifdef SLIOCGETMTU + case SLIOCGETMTU: + *(short*)data = ifp->if_mtu; + break; +#endif + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + + case SIOCGIFGENERIC: + case SIOCSIFGENERIC: + rv = sppp_params(sp, cmd, data); + break; + + default: + rv = ENOTTY; + } + SPPP_UNLOCK(sp); + splx(s); + return rv; +} + +/* + * Cisco framing implementation. + */ + +/* + * Handle incoming Cisco keepalive protocol packets. + */ +static void +sppp_cisco_input(struct sppp *sp, struct mbuf *m) +{ + STDDCL; + struct cisco_packet *h; + u_long me, mymask; + + if (m->m_pkthdr.len < CISCO_PACKET_LEN) { + if (debug) + log(LOG_DEBUG, + SPP_FMT "cisco invalid packet length: %d bytes\n", + SPP_ARGS(ifp), m->m_pkthdr.len); + return; + } + h = mtod (m, struct cisco_packet*); + if (debug) + log(LOG_DEBUG, + SPP_FMT "cisco input: %d bytes " + "<0x%lx 0x%lx 0x%lx 0x%x 0x%x-0x%x>\n", + SPP_ARGS(ifp), m->m_pkthdr.len, + (u_long)ntohl (h->type), (u_long)h->par1, (u_long)h->par2, (u_int)h->rel, + (u_int)h->time0, (u_int)h->time1); + switch (ntohl (h->type)) { + default: + if (debug) + log(-1, SPP_FMT "cisco unknown packet type: 0x%lx\n", + SPP_ARGS(ifp), (u_long)ntohl (h->type)); + break; + case CISCO_ADDR_REPLY: + /* Reply on address request, ignore */ + break; + case CISCO_KEEPALIVE_REQ: + sp->pp_alivecnt = 0; + sp->pp_rseq[IDX_LCP] = ntohl (h->par1); + if (sp->pp_seq[IDX_LCP] == sp->pp_rseq[IDX_LCP]) { + /* Local and remote sequence numbers are equal. + * Probably, the line is in loopback mode. */ + if (sp->pp_loopcnt >= MAXALIVECNT) { + printf (SPP_FMT "loopback\n", + SPP_ARGS(ifp)); + sp->pp_loopcnt = 0; + if (ifp->if_flags & IFF_UP) { + if_down (ifp); + sppp_qflush (&sp->pp_cpq); + } + } + ++sp->pp_loopcnt; + + /* Generate new local sequence number */ + sp->pp_seq[IDX_LCP] = random(); + break; + } + sp->pp_loopcnt = 0; + if (! (ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING)) { + if_up(ifp); + printf (SPP_FMT "up\n", SPP_ARGS(ifp)); + } + break; + case CISCO_ADDR_REQ: + sppp_get_ip_addrs(sp, &me, 0, &mymask); + if (me != 0L) + sppp_cisco_send(sp, CISCO_ADDR_REPLY, me, mymask); + break; + } +} + +/* + * Send Cisco keepalive packet. + */ +static void +sppp_cisco_send(struct sppp *sp, int type, long par1, long par2) +{ + STDDCL; + struct ppp_header *h; + struct cisco_packet *ch; + struct mbuf *m; + struct timeval tv; + + getmicrouptime(&tv); + + MGETHDR (m, M_DONTWAIT, MT_DATA); + if (! m) + return; + m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + CISCO_PACKET_LEN; + m->m_pkthdr.rcvif = 0; + + h = mtod (m, struct ppp_header*); + h->address = CISCO_MULTICAST; + h->control = 0; + h->protocol = htons (CISCO_KEEPALIVE); + + ch = (struct cisco_packet*) (h + 1); + ch->type = htonl (type); + ch->par1 = htonl (par1); + ch->par2 = htonl (par2); + ch->rel = -1; + + ch->time0 = htons ((u_short) (tv.tv_sec >> 16)); + ch->time1 = htons ((u_short) tv.tv_sec); + + if (debug) + log(LOG_DEBUG, + SPP_FMT "cisco output: <0x%lx 0x%lx 0x%lx 0x%x 0x%x-0x%x>\n", + SPP_ARGS(ifp), (u_long)ntohl (ch->type), (u_long)ch->par1, + (u_long)ch->par2, (u_int)ch->rel, (u_int)ch->time0, (u_int)ch->time1); + + if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) + ifp->if_oerrors++; +} + +/* + * PPP protocol implementation. + */ + +/* + * Send PPP control protocol packet. + */ +static void +sppp_cp_send(struct sppp *sp, u_short proto, u_char type, + u_char ident, u_short len, void *data) +{ + STDDCL; + struct ppp_header *h; + struct lcp_header *lh; + struct mbuf *m; + + if (len > MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN) + len = MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN; + MGETHDR (m, M_DONTWAIT, MT_DATA); + if (! m) + return; + m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + LCP_HEADER_LEN + len; + m->m_pkthdr.rcvif = 0; + + h = mtod (m, struct ppp_header*); + h->address = PPP_ALLSTATIONS; /* broadcast address */ + h->control = PPP_UI; /* Unnumbered Info */ + h->protocol = htons (proto); /* Link Control Protocol */ + + lh = (struct lcp_header*) (h + 1); + lh->type = type; + lh->ident = ident; + lh->len = htons (LCP_HEADER_LEN + len); + if (len) + bcopy (data, lh+1, len); + + if (debug) { + log(LOG_DEBUG, SPP_FMT "%s output <%s id=0x%x len=%d", + SPP_ARGS(ifp), + sppp_proto_name(proto), + sppp_cp_type_name (lh->type), lh->ident, + ntohs (lh->len)); + sppp_print_bytes ((u_char*) (lh+1), len); + log(-1, ">\n"); + } + if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) + ifp->if_oerrors++; +} + +/* + * Handle incoming PPP control protocol packets. + */ +static void +sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) +{ + STDDCL; + struct lcp_header *h; + int len = m->m_pkthdr.len; + int rv; + u_char *p; + + if (len < 4) { + if (debug) + log(LOG_DEBUG, + SPP_FMT "%s invalid packet length: %d bytes\n", + SPP_ARGS(ifp), cp->name, len); + return; + } + h = mtod (m, struct lcp_header*); + if (debug) { + log(LOG_DEBUG, + SPP_FMT "%s input(%s): <%s id=0x%x len=%d", + SPP_ARGS(ifp), cp->name, + sppp_state_name(sp->state[cp->protoidx]), + sppp_cp_type_name (h->type), h->ident, ntohs (h->len)); + sppp_print_bytes ((u_char*) (h+1), len-4); + log(-1, ">\n"); + } + if (len > ntohs (h->len)) + len = ntohs (h->len); + p = (u_char *)(h + 1); + switch (h->type) { + case CONF_REQ: + if (len < 4) { + if (debug) + log(-1, SPP_FMT "%s invalid conf-req length %d\n", + SPP_ARGS(ifp), cp->name, + len); + ++ifp->if_ierrors; + break; + } + /* handle states where RCR doesn't get a SCA/SCN */ + switch (sp->state[cp->protoidx]) { + case STATE_CLOSING: + case STATE_STOPPING: + return; + case STATE_CLOSED: + sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, + 0, 0); + return; + } + rv = (cp->RCR)(sp, h, len); + switch (sp->state[cp->protoidx]) { + case STATE_OPENED: + (cp->tld)(sp); + (cp->scr)(sp); + /* FALLTHROUGH */ + case STATE_ACK_SENT: + case STATE_REQ_SENT: + /* + * sppp_cp_change_state() have the side effect of + * restarting the timeouts. We want to avoid that + * if the state don't change, otherwise we won't + * ever timeout and resend a configuration request + * that got lost. + */ + if (sp->state[cp->protoidx] == (rv ? STATE_ACK_SENT: + STATE_REQ_SENT)) + break; + sppp_cp_change_state(cp, sp, rv? + STATE_ACK_SENT: STATE_REQ_SENT); + break; + case STATE_STOPPED: + sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; + (cp->scr)(sp); + sppp_cp_change_state(cp, sp, rv? + STATE_ACK_SENT: STATE_REQ_SENT); + break; + case STATE_ACK_RCVD: + if (rv) { + sppp_cp_change_state(cp, sp, STATE_OPENED); + if (debug) + log(LOG_DEBUG, SPP_FMT "%s tlu\n", + SPP_ARGS(ifp), + cp->name); + (cp->tlu)(sp); + } else + sppp_cp_change_state(cp, sp, STATE_ACK_RCVD); + break; + default: + printf(SPP_FMT "%s illegal %s in state %s\n", + SPP_ARGS(ifp), cp->name, + sppp_cp_type_name(h->type), + sppp_state_name(sp->state[cp->protoidx])); + ++ifp->if_ierrors; + } + break; + case CONF_ACK: + if (h->ident != sp->confid[cp->protoidx]) { + if (debug) + log(-1, SPP_FMT "%s id mismatch 0x%x != 0x%x\n", + SPP_ARGS(ifp), cp->name, + h->ident, sp->confid[cp->protoidx]); + ++ifp->if_ierrors; + break; + } + switch (sp->state[cp->protoidx]) { + case STATE_CLOSED: + case STATE_STOPPED: + sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0); + break; + case STATE_CLOSING: + case STATE_STOPPING: + break; + case STATE_REQ_SENT: + sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; + sppp_cp_change_state(cp, sp, STATE_ACK_RCVD); + break; + case STATE_OPENED: + (cp->tld)(sp); + /* FALLTHROUGH */ + case STATE_ACK_RCVD: + (cp->scr)(sp); + sppp_cp_change_state(cp, sp, STATE_REQ_SENT); + break; + case STATE_ACK_SENT: + sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; + sppp_cp_change_state(cp, sp, STATE_OPENED); + if (debug) + log(LOG_DEBUG, SPP_FMT "%s tlu\n", + SPP_ARGS(ifp), cp->name); + (cp->tlu)(sp); + break; + default: + printf(SPP_FMT "%s illegal %s in state %s\n", + SPP_ARGS(ifp), cp->name, + sppp_cp_type_name(h->type), + sppp_state_name(sp->state[cp->protoidx])); + ++ifp->if_ierrors; + } + break; + case CONF_NAK: + case CONF_REJ: + if (h->ident != sp->confid[cp->protoidx]) { + if (debug) + log(-1, SPP_FMT "%s id mismatch 0x%x != 0x%x\n", + SPP_ARGS(ifp), cp->name, + h->ident, sp->confid[cp->protoidx]); + ++ifp->if_ierrors; + break; + } + if (h->type == CONF_NAK) + (cp->RCN_nak)(sp, h, len); + else /* CONF_REJ */ + (cp->RCN_rej)(sp, h, len); + + switch (sp->state[cp->protoidx]) { + case STATE_CLOSED: + case STATE_STOPPED: + sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0); + break; + case STATE_REQ_SENT: + case STATE_ACK_SENT: + sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; + /* + * Slow things down a bit if we think we might be + * in loopback. Depend on the timeout to send the + * next configuration request. + */ + if (sp->pp_loopcnt) + break; + (cp->scr)(sp); + break; + case STATE_OPENED: + (cp->tld)(sp); + /* FALLTHROUGH */ + case STATE_ACK_RCVD: + sppp_cp_change_state(cp, sp, STATE_REQ_SENT); + (cp->scr)(sp); + break; + case STATE_CLOSING: + case STATE_STOPPING: + break; + default: + printf(SPP_FMT "%s illegal %s in state %s\n", + SPP_ARGS(ifp), cp->name, + sppp_cp_type_name(h->type), + sppp_state_name(sp->state[cp->protoidx])); + ++ifp->if_ierrors; + } + break; + + case TERM_REQ: + switch (sp->state[cp->protoidx]) { + case STATE_ACK_RCVD: + case STATE_ACK_SENT: + sppp_cp_change_state(cp, sp, STATE_REQ_SENT); + /* FALLTHROUGH */ + case STATE_CLOSED: + case STATE_STOPPED: + case STATE_CLOSING: + case STATE_STOPPING: + case STATE_REQ_SENT: + sta: + /* Send Terminate-Ack packet. */ + if (debug) + log(LOG_DEBUG, SPP_FMT "%s send terminate-ack\n", + SPP_ARGS(ifp), cp->name); + sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0); + break; + case STATE_OPENED: + (cp->tld)(sp); + sp->rst_counter[cp->protoidx] = 0; + sppp_cp_change_state(cp, sp, STATE_STOPPING); + goto sta; + break; + default: + printf(SPP_FMT "%s illegal %s in state %s\n", + SPP_ARGS(ifp), cp->name, + sppp_cp_type_name(h->type), + sppp_state_name(sp->state[cp->protoidx])); + ++ifp->if_ierrors; + } + break; + case TERM_ACK: + switch (sp->state[cp->protoidx]) { + case STATE_CLOSED: + case STATE_STOPPED: + case STATE_REQ_SENT: + case STATE_ACK_SENT: + break; + case STATE_CLOSING: + sppp_cp_change_state(cp, sp, STATE_CLOSED); + (cp->tlf)(sp); + break; + case STATE_STOPPING: + sppp_cp_change_state(cp, sp, STATE_STOPPED); + (cp->tlf)(sp); + break; + case STATE_ACK_RCVD: + sppp_cp_change_state(cp, sp, STATE_REQ_SENT); + break; + case STATE_OPENED: + (cp->tld)(sp); + (cp->scr)(sp); + sppp_cp_change_state(cp, sp, STATE_ACK_RCVD); + break; + default: + printf(SPP_FMT "%s illegal %s in state %s\n", + SPP_ARGS(ifp), cp->name, + sppp_cp_type_name(h->type), + sppp_state_name(sp->state[cp->protoidx])); + ++ifp->if_ierrors; + } + break; + case CODE_REJ: + /* XXX catastrophic rejects (RXJ-) aren't handled yet. */ + log(LOG_INFO, + SPP_FMT "%s: ignoring RXJ (%s) for proto 0x%x, " + "danger will robinson\n", + SPP_ARGS(ifp), cp->name, + sppp_cp_type_name(h->type), ntohs(*((u_short *)p))); + switch (sp->state[cp->protoidx]) { + case STATE_CLOSED: + case STATE_STOPPED: + case STATE_REQ_SENT: + case STATE_ACK_SENT: + case STATE_CLOSING: + case STATE_STOPPING: + case STATE_OPENED: + break; + case STATE_ACK_RCVD: + sppp_cp_change_state(cp, sp, STATE_REQ_SENT); + break; + default: + printf(SPP_FMT "%s illegal %s in state %s\n", + SPP_ARGS(ifp), cp->name, + sppp_cp_type_name(h->type), + sppp_state_name(sp->state[cp->protoidx])); + ++ifp->if_ierrors; + } + break; + case PROTO_REJ: + { + int catastrophic; + const struct cp *upper; + int i; + u_int16_t proto; + + catastrophic = 0; + upper = NULL; + proto = ntohs(*((u_int16_t *)p)); + for (i = 0; i < IDX_COUNT; i++) { + if (cps[i]->proto == proto) { + upper = cps[i]; + break; + } + } + if (upper == NULL) + catastrophic++; + + if (catastrophic || debug) + log(catastrophic? LOG_INFO: LOG_DEBUG, + SPP_FMT "%s: RXJ%c (%s) for proto 0x%x (%s/%s)\n", + SPP_ARGS(ifp), cp->name, catastrophic ? '-' : '+', + sppp_cp_type_name(h->type), proto, + upper ? upper->name : "unknown", + upper ? sppp_state_name(sp->state[upper->protoidx]) : "?"); + + /* + * if we got RXJ+ against conf-req, the peer does not implement + * this particular protocol type. terminate the protocol. + */ + if (upper && !catastrophic) { + if (sp->state[upper->protoidx] == STATE_REQ_SENT) { + upper->Close(sp); + break; + } + } + + /* XXX catastrophic rejects (RXJ-) aren't handled yet. */ + switch (sp->state[cp->protoidx]) { + case STATE_CLOSED: + case STATE_STOPPED: + case STATE_REQ_SENT: + case STATE_ACK_SENT: + case STATE_CLOSING: + case STATE_STOPPING: + case STATE_OPENED: + break; + case STATE_ACK_RCVD: + sppp_cp_change_state(cp, sp, STATE_REQ_SENT); + break; + default: + printf(SPP_FMT "%s illegal %s in state %s\n", + SPP_ARGS(ifp), cp->name, + sppp_cp_type_name(h->type), + sppp_state_name(sp->state[cp->protoidx])); + ++ifp->if_ierrors; + } + break; + } + case DISC_REQ: + if (cp->proto != PPP_LCP) + goto illegal; + /* Discard the packet. */ + break; + case ECHO_REQ: + if (cp->proto != PPP_LCP) + goto illegal; + if (sp->state[cp->protoidx] != STATE_OPENED) { + if (debug) + log(-1, SPP_FMT "lcp echo req but lcp closed\n", + SPP_ARGS(ifp)); + ++ifp->if_ierrors; + break; + } + if (len < 8) { + if (debug) + log(-1, SPP_FMT "invalid lcp echo request " + "packet length: %d bytes\n", + SPP_ARGS(ifp), len); + break; + } + if ((sp->lcp.opts & (1 << LCP_OPT_MAGIC)) && + ntohl (*(long*)(h+1)) == sp->lcp.magic) { + /* Line loopback mode detected. */ + printf(SPP_FMT "loopback\n", SPP_ARGS(ifp)); + sp->pp_loopcnt = MAXALIVECNT * 5; + if_down (ifp); + sppp_qflush (&sp->pp_cpq); + + /* Shut down the PPP link. */ + /* XXX */ + lcp.Down(sp); + lcp.Up(sp); + break; + } + *(long*)(h+1) = htonl (sp->lcp.magic); + if (debug) + log(-1, SPP_FMT "got lcp echo req, sending echo rep\n", + SPP_ARGS(ifp)); + sppp_cp_send (sp, PPP_LCP, ECHO_REPLY, h->ident, len-4, h+1); + break; + case ECHO_REPLY: + if (cp->proto != PPP_LCP) + goto illegal; + if (h->ident != sp->lcp.echoid) { + ++ifp->if_ierrors; + break; + } + if (len < 8) { + if (debug) + log(-1, SPP_FMT "lcp invalid echo reply " + "packet length: %d bytes\n", + SPP_ARGS(ifp), len); + break; + } + if (debug) + log(-1, SPP_FMT "lcp got echo rep\n", + SPP_ARGS(ifp)); + if (!(sp->lcp.opts & (1 << LCP_OPT_MAGIC)) || + ntohl (*(long*)(h+1)) != sp->lcp.magic) + sp->pp_alivecnt = 0; + break; + default: + /* Unknown packet type -- send Code-Reject packet. */ + illegal: + if (debug) + log(-1, SPP_FMT "%s send code-rej for 0x%x\n", + SPP_ARGS(ifp), cp->name, h->type); + sppp_cp_send(sp, cp->proto, CODE_REJ, + ++sp->pp_seq[cp->protoidx], m->m_pkthdr.len, h); + ++ifp->if_ierrors; + } +} + + +/* + * The generic part of all Up/Down/Open/Close/TO event handlers. + * Basically, the state transition handling in the automaton. + */ +static void +sppp_up_event(const struct cp *cp, struct sppp *sp) +{ + STDDCL; + + if (debug) + log(LOG_DEBUG, SPP_FMT "%s up(%s)\n", + SPP_ARGS(ifp), cp->name, + sppp_state_name(sp->state[cp->protoidx])); + + switch (sp->state[cp->protoidx]) { + case STATE_INITIAL: + sppp_cp_change_state(cp, sp, STATE_CLOSED); + break; + case STATE_STARTING: + sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; + (cp->scr)(sp); + sppp_cp_change_state(cp, sp, STATE_REQ_SENT); + break; + default: + printf(SPP_FMT "%s illegal up in state %s\n", + SPP_ARGS(ifp), cp->name, + sppp_state_name(sp->state[cp->protoidx])); + } +} + +static void +sppp_down_event(const struct cp *cp, struct sppp *sp) +{ + STDDCL; + + if (debug) + log(LOG_DEBUG, SPP_FMT "%s down(%s)\n", + SPP_ARGS(ifp), cp->name, + sppp_state_name(sp->state[cp->protoidx])); + + switch (sp->state[cp->protoidx]) { + case STATE_CLOSED: + case STATE_CLOSING: + sppp_cp_change_state(cp, sp, STATE_INITIAL); + break; + case STATE_STOPPED: + sppp_cp_change_state(cp, sp, STATE_STARTING); + (cp->tls)(sp); + break; + case STATE_STOPPING: + case STATE_REQ_SENT: + case STATE_ACK_RCVD: + case STATE_ACK_SENT: + sppp_cp_change_state(cp, sp, STATE_STARTING); + break; + case STATE_OPENED: + (cp->tld)(sp); + sppp_cp_change_state(cp, sp, STATE_STARTING); + break; + default: + printf(SPP_FMT "%s illegal down in state %s\n", + SPP_ARGS(ifp), cp->name, + sppp_state_name(sp->state[cp->protoidx])); + } +} + + +static void +sppp_open_event(const struct cp *cp, struct sppp *sp) +{ + STDDCL; + + if (debug) + log(LOG_DEBUG, SPP_FMT "%s open(%s)\n", + SPP_ARGS(ifp), cp->name, + sppp_state_name(sp->state[cp->protoidx])); + + switch (sp->state[cp->protoidx]) { + case STATE_INITIAL: + sppp_cp_change_state(cp, sp, STATE_STARTING); + (cp->tls)(sp); + break; + case STATE_STARTING: + break; + case STATE_CLOSED: + sp->rst_counter[cp->protoidx] = sp->lcp.max_configure; + (cp->scr)(sp); + sppp_cp_change_state(cp, sp, STATE_REQ_SENT); + break; + case STATE_STOPPED: + /* + * Try escaping stopped state. This seems to bite + * people occasionally, in particular for IPCP, + * presumably following previous IPCP negotiation + * aborts. Somehow, we must have missed a Down event + * which would have caused a transition into starting + * state, so as a bandaid we force the Down event now. + * This effectively implements (something like the) + * `restart' option mentioned in the state transition + * table of RFC 1661. + */ + sppp_cp_change_state(cp, sp, STATE_STARTING); + (cp->tls)(sp); + break; + case STATE_STOPPING: + case STATE_REQ_SENT: + case STATE_ACK_RCVD: + case STATE_ACK_SENT: + case STATE_OPENED: + break; + case STATE_CLOSING: + sppp_cp_change_state(cp, sp, STATE_STOPPING); + break; + } +} + + +static void +sppp_close_event(const struct cp *cp, struct sppp *sp) +{ + STDDCL; + + if (debug) + log(LOG_DEBUG, SPP_FMT "%s close(%s)\n", + SPP_ARGS(ifp), cp->name, + sppp_state_name(sp->state[cp->protoidx])); + + switch (sp->state[cp->protoidx]) { + case STATE_INITIAL: + case STATE_CLOSED: + case STATE_CLOSING: + break; + case STATE_STARTING: + sppp_cp_change_state(cp, sp, STATE_INITIAL); + (cp->tlf)(sp); + break; + case STATE_STOPPED: + sppp_cp_change_state(cp, sp, STATE_CLOSED); + break; + case STATE_STOPPING: + sppp_cp_change_state(cp, sp, STATE_CLOSING); + break; + case STATE_OPENED: + (cp->tld)(sp); + /* FALLTHROUGH */ + case STATE_REQ_SENT: + case STATE_ACK_RCVD: + case STATE_ACK_SENT: + sp->rst_counter[cp->protoidx] = sp->lcp.max_terminate; + sppp_cp_send(sp, cp->proto, TERM_REQ, + ++sp->pp_seq[cp->protoidx], 0, 0); + sppp_cp_change_state(cp, sp, STATE_CLOSING); + break; + } +} + +static void +sppp_to_event(const struct cp *cp, struct sppp *sp) +{ + STDDCL; + int s; + + s = splimp(); + SPPP_LOCK(sp); + if (debug) + log(LOG_DEBUG, SPP_FMT "%s TO(%s) rst_counter = %d\n", + SPP_ARGS(ifp), cp->name, + sppp_state_name(sp->state[cp->protoidx]), + sp->rst_counter[cp->protoidx]); + + if (--sp->rst_counter[cp->protoidx] < 0) + /* TO- event */ + switch (sp->state[cp->protoidx]) { + case STATE_CLOSING: + sppp_cp_change_state(cp, sp, STATE_CLOSED); + (cp->tlf)(sp); + break; + case STATE_STOPPING: + sppp_cp_change_state(cp, sp, STATE_STOPPED); + (cp->tlf)(sp); + break; + case STATE_REQ_SENT: + case STATE_ACK_RCVD: + case STATE_ACK_SENT: + sppp_cp_change_state(cp, sp, STATE_STOPPED); + (cp->tlf)(sp); + break; + } + else + /* TO+ event */ + switch (sp->state[cp->protoidx]) { + case STATE_CLOSING: + case STATE_STOPPING: + sppp_cp_send(sp, cp->proto, TERM_REQ, + ++sp->pp_seq[cp->protoidx], 0, 0); + callout_reset(&sp->ch[cp->protoidx], sp->lcp.timeout, + cp->TO, (void *)sp); + break; + case STATE_REQ_SENT: + case STATE_ACK_RCVD: + (cp->scr)(sp); + /* sppp_cp_change_state() will restart the timer */ + sppp_cp_change_state(cp, sp, STATE_REQ_SENT); + break; + case STATE_ACK_SENT: + (cp->scr)(sp); + callout_reset(&sp->ch[cp->protoidx], sp->lcp.timeout, + cp->TO, (void *)sp); + break; + } + + SPPP_UNLOCK(sp); + splx(s); +} + +/* + * Change the state of a control protocol in the state automaton. + * Takes care of starting/stopping the restart timer. + */ +static void +sppp_cp_change_state(const struct cp *cp, struct sppp *sp, int newstate) +{ + sp->state[cp->protoidx] = newstate; + + callout_stop (&sp->ch[cp->protoidx]); + + switch (newstate) { + case STATE_INITIAL: + case STATE_STARTING: + case STATE_CLOSED: + case STATE_STOPPED: + case STATE_OPENED: + break; + case STATE_CLOSING: + case STATE_STOPPING: + case STATE_REQ_SENT: + case STATE_ACK_RCVD: + case STATE_ACK_SENT: + callout_reset(&sp->ch[cp->protoidx], sp->lcp.timeout, + cp->TO, (void *)sp); + break; + } +} + +/* + *--------------------------------------------------------------------------* + * * + * The LCP implementation. * + * * + *--------------------------------------------------------------------------* + */ +static void +sppp_pp_up(struct sppp *sp) +{ + SPPP_LOCK(sp); + lcp.Up(sp); + SPPP_UNLOCK(sp); +} + +static void +sppp_pp_down(struct sppp *sp) +{ + SPPP_LOCK(sp); + lcp.Down(sp); + SPPP_UNLOCK(sp); +} + +static void +sppp_lcp_init(struct sppp *sp) +{ + sp->lcp.opts = (1 << LCP_OPT_MAGIC); + sp->lcp.magic = 0; + sp->state[IDX_LCP] = STATE_INITIAL; + sp->fail_counter[IDX_LCP] = 0; + sp->pp_seq[IDX_LCP] = 0; + sp->pp_rseq[IDX_LCP] = 0; + sp->lcp.protos = 0; + sp->lcp.mru = sp->lcp.their_mru = PP_MTU; + + /* Note that these values are relevant for all control protocols */ + sp->lcp.timeout = 3 * hz; + sp->lcp.max_terminate = 2; + sp->lcp.max_configure = 10; + sp->lcp.max_failure = 10; + callout_init(&sp->ch[IDX_LCP], CALLOUT_MPSAFE); +} + +static void +sppp_lcp_up(struct sppp *sp) +{ + STDDCL; + + sp->pp_alivecnt = 0; + sp->lcp.opts = (1 << LCP_OPT_MAGIC); + sp->lcp.magic = 0; + sp->lcp.protos = 0; + sp->lcp.mru = sp->lcp.their_mru = PP_MTU; + /* + * If we are authenticator, negotiate LCP_AUTH + */ + if (sp->hisauth.proto != 0) + sp->lcp.opts |= (1 << LCP_OPT_AUTH_PROTO); + else + sp->lcp.opts &= ~(1 << LCP_OPT_AUTH_PROTO); + sp->pp_flags &= ~PP_NEEDAUTH; + /* + * If this interface is passive or dial-on-demand, and we are + * still in Initial state, it means we've got an incoming + * call. Activate the interface. + */ + if ((ifp->if_flags & (IFF_AUTO | IFF_PASSIVE)) != 0) { + if (debug) + log(LOG_DEBUG, + SPP_FMT "Up event", SPP_ARGS(ifp)); + ifp->if_drv_flags |= IFF_DRV_RUNNING; + if (sp->state[IDX_LCP] == STATE_INITIAL) { + if (debug) + log(-1, "(incoming call)\n"); + sp->pp_flags |= PP_CALLIN; + lcp.Open(sp); + } else if (debug) + log(-1, "\n"); + } else if ((ifp->if_flags & (IFF_AUTO | IFF_PASSIVE)) == 0 && + (sp->state[IDX_LCP] == STATE_INITIAL)) { + ifp->if_drv_flags |= IFF_DRV_RUNNING; + lcp.Open(sp); + } + + sppp_up_event(&lcp, sp); +} + +static void +sppp_lcp_down(struct sppp *sp) +{ + STDDCL; + + sppp_down_event(&lcp, sp); + + /* + * If this is neither a dial-on-demand nor a passive + * interface, simulate an ``ifconfig down'' action, so the + * administrator can force a redial by another ``ifconfig + * up''. XXX For leased line operation, should we immediately + * try to reopen the connection here? + */ + if ((ifp->if_flags & (IFF_AUTO | IFF_PASSIVE)) == 0) { + log(LOG_INFO, + SPP_FMT "Down event, taking interface down.\n", + SPP_ARGS(ifp)); + if_down(ifp); + } else { + if (debug) + log(LOG_DEBUG, + SPP_FMT "Down event (carrier loss)\n", + SPP_ARGS(ifp)); + sp->pp_flags &= ~PP_CALLIN; + if (sp->state[IDX_LCP] != STATE_INITIAL) + lcp.Close(sp); + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + } +} + +static void +sppp_lcp_open(struct sppp *sp) +{ + sppp_open_event(&lcp, sp); +} + +static void +sppp_lcp_close(struct sppp *sp) +{ + sppp_close_event(&lcp, sp); +} + +static void +sppp_lcp_TO(void *cookie) +{ + sppp_to_event(&lcp, (struct sppp *)cookie); +} + +/* + * Analyze a configure request. Return true if it was agreeable, and + * caused action sca, false if it has been rejected or nak'ed, and + * caused action scn. (The return value is used to make the state + * transition decision in the state automaton.) + */ +static int +sppp_lcp_RCR(struct sppp *sp, struct lcp_header *h, int len) +{ + STDDCL; + u_char *buf, *r, *p; + int origlen, rlen; + u_long nmagic; + u_short authproto; + + len -= 4; + origlen = len; + buf = r = malloc (len, M_TEMP, M_NOWAIT); + if (! buf) + return (0); + + if (debug) + log(LOG_DEBUG, SPP_FMT "lcp parse opts: ", + SPP_ARGS(ifp)); + + /* pass 1: check for things that need to be rejected */ + p = (void*) (h+1); + for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; + len-=p[1], p+=p[1]) { + if (debug) + log(-1, " %s ", sppp_lcp_opt_name(*p)); + switch (*p) { + case LCP_OPT_MAGIC: + /* Magic number. */ + if (len >= 6 && p[1] == 6) + continue; + if (debug) + log(-1, "[invalid] "); + break; + case LCP_OPT_ASYNC_MAP: + /* Async control character map. */ + if (len >= 6 && p[1] == 6) + continue; + if (debug) + log(-1, "[invalid] "); + break; + case LCP_OPT_MRU: + /* Maximum receive unit. */ + if (len >= 4 && p[1] == 4) + continue; + if (debug) + log(-1, "[invalid] "); + break; + case LCP_OPT_AUTH_PROTO: + if (len < 4) { + if (debug) + log(-1, "[invalid] "); + break; + } + authproto = (p[2] << 8) + p[3]; + if (authproto == PPP_CHAP && p[1] != 5) { + if (debug) + log(-1, "[invalid chap len] "); + break; + } + if (sp->myauth.proto == 0) { + /* we are not configured to do auth */ + if (debug) + log(-1, "[not configured] "); + break; + } + /* + * Remote want us to authenticate, remember this, + * so we stay in PHASE_AUTHENTICATE after LCP got + * up. + */ + sp->pp_flags |= PP_NEEDAUTH; + continue; + default: + /* Others not supported. */ + if (debug) + log(-1, "[rej] "); + break; + } + /* Add the option to rejected list. */ + bcopy (p, r, p[1]); + r += p[1]; + rlen += p[1]; + } + if (rlen) { + if (debug) + log(-1, " send conf-rej\n"); + sppp_cp_send (sp, PPP_LCP, CONF_REJ, h->ident, rlen, buf); + return 0; + } else if (debug) + log(-1, "\n"); + + /* + * pass 2: check for option values that are unacceptable and + * thus require to be nak'ed. + */ + if (debug) + log(LOG_DEBUG, SPP_FMT "lcp parse opt values: ", + SPP_ARGS(ifp)); + + p = (void*) (h+1); + len = origlen; + for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; + len-=p[1], p+=p[1]) { + if (debug) + log(-1, " %s ", sppp_lcp_opt_name(*p)); + switch (*p) { + case LCP_OPT_MAGIC: + /* Magic number -- extract. */ + nmagic = (u_long)p[2] << 24 | + (u_long)p[3] << 16 | p[4] << 8 | p[5]; + if (nmagic != sp->lcp.magic) { + sp->pp_loopcnt = 0; + if (debug) + log(-1, "0x%lx ", nmagic); + continue; + } + if (debug && sp->pp_loopcnt < MAXALIVECNT*5) + log(-1, "[glitch] "); + ++sp->pp_loopcnt; + /* + * We negate our magic here, and NAK it. If + * we see it later in an NAK packet, we + * suggest a new one. + */ + nmagic = ~sp->lcp.magic; + /* Gonna NAK it. */ + p[2] = nmagic >> 24; + p[3] = nmagic >> 16; + p[4] = nmagic >> 8; + p[5] = nmagic; + break; + + case LCP_OPT_ASYNC_MAP: + /* + * Async control character map -- just ignore it. + * + * Quote from RFC 1662, chapter 6: + * To enable this functionality, synchronous PPP + * implementations MUST always respond to the + * Async-Control-Character-Map Configuration + * Option with the LCP Configure-Ack. However, + * acceptance of the Configuration Option does + * not imply that the synchronous implementation + * will do any ACCM mapping. Instead, all such + * octet mapping will be performed by the + * asynchronous-to-synchronous converter. + */ + continue; + + case LCP_OPT_MRU: + /* + * Maximum receive unit. Always agreeable, + * but ignored by now. + */ + sp->lcp.their_mru = p[2] * 256 + p[3]; + if (debug) + log(-1, "%lu ", sp->lcp.their_mru); + continue; + + case LCP_OPT_AUTH_PROTO: + authproto = (p[2] << 8) + p[3]; + if (sp->myauth.proto != authproto) { + /* not agreed, nak */ + if (debug) + log(-1, "[mine %s != his %s] ", + sppp_proto_name(sp->hisauth.proto), + sppp_proto_name(authproto)); + p[2] = sp->myauth.proto >> 8; + p[3] = sp->myauth.proto; + break; + } + if (authproto == PPP_CHAP && p[4] != CHAP_MD5) { + if (debug) + log(-1, "[chap not MD5] "); + p[4] = CHAP_MD5; + break; + } + continue; + } + /* Add the option to nak'ed list. */ + bcopy (p, r, p[1]); + r += p[1]; + rlen += p[1]; + } + if (rlen) { + /* + * Local and remote magics equal -- loopback? + */ + if (sp->pp_loopcnt >= MAXALIVECNT*5) { + if (sp->pp_loopcnt == MAXALIVECNT*5) + printf (SPP_FMT "loopback\n", + SPP_ARGS(ifp)); + if (ifp->if_flags & IFF_UP) { + if_down(ifp); + sppp_qflush(&sp->pp_cpq); + /* XXX ? */ + lcp.Down(sp); + lcp.Up(sp); + } + } else if (!sp->pp_loopcnt && + ++sp->fail_counter[IDX_LCP] >= sp->lcp.max_failure) { + if (debug) + log(-1, " max_failure (%d) exceeded, " + "send conf-rej\n", + sp->lcp.max_failure); + sppp_cp_send(sp, PPP_LCP, CONF_REJ, h->ident, rlen, buf); + } else { + if (debug) + log(-1, " send conf-nak\n"); + sppp_cp_send (sp, PPP_LCP, CONF_NAK, h->ident, rlen, buf); + } + } else { + if (debug) + log(-1, " send conf-ack\n"); + sp->fail_counter[IDX_LCP] = 0; + sp->pp_loopcnt = 0; + sppp_cp_send (sp, PPP_LCP, CONF_ACK, + h->ident, origlen, h+1); + } + + free (buf, M_TEMP); + return (rlen == 0); +} + +/* + * Analyze the LCP Configure-Reject option list, and adjust our + * negotiation. + */ +static void +sppp_lcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) +{ + STDDCL; + u_char *buf, *p; + + len -= 4; + buf = malloc (len, M_TEMP, M_NOWAIT); + if (!buf) + return; + + if (debug) + log(LOG_DEBUG, SPP_FMT "lcp rej opts: ", + SPP_ARGS(ifp)); + + p = (void*) (h+1); + for (; len >= 2 && p[1] >= 2 && len >= p[1]; + len -= p[1], p += p[1]) { + if (debug) + log(-1, " %s ", sppp_lcp_opt_name(*p)); + switch (*p) { + case LCP_OPT_MAGIC: + /* Magic number -- can't use it, use 0 */ + sp->lcp.opts &= ~(1 << LCP_OPT_MAGIC); + sp->lcp.magic = 0; + break; + case LCP_OPT_MRU: + /* + * Should not be rejected anyway, since we only + * negotiate a MRU if explicitly requested by + * peer. + */ + sp->lcp.opts &= ~(1 << LCP_OPT_MRU); + break; + case LCP_OPT_AUTH_PROTO: + /* + * Peer doesn't want to authenticate himself, + * deny unless this is a dialout call, and + * AUTHFLAG_NOCALLOUT is set. + */ + if ((sp->pp_flags & PP_CALLIN) == 0 && + (sp->hisauth.flags & AUTHFLAG_NOCALLOUT) != 0) { + if (debug) + log(-1, "[don't insist on auth " + "for callout]"); + sp->lcp.opts &= ~(1 << LCP_OPT_AUTH_PROTO); + break; + } + if (debug) + log(-1, "[access denied]\n"); + lcp.Close(sp); + break; + } + } + if (debug) + log(-1, "\n"); + free (buf, M_TEMP); + return; +} + +/* + * Analyze the LCP Configure-NAK option list, and adjust our + * negotiation. + */ +static void +sppp_lcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) +{ + STDDCL; + u_char *buf, *p; + u_long magic; + + len -= 4; + buf = malloc (len, M_TEMP, M_NOWAIT); + if (!buf) + return; + + if (debug) + log(LOG_DEBUG, SPP_FMT "lcp nak opts: ", + SPP_ARGS(ifp)); + + p = (void*) (h+1); + for (; len >= 2 && p[1] >= 2 && len >= p[1]; + len -= p[1], p += p[1]) { + if (debug) + log(-1, " %s ", sppp_lcp_opt_name(*p)); + switch (*p) { + case LCP_OPT_MAGIC: + /* Magic number -- renegotiate */ + if ((sp->lcp.opts & (1 << LCP_OPT_MAGIC)) && + len >= 6 && p[1] == 6) { + magic = (u_long)p[2] << 24 | + (u_long)p[3] << 16 | p[4] << 8 | p[5]; + /* + * If the remote magic is our negated one, + * this looks like a loopback problem. + * Suggest a new magic to make sure. + */ + if (magic == ~sp->lcp.magic) { + if (debug) + log(-1, "magic glitch "); + sp->lcp.magic = random(); + } else { + sp->lcp.magic = magic; + if (debug) + log(-1, "%lu ", magic); + } + } + break; + case LCP_OPT_MRU: + /* + * Peer wants to advise us to negotiate an MRU. + * Agree on it if it's reasonable, or use + * default otherwise. + */ + if (len >= 4 && p[1] == 4) { + u_int mru = p[2] * 256 + p[3]; + if (debug) + log(-1, "%d ", mru); + if (mru < PP_MTU || mru > PP_MAX_MRU) + mru = PP_MTU; + sp->lcp.mru = mru; + sp->lcp.opts |= (1 << LCP_OPT_MRU); + } + break; + case LCP_OPT_AUTH_PROTO: + /* + * Peer doesn't like our authentication method, + * deny. + */ + if (debug) + log(-1, "[access denied]\n"); + lcp.Close(sp); + break; + } + } + if (debug) + log(-1, "\n"); + free (buf, M_TEMP); + return; +} + +static void +sppp_lcp_tlu(struct sppp *sp) +{ + STDDCL; + int i; + u_long mask; + + /* XXX ? */ + if (! (ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING)) { + /* Coming out of loopback mode. */ + if_up(ifp); + printf (SPP_FMT "up\n", SPP_ARGS(ifp)); + } + + for (i = 0; i < IDX_COUNT; i++) + if ((cps[i])->flags & CP_QUAL) + (cps[i])->Open(sp); + + if ((sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) != 0 || + (sp->pp_flags & PP_NEEDAUTH) != 0) + sp->pp_phase = PHASE_AUTHENTICATE; + else + sp->pp_phase = PHASE_NETWORK; + + if (debug) + log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), + sppp_phase_name(sp->pp_phase)); + + /* + * Open all authentication protocols. This is even required + * if we already proceeded to network phase, since it might be + * that remote wants us to authenticate, so we might have to + * send a PAP request. Undesired authentication protocols + * don't do anything when they get an Open event. + */ + for (i = 0; i < IDX_COUNT; i++) + if ((cps[i])->flags & CP_AUTH) + (cps[i])->Open(sp); + + if (sp->pp_phase == PHASE_NETWORK) { + /* Notify all NCPs. */ + for (i = 0; i < IDX_COUNT; i++) + if (((cps[i])->flags & CP_NCP) && + /* + * XXX + * Hack to administratively disable IPv6 if + * not desired. Perhaps we should have another + * flag for this, but right now, we can make + * all struct cp's read/only. + */ + (cps[i] != &ipv6cp || + (sp->confflags & CONF_ENABLE_IPV6))) + (cps[i])->Open(sp); + } + + /* Send Up events to all started protos. */ + for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1) + if ((sp->lcp.protos & mask) && ((cps[i])->flags & CP_LCP) == 0) + (cps[i])->Up(sp); + + /* notify low-level driver of state change */ + if (sp->pp_chg) + sp->pp_chg(sp, (int)sp->pp_phase); + + if (sp->pp_phase == PHASE_NETWORK) + /* if no NCP is starting, close down */ + sppp_lcp_check_and_close(sp); +} + +static void +sppp_lcp_tld(struct sppp *sp) +{ + STDDCL; + int i; + u_long mask; + + sp->pp_phase = PHASE_TERMINATE; + + if (debug) + log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), + sppp_phase_name(sp->pp_phase)); + + /* + * Take upper layers down. We send the Down event first and + * the Close second to prevent the upper layers from sending + * ``a flurry of terminate-request packets'', as the RFC + * describes it. + */ + for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1) + if ((sp->lcp.protos & mask) && ((cps[i])->flags & CP_LCP) == 0) { + (cps[i])->Down(sp); + (cps[i])->Close(sp); + } +} + +static void +sppp_lcp_tls(struct sppp *sp) +{ + STDDCL; + + sp->pp_phase = PHASE_ESTABLISH; + + if (debug) + log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), + sppp_phase_name(sp->pp_phase)); + + /* Notify lower layer if desired. */ + if (sp->pp_tls) + (sp->pp_tls)(sp); + else + (sp->pp_up)(sp); +} + +static void +sppp_lcp_tlf(struct sppp *sp) +{ + STDDCL; + + sp->pp_phase = PHASE_DEAD; + if (debug) + log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), + sppp_phase_name(sp->pp_phase)); + + /* Notify lower layer if desired. */ + if (sp->pp_tlf) + (sp->pp_tlf)(sp); + else + (sp->pp_down)(sp); +} + +static void +sppp_lcp_scr(struct sppp *sp) +{ + char opt[6 /* magicnum */ + 4 /* mru */ + 5 /* chap */]; + int i = 0; + u_short authproto; + + if (sp->lcp.opts & (1 << LCP_OPT_MAGIC)) { + if (! sp->lcp.magic) + sp->lcp.magic = random(); + opt[i++] = LCP_OPT_MAGIC; + opt[i++] = 6; + opt[i++] = sp->lcp.magic >> 24; + opt[i++] = sp->lcp.magic >> 16; + opt[i++] = sp->lcp.magic >> 8; + opt[i++] = sp->lcp.magic; + } + + if (sp->lcp.opts & (1 << LCP_OPT_MRU)) { + opt[i++] = LCP_OPT_MRU; + opt[i++] = 4; + opt[i++] = sp->lcp.mru >> 8; + opt[i++] = sp->lcp.mru; + } + + if (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) { + authproto = sp->hisauth.proto; + opt[i++] = LCP_OPT_AUTH_PROTO; + opt[i++] = authproto == PPP_CHAP? 5: 4; + opt[i++] = authproto >> 8; + opt[i++] = authproto; + if (authproto == PPP_CHAP) + opt[i++] = CHAP_MD5; + } + + sp->confid[IDX_LCP] = ++sp->pp_seq[IDX_LCP]; + sppp_cp_send (sp, PPP_LCP, CONF_REQ, sp->confid[IDX_LCP], i, &opt); +} + +/* + * Check the open NCPs, return true if at least one NCP is open. + */ +static int +sppp_ncp_check(struct sppp *sp) +{ + int i, mask; + + for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1) + if ((sp->lcp.protos & mask) && (cps[i])->flags & CP_NCP) + return 1; + return 0; +} + +/* + * Re-check the open NCPs and see if we should terminate the link. + * Called by the NCPs during their tlf action handling. + */ +static void +sppp_lcp_check_and_close(struct sppp *sp) +{ + + if (sp->pp_phase < PHASE_NETWORK) + /* don't bother, we are already going down */ + return; + + if (sppp_ncp_check(sp)) + return; + + lcp.Close(sp); +} + +/* + *--------------------------------------------------------------------------* + * * + * The IPCP implementation. * + * * + *--------------------------------------------------------------------------* + */ + +#ifdef INET +static void +sppp_ipcp_init(struct sppp *sp) +{ + sp->ipcp.opts = 0; + sp->ipcp.flags = 0; + sp->state[IDX_IPCP] = STATE_INITIAL; + sp->fail_counter[IDX_IPCP] = 0; + sp->pp_seq[IDX_IPCP] = 0; + sp->pp_rseq[IDX_IPCP] = 0; + callout_init(&sp->ch[IDX_IPCP], CALLOUT_MPSAFE); +} + +static void +sppp_ipcp_up(struct sppp *sp) +{ + sppp_up_event(&ipcp, sp); +} + +static void +sppp_ipcp_down(struct sppp *sp) +{ + sppp_down_event(&ipcp, sp); +} + +static void +sppp_ipcp_open(struct sppp *sp) +{ + STDDCL; + u_long myaddr, hisaddr; + + sp->ipcp.flags &= ~(IPCP_HISADDR_SEEN | IPCP_MYADDR_SEEN | + IPCP_MYADDR_DYN | IPCP_VJ); + sp->ipcp.opts = 0; + + sppp_get_ip_addrs(sp, &myaddr, &hisaddr, 0); + /* + * If we don't have his address, this probably means our + * interface doesn't want to talk IP at all. (This could + * be the case if somebody wants to speak only IPX, for + * example.) Don't open IPCP in this case. + */ + if (hisaddr == 0L) { + /* XXX this message should go away */ + if (debug) + log(LOG_DEBUG, SPP_FMT "ipcp_open(): no IP interface\n", + SPP_ARGS(ifp)); + return; + } + if (myaddr == 0L) { + /* + * I don't have an assigned address, so i need to + * negotiate my address. + */ + sp->ipcp.flags |= IPCP_MYADDR_DYN; + sp->ipcp.opts |= (1 << IPCP_OPT_ADDRESS); + } else + sp->ipcp.flags |= IPCP_MYADDR_SEEN; + if (sp->confflags & CONF_ENABLE_VJ) { + sp->ipcp.opts |= (1 << IPCP_OPT_COMPRESSION); + sp->ipcp.max_state = MAX_STATES - 1; + sp->ipcp.compress_cid = 1; + } + sppp_open_event(&ipcp, sp); +} + +static void +sppp_ipcp_close(struct sppp *sp) +{ + sppp_close_event(&ipcp, sp); + if (sp->ipcp.flags & IPCP_MYADDR_DYN) + /* + * My address was dynamic, clear it again. + */ + sppp_set_ip_addr(sp, 0L); +} + +static void +sppp_ipcp_TO(void *cookie) +{ + sppp_to_event(&ipcp, (struct sppp *)cookie); +} + +/* + * Analyze a configure request. Return true if it was agreeable, and + * caused action sca, false if it has been rejected or nak'ed, and + * caused action scn. (The return value is used to make the state + * transition decision in the state automaton.) + */ +static int +sppp_ipcp_RCR(struct sppp *sp, struct lcp_header *h, int len) +{ + u_char *buf, *r, *p; + struct ifnet *ifp = SP2IFP(sp); + int rlen, origlen, debug = ifp->if_flags & IFF_DEBUG; + u_long hisaddr, desiredaddr; + int gotmyaddr = 0; + int desiredcomp; + + len -= 4; + origlen = len; + /* + * Make sure to allocate a buf that can at least hold a + * conf-nak with an `address' option. We might need it below. + */ + buf = r = malloc ((len < 6? 6: len), M_TEMP, M_NOWAIT); + if (! buf) + return (0); + + /* pass 1: see if we can recognize them */ + if (debug) + log(LOG_DEBUG, SPP_FMT "ipcp parse opts: ", + SPP_ARGS(ifp)); + p = (void*) (h+1); + for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; + len-=p[1], p+=p[1]) { + if (debug) + log(-1, " %s ", sppp_ipcp_opt_name(*p)); + switch (*p) { + case IPCP_OPT_COMPRESSION: + if (!(sp->confflags & CONF_ENABLE_VJ)) { + /* VJ compression administratively disabled */ + if (debug) + log(-1, "[locally disabled] "); + break; + } + /* + * In theory, we should only conf-rej an + * option that is shorter than RFC 1618 + * requires (i.e. < 4), and should conf-nak + * anything else that is not VJ. However, + * since our algorithm always uses the + * original option to NAK it with new values, + * things would become more complicated. In + * pratice, the only commonly implemented IP + * compression option is VJ anyway, so the + * difference is negligible. + */ + if (len >= 6 && p[1] == 6) { + /* + * correctly formed compression option + * that could be VJ compression + */ + continue; + } + if (debug) + log(-1, + "optlen %d [invalid/unsupported] ", + p[1]); + break; + case IPCP_OPT_ADDRESS: + if (len >= 6 && p[1] == 6) { + /* correctly formed address option */ + continue; + } + if (debug) + log(-1, "[invalid] "); + break; + default: + /* Others not supported. */ + if (debug) + log(-1, "[rej] "); + break; + } + /* Add the option to rejected list. */ + bcopy (p, r, p[1]); + r += p[1]; + rlen += p[1]; + } + if (rlen) { + if (debug) + log(-1, " send conf-rej\n"); + sppp_cp_send (sp, PPP_IPCP, CONF_REJ, h->ident, rlen, buf); + return 0; + } else if (debug) + log(-1, "\n"); + + /* pass 2: parse option values */ + sppp_get_ip_addrs(sp, 0, &hisaddr, 0); + if (debug) + log(LOG_DEBUG, SPP_FMT "ipcp parse opt values: ", + SPP_ARGS(ifp)); + p = (void*) (h+1); + len = origlen; + for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; + len-=p[1], p+=p[1]) { + if (debug) + log(-1, " %s ", sppp_ipcp_opt_name(*p)); + switch (*p) { + case IPCP_OPT_COMPRESSION: + desiredcomp = p[2] << 8 | p[3]; + /* We only support VJ */ + if (desiredcomp == IPCP_COMP_VJ) { + if (debug) + log(-1, "VJ [ack] "); + sp->ipcp.flags |= IPCP_VJ; + sl_compress_init(sp->pp_comp, p[4]); + sp->ipcp.max_state = p[4]; + sp->ipcp.compress_cid = p[5]; + continue; + } + if (debug) + log(-1, + "compproto %#04x [not supported] ", + desiredcomp); + p[2] = IPCP_COMP_VJ >> 8; + p[3] = IPCP_COMP_VJ; + p[4] = sp->ipcp.max_state; + p[5] = sp->ipcp.compress_cid; + break; + case IPCP_OPT_ADDRESS: + /* This is the address he wants in his end */ + desiredaddr = p[2] << 24 | p[3] << 16 | + p[4] << 8 | p[5]; + if (desiredaddr == hisaddr || + (hisaddr >= 1 && hisaddr <= 254 && desiredaddr != 0)) { + /* + * Peer's address is same as our value, + * or we have set it to 0.0.0.* to + * indicate that we do not really care, + * this is agreeable. Gonna conf-ack + * it. + */ + if (debug) + log(-1, "%s [ack] ", + sppp_dotted_quad(hisaddr)); + /* record that we've seen it already */ + sp->ipcp.flags |= IPCP_HISADDR_SEEN; + continue; + } + /* + * The address wasn't agreeable. This is either + * he sent us 0.0.0.0, asking to assign him an + * address, or he send us another address not + * matching our value. Either case, we gonna + * conf-nak it with our value. + * XXX: we should "rej" if hisaddr == 0 + */ + if (debug) { + if (desiredaddr == 0) + log(-1, "[addr requested] "); + else + log(-1, "%s [not agreed] ", + sppp_dotted_quad(desiredaddr)); + + } + p[2] = hisaddr >> 24; + p[3] = hisaddr >> 16; + p[4] = hisaddr >> 8; + p[5] = hisaddr; + break; + } + /* Add the option to nak'ed list. */ + bcopy (p, r, p[1]); + r += p[1]; + rlen += p[1]; + } + + /* + * If we are about to conf-ack the request, but haven't seen + * his address so far, gonna conf-nak it instead, with the + * `address' option present and our idea of his address being + * filled in there, to request negotiation of both addresses. + * + * XXX This can result in an endless req - nak loop if peer + * doesn't want to send us his address. Q: What should we do + * about it? XXX A: implement the max-failure counter. + */ + if (rlen == 0 && !(sp->ipcp.flags & IPCP_HISADDR_SEEN) && !gotmyaddr) { + buf[0] = IPCP_OPT_ADDRESS; + buf[1] = 6; + buf[2] = hisaddr >> 24; + buf[3] = hisaddr >> 16; + buf[4] = hisaddr >> 8; + buf[5] = hisaddr; + rlen = 6; + if (debug) + log(-1, "still need hisaddr "); + } + + if (rlen) { + if (debug) + log(-1, " send conf-nak\n"); + sppp_cp_send (sp, PPP_IPCP, CONF_NAK, h->ident, rlen, buf); + } else { + if (debug) + log(-1, " send conf-ack\n"); + sppp_cp_send (sp, PPP_IPCP, CONF_ACK, + h->ident, origlen, h+1); + } + + free (buf, M_TEMP); + return (rlen == 0); +} + +/* + * Analyze the IPCP Configure-Reject option list, and adjust our + * negotiation. + */ +static void +sppp_ipcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) +{ + u_char *buf, *p; + struct ifnet *ifp = SP2IFP(sp); + int debug = ifp->if_flags & IFF_DEBUG; + + len -= 4; + buf = malloc (len, M_TEMP, M_NOWAIT); + if (!buf) + return; + + if (debug) + log(LOG_DEBUG, SPP_FMT "ipcp rej opts: ", + SPP_ARGS(ifp)); + + p = (void*) (h+1); + for (; len >= 2 && p[1] >= 2 && len >= p[1]; + len -= p[1], p += p[1]) { + if (debug) + log(-1, " %s ", sppp_ipcp_opt_name(*p)); + switch (*p) { + case IPCP_OPT_COMPRESSION: + sp->ipcp.opts &= ~(1 << IPCP_OPT_COMPRESSION); + break; + case IPCP_OPT_ADDRESS: + /* + * Peer doesn't grok address option. This is + * bad. XXX Should we better give up here? + * XXX We could try old "addresses" option... + */ + sp->ipcp.opts &= ~(1 << IPCP_OPT_ADDRESS); + break; + } + } + if (debug) + log(-1, "\n"); + free (buf, M_TEMP); + return; +} + +/* + * Analyze the IPCP Configure-NAK option list, and adjust our + * negotiation. + */ +static void +sppp_ipcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) +{ + u_char *buf, *p; + struct ifnet *ifp = SP2IFP(sp); + int debug = ifp->if_flags & IFF_DEBUG; + int desiredcomp; + u_long wantaddr; + + len -= 4; + buf = malloc (len, M_TEMP, M_NOWAIT); + if (!buf) + return; + + if (debug) + log(LOG_DEBUG, SPP_FMT "ipcp nak opts: ", + SPP_ARGS(ifp)); + + p = (void*) (h+1); + for (; len >= 2 && p[1] >= 2 && len >= p[1]; + len -= p[1], p += p[1]) { + if (debug) + log(-1, " %s ", sppp_ipcp_opt_name(*p)); + switch (*p) { + case IPCP_OPT_COMPRESSION: + if (len >= 6 && p[1] == 6) { + desiredcomp = p[2] << 8 | p[3]; + if (debug) + log(-1, "[wantcomp %#04x] ", + desiredcomp); + if (desiredcomp == IPCP_COMP_VJ) { + sl_compress_init(sp->pp_comp, p[4]); + sp->ipcp.max_state = p[4]; + sp->ipcp.compress_cid = p[5]; + if (debug) + log(-1, "[agree] "); + } else + sp->ipcp.opts &= + ~(1 << IPCP_OPT_COMPRESSION); + } + break; + case IPCP_OPT_ADDRESS: + /* + * Peer doesn't like our local IP address. See + * if we can do something for him. We'll drop + * him our address then. + */ + if (len >= 6 && p[1] == 6) { + wantaddr = p[2] << 24 | p[3] << 16 | + p[4] << 8 | p[5]; + sp->ipcp.opts |= (1 << IPCP_OPT_ADDRESS); + if (debug) + log(-1, "[wantaddr %s] ", + sppp_dotted_quad(wantaddr)); + /* + * When doing dynamic address assignment, + * we accept his offer. Otherwise, we + * ignore it and thus continue to negotiate + * our already existing value. + * XXX: Bogus, if he said no once, he'll + * just say no again, might as well die. + */ + if (sp->ipcp.flags & IPCP_MYADDR_DYN) { + sppp_set_ip_addr(sp, wantaddr); + if (debug) + log(-1, "[agree] "); + sp->ipcp.flags |= IPCP_MYADDR_SEEN; + } + } + break; + } + } + if (debug) + log(-1, "\n"); + free (buf, M_TEMP); + return; +} + +static void +sppp_ipcp_tlu(struct sppp *sp) +{ + /* we are up - notify isdn daemon */ + if (sp->pp_con) + sp->pp_con(sp); +} + +static void +sppp_ipcp_tld(struct sppp *sp) +{ +} + +static void +sppp_ipcp_tls(struct sppp *sp) +{ + /* indicate to LCP that it must stay alive */ + sp->lcp.protos |= (1 << IDX_IPCP); +} + +static void +sppp_ipcp_tlf(struct sppp *sp) +{ + /* we no longer need LCP */ + sp->lcp.protos &= ~(1 << IDX_IPCP); + sppp_lcp_check_and_close(sp); +} + +static void +sppp_ipcp_scr(struct sppp *sp) +{ + char opt[6 /* compression */ + 6 /* address */]; + u_long ouraddr; + int i = 0; + + if (sp->ipcp.opts & (1 << IPCP_OPT_COMPRESSION)) { + opt[i++] = IPCP_OPT_COMPRESSION; + opt[i++] = 6; + opt[i++] = IPCP_COMP_VJ >> 8; + opt[i++] = IPCP_COMP_VJ; + opt[i++] = sp->ipcp.max_state; + opt[i++] = sp->ipcp.compress_cid; + } + if (sp->ipcp.opts & (1 << IPCP_OPT_ADDRESS)) { + sppp_get_ip_addrs(sp, &ouraddr, 0, 0); + opt[i++] = IPCP_OPT_ADDRESS; + opt[i++] = 6; + opt[i++] = ouraddr >> 24; + opt[i++] = ouraddr >> 16; + opt[i++] = ouraddr >> 8; + opt[i++] = ouraddr; + } + + sp->confid[IDX_IPCP] = ++sp->pp_seq[IDX_IPCP]; + sppp_cp_send(sp, PPP_IPCP, CONF_REQ, sp->confid[IDX_IPCP], i, &opt); +} +#else /* !INET */ +static void +sppp_ipcp_init(struct sppp *sp) +{ +} + +static void +sppp_ipcp_up(struct sppp *sp) +{ +} + +static void +sppp_ipcp_down(struct sppp *sp) +{ +} + +static void +sppp_ipcp_open(struct sppp *sp) +{ +} + +static void +sppp_ipcp_close(struct sppp *sp) +{ +} + +static void +sppp_ipcp_TO(void *cookie) +{ +} + +static int +sppp_ipcp_RCR(struct sppp *sp, struct lcp_header *h, int len) +{ + return (0); +} + +static void +sppp_ipcp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) +{ +} + +static void +sppp_ipcp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) +{ +} + +static void +sppp_ipcp_tlu(struct sppp *sp) +{ +} + +static void +sppp_ipcp_tld(struct sppp *sp) +{ +} + +static void +sppp_ipcp_tls(struct sppp *sp) +{ +} + +static void +sppp_ipcp_tlf(struct sppp *sp) +{ +} + +static void +sppp_ipcp_scr(struct sppp *sp) +{ +} +#endif + +/* + *--------------------------------------------------------------------------* + * * + * The IPv6CP implementation. * + * * + *--------------------------------------------------------------------------* + */ + +#ifdef INET6 +static void +sppp_ipv6cp_init(struct sppp *sp) +{ + sp->ipv6cp.opts = 0; + sp->ipv6cp.flags = 0; + sp->state[IDX_IPV6CP] = STATE_INITIAL; + sp->fail_counter[IDX_IPV6CP] = 0; + sp->pp_seq[IDX_IPV6CP] = 0; + sp->pp_rseq[IDX_IPV6CP] = 0; + callout_init(&sp->ch[IDX_IPV6CP], CALLOUT_MPSAFE); +} + +static void +sppp_ipv6cp_up(struct sppp *sp) +{ + sppp_up_event(&ipv6cp, sp); +} + +static void +sppp_ipv6cp_down(struct sppp *sp) +{ + sppp_down_event(&ipv6cp, sp); +} + +static void +sppp_ipv6cp_open(struct sppp *sp) +{ + STDDCL; + struct in6_addr myaddr, hisaddr; + +#ifdef IPV6CP_MYIFID_DYN + sp->ipv6cp.flags &= ~(IPV6CP_MYIFID_SEEN|IPV6CP_MYIFID_DYN); +#else + sp->ipv6cp.flags &= ~IPV6CP_MYIFID_SEEN; +#endif + + sppp_get_ip6_addrs(sp, &myaddr, &hisaddr, 0); + /* + * If we don't have our address, this probably means our + * interface doesn't want to talk IPv6 at all. (This could + * be the case if somebody wants to speak only IPX, for + * example.) Don't open IPv6CP in this case. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&myaddr)) { + /* XXX this message should go away */ + if (debug) + log(LOG_DEBUG, SPP_FMT "ipv6cp_open(): no IPv6 interface\n", + SPP_ARGS(ifp)); + return; + } + + sp->ipv6cp.flags |= IPV6CP_MYIFID_SEEN; + sp->ipv6cp.opts |= (1 << IPV6CP_OPT_IFID); + sppp_open_event(&ipv6cp, sp); +} + +static void +sppp_ipv6cp_close(struct sppp *sp) +{ + sppp_close_event(&ipv6cp, sp); +} + +static void +sppp_ipv6cp_TO(void *cookie) +{ + sppp_to_event(&ipv6cp, (struct sppp *)cookie); +} + +/* + * Analyze a configure request. Return true if it was agreeable, and + * caused action sca, false if it has been rejected or nak'ed, and + * caused action scn. (The return value is used to make the state + * transition decision in the state automaton.) + */ +static int +sppp_ipv6cp_RCR(struct sppp *sp, struct lcp_header *h, int len) +{ + u_char *buf, *r, *p; + struct ifnet *ifp = SP2IFP(sp); + int rlen, origlen, debug = ifp->if_flags & IFF_DEBUG; + struct in6_addr myaddr, desiredaddr, suggestaddr; + int ifidcount; + int type; + int collision, nohisaddr; + char ip6buf[INET6_ADDRSTRLEN]; + + len -= 4; + origlen = len; + /* + * Make sure to allocate a buf that can at least hold a + * conf-nak with an `address' option. We might need it below. + */ + buf = r = malloc ((len < 6? 6: len), M_TEMP, M_NOWAIT); + if (! buf) + return (0); + + /* pass 1: see if we can recognize them */ + if (debug) + log(LOG_DEBUG, SPP_FMT "ipv6cp parse opts:", + SPP_ARGS(ifp)); + p = (void*) (h+1); + ifidcount = 0; + for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; + len-=p[1], p+=p[1]) { + if (debug) + log(-1, " %s", sppp_ipv6cp_opt_name(*p)); + switch (*p) { + case IPV6CP_OPT_IFID: + if (len >= 10 && p[1] == 10 && ifidcount == 0) { + /* correctly formed address option */ + ifidcount++; + continue; + } + if (debug) + log(-1, " [invalid]"); + break; +#ifdef notyet + case IPV6CP_OPT_COMPRESSION: + if (len >= 4 && p[1] >= 4) { + /* correctly formed compress option */ + continue; + } + if (debug) + log(-1, " [invalid]"); + break; +#endif + default: + /* Others not supported. */ + if (debug) + log(-1, " [rej]"); + break; + } + /* Add the option to rejected list. */ + bcopy (p, r, p[1]); + r += p[1]; + rlen += p[1]; + } + if (rlen) { + if (debug) + log(-1, " send conf-rej\n"); + sppp_cp_send (sp, PPP_IPV6CP, CONF_REJ, h->ident, rlen, buf); + goto end; + } else if (debug) + log(-1, "\n"); + + /* pass 2: parse option values */ + sppp_get_ip6_addrs(sp, &myaddr, 0, 0); + if (debug) + log(LOG_DEBUG, SPP_FMT "ipv6cp parse opt values: ", + SPP_ARGS(ifp)); + p = (void*) (h+1); + len = origlen; + type = CONF_ACK; + for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1]; + len-=p[1], p+=p[1]) { + if (debug) + log(-1, " %s", sppp_ipv6cp_opt_name(*p)); + switch (*p) { +#ifdef notyet + case IPV6CP_OPT_COMPRESSION: + continue; +#endif + case IPV6CP_OPT_IFID: + bzero(&desiredaddr, sizeof(desiredaddr)); + bcopy(&p[2], &desiredaddr.s6_addr[8], 8); + collision = (bcmp(&desiredaddr.s6_addr[8], + &myaddr.s6_addr[8], 8) == 0); + nohisaddr = IN6_IS_ADDR_UNSPECIFIED(&desiredaddr); + + desiredaddr.s6_addr16[0] = htons(0xfe80); + (void)in6_setscope(&desiredaddr, SP2IFP(sp), NULL); + + if (!collision && !nohisaddr) { + /* no collision, hisaddr known - Conf-Ack */ + type = CONF_ACK; + + if (debug) { + log(-1, " %s [%s]", + ip6_sprintf(ip6buf, &desiredaddr), + sppp_cp_type_name(type)); + } + continue; + } + + bzero(&suggestaddr, sizeof(&suggestaddr)); + if (collision && nohisaddr) { + /* collision, hisaddr unknown - Conf-Rej */ + type = CONF_REJ; + bzero(&p[2], 8); + } else { + /* + * - no collision, hisaddr unknown, or + * - collision, hisaddr known + * Conf-Nak, suggest hisaddr + */ + type = CONF_NAK; + sppp_suggest_ip6_addr(sp, &suggestaddr); + bcopy(&suggestaddr.s6_addr[8], &p[2], 8); + } + if (debug) + log(-1, " %s [%s]", + ip6_sprintf(ip6buf, &desiredaddr), + sppp_cp_type_name(type)); + break; + } + /* Add the option to nak'ed list. */ + bcopy (p, r, p[1]); + r += p[1]; + rlen += p[1]; + } + + if (rlen == 0 && type == CONF_ACK) { + if (debug) + log(-1, " send %s\n", sppp_cp_type_name(type)); + sppp_cp_send (sp, PPP_IPV6CP, type, h->ident, origlen, h+1); + } else { +#ifdef DIAGNOSTIC + if (type == CONF_ACK) + panic("IPv6CP RCR: CONF_ACK with non-zero rlen"); +#endif + + if (debug) { + log(-1, " send %s suggest %s\n", + sppp_cp_type_name(type), + ip6_sprintf(ip6buf, &suggestaddr)); + } + sppp_cp_send (sp, PPP_IPV6CP, type, h->ident, rlen, buf); + } + + end: + free (buf, M_TEMP); + return (rlen == 0); +} + +/* + * Analyze the IPv6CP Configure-Reject option list, and adjust our + * negotiation. + */ +static void +sppp_ipv6cp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) +{ + u_char *buf, *p; + struct ifnet *ifp = SP2IFP(sp); + int debug = ifp->if_flags & IFF_DEBUG; + + len -= 4; + buf = malloc (len, M_TEMP, M_NOWAIT); + if (!buf) + return; + + if (debug) + log(LOG_DEBUG, SPP_FMT "ipv6cp rej opts:", + SPP_ARGS(ifp)); + + p = (void*) (h+1); + for (; len >= 2 && p[1] >= 2 && len >= p[1]; + len -= p[1], p += p[1]) { + if (debug) + log(-1, " %s", sppp_ipv6cp_opt_name(*p)); + switch (*p) { + case IPV6CP_OPT_IFID: + /* + * Peer doesn't grok address option. This is + * bad. XXX Should we better give up here? + */ + sp->ipv6cp.opts &= ~(1 << IPV6CP_OPT_IFID); + break; +#ifdef notyet + case IPV6CP_OPT_COMPRESS: + sp->ipv6cp.opts &= ~(1 << IPV6CP_OPT_COMPRESS); + break; +#endif + } + } + if (debug) + log(-1, "\n"); + free (buf, M_TEMP); + return; +} + +/* + * Analyze the IPv6CP Configure-NAK option list, and adjust our + * negotiation. + */ +static void +sppp_ipv6cp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) +{ + u_char *buf, *p; + struct ifnet *ifp = SP2IFP(sp); + int debug = ifp->if_flags & IFF_DEBUG; + struct in6_addr suggestaddr; + char ip6buf[INET6_ADDRSTRLEN]; + + len -= 4; + buf = malloc (len, M_TEMP, M_NOWAIT); + if (!buf) + return; + + if (debug) + log(LOG_DEBUG, SPP_FMT "ipv6cp nak opts:", + SPP_ARGS(ifp)); + + p = (void*) (h+1); + for (; len >= 2 && p[1] >= 2 && len >= p[1]; + len -= p[1], p += p[1]) { + if (debug) + log(-1, " %s", sppp_ipv6cp_opt_name(*p)); + switch (*p) { + case IPV6CP_OPT_IFID: + /* + * Peer doesn't like our local ifid. See + * if we can do something for him. We'll drop + * him our address then. + */ + if (len < 10 || p[1] != 10) + break; + bzero(&suggestaddr, sizeof(suggestaddr)); + suggestaddr.s6_addr16[0] = htons(0xfe80); + (void)in6_setscope(&suggestaddr, SP2IFP(sp), NULL); + bcopy(&p[2], &suggestaddr.s6_addr[8], 8); + + sp->ipv6cp.opts |= (1 << IPV6CP_OPT_IFID); + if (debug) + log(-1, " [suggestaddr %s]", + ip6_sprintf(ip6buf, &suggestaddr)); +#ifdef IPV6CP_MYIFID_DYN + /* + * When doing dynamic address assignment, + * we accept his offer. + */ + if (sp->ipv6cp.flags & IPV6CP_MYIFID_DYN) { + struct in6_addr lastsuggest; + /* + * If equals to + * , + * we have a collision. generate new random + * ifid. + */ + sppp_suggest_ip6_addr(&lastsuggest); + if (IN6_ARE_ADDR_EQUAL(&suggestaddr, + lastsuggest)) { + if (debug) + log(-1, " [random]"); + sppp_gen_ip6_addr(sp, &suggestaddr); + } + sppp_set_ip6_addr(sp, &suggestaddr, 0); + if (debug) + log(-1, " [agree]"); + sp->ipv6cp.flags |= IPV6CP_MYIFID_SEEN; + } +#else + /* + * Since we do not do dynamic address assignment, + * we ignore it and thus continue to negotiate + * our already existing value. This can possibly + * go into infinite request-reject loop. + * + * This is not likely because we normally use + * ifid based on MAC-address. + * If you have no ethernet card on the node, too bad. + * XXX should we use fail_counter? + */ +#endif + break; +#ifdef notyet + case IPV6CP_OPT_COMPRESS: + /* + * Peer wants different compression parameters. + */ + break; +#endif + } + } + if (debug) + log(-1, "\n"); + free (buf, M_TEMP); + return; +} +static void +sppp_ipv6cp_tlu(struct sppp *sp) +{ + /* we are up - notify isdn daemon */ + if (sp->pp_con) + sp->pp_con(sp); +} + +static void +sppp_ipv6cp_tld(struct sppp *sp) +{ +} + +static void +sppp_ipv6cp_tls(struct sppp *sp) +{ + /* indicate to LCP that it must stay alive */ + sp->lcp.protos |= (1 << IDX_IPV6CP); +} + +static void +sppp_ipv6cp_tlf(struct sppp *sp) +{ + +#if 0 /* need #if 0 to close IPv6CP properly */ + /* we no longer need LCP */ + sp->lcp.protos &= ~(1 << IDX_IPV6CP); + sppp_lcp_check_and_close(sp); +#endif +} + +static void +sppp_ipv6cp_scr(struct sppp *sp) +{ + char opt[10 /* ifid */ + 4 /* compression, minimum */]; + struct in6_addr ouraddr; + int i = 0; + + if (sp->ipv6cp.opts & (1 << IPV6CP_OPT_IFID)) { + sppp_get_ip6_addrs(sp, &ouraddr, 0, 0); + opt[i++] = IPV6CP_OPT_IFID; + opt[i++] = 10; + bcopy(&ouraddr.s6_addr[8], &opt[i], 8); + i += 8; + } + +#ifdef notyet + if (sp->ipv6cp.opts & (1 << IPV6CP_OPT_COMPRESSION)) { + opt[i++] = IPV6CP_OPT_COMPRESSION; + opt[i++] = 4; + opt[i++] = 0; /* TBD */ + opt[i++] = 0; /* TBD */ + /* variable length data may follow */ + } +#endif + + sp->confid[IDX_IPV6CP] = ++sp->pp_seq[IDX_IPV6CP]; + sppp_cp_send(sp, PPP_IPV6CP, CONF_REQ, sp->confid[IDX_IPV6CP], i, &opt); +} +#else /*INET6*/ +static void sppp_ipv6cp_init(struct sppp *sp) +{ +} + +static void sppp_ipv6cp_up(struct sppp *sp) +{ +} + +static void sppp_ipv6cp_down(struct sppp *sp) +{ +} + + +static void sppp_ipv6cp_open(struct sppp *sp) +{ +} + +static void sppp_ipv6cp_close(struct sppp *sp) +{ +} + +static void sppp_ipv6cp_TO(void *sp) +{ +} + +static int sppp_ipv6cp_RCR(struct sppp *sp, struct lcp_header *h, int len) +{ + return 0; +} + +static void sppp_ipv6cp_RCN_rej(struct sppp *sp, struct lcp_header *h, int len) +{ +} + +static void sppp_ipv6cp_RCN_nak(struct sppp *sp, struct lcp_header *h, int len) +{ +} + +static void sppp_ipv6cp_tlu(struct sppp *sp) +{ +} + +static void sppp_ipv6cp_tld(struct sppp *sp) +{ +} + +static void sppp_ipv6cp_tls(struct sppp *sp) +{ +} + +static void sppp_ipv6cp_tlf(struct sppp *sp) +{ +} + +static void sppp_ipv6cp_scr(struct sppp *sp) +{ +} +#endif /*INET6*/ + +/* + *--------------------------------------------------------------------------* + * * + * The CHAP implementation. * + * * + *--------------------------------------------------------------------------* + */ + +/* + * The authentication protocols don't employ a full-fledged state machine as + * the control protocols do, since they do have Open and Close events, but + * not Up and Down, nor are they explicitly terminated. Also, use of the + * authentication protocols may be different in both directions (this makes + * sense, think of a machine that never accepts incoming calls but only + * calls out, it doesn't require the called party to authenticate itself). + * + * Our state machine for the local authentication protocol (we are requesting + * the peer to authenticate) looks like: + * + * RCA- + * +--------------------------------------------+ + * V scn,tld| + * +--------+ Close +---------+ RCA+ + * | |<----------------------------------| |------+ + * +--->| Closed | TO* | Opened | sca | + * | | |-----+ +-------| |<-----+ + * | +--------+ irc | | +---------+ + * | ^ | | ^ + * | | | | | + * | | | | | + * | TO-| | | | + * | |tld TO+ V | | + * | | +------->+ | | + * | | | | | | + * | +--------+ V | | + * | | |<----+<--------------------+ | + * | | Req- | scr | + * | | Sent | | + * | | | | + * | +--------+ | + * | RCA- | | RCA+ | + * +------+ +------------------------------------------+ + * scn,tld sca,irc,ict,tlu + * + * + * with: + * + * Open: LCP reached authentication phase + * Close: LCP reached terminate phase + * + * RCA+: received reply (pap-req, chap-response), acceptable + * RCN: received reply (pap-req, chap-response), not acceptable + * TO+: timeout with restart counter >= 0 + * TO-: timeout with restart counter < 0 + * TO*: reschedule timeout for CHAP + * + * scr: send request packet (none for PAP, chap-challenge) + * sca: send ack packet (pap-ack, chap-success) + * scn: send nak packet (pap-nak, chap-failure) + * ict: initialize re-challenge timer (CHAP only) + * + * tlu: this-layer-up, LCP reaches network phase + * tld: this-layer-down, LCP enters terminate phase + * + * Note that in CHAP mode, after sending a new challenge, while the state + * automaton falls back into Req-Sent state, it doesn't signal a tld + * event to LCP, so LCP remains in network phase. Only after not getting + * any response (or after getting an unacceptable response), CHAP closes, + * causing LCP to enter terminate phase. + * + * With PAP, there is no initial request that can be sent. The peer is + * expected to send one based on the successful negotiation of PAP as + * the authentication protocol during the LCP option negotiation. + * + * Incoming authentication protocol requests (remote requests + * authentication, we are peer) don't employ a state machine at all, + * they are simply answered. Some peers [Ascend P50 firmware rev + * 4.50] react allergically when sending IPCP requests while they are + * still in authentication phase (thereby violating the standard that + * demands that these NCP packets are to be discarded), so we keep + * track of the peer demanding us to authenticate, and only proceed to + * phase network once we've seen a positive acknowledge for the + * authentication. + */ + +/* + * Handle incoming CHAP packets. + */ +static void +sppp_chap_input(struct sppp *sp, struct mbuf *m) +{ + STDDCL; + struct lcp_header *h; + int len, x; + u_char *value, *name, digest[AUTHKEYLEN], dsize; + int value_len, name_len; + MD5_CTX ctx; + + len = m->m_pkthdr.len; + if (len < 4) { + if (debug) + log(LOG_DEBUG, + SPP_FMT "chap invalid packet length: %d bytes\n", + SPP_ARGS(ifp), len); + return; + } + h = mtod (m, struct lcp_header*); + if (len > ntohs (h->len)) + len = ntohs (h->len); + + switch (h->type) { + /* challenge, failure and success are his authproto */ + case CHAP_CHALLENGE: + value = 1 + (u_char*)(h+1); + value_len = value[-1]; + name = value + value_len; + name_len = len - value_len - 5; + if (name_len < 0) { + if (debug) { + log(LOG_DEBUG, + SPP_FMT "chap corrupted challenge " + "<%s id=0x%x len=%d", + SPP_ARGS(ifp), + sppp_auth_type_name(PPP_CHAP, h->type), + h->ident, ntohs(h->len)); + sppp_print_bytes((u_char*) (h+1), len-4); + log(-1, ">\n"); + } + break; + } + + if (debug) { + log(LOG_DEBUG, + SPP_FMT "chap input <%s id=0x%x len=%d name=", + SPP_ARGS(ifp), + sppp_auth_type_name(PPP_CHAP, h->type), h->ident, + ntohs(h->len)); + sppp_print_string((char*) name, name_len); + log(-1, " value-size=%d value=", value_len); + sppp_print_bytes(value, value_len); + log(-1, ">\n"); + } + + /* Compute reply value. */ + MD5Init(&ctx); + MD5Update(&ctx, &h->ident, 1); + MD5Update(&ctx, sp->myauth.secret, + sppp_strnlen(sp->myauth.secret, AUTHKEYLEN)); + MD5Update(&ctx, value, value_len); + MD5Final(digest, &ctx); + dsize = sizeof digest; + + sppp_auth_send(&chap, sp, CHAP_RESPONSE, h->ident, + sizeof dsize, (const char *)&dsize, + sizeof digest, digest, + (size_t)sppp_strnlen(sp->myauth.name, AUTHNAMELEN), + sp->myauth.name, + 0); + break; + + case CHAP_SUCCESS: + if (debug) { + log(LOG_DEBUG, SPP_FMT "chap success", + SPP_ARGS(ifp)); + if (len > 4) { + log(-1, ": "); + sppp_print_string((char*)(h + 1), len - 4); + } + log(-1, "\n"); + } + x = splimp(); + SPPP_LOCK(sp); + sp->pp_flags &= ~PP_NEEDAUTH; + if (sp->myauth.proto == PPP_CHAP && + (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) && + (sp->lcp.protos & (1 << IDX_CHAP)) == 0) { + /* + * We are authenticator for CHAP but didn't + * complete yet. Leave it to tlu to proceed + * to network phase. + */ + SPPP_UNLOCK(sp); + splx(x); + break; + } + SPPP_UNLOCK(sp); + splx(x); + sppp_phase_network(sp); + break; + + case CHAP_FAILURE: + if (debug) { + log(LOG_INFO, SPP_FMT "chap failure", + SPP_ARGS(ifp)); + if (len > 4) { + log(-1, ": "); + sppp_print_string((char*)(h + 1), len - 4); + } + log(-1, "\n"); + } else + log(LOG_INFO, SPP_FMT "chap failure\n", + SPP_ARGS(ifp)); + /* await LCP shutdown by authenticator */ + break; + + /* response is my authproto */ + case CHAP_RESPONSE: + value = 1 + (u_char*)(h+1); + value_len = value[-1]; + name = value + value_len; + name_len = len - value_len - 5; + if (name_len < 0) { + if (debug) { + log(LOG_DEBUG, + SPP_FMT "chap corrupted response " + "<%s id=0x%x len=%d", + SPP_ARGS(ifp), + sppp_auth_type_name(PPP_CHAP, h->type), + h->ident, ntohs(h->len)); + sppp_print_bytes((u_char*)(h+1), len-4); + log(-1, ">\n"); + } + break; + } + if (h->ident != sp->confid[IDX_CHAP]) { + if (debug) + log(LOG_DEBUG, + SPP_FMT "chap dropping response for old ID " + "(got %d, expected %d)\n", + SPP_ARGS(ifp), + h->ident, sp->confid[IDX_CHAP]); + break; + } + if (name_len != sppp_strnlen(sp->hisauth.name, AUTHNAMELEN) + || bcmp(name, sp->hisauth.name, name_len) != 0) { + log(LOG_INFO, SPP_FMT "chap response, his name ", + SPP_ARGS(ifp)); + sppp_print_string(name, name_len); + log(-1, " != expected "); + sppp_print_string(sp->hisauth.name, + sppp_strnlen(sp->hisauth.name, AUTHNAMELEN)); + log(-1, "\n"); + } + if (debug) { + log(LOG_DEBUG, SPP_FMT "chap input(%s) " + "<%s id=0x%x len=%d name=", + SPP_ARGS(ifp), + sppp_state_name(sp->state[IDX_CHAP]), + sppp_auth_type_name(PPP_CHAP, h->type), + h->ident, ntohs (h->len)); + sppp_print_string((char*)name, name_len); + log(-1, " value-size=%d value=", value_len); + sppp_print_bytes(value, value_len); + log(-1, ">\n"); + } + if (value_len != AUTHKEYLEN) { + if (debug) + log(LOG_DEBUG, + SPP_FMT "chap bad hash value length: " + "%d bytes, should be %d\n", + SPP_ARGS(ifp), value_len, + AUTHKEYLEN); + break; + } + + MD5Init(&ctx); + MD5Update(&ctx, &h->ident, 1); + MD5Update(&ctx, sp->hisauth.secret, + sppp_strnlen(sp->hisauth.secret, AUTHKEYLEN)); + MD5Update(&ctx, sp->myauth.challenge, AUTHKEYLEN); + MD5Final(digest, &ctx); + +#define FAILMSG "Failed..." +#define SUCCMSG "Welcome!" + + if (value_len != sizeof digest || + bcmp(digest, value, value_len) != 0) { + /* action scn, tld */ + sppp_auth_send(&chap, sp, CHAP_FAILURE, h->ident, + sizeof(FAILMSG) - 1, (u_char *)FAILMSG, + 0); + chap.tld(sp); + break; + } + /* action sca, perhaps tlu */ + if (sp->state[IDX_CHAP] == STATE_REQ_SENT || + sp->state[IDX_CHAP] == STATE_OPENED) + sppp_auth_send(&chap, sp, CHAP_SUCCESS, h->ident, + sizeof(SUCCMSG) - 1, (u_char *)SUCCMSG, + 0); + if (sp->state[IDX_CHAP] == STATE_REQ_SENT) { + sppp_cp_change_state(&chap, sp, STATE_OPENED); + chap.tlu(sp); + } + break; + + default: + /* Unknown CHAP packet type -- ignore. */ + if (debug) { + log(LOG_DEBUG, SPP_FMT "chap unknown input(%s) " + "<0x%x id=0x%xh len=%d", + SPP_ARGS(ifp), + sppp_state_name(sp->state[IDX_CHAP]), + h->type, h->ident, ntohs(h->len)); + sppp_print_bytes((u_char*)(h+1), len-4); + log(-1, ">\n"); + } + break; + + } +} + +static void +sppp_chap_init(struct sppp *sp) +{ + /* Chap doesn't have STATE_INITIAL at all. */ + sp->state[IDX_CHAP] = STATE_CLOSED; + sp->fail_counter[IDX_CHAP] = 0; + sp->pp_seq[IDX_CHAP] = 0; + sp->pp_rseq[IDX_CHAP] = 0; + callout_init(&sp->ch[IDX_CHAP], CALLOUT_MPSAFE); +} + +static void +sppp_chap_open(struct sppp *sp) +{ + if (sp->myauth.proto == PPP_CHAP && + (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) != 0) { + /* we are authenticator for CHAP, start it */ + chap.scr(sp); + sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure; + sppp_cp_change_state(&chap, sp, STATE_REQ_SENT); + } + /* nothing to be done if we are peer, await a challenge */ +} + +static void +sppp_chap_close(struct sppp *sp) +{ + if (sp->state[IDX_CHAP] != STATE_CLOSED) + sppp_cp_change_state(&chap, sp, STATE_CLOSED); +} + +static void +sppp_chap_TO(void *cookie) +{ + struct sppp *sp = (struct sppp *)cookie; + STDDCL; + int s; + + s = splimp(); + SPPP_LOCK(sp); + if (debug) + log(LOG_DEBUG, SPP_FMT "chap TO(%s) rst_counter = %d\n", + SPP_ARGS(ifp), + sppp_state_name(sp->state[IDX_CHAP]), + sp->rst_counter[IDX_CHAP]); + + if (--sp->rst_counter[IDX_CHAP] < 0) + /* TO- event */ + switch (sp->state[IDX_CHAP]) { + case STATE_REQ_SENT: + chap.tld(sp); + sppp_cp_change_state(&chap, sp, STATE_CLOSED); + break; + } + else + /* TO+ (or TO*) event */ + switch (sp->state[IDX_CHAP]) { + case STATE_OPENED: + /* TO* event */ + sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure; + /* FALLTHROUGH */ + case STATE_REQ_SENT: + chap.scr(sp); + /* sppp_cp_change_state() will restart the timer */ + sppp_cp_change_state(&chap, sp, STATE_REQ_SENT); + break; + } + + SPPP_UNLOCK(sp); + splx(s); +} + +static void +sppp_chap_tlu(struct sppp *sp) +{ + STDDCL; + int i, x; + + i = 0; + sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure; + + /* + * Some broken CHAP implementations (Conware CoNet, firmware + * 4.0.?) don't want to re-authenticate their CHAP once the + * initial challenge-response exchange has taken place. + * Provide for an option to avoid rechallenges. + */ + if ((sp->hisauth.flags & AUTHFLAG_NORECHALLENGE) == 0) { + /* + * Compute the re-challenge timeout. This will yield + * a number between 300 and 810 seconds. + */ + i = 300 + ((unsigned)(random() & 0xff00) >> 7); + callout_reset(&sp->ch[IDX_CHAP], i * hz, chap.TO, (void *)sp); + } + + if (debug) { + log(LOG_DEBUG, + SPP_FMT "chap %s, ", + SPP_ARGS(ifp), + sp->pp_phase == PHASE_NETWORK? "reconfirmed": "tlu"); + if ((sp->hisauth.flags & AUTHFLAG_NORECHALLENGE) == 0) + log(-1, "next re-challenge in %d seconds\n", i); + else + log(-1, "re-challenging supressed\n"); + } + + x = splimp(); + SPPP_LOCK(sp); + /* indicate to LCP that we need to be closed down */ + sp->lcp.protos |= (1 << IDX_CHAP); + + if (sp->pp_flags & PP_NEEDAUTH) { + /* + * Remote is authenticator, but his auth proto didn't + * complete yet. Defer the transition to network + * phase. + */ + SPPP_UNLOCK(sp); + splx(x); + return; + } + SPPP_UNLOCK(sp); + splx(x); + + /* + * If we are already in phase network, we are done here. This + * is the case if this is a dummy tlu event after a re-challenge. + */ + if (sp->pp_phase != PHASE_NETWORK) + sppp_phase_network(sp); +} + +static void +sppp_chap_tld(struct sppp *sp) +{ + STDDCL; + + if (debug) + log(LOG_DEBUG, SPP_FMT "chap tld\n", SPP_ARGS(ifp)); + callout_stop(&sp->ch[IDX_CHAP]); + sp->lcp.protos &= ~(1 << IDX_CHAP); + + lcp.Close(sp); +} + +static void +sppp_chap_scr(struct sppp *sp) +{ + u_long *ch, seed; + u_char clen; + + /* Compute random challenge. */ + ch = (u_long *)sp->myauth.challenge; + read_random(&seed, sizeof seed); + ch[0] = seed ^ random(); + ch[1] = seed ^ random(); + ch[2] = seed ^ random(); + ch[3] = seed ^ random(); + clen = AUTHKEYLEN; + + sp->confid[IDX_CHAP] = ++sp->pp_seq[IDX_CHAP]; + + sppp_auth_send(&chap, sp, CHAP_CHALLENGE, sp->confid[IDX_CHAP], + sizeof clen, (const char *)&clen, + (size_t)AUTHKEYLEN, sp->myauth.challenge, + (size_t)sppp_strnlen(sp->myauth.name, AUTHNAMELEN), + sp->myauth.name, + 0); +} + +/* + *--------------------------------------------------------------------------* + * * + * The PAP implementation. * + * * + *--------------------------------------------------------------------------* + */ +/* + * For PAP, we need to keep a little state also if we are the peer, not the + * authenticator. This is since we don't get a request to authenticate, but + * have to repeatedly authenticate ourself until we got a response (or the + * retry counter is expired). + */ + +/* + * Handle incoming PAP packets. */ +static void +sppp_pap_input(struct sppp *sp, struct mbuf *m) +{ + STDDCL; + struct lcp_header *h; + int len, x; + u_char *name, *passwd, mlen; + int name_len, passwd_len; + + len = m->m_pkthdr.len; + if (len < 5) { + if (debug) + log(LOG_DEBUG, + SPP_FMT "pap invalid packet length: %d bytes\n", + SPP_ARGS(ifp), len); + return; + } + h = mtod (m, struct lcp_header*); + if (len > ntohs (h->len)) + len = ntohs (h->len); + switch (h->type) { + /* PAP request is my authproto */ + case PAP_REQ: + name = 1 + (u_char*)(h+1); + name_len = name[-1]; + passwd = name + name_len + 1; + if (name_len > len - 6 || + (passwd_len = passwd[-1]) > len - 6 - name_len) { + if (debug) { + log(LOG_DEBUG, SPP_FMT "pap corrupted input " + "<%s id=0x%x len=%d", + SPP_ARGS(ifp), + sppp_auth_type_name(PPP_PAP, h->type), + h->ident, ntohs(h->len)); + sppp_print_bytes((u_char*)(h+1), len-4); + log(-1, ">\n"); + } + break; + } + if (debug) { + log(LOG_DEBUG, SPP_FMT "pap input(%s) " + "<%s id=0x%x len=%d name=", + SPP_ARGS(ifp), + sppp_state_name(sp->state[IDX_PAP]), + sppp_auth_type_name(PPP_PAP, h->type), + h->ident, ntohs(h->len)); + sppp_print_string((char*)name, name_len); + log(-1, " passwd="); + sppp_print_string((char*)passwd, passwd_len); + log(-1, ">\n"); + } + if (name_len != sppp_strnlen(sp->hisauth.name, AUTHNAMELEN) || + passwd_len != sppp_strnlen(sp->hisauth.secret, AUTHKEYLEN) || + bcmp(name, sp->hisauth.name, name_len) != 0 || + bcmp(passwd, sp->hisauth.secret, passwd_len) != 0) { + /* action scn, tld */ + mlen = sizeof(FAILMSG) - 1; + sppp_auth_send(&pap, sp, PAP_NAK, h->ident, + sizeof mlen, (const char *)&mlen, + sizeof(FAILMSG) - 1, (u_char *)FAILMSG, + 0); + pap.tld(sp); + break; + } + /* action sca, perhaps tlu */ + if (sp->state[IDX_PAP] == STATE_REQ_SENT || + sp->state[IDX_PAP] == STATE_OPENED) { + mlen = sizeof(SUCCMSG) - 1; + sppp_auth_send(&pap, sp, PAP_ACK, h->ident, + sizeof mlen, (const char *)&mlen, + sizeof(SUCCMSG) - 1, (u_char *)SUCCMSG, + 0); + } + if (sp->state[IDX_PAP] == STATE_REQ_SENT) { + sppp_cp_change_state(&pap, sp, STATE_OPENED); + pap.tlu(sp); + } + break; + + /* ack and nak are his authproto */ + case PAP_ACK: + callout_stop(&sp->pap_my_to_ch); + if (debug) { + log(LOG_DEBUG, SPP_FMT "pap success", + SPP_ARGS(ifp)); + name_len = *((char *)h); + if (len > 5 && name_len) { + log(-1, ": "); + sppp_print_string((char*)(h+1), name_len); + } + log(-1, "\n"); + } + x = splimp(); + SPPP_LOCK(sp); + sp->pp_flags &= ~PP_NEEDAUTH; + if (sp->myauth.proto == PPP_PAP && + (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) && + (sp->lcp.protos & (1 << IDX_PAP)) == 0) { + /* + * We are authenticator for PAP but didn't + * complete yet. Leave it to tlu to proceed + * to network phase. + */ + SPPP_UNLOCK(sp); + splx(x); + break; + } + SPPP_UNLOCK(sp); + splx(x); + sppp_phase_network(sp); + break; + + case PAP_NAK: + callout_stop (&sp->pap_my_to_ch); + if (debug) { + log(LOG_INFO, SPP_FMT "pap failure", + SPP_ARGS(ifp)); + name_len = *((char *)h); + if (len > 5 && name_len) { + log(-1, ": "); + sppp_print_string((char*)(h+1), name_len); + } + log(-1, "\n"); + } else + log(LOG_INFO, SPP_FMT "pap failure\n", + SPP_ARGS(ifp)); + /* await LCP shutdown by authenticator */ + break; + + default: + /* Unknown PAP packet type -- ignore. */ + if (debug) { + log(LOG_DEBUG, SPP_FMT "pap corrupted input " + "<0x%x id=0x%x len=%d", + SPP_ARGS(ifp), + h->type, h->ident, ntohs(h->len)); + sppp_print_bytes((u_char*)(h+1), len-4); + log(-1, ">\n"); + } + break; + + } +} + +static void +sppp_pap_init(struct sppp *sp) +{ + /* PAP doesn't have STATE_INITIAL at all. */ + sp->state[IDX_PAP] = STATE_CLOSED; + sp->fail_counter[IDX_PAP] = 0; + sp->pp_seq[IDX_PAP] = 0; + sp->pp_rseq[IDX_PAP] = 0; + callout_init(&sp->ch[IDX_PAP], CALLOUT_MPSAFE); + callout_init(&sp->pap_my_to_ch, CALLOUT_MPSAFE); +} + +static void +sppp_pap_open(struct sppp *sp) +{ + if (sp->hisauth.proto == PPP_PAP && + (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) != 0) { + /* we are authenticator for PAP, start our timer */ + sp->rst_counter[IDX_PAP] = sp->lcp.max_configure; + sppp_cp_change_state(&pap, sp, STATE_REQ_SENT); + } + if (sp->myauth.proto == PPP_PAP) { + /* we are peer, send a request, and start a timer */ + pap.scr(sp); + callout_reset(&sp->pap_my_to_ch, sp->lcp.timeout, + sppp_pap_my_TO, (void *)sp); + } +} + +static void +sppp_pap_close(struct sppp *sp) +{ + if (sp->state[IDX_PAP] != STATE_CLOSED) + sppp_cp_change_state(&pap, sp, STATE_CLOSED); +} + +/* + * That's the timeout routine if we are authenticator. Since the + * authenticator is basically passive in PAP, we can't do much here. + */ +static void +sppp_pap_TO(void *cookie) +{ + struct sppp *sp = (struct sppp *)cookie; + STDDCL; + int s; + + s = splimp(); + SPPP_LOCK(sp); + if (debug) + log(LOG_DEBUG, SPP_FMT "pap TO(%s) rst_counter = %d\n", + SPP_ARGS(ifp), + sppp_state_name(sp->state[IDX_PAP]), + sp->rst_counter[IDX_PAP]); + + if (--sp->rst_counter[IDX_PAP] < 0) + /* TO- event */ + switch (sp->state[IDX_PAP]) { + case STATE_REQ_SENT: + pap.tld(sp); + sppp_cp_change_state(&pap, sp, STATE_CLOSED); + break; + } + else + /* TO+ event, not very much we could do */ + switch (sp->state[IDX_PAP]) { + case STATE_REQ_SENT: + /* sppp_cp_change_state() will restart the timer */ + sppp_cp_change_state(&pap, sp, STATE_REQ_SENT); + break; + } + + SPPP_UNLOCK(sp); + splx(s); +} + +/* + * That's the timeout handler if we are peer. Since the peer is active, + * we need to retransmit our PAP request since it is apparently lost. + * XXX We should impose a max counter. + */ +static void +sppp_pap_my_TO(void *cookie) +{ + struct sppp *sp = (struct sppp *)cookie; + STDDCL; + + if (debug) + log(LOG_DEBUG, SPP_FMT "pap peer TO\n", + SPP_ARGS(ifp)); + + SPPP_LOCK(sp); + pap.scr(sp); + SPPP_UNLOCK(sp); +} + +static void +sppp_pap_tlu(struct sppp *sp) +{ + STDDCL; + int x; + + sp->rst_counter[IDX_PAP] = sp->lcp.max_configure; + + if (debug) + log(LOG_DEBUG, SPP_FMT "%s tlu\n", + SPP_ARGS(ifp), pap.name); + + x = splimp(); + SPPP_LOCK(sp); + /* indicate to LCP that we need to be closed down */ + sp->lcp.protos |= (1 << IDX_PAP); + + if (sp->pp_flags & PP_NEEDAUTH) { + /* + * Remote is authenticator, but his auth proto didn't + * complete yet. Defer the transition to network + * phase. + */ + SPPP_UNLOCK(sp); + splx(x); + return; + } + SPPP_UNLOCK(sp); + splx(x); + sppp_phase_network(sp); +} + +static void +sppp_pap_tld(struct sppp *sp) +{ + STDDCL; + + if (debug) + log(LOG_DEBUG, SPP_FMT "pap tld\n", SPP_ARGS(ifp)); + callout_stop (&sp->ch[IDX_PAP]); + callout_stop (&sp->pap_my_to_ch); + sp->lcp.protos &= ~(1 << IDX_PAP); + + lcp.Close(sp); +} + +static void +sppp_pap_scr(struct sppp *sp) +{ + u_char idlen, pwdlen; + + sp->confid[IDX_PAP] = ++sp->pp_seq[IDX_PAP]; + pwdlen = sppp_strnlen(sp->myauth.secret, AUTHKEYLEN); + idlen = sppp_strnlen(sp->myauth.name, AUTHNAMELEN); + + sppp_auth_send(&pap, sp, PAP_REQ, sp->confid[IDX_PAP], + sizeof idlen, (const char *)&idlen, + (size_t)idlen, sp->myauth.name, + sizeof pwdlen, (const char *)&pwdlen, + (size_t)pwdlen, sp->myauth.secret, + 0); +} + +/* + * Random miscellaneous functions. + */ + +/* + * Send a PAP or CHAP proto packet. + * + * Varadic function, each of the elements for the ellipsis is of type + * ``size_t mlen, const u_char *msg''. Processing will stop iff + * mlen == 0. + * NOTE: never declare variadic functions with types subject to type + * promotion (i.e. u_char). This is asking for big trouble depending + * on the architecture you are on... + */ + +static void +sppp_auth_send(const struct cp *cp, struct sppp *sp, + unsigned int type, unsigned int id, + ...) +{ + STDDCL; + struct ppp_header *h; + struct lcp_header *lh; + struct mbuf *m; + u_char *p; + int len; + unsigned int mlen; + const char *msg; + va_list ap; + + MGETHDR (m, M_DONTWAIT, MT_DATA); + if (! m) + return; + m->m_pkthdr.rcvif = 0; + + h = mtod (m, struct ppp_header*); + h->address = PPP_ALLSTATIONS; /* broadcast address */ + h->control = PPP_UI; /* Unnumbered Info */ + h->protocol = htons(cp->proto); + + lh = (struct lcp_header*)(h + 1); + lh->type = type; + lh->ident = id; + p = (u_char*) (lh+1); + + va_start(ap, id); + len = 0; + + while ((mlen = (unsigned int)va_arg(ap, size_t)) != 0) { + msg = va_arg(ap, const char *); + len += mlen; + if (len > MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN) { + va_end(ap); + m_freem(m); + return; + } + + bcopy(msg, p, mlen); + p += mlen; + } + va_end(ap); + + m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + LCP_HEADER_LEN + len; + lh->len = htons (LCP_HEADER_LEN + len); + + if (debug) { + log(LOG_DEBUG, SPP_FMT "%s output <%s id=0x%x len=%d", + SPP_ARGS(ifp), cp->name, + sppp_auth_type_name(cp->proto, lh->type), + lh->ident, ntohs(lh->len)); + sppp_print_bytes((u_char*) (lh+1), len); + log(-1, ">\n"); + } + if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) + ifp->if_oerrors++; +} + +/* + * Flush interface queue. + */ +static void +sppp_qflush(struct ifqueue *ifq) +{ + struct mbuf *m, *n; + + n = ifq->ifq_head; + while ((m = n)) { + n = m->m_act; + m_freem (m); + } + ifq->ifq_head = 0; + ifq->ifq_tail = 0; + ifq->ifq_len = 0; +} + +/* + * Send keepalive packets, every 10 seconds. + */ +static void +sppp_keepalive(void *dummy) +{ + struct sppp *sp = (struct sppp*)dummy; + struct ifnet *ifp = SP2IFP(sp); + int s; + + s = splimp(); + SPPP_LOCK(sp); + /* Keepalive mode disabled or channel down? */ + if (! (sp->pp_flags & PP_KEEPALIVE) || + ! (ifp->if_drv_flags & IFF_DRV_RUNNING)) + goto out; + + if (sp->pp_mode == PP_FR) { + sppp_fr_keepalive (sp); + goto out; + } + + /* No keepalive in PPP mode if LCP not opened yet. */ + if (sp->pp_mode != IFF_CISCO && + sp->pp_phase < PHASE_AUTHENTICATE) + goto out; + + if (sp->pp_alivecnt == MAXALIVECNT) { + /* No keepalive packets got. Stop the interface. */ + printf (SPP_FMT "down\n", SPP_ARGS(ifp)); + if_down (ifp); + sppp_qflush (&sp->pp_cpq); + if (sp->pp_mode != IFF_CISCO) { + /* XXX */ + /* Shut down the PPP link. */ + lcp.Down(sp); + /* Initiate negotiation. XXX */ + lcp.Up(sp); + } + } + if (sp->pp_alivecnt <= MAXALIVECNT) + ++sp->pp_alivecnt; + if (sp->pp_mode == IFF_CISCO) + sppp_cisco_send (sp, CISCO_KEEPALIVE_REQ, + ++sp->pp_seq[IDX_LCP], sp->pp_rseq[IDX_LCP]); + else if (sp->pp_phase >= PHASE_AUTHENTICATE) { + long nmagic = htonl (sp->lcp.magic); + sp->lcp.echoid = ++sp->pp_seq[IDX_LCP]; + sppp_cp_send (sp, PPP_LCP, ECHO_REQ, + sp->lcp.echoid, 4, &nmagic); + } +out: + SPPP_UNLOCK(sp); + splx(s); + callout_reset(&sp->keepalive_callout, hz * 10, sppp_keepalive, + (void *)sp); +} + +/* + * Get both IP addresses. + */ +void +sppp_get_ip_addrs(struct sppp *sp, u_long *src, u_long *dst, u_long *srcmask) +{ + struct ifnet *ifp = SP2IFP(sp); + struct ifaddr *ifa; + struct sockaddr_in *si, *sm; + u_long ssrc, ddst; + + sm = NULL; + ssrc = ddst = 0L; + /* + * Pick the first AF_INET address from the list, + * aliases don't make any sense on a p2p link anyway. + */ + si = 0; + if_addr_rlock(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + if (ifa->ifa_addr->sa_family == AF_INET) { + si = (struct sockaddr_in *)ifa->ifa_addr; + sm = (struct sockaddr_in *)ifa->ifa_netmask; + if (si) + break; + } + if (ifa) { + if (si && si->sin_addr.s_addr) { + ssrc = si->sin_addr.s_addr; + if (srcmask) + *srcmask = ntohl(sm->sin_addr.s_addr); + } + + si = (struct sockaddr_in *)ifa->ifa_dstaddr; + if (si && si->sin_addr.s_addr) + ddst = si->sin_addr.s_addr; + } + if_addr_runlock(ifp); + + if (dst) *dst = ntohl(ddst); + if (src) *src = ntohl(ssrc); +} + +#ifdef INET +/* + * Set my IP address. Must be called at splimp. + */ +static void +sppp_set_ip_addr(struct sppp *sp, u_long src) +{ + STDDCL; + struct ifaddr *ifa; + struct sockaddr_in *si; + struct in_ifaddr *ia; + + /* + * Pick the first AF_INET address from the list, + * aliases don't make any sense on a p2p link anyway. + */ + si = 0; + if_addr_rlock(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family == AF_INET) { + si = (struct sockaddr_in *)ifa->ifa_addr; + if (si != NULL) { + ifa_ref(ifa); + break; + } + } + } + if_addr_runlock(ifp); + + if (ifa != NULL) { + int error; + + /* delete old route */ + error = rtinit(ifa, (int)RTM_DELETE, RTF_HOST); + if (debug && error) { + log(LOG_DEBUG, SPP_FMT "sppp_set_ip_addr: rtinit DEL failed, error=%d\n", + SPP_ARGS(ifp), error); + } + + /* set new address */ + si->sin_addr.s_addr = htonl(src); + ia = ifatoia(ifa); + IN_IFADDR_WLOCK(); + LIST_REMOVE(ia, ia_hash); + LIST_INSERT_HEAD(INADDR_HASH(si->sin_addr.s_addr), ia, ia_hash); + IN_IFADDR_WUNLOCK(); + + /* add new route */ + error = rtinit(ifa, (int)RTM_ADD, RTF_HOST); + if (debug && error) { + log(LOG_DEBUG, SPP_FMT "sppp_set_ip_addr: rtinit ADD failed, error=%d", + SPP_ARGS(ifp), error); + } + ifa_free(ifa); + } +} +#endif + +#ifdef INET6 +/* + * Get both IPv6 addresses. + */ +static void +sppp_get_ip6_addrs(struct sppp *sp, struct in6_addr *src, struct in6_addr *dst, + struct in6_addr *srcmask) +{ + struct ifnet *ifp = SP2IFP(sp); + struct ifaddr *ifa; + struct sockaddr_in6 *si, *sm; + struct in6_addr ssrc, ddst; + + sm = NULL; + bzero(&ssrc, sizeof(ssrc)); + bzero(&ddst, sizeof(ddst)); + /* + * Pick the first link-local AF_INET6 address from the list, + * aliases don't make any sense on a p2p link anyway. + */ + si = NULL; + if_addr_rlock(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + if (ifa->ifa_addr->sa_family == AF_INET6) { + si = (struct sockaddr_in6 *)ifa->ifa_addr; + sm = (struct sockaddr_in6 *)ifa->ifa_netmask; + if (si && IN6_IS_ADDR_LINKLOCAL(&si->sin6_addr)) + break; + } + if (ifa) { + if (si && !IN6_IS_ADDR_UNSPECIFIED(&si->sin6_addr)) { + bcopy(&si->sin6_addr, &ssrc, sizeof(ssrc)); + if (srcmask) { + bcopy(&sm->sin6_addr, srcmask, + sizeof(*srcmask)); + } + } + + si = (struct sockaddr_in6 *)ifa->ifa_dstaddr; + if (si && !IN6_IS_ADDR_UNSPECIFIED(&si->sin6_addr)) + bcopy(&si->sin6_addr, &ddst, sizeof(ddst)); + } + + if (dst) + bcopy(&ddst, dst, sizeof(*dst)); + if (src) + bcopy(&ssrc, src, sizeof(*src)); + if_addr_runlock(ifp); +} + +#ifdef IPV6CP_MYIFID_DYN +/* + * Generate random ifid. + */ +static void +sppp_gen_ip6_addr(struct sppp *sp, struct in6_addr *addr) +{ + /* TBD */ +} + +/* + * Set my IPv6 address. Must be called at splimp. + */ +static void +sppp_set_ip6_addr(struct sppp *sp, const struct in6_addr *src) +{ + STDDCL; + struct ifaddr *ifa; + struct sockaddr_in6 *sin6; + + /* + * Pick the first link-local AF_INET6 address from the list, + * aliases don't make any sense on a p2p link anyway. + */ + + sin6 = NULL; + if_addr_rlock(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family == AF_INET6) { + sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; + if (sin6 && IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { + ifa_ref(ifa); + break; + } + } + } + if_addr_runlock(ifp); + + if (ifa != NULL) { + int error; + struct sockaddr_in6 new_sin6 = *sin6; + + bcopy(src, &new_sin6.sin6_addr, sizeof(new_sin6.sin6_addr)); + error = in6_ifinit(ifp, ifatoia6(ifa), &new_sin6, 1); + if (debug && error) { + log(LOG_DEBUG, SPP_FMT "sppp_set_ip6_addr: in6_ifinit " + " failed, error=%d\n", SPP_ARGS(ifp), error); + } + ifa_free(ifa); + } +} +#endif + +/* + * Suggest a candidate address to be used by peer. + */ +static void +sppp_suggest_ip6_addr(struct sppp *sp, struct in6_addr *suggest) +{ + struct in6_addr myaddr; + struct timeval tv; + + sppp_get_ip6_addrs(sp, &myaddr, 0, 0); + + myaddr.s6_addr[8] &= ~0x02; /* u bit to "local" */ + microtime(&tv); + if ((tv.tv_usec & 0xff) == 0 && (tv.tv_sec & 0xff) == 0) { + myaddr.s6_addr[14] ^= 0xff; + myaddr.s6_addr[15] ^= 0xff; + } else { + myaddr.s6_addr[14] ^= (tv.tv_usec & 0xff); + myaddr.s6_addr[15] ^= (tv.tv_sec & 0xff); + } + if (suggest) + bcopy(&myaddr, suggest, sizeof(myaddr)); +} +#endif /*INET6*/ + +static int +sppp_params(struct sppp *sp, u_long cmd, void *data) +{ + u_long subcmd; + struct ifreq *ifr = (struct ifreq *)data; + struct spppreq *spr; + int rv = 0; + + if ((spr = malloc(sizeof(struct spppreq), M_TEMP, M_NOWAIT)) == 0) + return (EAGAIN); + /* + * ifr->ifr_data is supposed to point to a struct spppreq. + * Check the cmd word first before attempting to fetch all the + * data. + */ + if ((subcmd = fuword(ifr->ifr_data)) == -1) { + rv = EFAULT; + goto quit; + } + + if (copyin((caddr_t)ifr->ifr_data, spr, sizeof(struct spppreq)) != 0) { + rv = EFAULT; + goto quit; + } + + switch (subcmd) { + case (u_long)SPPPIOGDEFS: + if (cmd != SIOCGIFGENERIC) { + rv = EINVAL; + break; + } + /* + * We copy over the entire current state, but clean + * out some of the stuff we don't wanna pass up. + * Remember, SIOCGIFGENERIC is unprotected, and can be + * called by any user. No need to ever get PAP or + * CHAP secrets back to userland anyway. + */ + spr->defs.pp_phase = sp->pp_phase; + spr->defs.enable_vj = (sp->confflags & CONF_ENABLE_VJ) != 0; + spr->defs.enable_ipv6 = (sp->confflags & CONF_ENABLE_IPV6) != 0; + spr->defs.lcp = sp->lcp; + spr->defs.ipcp = sp->ipcp; + spr->defs.ipv6cp = sp->ipv6cp; + spr->defs.myauth = sp->myauth; + spr->defs.hisauth = sp->hisauth; + bzero(spr->defs.myauth.secret, AUTHKEYLEN); + bzero(spr->defs.myauth.challenge, AUTHKEYLEN); + bzero(spr->defs.hisauth.secret, AUTHKEYLEN); + bzero(spr->defs.hisauth.challenge, AUTHKEYLEN); + /* + * Fixup the LCP timeout value to milliseconds so + * spppcontrol doesn't need to bother about the value + * of "hz". We do the reverse calculation below when + * setting it. + */ + spr->defs.lcp.timeout = sp->lcp.timeout * 1000 / hz; + rv = copyout(spr, (caddr_t)ifr->ifr_data, + sizeof(struct spppreq)); + break; + + case (u_long)SPPPIOSDEFS: + if (cmd != SIOCSIFGENERIC) { + rv = EINVAL; + break; + } + /* + * We have a very specific idea of which fields we + * allow being passed back from userland, so to not + * clobber our current state. For one, we only allow + * setting anything if LCP is in dead or establish + * phase. Once the authentication negotiations + * started, the authentication settings must not be + * changed again. (The administrator can force an + * ifconfig down in order to get LCP back into dead + * phase.) + * + * Also, we only allow for authentication parameters to be + * specified. + * + * XXX Should allow to set or clear pp_flags. + * + * Finally, if the respective authentication protocol to + * be used is set differently than 0, but the secret is + * passed as all zeros, we don't trash the existing secret. + * This allows an administrator to change the system name + * only without clobbering the secret (which he didn't get + * back in a previous SPPPIOGDEFS call). However, the + * secrets are cleared if the authentication protocol is + * reset to 0. */ + if (sp->pp_phase != PHASE_DEAD && + sp->pp_phase != PHASE_ESTABLISH) { + rv = EBUSY; + break; + } + + if ((spr->defs.myauth.proto != 0 && spr->defs.myauth.proto != PPP_PAP && + spr->defs.myauth.proto != PPP_CHAP) || + (spr->defs.hisauth.proto != 0 && spr->defs.hisauth.proto != PPP_PAP && + spr->defs.hisauth.proto != PPP_CHAP)) { + rv = EINVAL; + break; + } + + if (spr->defs.myauth.proto == 0) + /* resetting myauth */ + bzero(&sp->myauth, sizeof sp->myauth); + else { + /* setting/changing myauth */ + sp->myauth.proto = spr->defs.myauth.proto; + bcopy(spr->defs.myauth.name, sp->myauth.name, AUTHNAMELEN); + if (spr->defs.myauth.secret[0] != '\0') + bcopy(spr->defs.myauth.secret, sp->myauth.secret, + AUTHKEYLEN); + } + if (spr->defs.hisauth.proto == 0) + /* resetting hisauth */ + bzero(&sp->hisauth, sizeof sp->hisauth); + else { + /* setting/changing hisauth */ + sp->hisauth.proto = spr->defs.hisauth.proto; + sp->hisauth.flags = spr->defs.hisauth.flags; + bcopy(spr->defs.hisauth.name, sp->hisauth.name, AUTHNAMELEN); + if (spr->defs.hisauth.secret[0] != '\0') + bcopy(spr->defs.hisauth.secret, sp->hisauth.secret, + AUTHKEYLEN); + } + /* set LCP restart timer timeout */ + if (spr->defs.lcp.timeout != 0) + sp->lcp.timeout = spr->defs.lcp.timeout * hz / 1000; + /* set VJ enable and IPv6 disable flags */ +#ifdef INET + if (spr->defs.enable_vj) + sp->confflags |= CONF_ENABLE_VJ; + else + sp->confflags &= ~CONF_ENABLE_VJ; +#endif +#ifdef INET6 + if (spr->defs.enable_ipv6) + sp->confflags |= CONF_ENABLE_IPV6; + else + sp->confflags &= ~CONF_ENABLE_IPV6; +#endif + break; + + default: + rv = EINVAL; + } + + quit: + free(spr, M_TEMP); + + return (rv); +} + +static void +sppp_phase_network(struct sppp *sp) +{ + STDDCL; + int i; + u_long mask; + + sp->pp_phase = PHASE_NETWORK; + + if (debug) + log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp), + sppp_phase_name(sp->pp_phase)); + + /* Notify NCPs now. */ + for (i = 0; i < IDX_COUNT; i++) + if ((cps[i])->flags & CP_NCP) + (cps[i])->Open(sp); + + /* Send Up events to all NCPs. */ + for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1) + if ((sp->lcp.protos & mask) && ((cps[i])->flags & CP_NCP)) + (cps[i])->Up(sp); + + /* if no NCP is starting, all this was in vain, close down */ + sppp_lcp_check_and_close(sp); +} + + +static const char * +sppp_cp_type_name(u_char type) +{ + static char buf[12]; + switch (type) { + case CONF_REQ: return "conf-req"; + case CONF_ACK: return "conf-ack"; + case CONF_NAK: return "conf-nak"; + case CONF_REJ: return "conf-rej"; + case TERM_REQ: return "term-req"; + case TERM_ACK: return "term-ack"; + case CODE_REJ: return "code-rej"; + case PROTO_REJ: return "proto-rej"; + case ECHO_REQ: return "echo-req"; + case ECHO_REPLY: return "echo-reply"; + case DISC_REQ: return "discard-req"; + } + snprintf (buf, sizeof(buf), "cp/0x%x", type); + return buf; +} + +static const char * +sppp_auth_type_name(u_short proto, u_char type) +{ + static char buf[12]; + switch (proto) { + case PPP_CHAP: + switch (type) { + case CHAP_CHALLENGE: return "challenge"; + case CHAP_RESPONSE: return "response"; + case CHAP_SUCCESS: return "success"; + case CHAP_FAILURE: return "failure"; + } + case PPP_PAP: + switch (type) { + case PAP_REQ: return "req"; + case PAP_ACK: return "ack"; + case PAP_NAK: return "nak"; + } + } + snprintf (buf, sizeof(buf), "auth/0x%x", type); + return buf; +} + +static const char * +sppp_lcp_opt_name(u_char opt) +{ + static char buf[12]; + switch (opt) { + case LCP_OPT_MRU: return "mru"; + case LCP_OPT_ASYNC_MAP: return "async-map"; + case LCP_OPT_AUTH_PROTO: return "auth-proto"; + case LCP_OPT_QUAL_PROTO: return "qual-proto"; + case LCP_OPT_MAGIC: return "magic"; + case LCP_OPT_PROTO_COMP: return "proto-comp"; + case LCP_OPT_ADDR_COMP: return "addr-comp"; + } + snprintf (buf, sizeof(buf), "lcp/0x%x", opt); + return buf; +} + +#ifdef INET +static const char * +sppp_ipcp_opt_name(u_char opt) +{ + static char buf[12]; + switch (opt) { + case IPCP_OPT_ADDRESSES: return "addresses"; + case IPCP_OPT_COMPRESSION: return "compression"; + case IPCP_OPT_ADDRESS: return "address"; + } + snprintf (buf, sizeof(buf), "ipcp/0x%x", opt); + return buf; +} +#endif + +#ifdef INET6 +static const char * +sppp_ipv6cp_opt_name(u_char opt) +{ + static char buf[12]; + switch (opt) { + case IPV6CP_OPT_IFID: return "ifid"; + case IPV6CP_OPT_COMPRESSION: return "compression"; + } + sprintf (buf, "0x%x", opt); + return buf; +} +#endif + +static const char * +sppp_state_name(int state) +{ + switch (state) { + case STATE_INITIAL: return "initial"; + case STATE_STARTING: return "starting"; + case STATE_CLOSED: return "closed"; + case STATE_STOPPED: return "stopped"; + case STATE_CLOSING: return "closing"; + case STATE_STOPPING: return "stopping"; + case STATE_REQ_SENT: return "req-sent"; + case STATE_ACK_RCVD: return "ack-rcvd"; + case STATE_ACK_SENT: return "ack-sent"; + case STATE_OPENED: return "opened"; + } + return "illegal"; +} + +static const char * +sppp_phase_name(enum ppp_phase phase) +{ + switch (phase) { + case PHASE_DEAD: return "dead"; + case PHASE_ESTABLISH: return "establish"; + case PHASE_TERMINATE: return "terminate"; + case PHASE_AUTHENTICATE: return "authenticate"; + case PHASE_NETWORK: return "network"; + } + return "illegal"; +} + +static const char * +sppp_proto_name(u_short proto) +{ + static char buf[12]; + switch (proto) { + case PPP_LCP: return "lcp"; + case PPP_IPCP: return "ipcp"; + case PPP_PAP: return "pap"; + case PPP_CHAP: return "chap"; + case PPP_IPV6CP: return "ipv6cp"; + } + snprintf(buf, sizeof(buf), "proto/0x%x", (unsigned)proto); + return buf; +} + +static void +sppp_print_bytes(const u_char *p, u_short len) +{ + if (len) + log(-1, " %*D", len, p, "-"); +} + +static void +sppp_print_string(const char *p, u_short len) +{ + u_char c; + + while (len-- > 0) { + c = *p++; + /* + * Print only ASCII chars directly. RFC 1994 recommends + * using only them, but we don't rely on it. */ + if (c < ' ' || c > '~') + log(-1, "\\x%x", c); + else + log(-1, "%c", c); + } +} + +#ifdef INET +static const char * +sppp_dotted_quad(u_long addr) +{ + static char s[16]; + sprintf(s, "%d.%d.%d.%d", + (int)((addr >> 24) & 0xff), + (int)((addr >> 16) & 0xff), + (int)((addr >> 8) & 0xff), + (int)(addr & 0xff)); + return s; +} +#endif + +static int +sppp_strnlen(u_char *p, int max) +{ + int len; + + for (len = 0; len < max && *p; ++p) + ++len; + return len; +} + +/* a dummy, used to drop uninteresting events */ +static void +sppp_null(struct sppp *unused) +{ + /* do just nothing */ +} diff --git a/freebsd/sys/net/if_stf.c b/freebsd/sys/net/if_stf.c new file mode 100644 index 00000000..1cf5c408 --- /dev/null +++ b/freebsd/sys/net/if_stf.c @@ -0,0 +1,850 @@ +#include + +/* $FreeBSD$ */ +/* $KAME: if_stf.c,v 1.73 2001/12/03 11:08:30 keiichi Exp $ */ + +/*- + * Copyright (C) 2000 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * 6to4 interface, based on RFC3056. + * + * 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting. + * There is no address mapping defined from IPv6 multicast address to IPv4 + * address. Therefore, we do not have IFF_MULTICAST on the interface. + * + * Due to the lack of address mapping for link-local addresses, we cannot + * throw packets toward link-local addresses (fe80::x). Also, we cannot throw + * packets to link-local multicast addresses (ff02::x). + * + * Here are interesting symptoms due to the lack of link-local address: + * + * Unicast routing exchange: + * - RIPng: Impossible. Uses link-local multicast packet toward ff02::9, + * and link-local addresses as nexthop. + * - OSPFv6: Impossible. OSPFv6 assumes that there's link-local address + * assigned to the link, and makes use of them. Also, HELLO packets use + * link-local multicast addresses (ff02::5 and ff02::6). + * - BGP4+: Maybe. You can only use global address as nexthop, and global + * address as TCP endpoint address. + * + * Multicast routing protocols: + * - PIM: Hello packet cannot be used to discover adjacent PIM routers. + * Adjacent PIM routers must be configured manually (is it really spec-wise + * correct thing to do?). + * + * ICMPv6: + * - Redirects cannot be used due to the lack of link-local address. + * + * stf interface does not have, and will not need, a link-local address. + * It seems to have no real benefit and does not help the above symptoms much. + * Even if we assign link-locals to interface, we cannot really + * use link-local unicast/multicast on top of 6to4 cloud (since there's no + * encapsulation defined for link-local address), and the above analysis does + * not change. RFC3056 does not mandate the assignment of link-local address + * either. + * + * 6to4 interface has security issues. Refer to + * http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt + * for details. The code tries to filter out some of malicious packets. + * Note that there is no way to be 100% secure. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +#include + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, IFT_STF, stf, CTLFLAG_RW, 0, "6to4 Interface"); + +static int stf_route_cache = 1; +SYSCTL_INT(_net_link_stf, OID_AUTO, route_cache, CTLFLAG_RW, + &stf_route_cache, 0, "Caching of IPv4 routes for 6to4 Output"); + +#define STFNAME "stf" +#define STFUNIT 0 + +#define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) + +/* + * XXX: Return a pointer with 16-bit aligned. Don't cast it to + * struct in_addr *; use bcopy() instead. + */ +#define GET_V4(x) ((caddr_t)(&(x)->s6_addr16[1])) + +struct stf_softc { + struct ifnet *sc_ifp; + union { + struct route __sc_ro4; + struct route_in6 __sc_ro6; /* just for safety */ + } __sc_ro46; +#define sc_ro __sc_ro46.__sc_ro4 + struct mtx sc_ro_mtx; + u_int sc_fibnum; + const struct encaptab *encap_cookie; +}; +#define STF2IFP(sc) ((sc)->sc_ifp) + +/* + * Note that mutable fields in the softc are not currently locked. + * We do lock sc_ro in stf_output though. + */ +static MALLOC_DEFINE(M_STF, STFNAME, "6to4 Tunnel Interface"); +static const int ip_stf_ttl = 40; + +extern struct domain inetdomain; +struct protosw in_stf_protosw = { + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_IPV6, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = in_stf_input, + .pr_output = (pr_output_t *)rip_output, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}; + +static char *stfnames[] = {"stf0", "stf", "6to4", NULL}; + +static int stfmodevent(module_t, int, void *); +static int stf_encapcheck(const struct mbuf *, int, int, void *); +static struct in6_ifaddr *stf_getsrcifa6(struct ifnet *); +static int stf_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); +static int isrfc1918addr(struct in_addr *); +static int stf_checkaddr4(struct stf_softc *, struct in_addr *, + struct ifnet *); +static int stf_checkaddr6(struct stf_softc *, struct in6_addr *, + struct ifnet *); +static void stf_rtrequest(int, struct rtentry *, struct rt_addrinfo *); +static int stf_ioctl(struct ifnet *, u_long, caddr_t); + +static int stf_clone_match(struct if_clone *, const char *); +static int stf_clone_create(struct if_clone *, char *, size_t, caddr_t); +static int stf_clone_destroy(struct if_clone *, struct ifnet *); +struct if_clone stf_cloner = IFC_CLONE_INITIALIZER(STFNAME, NULL, 0, + NULL, stf_clone_match, stf_clone_create, stf_clone_destroy); + +static int +stf_clone_match(struct if_clone *ifc, const char *name) +{ + int i; + + for(i = 0; stfnames[i] != NULL; i++) { + if (strcmp(stfnames[i], name) == 0) + return (1); + } + + return (0); +} + +static int +stf_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +{ + int err, unit; + struct stf_softc *sc; + struct ifnet *ifp; + + /* + * We can only have one unit, but since unit allocation is + * already locked, we use it to keep from allocating extra + * interfaces. + */ + unit = STFUNIT; + err = ifc_alloc_unit(ifc, &unit); + if (err != 0) + return (err); + + sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK | M_ZERO); + ifp = STF2IFP(sc) = if_alloc(IFT_STF); + if (ifp == NULL) { + free(sc, M_STF); + ifc_free_unit(ifc, unit); + return (ENOSPC); + } + ifp->if_softc = sc; + sc->sc_fibnum = curthread->td_proc->p_fibnum; + + /* + * Set the name manually rather then using if_initname because + * we don't conform to the default naming convention for interfaces. + */ + strlcpy(ifp->if_xname, name, IFNAMSIZ); + ifp->if_dname = ifc->ifc_name; + ifp->if_dunit = IF_DUNIT_NONE; + + mtx_init(&(sc)->sc_ro_mtx, "stf ro", NULL, MTX_DEF); + sc->encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV6, + stf_encapcheck, &in_stf_protosw, sc); + if (sc->encap_cookie == NULL) { + if_printf(ifp, "attach failed\n"); + free(sc, M_STF); + ifc_free_unit(ifc, unit); + return (ENOMEM); + } + + ifp->if_mtu = IPV6_MMTU; + ifp->if_ioctl = stf_ioctl; + ifp->if_output = stf_output; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + return (0); +} + +static int +stf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) +{ + struct stf_softc *sc = ifp->if_softc; + int err; + + err = encap_detach(sc->encap_cookie); + KASSERT(err == 0, ("Unexpected error detaching encap_cookie")); + mtx_destroy(&(sc)->sc_ro_mtx); + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); + + free(sc, M_STF); + ifc_free_unit(ifc, STFUNIT); + + return (0); +} + +static int +stfmodevent(mod, type, data) + module_t mod; + int type; + void *data; +{ + + switch (type) { + case MOD_LOAD: + if_clone_attach(&stf_cloner); + break; + case MOD_UNLOAD: + if_clone_detach(&stf_cloner); + break; + default: + return (EOPNOTSUPP); + } + + return (0); +} + +static moduledata_t stf_mod = { + "if_stf", + stfmodevent, + 0 +}; + +DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); + +static int +stf_encapcheck(m, off, proto, arg) + const struct mbuf *m; + int off; + int proto; + void *arg; +{ + struct ip ip; + struct in6_ifaddr *ia6; + struct stf_softc *sc; + struct in_addr a, b, mask; + + sc = (struct stf_softc *)arg; + if (sc == NULL) + return 0; + + if ((STF2IFP(sc)->if_flags & IFF_UP) == 0) + return 0; + + /* IFF_LINK0 means "no decapsulation" */ + if ((STF2IFP(sc)->if_flags & IFF_LINK0) != 0) + return 0; + + if (proto != IPPROTO_IPV6) + return 0; + + /* LINTED const cast */ + m_copydata((struct mbuf *)(uintptr_t)m, 0, sizeof(ip), (caddr_t)&ip); + + if (ip.ip_v != 4) + return 0; + + ia6 = stf_getsrcifa6(STF2IFP(sc)); + if (ia6 == NULL) + return 0; + + /* + * check if IPv4 dst matches the IPv4 address derived from the + * local 6to4 address. + * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:... + */ + if (bcmp(GET_V4(&ia6->ia_addr.sin6_addr), &ip.ip_dst, + sizeof(ip.ip_dst)) != 0) { + ifa_free(&ia6->ia_ifa); + return 0; + } + + /* + * check if IPv4 src matches the IPv4 address derived from the + * local 6to4 address masked by prefixmask. + * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24 + * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24 + */ + bzero(&a, sizeof(a)); + bcopy(GET_V4(&ia6->ia_addr.sin6_addr), &a, sizeof(a)); + bcopy(GET_V4(&ia6->ia_prefixmask.sin6_addr), &mask, sizeof(mask)); + ifa_free(&ia6->ia_ifa); + a.s_addr &= mask.s_addr; + b = ip.ip_src; + b.s_addr &= mask.s_addr; + if (a.s_addr != b.s_addr) + return 0; + + /* stf interface makes single side match only */ + return 32; +} + +static struct in6_ifaddr * +stf_getsrcifa6(ifp) + struct ifnet *ifp; +{ + struct ifaddr *ia; + struct in_ifaddr *ia4; + struct sockaddr_in6 *sin6; + struct in_addr in; + + if_addr_rlock(ifp); + TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { + if (ia->ifa_addr->sa_family != AF_INET6) + continue; + sin6 = (struct sockaddr_in6 *)ia->ifa_addr; + if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) + continue; + + bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in)); + LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash) + if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) + break; + if (ia4 == NULL) + continue; + + ifa_ref(ia); + if_addr_runlock(ifp); + return (struct in6_ifaddr *)ia; + } + if_addr_runlock(ifp); + + return NULL; +} + +static int +stf_output(ifp, m, dst, ro) + struct ifnet *ifp; + struct mbuf *m; + struct sockaddr *dst; + struct route *ro; +{ + struct stf_softc *sc; + struct sockaddr_in6 *dst6; + struct route *cached_route; + struct in_addr in4; + caddr_t ptr; + struct sockaddr_in *dst4; + u_int8_t tos; + struct ip *ip; + struct ip6_hdr *ip6; + struct in6_ifaddr *ia6; + u_int32_t af; + int error; + +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) { + m_freem(m); + return (error); + } +#endif + + sc = ifp->if_softc; + dst6 = (struct sockaddr_in6 *)dst; + + /* just in case */ + if ((ifp->if_flags & IFF_UP) == 0) { + m_freem(m); + ifp->if_oerrors++; + return ENETDOWN; + } + + /* + * If we don't have an ip4 address that match my inner ip6 address, + * we shouldn't generate output. Without this check, we'll end up + * using wrong IPv4 source. + */ + ia6 = stf_getsrcifa6(ifp); + if (ia6 == NULL) { + m_freem(m); + ifp->if_oerrors++; + return ENETDOWN; + } + + if (m->m_len < sizeof(*ip6)) { + m = m_pullup(m, sizeof(*ip6)); + if (!m) { + ifa_free(&ia6->ia_ifa); + ifp->if_oerrors++; + return ENOBUFS; + } + } + ip6 = mtod(m, struct ip6_hdr *); + tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + + /* + * BPF writes need to be handled specially. + * This is a null operation, nothing here checks dst->sa_family. + */ + if (dst->sa_family == AF_UNSPEC) { + bcopy(dst->sa_data, &af, sizeof(af)); + dst->sa_family = af; + } + + /* + * Pickup the right outer dst addr from the list of candidates. + * ip6_dst has priority as it may be able to give us shorter IPv4 hops. + */ + ptr = NULL; + if (IN6_IS_ADDR_6TO4(&ip6->ip6_dst)) + ptr = GET_V4(&ip6->ip6_dst); + else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr)) + ptr = GET_V4(&dst6->sin6_addr); + else { + ifa_free(&ia6->ia_ifa); + m_freem(m); + ifp->if_oerrors++; + return ENETUNREACH; + } + bcopy(ptr, &in4, sizeof(in4)); + + if (bpf_peers_present(ifp->if_bpf)) { + /* + * We need to prepend the address family as + * a four byte field. Cons up a dummy header + * to pacify bpf. This is safe because bpf + * will only read from the mbuf (i.e., it won't + * try to free it or keep a pointer a to it). + */ + af = AF_INET6; + bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); + } + + M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); + if (m && m->m_len < sizeof(struct ip)) + m = m_pullup(m, sizeof(struct ip)); + if (m == NULL) { + ifa_free(&ia6->ia_ifa); + ifp->if_oerrors++; + return ENOBUFS; + } + ip = mtod(m, struct ip *); + + bzero(ip, sizeof(*ip)); + + bcopy(GET_V4(&((struct sockaddr_in6 *)&ia6->ia_addr)->sin6_addr), + &ip->ip_src, sizeof(ip->ip_src)); + ifa_free(&ia6->ia_ifa); + bcopy(&in4, &ip->ip_dst, sizeof(ip->ip_dst)); + ip->ip_p = IPPROTO_IPV6; + ip->ip_ttl = ip_stf_ttl; + ip->ip_len = m->m_pkthdr.len; /*host order*/ + if (ifp->if_flags & IFF_LINK1) + ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos); + else + ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); + + if (!stf_route_cache) { + cached_route = NULL; + goto sendit; + } + + /* + * Do we have a cached route? + */ + mtx_lock(&(sc)->sc_ro_mtx); + dst4 = (struct sockaddr_in *)&sc->sc_ro.ro_dst; + if (dst4->sin_family != AF_INET || + bcmp(&dst4->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)) != 0) { + /* cache route doesn't match */ + dst4->sin_family = AF_INET; + dst4->sin_len = sizeof(struct sockaddr_in); + bcopy(&ip->ip_dst, &dst4->sin_addr, sizeof(dst4->sin_addr)); + if (sc->sc_ro.ro_rt) { + RTFREE(sc->sc_ro.ro_rt); + sc->sc_ro.ro_rt = NULL; + } + } + + if (sc->sc_ro.ro_rt == NULL) { + rtalloc_fib(&sc->sc_ro, sc->sc_fibnum); + if (sc->sc_ro.ro_rt == NULL) { + m_freem(m); + mtx_unlock(&(sc)->sc_ro_mtx); + ifp->if_oerrors++; + return ENETUNREACH; + } + } + cached_route = &sc->sc_ro; + +sendit: + M_SETFIB(m, sc->sc_fibnum); + ifp->if_opackets++; + error = ip_output(m, NULL, cached_route, 0, NULL, NULL); + + if (cached_route != NULL) + mtx_unlock(&(sc)->sc_ro_mtx); + return error; +} + +static int +isrfc1918addr(in) + struct in_addr *in; +{ + /* + * returns 1 if private address range: + * 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 + */ + if ((ntohl(in->s_addr) & 0xff000000) >> 24 == 10 || + (ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 || + (ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168) + return 1; + + return 0; +} + +static int +stf_checkaddr4(sc, in, inifp) + struct stf_softc *sc; + struct in_addr *in; + struct ifnet *inifp; /* incoming interface */ +{ + struct in_ifaddr *ia4; + + /* + * reject packets with the following address: + * 224.0.0.0/4 0.0.0.0/8 127.0.0.0/8 255.0.0.0/8 + */ + if (IN_MULTICAST(ntohl(in->s_addr))) + return -1; + switch ((ntohl(in->s_addr) & 0xff000000) >> 24) { + case 0: case 127: case 255: + return -1; + } + + /* + * reject packets with private address range. + * (requirement from RFC3056 section 2 1st paragraph) + */ + if (isrfc1918addr(in)) + return -1; + + /* + * reject packets with broadcast + */ + IN_IFADDR_RLOCK(); + for (ia4 = TAILQ_FIRST(&V_in_ifaddrhead); + ia4; + ia4 = TAILQ_NEXT(ia4, ia_link)) + { + if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) + continue; + if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { + IN_IFADDR_RUNLOCK(); + return -1; + } + } + IN_IFADDR_RUNLOCK(); + + /* + * perform ingress filter + */ + if (sc && (STF2IFP(sc)->if_flags & IFF_LINK2) == 0 && inifp) { + struct sockaddr_in sin; + struct rtentry *rt; + + bzero(&sin, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_addr = *in; + rt = rtalloc1_fib((struct sockaddr *)&sin, 0, + 0UL, sc->sc_fibnum); + if (!rt || rt->rt_ifp != inifp) { +#if 0 + log(LOG_WARNING, "%s: packet from 0x%x dropped " + "due to ingress filter\n", if_name(STF2IFP(sc)), + (u_int32_t)ntohl(sin.sin_addr.s_addr)); +#endif + if (rt) + RTFREE_LOCKED(rt); + return -1; + } + RTFREE_LOCKED(rt); + } + + return 0; +} + +static int +stf_checkaddr6(sc, in6, inifp) + struct stf_softc *sc; + struct in6_addr *in6; + struct ifnet *inifp; /* incoming interface */ +{ + /* + * check 6to4 addresses + */ + if (IN6_IS_ADDR_6TO4(in6)) { + struct in_addr in4; + bcopy(GET_V4(in6), &in4, sizeof(in4)); + return stf_checkaddr4(sc, &in4, inifp); + } + + /* + * reject anything that look suspicious. the test is implemented + * in ip6_input too, but we check here as well to + * (1) reject bad packets earlier, and + * (2) to be safe against future ip6_input change. + */ + if (IN6_IS_ADDR_V4COMPAT(in6) || IN6_IS_ADDR_V4MAPPED(in6)) + return -1; + + return 0; +} + +void +in_stf_input(m, off) + struct mbuf *m; + int off; +{ + int proto; + struct stf_softc *sc; + struct ip *ip; + struct ip6_hdr *ip6; + u_int8_t otos, itos; + struct ifnet *ifp; + + proto = mtod(m, struct ip *)->ip_p; + + if (proto != IPPROTO_IPV6) { + m_freem(m); + return; + } + + ip = mtod(m, struct ip *); + + sc = (struct stf_softc *)encap_getarg(m); + + if (sc == NULL || (STF2IFP(sc)->if_flags & IFF_UP) == 0) { + m_freem(m); + return; + } + + ifp = STF2IFP(sc); + +#ifdef MAC + mac_ifnet_create_mbuf(ifp, m); +#endif + + /* + * perform sanity check against outer src/dst. + * for source, perform ingress filter as well. + */ + if (stf_checkaddr4(sc, &ip->ip_dst, NULL) < 0 || + stf_checkaddr4(sc, &ip->ip_src, m->m_pkthdr.rcvif) < 0) { + m_freem(m); + return; + } + + otos = ip->ip_tos; + m_adj(m, off); + + if (m->m_len < sizeof(*ip6)) { + m = m_pullup(m, sizeof(*ip6)); + if (!m) + return; + } + ip6 = mtod(m, struct ip6_hdr *); + + /* + * perform sanity check against inner src/dst. + * for source, perform ingress filter as well. + */ + if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 || + stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) { + m_freem(m); + return; + } + + itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + if ((ifp->if_flags & IFF_LINK1) != 0) + ip_ecn_egress(ECN_ALLOWED, &otos, &itos); + else + ip_ecn_egress(ECN_NOCARE, &otos, &itos); + ip6->ip6_flow &= ~htonl(0xff << 20); + ip6->ip6_flow |= htonl((u_int32_t)itos << 20); + + m->m_pkthdr.rcvif = ifp; + + if (bpf_peers_present(ifp->if_bpf)) { + /* + * We need to prepend the address family as + * a four byte field. Cons up a dummy header + * to pacify bpf. This is safe because bpf + * will only read from the mbuf (i.e., it won't + * try to free it or keep a pointer a to it). + */ + u_int32_t af = AF_INET6; + bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); + } + + /* + * Put the packet to the network layer input queue according to the + * specified address family. + * See net/if_gif.c for possible issues with packet processing + * reorder due to extra queueing. + */ + ifp->if_ipackets++; + ifp->if_ibytes += m->m_pkthdr.len; + netisr_dispatch(NETISR_IPV6, m); +} + +/* ARGSUSED */ +static void +stf_rtrequest(cmd, rt, info) + int cmd; + struct rtentry *rt; + struct rt_addrinfo *info; +{ + RT_LOCK_ASSERT(rt); + rt->rt_rmx.rmx_mtu = IPV6_MMTU; +} + +static int +stf_ioctl(ifp, cmd, data) + struct ifnet *ifp; + u_long cmd; + caddr_t data; +{ + struct ifaddr *ifa; + struct ifreq *ifr; + struct sockaddr_in6 *sin6; + struct in_addr addr; + int error; + + error = 0; + switch (cmd) { + case SIOCSIFADDR: + ifa = (struct ifaddr *)data; + if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) { + error = EAFNOSUPPORT; + break; + } + sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; + if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { + error = EINVAL; + break; + } + bcopy(GET_V4(&sin6->sin6_addr), &addr, sizeof(addr)); + if (isrfc1918addr(&addr)) { + error = EINVAL; + break; + } + + ifa->ifa_rtrequest = stf_rtrequest; + ifp->if_flags |= IFF_UP; + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + ifr = (struct ifreq *)data; + if (ifr && ifr->ifr_addr.sa_family == AF_INET6) + ; + else + error = EAFNOSUPPORT; + break; + + default: + error = EINVAL; + break; + } + + return error; +} diff --git a/freebsd/sys/net/if_stf.h b/freebsd/sys/net/if_stf.h new file mode 100644 index 00000000..64fd30ee --- /dev/null +++ b/freebsd/sys/net/if_stf.h @@ -0,0 +1,38 @@ +/* $FreeBSD$ */ +/* $KAME: if_stf.h,v 1.5 2001/10/12 10:09:17 keiichi Exp $ */ + +/*- + * Copyright (C) 2000 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_IF_STF_HH_ +#define _NET_IF_STF_HH_ + +void in_stf_input(struct mbuf *, int); + +#endif /* _NET_IF_STF_HH_ */ diff --git a/freebsd/sys/net/if_tap.c b/freebsd/sys/net/if_tap.c new file mode 100644 index 00000000..206302bb --- /dev/null +++ b/freebsd/sys/net/if_tap.c @@ -0,0 +1,1086 @@ +#include + +/*- + * Copyright (C) 1999-2000 by Maksim Yevmenkin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * BASED ON: + * ------------------------------------------------------------------------- + * + * Copyright (c) 1988, Julian Onions + * Nottingham University 1987. + */ + +/* + * $FreeBSD$ + * $Id: if_tap.c,v 0.21 2000/07/23 21:46:02 max Exp $ + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + + +#define CDEV_NAME "tap" +#define TAPDEBUG if (tapdebug) printf + +#define TAP "tap" +#define VMNET "vmnet" +#define TAPMAXUNIT 0x7fff +#define VMNET_DEV_MASK CLONE_FLAG0 + +/* module */ +static int tapmodevent(module_t, int, void *); + +/* device */ +static void tapclone(void *, struct ucred *, char *, int, + struct cdev **); +static void tapcreate(struct cdev *); + +/* network interface */ +static void tapifstart(struct ifnet *); +static int tapifioctl(struct ifnet *, u_long, caddr_t); +static void tapifinit(void *); + +static int tap_clone_create(struct if_clone *, int, caddr_t); +static void tap_clone_destroy(struct ifnet *); +static int vmnet_clone_create(struct if_clone *, int, caddr_t); +static void vmnet_clone_destroy(struct ifnet *); + +IFC_SIMPLE_DECLARE(tap, 0); +IFC_SIMPLE_DECLARE(vmnet, 0); + +/* character device */ +static d_open_t tapopen; +static d_close_t tapclose; +static d_read_t tapread; +static d_write_t tapwrite; +static d_ioctl_t tapioctl; +static d_poll_t tappoll; +static d_kqfilter_t tapkqfilter; + +/* kqueue(2) */ +static int tapkqread(struct knote *, long); +static int tapkqwrite(struct knote *, long); +static void tapkqdetach(struct knote *); + +static struct filterops tap_read_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = tapkqdetach, + .f_event = tapkqread, +}; + +static struct filterops tap_write_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = tapkqdetach, + .f_event = tapkqwrite, +}; + +static struct cdevsw tap_cdevsw = { + .d_version = D_VERSION, + .d_flags = D_PSEUDO | D_NEEDMINOR, + .d_open = tapopen, + .d_close = tapclose, + .d_read = tapread, + .d_write = tapwrite, + .d_ioctl = tapioctl, + .d_poll = tappoll, + .d_name = CDEV_NAME, + .d_kqfilter = tapkqfilter, +}; + +/* + * All global variables in if_tap.c are locked with tapmtx, with the + * exception of tapdebug, which is accessed unlocked; tapclones is + * static at runtime. + */ +static struct mtx tapmtx; +static int tapdebug = 0; /* debug flag */ +static int tapuopen = 0; /* allow user open() */ +static int tapuponopen = 0; /* IFF_UP on open() */ +static int tapdclone = 1; /* enable devfs cloning */ +static SLIST_HEAD(, tap_softc) taphead; /* first device */ +static struct clonedevs *tapclones; + +MALLOC_DECLARE(M_TAP); +MALLOC_DEFINE(M_TAP, CDEV_NAME, "Ethernet tunnel interface"); +SYSCTL_INT(_debug, OID_AUTO, if_tap_debug, CTLFLAG_RW, &tapdebug, 0, ""); + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0, + "Ethernet tunnel software network interface"); +SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tapuopen, 0, + "Allow user to open /dev/tap (based on node permissions)"); +SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, + "Bring interface up when /dev/tap is opened"); +SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RW, &tapdclone, 0, + "Enably legacy devfs interface creation"); +SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tapdebug, 0, ""); + +TUNABLE_INT("net.link.tap.devfs_cloning", &tapdclone); + +DEV_MODULE(if_tap, tapmodevent, NULL); + +static int +tap_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + struct cdev *dev; + int i; + int extra; + + if (strcmp(ifc->ifc_name, VMNET) == 0) + extra = VMNET_DEV_MASK; + else + extra = 0; + + /* find any existing device, or allocate new unit number */ + i = clone_create(&tapclones, &tap_cdevsw, &unit, &dev, extra); + if (i) { + dev = make_dev(&tap_cdevsw, unit | extra, + UID_ROOT, GID_WHEEL, 0600, "%s%d", ifc->ifc_name, unit); + } + + tapcreate(dev); + return (0); +} + +/* vmnet devices are tap devices in disguise */ +static int +vmnet_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + return tap_clone_create(ifc, unit, params); +} + +static void +tap_destroy(struct tap_softc *tp) +{ + struct ifnet *ifp = tp->tap_ifp; + + /* Unlocked read. */ + KASSERT(!(tp->tap_flags & TAP_OPEN), + ("%s flags is out of sync", ifp->if_xname)); + + knlist_destroy(&tp->tap_rsel.si_note); + destroy_dev(tp->tap_dev); + ether_ifdetach(ifp); + if_free_type(ifp, IFT_ETHER); + + mtx_destroy(&tp->tap_mtx); + free(tp, M_TAP); +} + +static void +tap_clone_destroy(struct ifnet *ifp) +{ + struct tap_softc *tp = ifp->if_softc; + + mtx_lock(&tapmtx); + SLIST_REMOVE(&taphead, tp, tap_softc, tap_next); + mtx_unlock(&tapmtx); + tap_destroy(tp); +} + +/* vmnet devices are tap devices in disguise */ +static void +vmnet_clone_destroy(struct ifnet *ifp) +{ + tap_clone_destroy(ifp); +} + +/* + * tapmodevent + * + * module event handler + */ +static int +tapmodevent(module_t mod, int type, void *data) +{ + static eventhandler_tag eh_tag = NULL; + struct tap_softc *tp = NULL; + struct ifnet *ifp = NULL; + + switch (type) { + case MOD_LOAD: + + /* intitialize device */ + + mtx_init(&tapmtx, "tapmtx", NULL, MTX_DEF); + SLIST_INIT(&taphead); + + clone_setup(&tapclones); + eh_tag = EVENTHANDLER_REGISTER(dev_clone, tapclone, 0, 1000); + if (eh_tag == NULL) { + clone_cleanup(&tapclones); + mtx_destroy(&tapmtx); + return (ENOMEM); + } + if_clone_attach(&tap_cloner); + if_clone_attach(&vmnet_cloner); + return (0); + + case MOD_UNLOAD: + /* + * The EBUSY algorithm here can't quite atomically + * guarantee that this is race-free since we have to + * release the tap mtx to deregister the clone handler. + */ + mtx_lock(&tapmtx); + SLIST_FOREACH(tp, &taphead, tap_next) { + mtx_lock(&tp->tap_mtx); + if (tp->tap_flags & TAP_OPEN) { + mtx_unlock(&tp->tap_mtx); + mtx_unlock(&tapmtx); + return (EBUSY); + } + mtx_unlock(&tp->tap_mtx); + } + mtx_unlock(&tapmtx); + + EVENTHANDLER_DEREGISTER(dev_clone, eh_tag); + if_clone_detach(&tap_cloner); + if_clone_detach(&vmnet_cloner); + drain_dev_clone_events(); + + mtx_lock(&tapmtx); + while ((tp = SLIST_FIRST(&taphead)) != NULL) { + SLIST_REMOVE_HEAD(&taphead, tap_next); + mtx_unlock(&tapmtx); + + ifp = tp->tap_ifp; + + TAPDEBUG("detaching %s\n", ifp->if_xname); + + tap_destroy(tp); + mtx_lock(&tapmtx); + } + mtx_unlock(&tapmtx); + clone_cleanup(&tapclones); + + mtx_destroy(&tapmtx); + + break; + + default: + return (EOPNOTSUPP); + } + + return (0); +} /* tapmodevent */ + + +/* + * DEVFS handler + * + * We need to support two kind of devices - tap and vmnet + */ +static void +tapclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) +{ + char devname[SPECNAMELEN + 1]; + int i, unit, append_unit; + int extra; + + if (*dev != NULL) + return; + + if (!tapdclone || + (!tapuopen && priv_check_cred(cred, PRIV_NET_IFCREATE, 0) != 0)) + return; + + unit = 0; + append_unit = 0; + extra = 0; + + /* We're interested in only tap/vmnet devices. */ + if (strcmp(name, TAP) == 0) { + unit = -1; + } else if (strcmp(name, VMNET) == 0) { + unit = -1; + extra = VMNET_DEV_MASK; + } else if (dev_stdclone(name, NULL, TAP, &unit) != 1) { + if (dev_stdclone(name, NULL, VMNET, &unit) != 1) { + return; + } else { + extra = VMNET_DEV_MASK; + } + } + + if (unit == -1) + append_unit = 1; + + /* find any existing device, or allocate new unit number */ + i = clone_create(&tapclones, &tap_cdevsw, &unit, dev, extra); + if (i) { + if (append_unit) { + /* + * We were passed 'tun' or 'tap', with no unit specified + * so we'll need to append it now. + */ + namelen = snprintf(devname, sizeof(devname), "%s%d", name, + unit); + name = devname; + } + + *dev = make_dev_credf(MAKEDEV_REF, &tap_cdevsw, unit | extra, + cred, UID_ROOT, GID_WHEEL, 0600, "%s", name); + } + + if_clone_create(name, namelen, NULL); +} /* tapclone */ + + +/* + * tapcreate + * + * to create interface + */ +static void +tapcreate(struct cdev *dev) +{ + struct ifnet *ifp = NULL; + struct tap_softc *tp = NULL; + unsigned short macaddr_hi; + uint32_t macaddr_mid; + int unit; + char *name = NULL; + u_char eaddr[6]; + + dev->si_flags &= ~SI_CHEAPCLONE; + + /* allocate driver storage and create device */ + tp = malloc(sizeof(*tp), M_TAP, M_WAITOK | M_ZERO); + mtx_init(&tp->tap_mtx, "tap_mtx", NULL, MTX_DEF); + mtx_lock(&tapmtx); + SLIST_INSERT_HEAD(&taphead, tp, tap_next); + mtx_unlock(&tapmtx); + + unit = dev2unit(dev); + + /* select device: tap or vmnet */ + if (unit & VMNET_DEV_MASK) { + name = VMNET; + tp->tap_flags |= TAP_VMNET; + } else + name = TAP; + + unit &= TAPMAXUNIT; + + TAPDEBUG("tapcreate(%s%d). minor = %#x\n", name, unit, dev2unit(dev)); + + /* generate fake MAC address: 00 bd xx xx xx unit_no */ + macaddr_hi = htons(0x00bd); + macaddr_mid = (uint32_t) ticks; + bcopy(&macaddr_hi, eaddr, sizeof(short)); + bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t)); + eaddr[5] = (u_char)unit; + + /* fill the rest and attach interface */ + ifp = tp->tap_ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) + panic("%s%d: can not if_alloc()", name, unit); + ifp->if_softc = tp; + if_initname(ifp, name, unit); + ifp->if_init = tapifinit; + ifp->if_start = tapifstart; + ifp->if_ioctl = tapifioctl; + ifp->if_mtu = ETHERMTU; + ifp->if_flags = (IFF_BROADCAST|IFF_SIMPLEX|IFF_MULTICAST); + IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); + ifp->if_capabilities |= IFCAP_LINKSTATE; + ifp->if_capenable |= IFCAP_LINKSTATE; + + dev->si_drv1 = tp; + tp->tap_dev = dev; + + ether_ifattach(ifp, eaddr); + + mtx_lock(&tp->tap_mtx); + tp->tap_flags |= TAP_INITED; + mtx_unlock(&tp->tap_mtx); + + knlist_init_mtx(&tp->tap_rsel.si_note, &tp->tap_mtx); + + TAPDEBUG("interface %s is created. minor = %#x\n", + ifp->if_xname, dev2unit(dev)); +} /* tapcreate */ + + +/* + * tapopen + * + * to open tunnel. must be superuser + */ +static int +tapopen(struct cdev *dev, int flag, int mode, struct thread *td) +{ + struct tap_softc *tp = NULL; + struct ifnet *ifp = NULL; + int error; + + if (tapuopen == 0) { + error = priv_check(td, PRIV_NET_TAP); + if (error) + return (error); + } + + if ((dev2unit(dev) & CLONE_UNITMASK) > TAPMAXUNIT) + return (ENXIO); + + tp = dev->si_drv1; + + mtx_lock(&tp->tap_mtx); + if (tp->tap_flags & TAP_OPEN) { + mtx_unlock(&tp->tap_mtx); + return (EBUSY); + } + + bcopy(IF_LLADDR(tp->tap_ifp), tp->ether_addr, sizeof(tp->ether_addr)); + tp->tap_pid = td->td_proc->p_pid; + tp->tap_flags |= TAP_OPEN; + ifp = tp->tap_ifp; + + ifp->if_drv_flags |= IFF_DRV_RUNNING; + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + if (tapuponopen) + ifp->if_flags |= IFF_UP; + if_link_state_change(ifp, LINK_STATE_UP); + mtx_unlock(&tp->tap_mtx); + + TAPDEBUG("%s is open. minor = %#x\n", ifp->if_xname, dev2unit(dev)); + + return (0); +} /* tapopen */ + + +/* + * tapclose + * + * close the device - mark i/f down & delete routing info + */ +static int +tapclose(struct cdev *dev, int foo, int bar, struct thread *td) +{ + struct ifaddr *ifa; + struct tap_softc *tp = dev->si_drv1; + struct ifnet *ifp = tp->tap_ifp; + + /* junk all pending output */ + mtx_lock(&tp->tap_mtx); + IF_DRAIN(&ifp->if_snd); + + /* + * do not bring the interface down, and do not anything with + * interface, if we are in VMnet mode. just close the device. + */ + + if (((tp->tap_flags & TAP_VMNET) == 0) && (ifp->if_flags & IFF_UP)) { + mtx_unlock(&tp->tap_mtx); + if_down(ifp); + mtx_lock(&tp->tap_mtx); + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + mtx_unlock(&tp->tap_mtx); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + rtinit(ifa, (int)RTM_DELETE, 0); + } + if_purgeaddrs(ifp); + mtx_lock(&tp->tap_mtx); + } + } + + if_link_state_change(ifp, LINK_STATE_DOWN); + funsetown(&tp->tap_sigio); + selwakeuppri(&tp->tap_rsel, PZERO+1); + KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); + + tp->tap_flags &= ~TAP_OPEN; + tp->tap_pid = 0; + mtx_unlock(&tp->tap_mtx); + + TAPDEBUG("%s is closed. minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + + return (0); +} /* tapclose */ + + +/* + * tapifinit + * + * network interface initialization function + */ +static void +tapifinit(void *xtp) +{ + struct tap_softc *tp = (struct tap_softc *)xtp; + struct ifnet *ifp = tp->tap_ifp; + + TAPDEBUG("initializing %s\n", ifp->if_xname); + + mtx_lock(&tp->tap_mtx); + ifp->if_drv_flags |= IFF_DRV_RUNNING; + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + mtx_unlock(&tp->tap_mtx); + + /* attempt to start output */ + tapifstart(ifp); +} /* tapifinit */ + + +/* + * tapifioctl + * + * Process an ioctl request on network interface + */ +static int +tapifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct tap_softc *tp = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data; + struct ifstat *ifs = NULL; + int dummy; + + switch (cmd) { + case SIOCSIFFLAGS: /* XXX -- just like vmnet does */ + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + + case SIOCSIFMTU: + ifp->if_mtu = ifr->ifr_mtu; + break; + + case SIOCGIFSTATUS: + ifs = (struct ifstat *)data; + dummy = strlen(ifs->ascii); + mtx_lock(&tp->tap_mtx); + if (tp->tap_pid != 0 && dummy < sizeof(ifs->ascii)) + snprintf(ifs->ascii + dummy, + sizeof(ifs->ascii) - dummy, + "\tOpened by PID %d\n", tp->tap_pid); + mtx_unlock(&tp->tap_mtx); + break; + + default: + return (ether_ioctl(ifp, cmd, data)); + /* NOT REACHED */ + } + + return (0); +} /* tapifioctl */ + + +/* + * tapifstart + * + * queue packets from higher level ready to put out + */ +static void +tapifstart(struct ifnet *ifp) +{ + struct tap_softc *tp = ifp->if_softc; + + TAPDEBUG("%s starting\n", ifp->if_xname); + + /* + * do not junk pending output if we are in VMnet mode. + * XXX: can this do any harm because of queue overflow? + */ + + mtx_lock(&tp->tap_mtx); + if (((tp->tap_flags & TAP_VMNET) == 0) && + ((tp->tap_flags & TAP_READY) != TAP_READY)) { + struct mbuf *m; + + /* Unlocked read. */ + TAPDEBUG("%s not ready, tap_flags = 0x%x\n", ifp->if_xname, + tp->tap_flags); + + for (;;) { + IF_DEQUEUE(&ifp->if_snd, m); + if (m != NULL) { + m_freem(m); + ifp->if_oerrors++; + } else + break; + } + mtx_unlock(&tp->tap_mtx); + + return; + } + + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + if (tp->tap_flags & TAP_RWAIT) { + tp->tap_flags &= ~TAP_RWAIT; + wakeup(tp); + } + + if ((tp->tap_flags & TAP_ASYNC) && (tp->tap_sigio != NULL)) { + mtx_unlock(&tp->tap_mtx); + pgsigio(&tp->tap_sigio, SIGIO, 0); + mtx_lock(&tp->tap_mtx); + } + + selwakeuppri(&tp->tap_rsel, PZERO+1); + KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); + ifp->if_opackets ++; /* obytes are counted in ether_output */ + } + + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + mtx_unlock(&tp->tap_mtx); +} /* tapifstart */ + + +/* + * tapioctl + * + * the cdevsw interface is now pretty minimal + */ +static int +tapioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) +{ + struct tap_softc *tp = dev->si_drv1; + struct ifnet *ifp = tp->tap_ifp; + struct tapinfo *tapp = NULL; + int f; +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) + int ival; +#endif + + switch (cmd) { + case TAPSIFINFO: + tapp = (struct tapinfo *)data; + mtx_lock(&tp->tap_mtx); + ifp->if_mtu = tapp->mtu; + ifp->if_type = tapp->type; + ifp->if_baudrate = tapp->baudrate; + mtx_unlock(&tp->tap_mtx); + break; + + case TAPGIFINFO: + tapp = (struct tapinfo *)data; + mtx_lock(&tp->tap_mtx); + tapp->mtu = ifp->if_mtu; + tapp->type = ifp->if_type; + tapp->baudrate = ifp->if_baudrate; + mtx_unlock(&tp->tap_mtx); + break; + + case TAPSDEBUG: + tapdebug = *(int *)data; + break; + + case TAPGDEBUG: + *(int *)data = tapdebug; + break; + + case TAPGIFNAME: { + struct ifreq *ifr = (struct ifreq *) data; + + strlcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ); + } break; + + case FIONBIO: + break; + + case FIOASYNC: + mtx_lock(&tp->tap_mtx); + if (*(int *)data) + tp->tap_flags |= TAP_ASYNC; + else + tp->tap_flags &= ~TAP_ASYNC; + mtx_unlock(&tp->tap_mtx); + break; + + case FIONREAD: + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + struct mbuf *mb; + + IFQ_LOCK(&ifp->if_snd); + IFQ_POLL_NOLOCK(&ifp->if_snd, mb); + for (*(int *)data = 0; mb != NULL; + mb = mb->m_next) + *(int *)data += mb->m_len; + IFQ_UNLOCK(&ifp->if_snd); + } else + *(int *)data = 0; + break; + + case FIOSETOWN: + return (fsetown(*(int *)data, &tp->tap_sigio)); + + case FIOGETOWN: + *(int *)data = fgetown(&tp->tap_sigio); + return (0); + + /* this is deprecated, FIOSETOWN should be used instead */ + case TIOCSPGRP: + return (fsetown(-(*(int *)data), &tp->tap_sigio)); + + /* this is deprecated, FIOGETOWN should be used instead */ + case TIOCGPGRP: + *(int *)data = -fgetown(&tp->tap_sigio); + return (0); + + /* VMware/VMnet port ioctl's */ + +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) + case _IO('V', 0): + ival = IOCPARM_IVAL(data); + data = (caddr_t)&ival; + /* FALLTHROUGH */ +#endif + case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ + f = *(int *)data; + f &= 0x0fff; + f &= ~IFF_CANTCHANGE; + f |= IFF_UP; + + mtx_lock(&tp->tap_mtx); + ifp->if_flags = f | (ifp->if_flags & IFF_CANTCHANGE); + mtx_unlock(&tp->tap_mtx); + break; + + case OSIOCGIFADDR: /* get MAC address of the remote side */ + case SIOCGIFADDR: + mtx_lock(&tp->tap_mtx); + bcopy(tp->ether_addr, data, sizeof(tp->ether_addr)); + mtx_unlock(&tp->tap_mtx); + break; + + case SIOCSIFADDR: /* set MAC address of the remote side */ + mtx_lock(&tp->tap_mtx); + bcopy(data, tp->ether_addr, sizeof(tp->ether_addr)); + mtx_unlock(&tp->tap_mtx); + break; + + default: + return (ENOTTY); + } + return (0); +} /* tapioctl */ + + +/* + * tapread + * + * the cdevsw read interface - reads a packet at a time, or at + * least as much of a packet as can be read + */ +static int +tapread(struct cdev *dev, struct uio *uio, int flag) +{ + struct tap_softc *tp = dev->si_drv1; + struct ifnet *ifp = tp->tap_ifp; + struct mbuf *m = NULL; + int error = 0, len; + + TAPDEBUG("%s reading, minor = %#x\n", ifp->if_xname, dev2unit(dev)); + + mtx_lock(&tp->tap_mtx); + if ((tp->tap_flags & TAP_READY) != TAP_READY) { + mtx_unlock(&tp->tap_mtx); + + /* Unlocked read. */ + TAPDEBUG("%s not ready. minor = %#x, tap_flags = 0x%x\n", + ifp->if_xname, dev2unit(dev), tp->tap_flags); + + return (EHOSTDOWN); + } + + tp->tap_flags &= ~TAP_RWAIT; + + /* sleep until we get a packet */ + do { + IF_DEQUEUE(&ifp->if_snd, m); + + if (m == NULL) { + if (flag & O_NONBLOCK) { + mtx_unlock(&tp->tap_mtx); + return (EWOULDBLOCK); + } + + tp->tap_flags |= TAP_RWAIT; + error = mtx_sleep(tp, &tp->tap_mtx, PCATCH | (PZERO + 1), + "taprd", 0); + if (error) { + mtx_unlock(&tp->tap_mtx); + return (error); + } + } + } while (m == NULL); + mtx_unlock(&tp->tap_mtx); + + /* feed packet to bpf */ + BPF_MTAP(ifp, m); + + /* xfer packet to user space */ + while ((m != NULL) && (uio->uio_resid > 0) && (error == 0)) { + len = min(uio->uio_resid, m->m_len); + if (len == 0) + break; + + error = uiomove(mtod(m, void *), len, uio); + m = m_free(m); + } + + if (m != NULL) { + TAPDEBUG("%s dropping mbuf, minor = %#x\n", ifp->if_xname, + dev2unit(dev)); + m_freem(m); + } + + return (error); +} /* tapread */ + + +/* + * tapwrite + * + * the cdevsw write interface - an atomic write is a packet - or else! + */ +static int +tapwrite(struct cdev *dev, struct uio *uio, int flag) +{ + struct ether_header *eh; + struct tap_softc *tp = dev->si_drv1; + struct ifnet *ifp = tp->tap_ifp; + struct mbuf *m; + + TAPDEBUG("%s writting, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + + if (uio->uio_resid == 0) + return (0); + + if ((uio->uio_resid < 0) || (uio->uio_resid > TAPMRU)) { + TAPDEBUG("%s invalid packet len = %zd, minor = %#x\n", + ifp->if_xname, uio->uio_resid, dev2unit(dev)); + + return (EIO); + } + + if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, ETHER_ALIGN, + M_PKTHDR)) == NULL) { + ifp->if_ierrors ++; + return (ENOBUFS); + } + + m->m_pkthdr.rcvif = ifp; + + /* + * Only pass a unicast frame to ether_input(), if it would actually + * have been received by non-virtual hardware. + */ + if (m->m_len < sizeof(struct ether_header)) { + m_freem(m); + return (0); + } + eh = mtod(m, struct ether_header *); + + if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && + !ETHER_IS_MULTICAST(eh->ether_dhost) && + bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { + m_freem(m); + return (0); + } + + /* Pass packet up to parent. */ + (*ifp->if_input)(ifp, m); + ifp->if_ipackets ++; /* ibytes are counted in parent */ + + return (0); +} /* tapwrite */ + + +/* + * tappoll + * + * the poll interface, this is only useful on reads + * really. the write detect always returns true, write never blocks + * anyway, it either accepts the packet or drops it + */ +static int +tappoll(struct cdev *dev, int events, struct thread *td) +{ + struct tap_softc *tp = dev->si_drv1; + struct ifnet *ifp = tp->tap_ifp; + int revents = 0; + + TAPDEBUG("%s polling, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + + if (events & (POLLIN | POLLRDNORM)) { + IFQ_LOCK(&ifp->if_snd); + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + TAPDEBUG("%s have data in queue. len = %d, " \ + "minor = %#x\n", ifp->if_xname, + ifp->if_snd.ifq_len, dev2unit(dev)); + + revents |= (events & (POLLIN | POLLRDNORM)); + } else { + TAPDEBUG("%s waiting for data, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + + selrecord(td, &tp->tap_rsel); + } + IFQ_UNLOCK(&ifp->if_snd); + } + + if (events & (POLLOUT | POLLWRNORM)) + revents |= (events & (POLLOUT | POLLWRNORM)); + + return (revents); +} /* tappoll */ + + +/* + * tap_kqfilter + * + * support for kevent() system call + */ +static int +tapkqfilter(struct cdev *dev, struct knote *kn) +{ + struct tap_softc *tp = dev->si_drv1; + struct ifnet *ifp = tp->tap_ifp; + + switch (kn->kn_filter) { + case EVFILT_READ: + TAPDEBUG("%s kqfilter: EVFILT_READ, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + kn->kn_fop = &tap_read_filterops; + break; + + case EVFILT_WRITE: + TAPDEBUG("%s kqfilter: EVFILT_WRITE, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + kn->kn_fop = &tap_write_filterops; + break; + + default: + TAPDEBUG("%s kqfilter: invalid filter, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + return (EINVAL); + /* NOT REACHED */ + } + + kn->kn_hook = tp; + knlist_add(&tp->tap_rsel.si_note, kn, 0); + + return (0); +} /* tapkqfilter */ + + +/* + * tap_kqread + * + * Return true if there is data in the interface queue + */ +static int +tapkqread(struct knote *kn, long hint) +{ + int ret; + struct tap_softc *tp = kn->kn_hook; + struct cdev *dev = tp->tap_dev; + struct ifnet *ifp = tp->tap_ifp; + + if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { + TAPDEBUG("%s have data in queue. len = %d, minor = %#x\n", + ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); + ret = 1; + } else { + TAPDEBUG("%s waiting for data, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + ret = 0; + } + + return (ret); +} /* tapkqread */ + + +/* + * tap_kqwrite + * + * Always can write. Return the MTU in kn->data + */ +static int +tapkqwrite(struct knote *kn, long hint) +{ + struct tap_softc *tp = kn->kn_hook; + struct ifnet *ifp = tp->tap_ifp; + + kn->kn_data = ifp->if_mtu; + + return (1); +} /* tapkqwrite */ + + +static void +tapkqdetach(struct knote *kn) +{ + struct tap_softc *tp = kn->kn_hook; + + knlist_remove(&tp->tap_rsel.si_note, kn, 0); +} /* tapkqdetach */ + diff --git a/freebsd/sys/net/if_tap.h b/freebsd/sys/net/if_tap.h new file mode 100644 index 00000000..e611884b --- /dev/null +++ b/freebsd/sys/net/if_tap.h @@ -0,0 +1,74 @@ +/*- + * Copyright (C) 1999-2000 by Maksim Yevmenkin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * BASED ON: + * ------------------------------------------------------------------------- + * + * Copyright (c) 1988, Julian Onions + * Nottingham University 1987. + */ + +/* + * $FreeBSD$ + * $Id: if_tap.h,v 0.7 2000/07/12 04:12:51 max Exp $ + */ + +#ifndef _NET_IF_TAP_HH_ +#define _NET_IF_TAP_HH_ + +/* refer to if_tapvar.h for the softc stuff */ + +/* maximum receive packet size (hard limit) */ +#define TAPMRU 16384 + +struct tapinfo { + int baudrate; /* linespeed */ + short mtu; /* maximum transmission unit */ + u_char type; /* ethernet, tokenring, etc. */ + u_char dummy; /* place holder */ +}; + +/* ioctl's for get/set debug */ +#define TAPSDEBUG _IOW('t', 90, int) +#define TAPGDEBUG _IOR('t', 89, int) +#define TAPSIFINFO _IOW('t', 91, struct tapinfo) +#define TAPGIFINFO _IOR('t', 92, struct tapinfo) +#define TAPGIFNAME _IOR('t', 93, struct ifreq) + +/* VMware ioctl's */ +#define VMIO_SIOCSIFFLAGS _IOWINT('V', 0) +#define VMIO_SIOCSKEEP _IO('V', 1) +#define VMIO_SIOCSIFBR _IO('V', 2) +#define VMIO_SIOCSLADRF _IO('V', 3) + +/* XXX -- unimplemented */ +#define VMIO_SIOCSETMACADDR _IO('V', 4) + +/* XXX -- not used? */ +#define VMIO_SIOCPORT _IO('V', 5) +#define VMIO_SIOCBRIDGE _IO('V', 6) +#define VMIO_SIOCNETIF _IO('V', 7) + +#endif /* !_NET_IF_TAP_HH_ */ diff --git a/freebsd/sys/net/if_tapvar.h b/freebsd/sys/net/if_tapvar.h new file mode 100644 index 00000000..4a26fd87 --- /dev/null +++ b/freebsd/sys/net/if_tapvar.h @@ -0,0 +1,69 @@ +/*- + * Copyright (C) 1999-2000 by Maksim Yevmenkin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * BASED ON: + * ------------------------------------------------------------------------- + * + * Copyright (c) 1998 Brian Somers + * All rights reserved. + * + * Copyright (c) 1988, Julian Onions + * Nottingham University 1987. + */ + +/* + * $FreeBSD$ + * $Id: if_tapvar.h,v 0.6 2000/07/11 02:16:08 max Exp $ + */ + +#ifndef _NET_IF_TAPVAR_HH_ +#define _NET_IF_TAPVAR_HH_ + +/* + * tap_mtx locks tap_flags, tap_pid. tap_next locked with global tapmtx. + * Other fields locked by owning subsystems. + */ +struct tap_softc { + struct ifnet *tap_ifp; + u_short tap_flags; /* misc flags */ +#define TAP_OPEN (1 << 0) +#define TAP_INITED (1 << 1) +#define TAP_RWAIT (1 << 2) +#define TAP_ASYNC (1 << 3) +#define TAP_READY (TAP_OPEN|TAP_INITED) +#define TAP_VMNET (1 << 4) + + u_int8_t ether_addr[ETHER_ADDR_LEN]; /* ether addr of the remote side */ + + pid_t tap_pid; /* PID of process to open */ + struct sigio *tap_sigio; /* information for async I/O */ + struct selinfo tap_rsel; /* read select */ + + SLIST_ENTRY(tap_softc) tap_next; /* next device in chain */ + struct cdev *tap_dev; + struct mtx tap_mtx; /* per-softc mutex */ +}; + +#endif /* !_NET_IF_TAPVAR_HH_ */ diff --git a/freebsd/sys/net/if_tun.c b/freebsd/sys/net/if_tun.c new file mode 100644 index 00000000..7f90fa51 --- /dev/null +++ b/freebsd/sys/net/if_tun.c @@ -0,0 +1,1059 @@ +#include + +/* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ + +/*- + * Copyright (c) 1988, Julian Onions + * Nottingham University 1987. + * + * This source may be freely distributed, however I would be interested + * in any changes that are made. + * + * This driver takes packets off the IP i/f and hands them up to a + * user process to have its wicked way with. This driver has it's + * roots in a similar driver written by Phil Cockcroft (formerly) at + * UCL. This driver is based much more on read/write/poll mode of + * operation though. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef INET +#include +#endif +#include +#include + +#include +#include + +#include + +/* + * tun_list is protected by global tunmtx. Other mutable fields are + * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is + * static for the duration of a tunnel interface. + */ +struct tun_softc { + TAILQ_ENTRY(tun_softc) tun_list; + struct cdev *tun_dev; + u_short tun_flags; /* misc flags */ +#define TUN_OPEN 0x0001 +#define TUN_INITED 0x0002 +#define TUN_RCOLL 0x0004 +#define TUN_IASET 0x0008 +#define TUN_DSTADDR 0x0010 +#define TUN_LMODE 0x0020 +#define TUN_RWAIT 0x0040 +#define TUN_ASYNC 0x0080 +#define TUN_IFHEAD 0x0100 + +#define TUN_READY (TUN_OPEN | TUN_INITED) + + /* + * XXXRW: tun_pid is used to exclusively lock /dev/tun. Is this + * actually needed? Can we just return EBUSY if already open? + * Problem is that this involved inherent races when a tun device + * is handed off from one process to another, as opposed to just + * being slightly stale informationally. + */ + pid_t tun_pid; /* owning pid */ + struct ifnet *tun_ifp; /* the interface */ + struct sigio *tun_sigio; /* information for async I/O */ + struct selinfo tun_rsel; /* read select */ + struct mtx tun_mtx; /* protect mutable softc fields */ + struct cv tun_cv; /* protect against ref'd dev destroy */ +}; +#define TUN2IFP(sc) ((sc)->tun_ifp) + +#define TUNDEBUG if (tundebug) if_printf +#define TUNNAME "tun" + +/* + * All mutable global variables in if_tun are locked using tunmtx, with + * the exception of tundebug, which is used unlocked, and tunclones, + * which is static after setup. + */ +static struct mtx tunmtx; +static MALLOC_DEFINE(M_TUN, TUNNAME, "Tunnel Interface"); +static int tundebug = 0; +static int tundclone = 1; +static struct clonedevs *tunclones; +static TAILQ_HEAD(,tun_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); +SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, + "IP tunnel software network interface."); +SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RW, &tundclone, 0, + "Enable legacy devfs interface creation."); + +TUNABLE_INT("net.link.tun.devfs_cloning", &tundclone); + +static void tunclone(void *arg, struct ucred *cred, char *name, + int namelen, struct cdev **dev); +static void tuncreate(const char *name, struct cdev *dev); +static int tunifioctl(struct ifnet *, u_long, caddr_t); +static int tuninit(struct ifnet *); +static int tunmodevent(module_t, int, void *); +static int tunoutput(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *ro); +static void tunstart(struct ifnet *); + +static int tun_clone_create(struct if_clone *, int, caddr_t); +static void tun_clone_destroy(struct ifnet *); + +IFC_SIMPLE_DECLARE(tun, 0); + +static d_open_t tunopen; +static d_close_t tunclose; +static d_read_t tunread; +static d_write_t tunwrite; +static d_ioctl_t tunioctl; +static d_poll_t tunpoll; +static d_kqfilter_t tunkqfilter; + +static int tunkqread(struct knote *, long); +static int tunkqwrite(struct knote *, long); +static void tunkqdetach(struct knote *); + +static struct filterops tun_read_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = tunkqdetach, + .f_event = tunkqread, +}; + +static struct filterops tun_write_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = tunkqdetach, + .f_event = tunkqwrite, +}; + +static struct cdevsw tun_cdevsw = { + .d_version = D_VERSION, + .d_flags = D_PSEUDO | D_NEEDMINOR, + .d_open = tunopen, + .d_close = tunclose, + .d_read = tunread, + .d_write = tunwrite, + .d_ioctl = tunioctl, + .d_poll = tunpoll, + .d_kqfilter = tunkqfilter, + .d_name = TUNNAME, +}; + +static int +tun_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + struct cdev *dev; + int i; + + /* find any existing device, or allocate new unit number */ + i = clone_create(&tunclones, &tun_cdevsw, &unit, &dev, 0); + if (i) { + /* No preexisting struct cdev *, create one */ + dev = make_dev(&tun_cdevsw, unit, + UID_UUCP, GID_DIALER, 0600, "%s%d", ifc->ifc_name, unit); + } + tuncreate(ifc->ifc_name, dev); + + return (0); +} + +static void +tunclone(void *arg, struct ucred *cred, char *name, int namelen, + struct cdev **dev) +{ + char devname[SPECNAMELEN + 1]; + int u, i, append_unit; + + if (*dev != NULL) + return; + + /* + * If tun cloning is enabled, only the superuser can create an + * interface. + */ + if (!tundclone || priv_check_cred(cred, PRIV_NET_IFCREATE, 0) != 0) + return; + + if (strcmp(name, TUNNAME) == 0) { + u = -1; + } else if (dev_stdclone(name, NULL, TUNNAME, &u) != 1) + return; /* Don't recognise the name */ + if (u != -1 && u > IF_MAXUNIT) + return; /* Unit number too high */ + + if (u == -1) + append_unit = 1; + else + append_unit = 0; + + CURVNET_SET(CRED_TO_VNET(cred)); + /* find any existing device, or allocate new unit number */ + i = clone_create(&tunclones, &tun_cdevsw, &u, dev, 0); + if (i) { + if (append_unit) { + namelen = snprintf(devname, sizeof(devname), "%s%d", name, + u); + name = devname; + } + /* No preexisting struct cdev *, create one */ + *dev = make_dev_credf(MAKEDEV_REF, &tun_cdevsw, u, cred, + UID_UUCP, GID_DIALER, 0600, "%s", name); + } + + if_clone_create(name, namelen, NULL); + CURVNET_RESTORE(); +} + +static void +tun_destroy(struct tun_softc *tp) +{ + struct cdev *dev; + + /* Unlocked read. */ + mtx_lock(&tp->tun_mtx); + if ((tp->tun_flags & TUN_OPEN) != 0) + cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); + else + mtx_unlock(&tp->tun_mtx); + + CURVNET_SET(TUN2IFP(tp)->if_vnet); + dev = tp->tun_dev; + bpfdetach(TUN2IFP(tp)); + if_detach(TUN2IFP(tp)); + if_free(TUN2IFP(tp)); + destroy_dev(dev); + knlist_destroy(&tp->tun_rsel.si_note); + mtx_destroy(&tp->tun_mtx); + cv_destroy(&tp->tun_cv); + free(tp, M_TUN); + CURVNET_RESTORE(); +} + +static void +tun_clone_destroy(struct ifnet *ifp) +{ + struct tun_softc *tp = ifp->if_softc; + + mtx_lock(&tunmtx); + TAILQ_REMOVE(&tunhead, tp, tun_list); + mtx_unlock(&tunmtx); + tun_destroy(tp); +} + +static int +tunmodevent(module_t mod, int type, void *data) +{ + static eventhandler_tag tag; + struct tun_softc *tp; + + switch (type) { + case MOD_LOAD: + mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); + clone_setup(&tunclones); + tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); + if (tag == NULL) + return (ENOMEM); + if_clone_attach(&tun_cloner); + break; + case MOD_UNLOAD: + if_clone_detach(&tun_cloner); + EVENTHANDLER_DEREGISTER(dev_clone, tag); + drain_dev_clone_events(); + + mtx_lock(&tunmtx); + while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { + TAILQ_REMOVE(&tunhead, tp, tun_list); + mtx_unlock(&tunmtx); + tun_destroy(tp); + mtx_lock(&tunmtx); + } + mtx_unlock(&tunmtx); + clone_cleanup(&tunclones); + mtx_destroy(&tunmtx); + break; + default: + return EOPNOTSUPP; + } + return 0; +} + +static moduledata_t tun_mod = { + "if_tun", + tunmodevent, + 0 +}; + +DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); + +static void +tunstart(struct ifnet *ifp) +{ + struct tun_softc *tp = ifp->if_softc; + struct mbuf *m; + + TUNDEBUG(ifp,"%s starting\n", ifp->if_xname); + if (ALTQ_IS_ENABLED(&ifp->if_snd)) { + IFQ_LOCK(&ifp->if_snd); + IFQ_POLL_NOLOCK(&ifp->if_snd, m); + if (m == NULL) { + IFQ_UNLOCK(&ifp->if_snd); + return; + } + IFQ_UNLOCK(&ifp->if_snd); + } + + mtx_lock(&tp->tun_mtx); + if (tp->tun_flags & TUN_RWAIT) { + tp->tun_flags &= ~TUN_RWAIT; + wakeup(tp); + } + selwakeuppri(&tp->tun_rsel, PZERO + 1); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); + if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { + mtx_unlock(&tp->tun_mtx); + pgsigio(&tp->tun_sigio, SIGIO, 0); + } else + mtx_unlock(&tp->tun_mtx); +} + +/* XXX: should return an error code so it can fail. */ +static void +tuncreate(const char *name, struct cdev *dev) +{ + struct tun_softc *sc; + struct ifnet *ifp; + + dev->si_flags &= ~SI_CHEAPCLONE; + + sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO); + mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF); + cv_init(&sc->tun_cv, "tun_condvar"); + sc->tun_flags = TUN_INITED; + sc->tun_dev = dev; + mtx_lock(&tunmtx); + TAILQ_INSERT_TAIL(&tunhead, sc, tun_list); + mtx_unlock(&tunmtx); + + ifp = sc->tun_ifp = if_alloc(IFT_PPP); + if (ifp == NULL) + panic("%s%d: failed to if_alloc() interface.\n", + name, dev2unit(dev)); + if_initname(ifp, name, dev2unit(dev)); + ifp->if_mtu = TUNMTU; + ifp->if_ioctl = tunifioctl; + ifp->if_output = tunoutput; + ifp->if_start = tunstart; + ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; + ifp->if_softc = sc; + IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); + ifp->if_snd.ifq_drv_maxlen = 0; + IFQ_SET_READY(&ifp->if_snd); + knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx); + ifp->if_capabilities |= IFCAP_LINKSTATE; + ifp->if_capenable |= IFCAP_LINKSTATE; + + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + dev->si_drv1 = sc; + TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); +} + +static int +tunopen(struct cdev *dev, int flag, int mode, struct thread *td) +{ + struct ifnet *ifp; + struct tun_softc *tp; + + /* + * XXXRW: Non-atomic test and set of dev->si_drv1 requires + * synchronization. + */ + tp = dev->si_drv1; + if (!tp) { + tuncreate(TUNNAME, dev); + tp = dev->si_drv1; + } + + /* + * XXXRW: This use of tun_pid is subject to error due to the + * fact that a reference to the tunnel can live beyond the + * death of the process that created it. Can we replace this + * with a simple busy flag? + */ + mtx_lock(&tp->tun_mtx); + if (tp->tun_pid != 0 && tp->tun_pid != td->td_proc->p_pid) { + mtx_unlock(&tp->tun_mtx); + return (EBUSY); + } + tp->tun_pid = td->td_proc->p_pid; + + tp->tun_flags |= TUN_OPEN; + ifp = TUN2IFP(tp); + if_link_state_change(ifp, LINK_STATE_UP); + TUNDEBUG(ifp, "open\n"); + mtx_unlock(&tp->tun_mtx); + + return (0); +} + +/* + * tunclose - close the device - mark i/f down & delete + * routing info + */ +static int +tunclose(struct cdev *dev, int foo, int bar, struct thread *td) +{ + struct tun_softc *tp; + struct ifnet *ifp; + + tp = dev->si_drv1; + ifp = TUN2IFP(tp); + + mtx_lock(&tp->tun_mtx); + tp->tun_flags &= ~TUN_OPEN; + tp->tun_pid = 0; + + /* + * junk all pending output + */ + CURVNET_SET(ifp->if_vnet); + IFQ_PURGE(&ifp->if_snd); + + if (ifp->if_flags & IFF_UP) { + mtx_unlock(&tp->tun_mtx); + if_down(ifp); + mtx_lock(&tp->tun_mtx); + } + + /* Delete all addresses and routes which reference this interface. */ + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + struct ifaddr *ifa; + + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + mtx_unlock(&tp->tun_mtx); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + /* deal w/IPv4 PtP destination; unlocked read */ + if (ifa->ifa_addr->sa_family == AF_INET) { + rtinit(ifa, (int)RTM_DELETE, + tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); + } else { + rtinit(ifa, (int)RTM_DELETE, 0); + } + } + if_purgeaddrs(ifp); + mtx_lock(&tp->tun_mtx); + } + if_link_state_change(ifp, LINK_STATE_DOWN); + CURVNET_RESTORE(); + + funsetown(&tp->tun_sigio); + selwakeuppri(&tp->tun_rsel, PZERO + 1); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); + TUNDEBUG (ifp, "closed\n"); + + cv_broadcast(&tp->tun_cv); + mtx_unlock(&tp->tun_mtx); + return (0); +} + +static int +tuninit(struct ifnet *ifp) +{ + struct tun_softc *tp = ifp->if_softc; +#ifdef INET + struct ifaddr *ifa; +#endif + int error = 0; + + TUNDEBUG(ifp, "tuninit\n"); + + mtx_lock(&tp->tun_mtx); + ifp->if_flags |= IFF_UP; + ifp->if_drv_flags |= IFF_DRV_RUNNING; + getmicrotime(&ifp->if_lastchange); + +#ifdef INET + if_addr_rlock(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family == AF_INET) { + struct sockaddr_in *si; + + si = (struct sockaddr_in *)ifa->ifa_addr; + if (si->sin_addr.s_addr) + tp->tun_flags |= TUN_IASET; + + si = (struct sockaddr_in *)ifa->ifa_dstaddr; + if (si && si->sin_addr.s_addr) + tp->tun_flags |= TUN_DSTADDR; + } + } + if_addr_runlock(ifp); +#endif + mtx_unlock(&tp->tun_mtx); + return (error); +} + +/* + * Process an ioctl request. + */ +static int +tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct ifreq *ifr = (struct ifreq *)data; + struct tun_softc *tp = ifp->if_softc; + struct ifstat *ifs; + int error = 0; + + switch(cmd) { + case SIOCGIFSTATUS: + ifs = (struct ifstat *)data; + mtx_lock(&tp->tun_mtx); + if (tp->tun_pid) + sprintf(ifs->ascii + strlen(ifs->ascii), + "\tOpened by PID %d\n", tp->tun_pid); + mtx_unlock(&tp->tun_mtx); + break; + case SIOCSIFADDR: + error = tuninit(ifp); + TUNDEBUG(ifp, "address set, error=%d\n", error); + break; + case SIOCSIFDSTADDR: + error = tuninit(ifp); + TUNDEBUG(ifp, "destination address set, error=%d\n", error); + break; + case SIOCSIFMTU: + ifp->if_mtu = ifr->ifr_mtu; + TUNDEBUG(ifp, "mtu set\n"); + break; + case SIOCSIFFLAGS: + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + default: + error = EINVAL; + } + return (error); +} + +/* + * tunoutput - queue packets from higher level ready to put out. + */ +static int +tunoutput( + struct ifnet *ifp, + struct mbuf *m0, + struct sockaddr *dst, + struct route *ro) +{ + struct tun_softc *tp = ifp->if_softc; + u_short cached_tun_flags; + int error; + u_int32_t af; + + TUNDEBUG (ifp, "tunoutput\n"); + +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m0); + if (error) { + m_freem(m0); + return (error); + } +#endif + + /* Could be unlocked read? */ + mtx_lock(&tp->tun_mtx); + cached_tun_flags = tp->tun_flags; + mtx_unlock(&tp->tun_mtx); + if ((cached_tun_flags & TUN_READY) != TUN_READY) { + TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); + m_freem (m0); + return (EHOSTDOWN); + } + + if ((ifp->if_flags & IFF_UP) != IFF_UP) { + m_freem (m0); + return (EHOSTDOWN); + } + + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC) { + bcopy(dst->sa_data, &af, sizeof(af)); + dst->sa_family = af; + } + + if (bpf_peers_present(ifp->if_bpf)) { + af = dst->sa_family; + bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); + } + + /* prepend sockaddr? this may abort if the mbuf allocation fails */ + if (cached_tun_flags & TUN_LMODE) { + /* allocate space for sockaddr */ + M_PREPEND(m0, dst->sa_len, M_DONTWAIT); + + /* if allocation failed drop packet */ + if (m0 == NULL) { + ifp->if_iqdrops++; + ifp->if_oerrors++; + return (ENOBUFS); + } else { + bcopy(dst, m0->m_data, dst->sa_len); + } + } + + if (cached_tun_flags & TUN_IFHEAD) { + /* Prepend the address family */ + M_PREPEND(m0, 4, M_DONTWAIT); + + /* if allocation failed drop packet */ + if (m0 == NULL) { + ifp->if_iqdrops++; + ifp->if_oerrors++; + return (ENOBUFS); + } else + *(u_int32_t *)m0->m_data = htonl(dst->sa_family); + } else { +#ifdef INET + if (dst->sa_family != AF_INET) +#endif + { + m_freem(m0); + return (EAFNOSUPPORT); + } + } + + error = (ifp->if_transmit)(ifp, m0); + if (error) { + ifp->if_collisions++; + return (ENOBUFS); + } + ifp->if_opackets++; + return (0); +} + +/* + * the cdevsw interface is now pretty minimal. + */ +static int +tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) +{ + int error; + struct tun_softc *tp = dev->si_drv1; + struct tuninfo *tunp; + + switch (cmd) { + case TUNSIFINFO: + tunp = (struct tuninfo *)data; + if (tunp->mtu < IF_MINMTU) + return (EINVAL); + if (TUN2IFP(tp)->if_mtu != tunp->mtu) { + error = priv_check(td, PRIV_NET_SETIFMTU); + if (error) + return (error); + } + mtx_lock(&tp->tun_mtx); + TUN2IFP(tp)->if_mtu = tunp->mtu; + TUN2IFP(tp)->if_type = tunp->type; + TUN2IFP(tp)->if_baudrate = tunp->baudrate; + mtx_unlock(&tp->tun_mtx); + break; + case TUNGIFINFO: + tunp = (struct tuninfo *)data; + mtx_lock(&tp->tun_mtx); + tunp->mtu = TUN2IFP(tp)->if_mtu; + tunp->type = TUN2IFP(tp)->if_type; + tunp->baudrate = TUN2IFP(tp)->if_baudrate; + mtx_unlock(&tp->tun_mtx); + break; + case TUNSDEBUG: + tundebug = *(int *)data; + break; + case TUNGDEBUG: + *(int *)data = tundebug; + break; + case TUNSLMODE: + mtx_lock(&tp->tun_mtx); + if (*(int *)data) { + tp->tun_flags |= TUN_LMODE; + tp->tun_flags &= ~TUN_IFHEAD; + } else + tp->tun_flags &= ~TUN_LMODE; + mtx_unlock(&tp->tun_mtx); + break; + case TUNSIFHEAD: + mtx_lock(&tp->tun_mtx); + if (*(int *)data) { + tp->tun_flags |= TUN_IFHEAD; + tp->tun_flags &= ~TUN_LMODE; + } else + tp->tun_flags &= ~TUN_IFHEAD; + mtx_unlock(&tp->tun_mtx); + break; + case TUNGIFHEAD: + mtx_lock(&tp->tun_mtx); + *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; + mtx_unlock(&tp->tun_mtx); + break; + case TUNSIFMODE: + /* deny this if UP */ + if (TUN2IFP(tp)->if_flags & IFF_UP) + return(EBUSY); + + switch (*(int *)data & ~IFF_MULTICAST) { + case IFF_POINTOPOINT: + case IFF_BROADCAST: + mtx_lock(&tp->tun_mtx); + TUN2IFP(tp)->if_flags &= + ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); + TUN2IFP(tp)->if_flags |= *(int *)data; + mtx_unlock(&tp->tun_mtx); + break; + default: + return(EINVAL); + } + break; + case TUNSIFPID: + mtx_lock(&tp->tun_mtx); + tp->tun_pid = curthread->td_proc->p_pid; + mtx_unlock(&tp->tun_mtx); + break; + case FIONBIO: + break; + case FIOASYNC: + mtx_lock(&tp->tun_mtx); + if (*(int *)data) + tp->tun_flags |= TUN_ASYNC; + else + tp->tun_flags &= ~TUN_ASYNC; + mtx_unlock(&tp->tun_mtx); + break; + case FIONREAD: + if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { + struct mbuf *mb; + IFQ_LOCK(&TUN2IFP(tp)->if_snd); + IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); + for (*(int *)data = 0; mb != NULL; mb = mb->m_next) + *(int *)data += mb->m_len; + IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); + } else + *(int *)data = 0; + break; + case FIOSETOWN: + return (fsetown(*(int *)data, &tp->tun_sigio)); + + case FIOGETOWN: + *(int *)data = fgetown(&tp->tun_sigio); + return (0); + + /* This is deprecated, FIOSETOWN should be used instead. */ + case TIOCSPGRP: + return (fsetown(-(*(int *)data), &tp->tun_sigio)); + + /* This is deprecated, FIOGETOWN should be used instead. */ + case TIOCGPGRP: + *(int *)data = -fgetown(&tp->tun_sigio); + return (0); + + default: + return (ENOTTY); + } + return (0); +} + +/* + * The cdevsw read interface - reads a packet at a time, or at + * least as much of a packet as can be read. + */ +static int +tunread(struct cdev *dev, struct uio *uio, int flag) +{ + struct tun_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + struct mbuf *m; + int error=0, len; + + TUNDEBUG (ifp, "read\n"); + mtx_lock(&tp->tun_mtx); + if ((tp->tun_flags & TUN_READY) != TUN_READY) { + mtx_unlock(&tp->tun_mtx); + TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); + return (EHOSTDOWN); + } + + tp->tun_flags &= ~TUN_RWAIT; + + do { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m == NULL) { + if (flag & O_NONBLOCK) { + mtx_unlock(&tp->tun_mtx); + return (EWOULDBLOCK); + } + tp->tun_flags |= TUN_RWAIT; + error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), + "tunread", 0); + if (error != 0) { + mtx_unlock(&tp->tun_mtx); + return (error); + } + } + } while (m == NULL); + mtx_unlock(&tp->tun_mtx); + + while (m && uio->uio_resid > 0 && error == 0) { + len = min(uio->uio_resid, m->m_len); + if (len != 0) + error = uiomove(mtod(m, void *), len, uio); + m = m_free(m); + } + + if (m) { + TUNDEBUG(ifp, "Dropping mbuf\n"); + m_freem(m); + } + return (error); +} + +/* + * the cdevsw write interface - an atomic write is a packet - or else! + */ +static int +tunwrite(struct cdev *dev, struct uio *uio, int flag) +{ + struct tun_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + struct mbuf *m; + int error = 0; + uint32_t family; + int isr; + + TUNDEBUG(ifp, "tunwrite\n"); + + if ((ifp->if_flags & IFF_UP) != IFF_UP) + /* ignore silently */ + return (0); + + if (uio->uio_resid == 0) + return (0); + + if (uio->uio_resid < 0 || uio->uio_resid > TUNMRU) { + TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); + return (EIO); + } + + if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, 0, M_PKTHDR)) == NULL) { + ifp->if_ierrors++; + return (error); + } + + m->m_pkthdr.rcvif = ifp; +#ifdef MAC + mac_ifnet_create_mbuf(ifp, m); +#endif + + /* Could be unlocked read? */ + mtx_lock(&tp->tun_mtx); + if (tp->tun_flags & TUN_IFHEAD) { + mtx_unlock(&tp->tun_mtx); + if (m->m_len < sizeof(family) && + (m = m_pullup(m, sizeof(family))) == NULL) + return (ENOBUFS); + family = ntohl(*mtod(m, u_int32_t *)); + m_adj(m, sizeof(family)); + } else { + mtx_unlock(&tp->tun_mtx); + family = AF_INET; + } + + BPF_MTAP2(ifp, &family, sizeof(family), m); + + switch (family) { +#ifdef INET + case AF_INET: + isr = NETISR_IP; + break; +#endif +#ifdef INET6 + case AF_INET6: + isr = NETISR_IPV6; + break; +#endif +#ifdef IPX + case AF_IPX: + isr = NETISR_IPX; + break; +#endif +#ifdef NETATALK + case AF_APPLETALK: + isr = NETISR_ATALK2; + break; +#endif + default: + m_freem(m); + return (EAFNOSUPPORT); + } + /* First chunk of an mbuf contains good junk */ + if (harvest.point_to_point) + random_harvest(m, 16, 3, 0, RANDOM_NET); + ifp->if_ibytes += m->m_pkthdr.len; + ifp->if_ipackets++; + CURVNET_SET(ifp->if_vnet); + netisr_dispatch(isr, m); + CURVNET_RESTORE(); + return (0); +} + +/* + * tunpoll - the poll interface, this is only useful on reads + * really. The write detect always returns true, write never blocks + * anyway, it either accepts the packet or drops it. + */ +static int +tunpoll(struct cdev *dev, int events, struct thread *td) +{ + struct tun_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + int revents = 0; + struct mbuf *m; + + TUNDEBUG(ifp, "tunpoll\n"); + + if (events & (POLLIN | POLLRDNORM)) { + IFQ_LOCK(&ifp->if_snd); + IFQ_POLL_NOLOCK(&ifp->if_snd, m); + if (m != NULL) { + TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); + revents |= events & (POLLIN | POLLRDNORM); + } else { + TUNDEBUG(ifp, "tunpoll waiting\n"); + selrecord(td, &tp->tun_rsel); + } + IFQ_UNLOCK(&ifp->if_snd); + } + if (events & (POLLOUT | POLLWRNORM)) + revents |= events & (POLLOUT | POLLWRNORM); + + return (revents); +} + +/* + * tunkqfilter - support for the kevent() system call. + */ +static int +tunkqfilter(struct cdev *dev, struct knote *kn) +{ + struct tun_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + + switch(kn->kn_filter) { + case EVFILT_READ: + TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + kn->kn_fop = &tun_read_filterops; + break; + + case EVFILT_WRITE: + TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + kn->kn_fop = &tun_write_filterops; + break; + + default: + TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + return(EINVAL); + } + + kn->kn_hook = tp; + knlist_add(&tp->tun_rsel.si_note, kn, 0); + + return (0); +} + +/* + * Return true of there is data in the interface queue. + */ +static int +tunkqread(struct knote *kn, long hint) +{ + int ret; + struct tun_softc *tp = kn->kn_hook; + struct cdev *dev = tp->tun_dev; + struct ifnet *ifp = TUN2IFP(tp); + + if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { + TUNDEBUG(ifp, + "%s have data in the queue. Len = %d, minor = %#x\n", + ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); + ret = 1; + } else { + TUNDEBUG(ifp, + "%s waiting for data, minor = %#x\n", ifp->if_xname, + dev2unit(dev)); + ret = 0; + } + + return (ret); +} + +/* + * Always can write, always return MTU in kn->data. + */ +static int +tunkqwrite(struct knote *kn, long hint) +{ + struct tun_softc *tp = kn->kn_hook; + struct ifnet *ifp = TUN2IFP(tp); + + kn->kn_data = ifp->if_mtu; + + return (1); +} + +static void +tunkqdetach(struct knote *kn) +{ + struct tun_softc *tp = kn->kn_hook; + + knlist_remove(&tp->tun_rsel.si_note, kn, 0); +} diff --git a/freebsd/sys/net/if_tun.h b/freebsd/sys/net/if_tun.h new file mode 100644 index 00000000..29718cda --- /dev/null +++ b/freebsd/sys/net/if_tun.h @@ -0,0 +1,48 @@ +/* $NetBSD: if_tun.h,v 1.5 1994/06/29 06:36:27 cgd Exp $ */ + +/*- + * Copyright (c) 1988, Julian Onions + * Nottingham University 1987. + * + * This source may be freely distributed, however I would be interested + * in any changes that are made. + * + * This driver takes packets off the IP i/f and hands them up to a + * user process to have its wicked way with. This driver has it's + * roots in a similar driver written by Phil Cockcroft (formerly) at + * UCL. This driver is based much more on read/write/select mode of + * operation though. + * + * $FreeBSD$ + */ + +#ifndef _NET_IF_TUN_HH_ +#define _NET_IF_TUN_HH_ + +/* Refer to if_tunvar.h for the softc stuff */ + +/* Maximum transmit packet size (default) */ +#define TUNMTU 1500 + +/* Maximum receive packet size (hard limit) */ +#define TUNMRU 16384 + +struct tuninfo { + int baudrate; /* linespeed */ + short mtu; /* maximum transmission unit */ + u_char type; /* ethernet, tokenring, etc. */ + u_char dummy; /* place holder */ +}; + +/* ioctl's for get/set debug */ +#define TUNSDEBUG _IOW('t', 90, int) +#define TUNGDEBUG _IOR('t', 89, int) +#define TUNSIFINFO _IOW('t', 91, struct tuninfo) +#define TUNGIFINFO _IOR('t', 92, struct tuninfo) +#define TUNSLMODE _IOW('t', 93, int) +#define TUNSIFMODE _IOW('t', 94, int) +#define TUNSIFPID _IO('t', 95) +#define TUNSIFHEAD _IOW('t', 96, int) +#define TUNGIFHEAD _IOR('t', 97, int) + +#endif /* !_NET_IF_TUN_HH_ */ diff --git a/freebsd/sys/net/if_types.h b/freebsd/sys/net/if_types.h new file mode 100644 index 00000000..1d4f2b2a --- /dev/null +++ b/freebsd/sys/net/if_types.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/net/if_var.h b/freebsd/sys/net/if_var.h new file mode 100644 index 00000000..913d62a9 --- /dev/null +++ b/freebsd/sys/net/if_var.h @@ -0,0 +1,904 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)if.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NET_IF_VAR_HH_ +#define _NET_IF_VAR_HH_ + +/* + * Structures defining a network interface, providing a packet + * transport mechanism (ala level 0 of the PUP protocols). + * + * Each interface accepts output datagrams of a specified maximum + * length, and provides higher level routines with input datagrams + * received from its medium. + * + * Output occurs when the routine if_output is called, with three parameters: + * (*ifp->if_output)(ifp, m, dst, rt) + * Here m is the mbuf chain to be sent and dst is the destination address. + * The output routine encapsulates the supplied datagram if necessary, + * and then transmits it on its medium. + * + * On input, each interface unwraps the data received by it, and either + * places it on the input queue of an internetwork datagram routine + * and posts the associated software interrupt, or passes the datagram to a raw + * packet input routine. + * + * Routines exist for locating interfaces by their addresses + * or for locating an interface on a certain network, as well as more general + * routing and gateway routines maintaining information used to locate + * interfaces. These routines live in the files if.c and route.c + */ + +#ifdef __STDC__ +/* + * Forward structure declarations for function prototypes [sic]. + */ +struct mbuf; +struct thread; +struct rtentry; +struct rt_addrinfo; +struct socket; +struct ether_header; +struct carp_if; +struct ifvlantrunk; +struct route; +struct vnet; +#endif + +#include /* get TAILQ macros */ + +#ifdef _KERNEL +#include +#include +#include +#include +#endif /* _KERNEL */ +#include /* XXX */ +#include /* XXX */ +#include /* XXX */ +#include /* XXX */ +#include /* XXX */ +#include + +#define IF_DUNIT_NONE -1 + +#include + +TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ +TAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ +TAILQ_HEAD(ifprefixhead, ifprefix); +TAILQ_HEAD(ifmultihead, ifmultiaddr); +TAILQ_HEAD(ifgrouphead, ifg_group); + +/* + * Structure defining a queue for a network interface. + */ +struct ifqueue { + struct mbuf *ifq_head; + struct mbuf *ifq_tail; + int ifq_len; + int ifq_maxlen; + int ifq_drops; + struct mtx ifq_mtx; +}; + +/* + * Structure defining a network interface. + * + * (Would like to call this struct ``if'', but C isn't PL/1.) + */ + +struct ifnet { + void *if_softc; /* pointer to driver state */ + void *if_l2com; /* pointer to protocol bits */ + struct vnet *if_vnet; /* pointer to network stack instance */ + TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ + const char *if_dname; /* driver name */ + int if_dunit; /* unit or IF_DUNIT_NONE */ + u_int if_refcount; /* reference count */ + struct ifaddrhead if_addrhead; /* linked list of addresses per if */ + /* + * if_addrhead is the list of all addresses associated to + * an interface. + * Some code in the kernel assumes that first element + * of the list has type AF_LINK, and contains sockaddr_dl + * addresses which store the link-level address and the name + * of the interface. + * However, access to the AF_LINK address through this + * field is deprecated. Use if_addr or ifaddr_byindex() instead. + */ + int if_pcount; /* number of promiscuous listeners */ + struct carp_if *if_carp; /* carp interface structure */ + struct bpf_if *if_bpf; /* packet filter structure */ + u_short if_index; /* numeric abbreviation for this if */ + short if_timer; /* time 'til if_watchdog called */ + struct ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */ + int if_flags; /* up/down, broadcast, etc. */ + int if_capabilities; /* interface features & capabilities */ + int if_capenable; /* enabled features & capabilities */ + void *if_linkmib; /* link-type-specific MIB data */ + size_t if_linkmiblen; /* length of above data */ + struct if_data if_data; + struct ifmultihead if_multiaddrs; /* multicast addresses configured */ + int if_amcount; /* number of all-multicast requests */ +/* procedure handles */ + int (*if_output) /* output routine (enqueue) */ + (struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); + void (*if_input) /* input routine (from h/w driver) */ + (struct ifnet *, struct mbuf *); + void (*if_start) /* initiate output routine */ + (struct ifnet *); + int (*if_ioctl) /* ioctl routine */ + (struct ifnet *, u_long, caddr_t); + void (*if_watchdog) /* timer routine */ + (struct ifnet *); + void (*if_init) /* Init routine */ + (void *); + int (*if_resolvemulti) /* validate/resolve multicast */ + (struct ifnet *, struct sockaddr **, struct sockaddr *); + void (*if_qflush) /* flush any queues */ + (struct ifnet *); + int (*if_transmit) /* initiate output routine */ + (struct ifnet *, struct mbuf *); + void (*if_reassign) /* reassign to vnet routine */ + (struct ifnet *, struct vnet *, char *); + struct vnet *if_home_vnet; /* where this ifnet originates from */ + struct ifaddr *if_addr; /* pointer to link-level address */ + void *if_llsoftc; /* link layer softc */ + int if_drv_flags; /* driver-managed status flags */ + struct ifaltq if_snd; /* output queue (includes altq) */ + const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */ + + void *if_bridge; /* bridge glue */ + + struct label *if_label; /* interface MAC label */ + + /* these are only used by IPv6 */ + struct ifprefixhead if_prefixhead; /* list of prefixes per if */ + void *if_afdata[AF_MAX]; + int if_afdata_initialized; + struct rwlock if_afdata_lock; + struct task if_linktask; /* task for link change events */ + struct mtx if_addr_mtx; /* mutex to protect address lists */ + + LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */ + TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */ + /* protected by if_addr_mtx */ + void *if_pf_kif; + void *if_lagg; /* lagg glue */ + u_char if_alloctype; /* if_type at time of allocation */ + + /* + * Spare fields are added so that we can modify sensitive data + * structures without changing the kernel binary interface, and must + * be used with care where binary compatibility is required. + */ + char if_cspare[3]; + char *if_description; /* interface description */ + void *if_pspare[7]; + int if_ispare[4]; +}; + +typedef void if_init_f_t(void *); + +/* + * XXX These aliases are terribly dangerous because they could apply + * to anything. + */ +#define if_mtu if_data.ifi_mtu +#define if_type if_data.ifi_type +#define if_physical if_data.ifi_physical +#define if_addrlen if_data.ifi_addrlen +#define if_hdrlen if_data.ifi_hdrlen +#define if_metric if_data.ifi_metric +#define if_link_state if_data.ifi_link_state +#define if_baudrate if_data.ifi_baudrate +#define if_hwassist if_data.ifi_hwassist +#define if_ipackets if_data.ifi_ipackets +#define if_ierrors if_data.ifi_ierrors +#define if_opackets if_data.ifi_opackets +#define if_oerrors if_data.ifi_oerrors +#define if_collisions if_data.ifi_collisions +#define if_ibytes if_data.ifi_ibytes +#define if_obytes if_data.ifi_obytes +#define if_imcasts if_data.ifi_imcasts +#define if_omcasts if_data.ifi_omcasts +#define if_iqdrops if_data.ifi_iqdrops +#define if_noproto if_data.ifi_noproto +#define if_lastchange if_data.ifi_lastchange + +/* for compatibility with other BSDs */ +#define if_addrlist if_addrhead +#define if_list if_link +#define if_name(ifp) ((ifp)->if_xname) + +/* + * Locks for address lists on the network interface. + */ +#define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_mtx, \ + "if_addr_mtx", NULL, MTX_DEF) +#define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_mtx) +#define IF_ADDR_LOCK(if) mtx_lock(&(if)->if_addr_mtx) +#define IF_ADDR_UNLOCK(if) mtx_unlock(&(if)->if_addr_mtx) +#define IF_ADDR_LOCK_ASSERT(if) mtx_assert(&(if)->if_addr_mtx, MA_OWNED) + +/* + * Function variations on locking macros intended to be used by loadable + * kernel modules in order to divorce them from the internals of address list + * locking. + */ +void if_addr_rlock(struct ifnet *ifp); /* if_addrhead */ +void if_addr_runlock(struct ifnet *ifp); /* if_addrhead */ +void if_maddr_rlock(struct ifnet *ifp); /* if_multiaddrs */ +void if_maddr_runlock(struct ifnet *ifp); /* if_multiaddrs */ + +/* + * Output queues (ifp->if_snd) and slow device input queues (*ifp->if_slowq) + * are queues of messages stored on ifqueue structures + * (defined above). Entries are added to and deleted from these structures + * by these macros, which should be called with ipl raised to splimp(). + */ +#define IF_LOCK(ifq) mtx_lock(&(ifq)->ifq_mtx) +#define IF_UNLOCK(ifq) mtx_unlock(&(ifq)->ifq_mtx) +#define IF_LOCK_ASSERT(ifq) mtx_assert(&(ifq)->ifq_mtx, MA_OWNED) +#define _IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen) +#define _IF_DROP(ifq) ((ifq)->ifq_drops++) +#define _IF_QLEN(ifq) ((ifq)->ifq_len) + +#define _IF_ENQUEUE(ifq, m) do { \ + (m)->m_nextpkt = NULL; \ + if ((ifq)->ifq_tail == NULL) \ + (ifq)->ifq_head = m; \ + else \ + (ifq)->ifq_tail->m_nextpkt = m; \ + (ifq)->ifq_tail = m; \ + (ifq)->ifq_len++; \ +} while (0) + +#define IF_ENQUEUE(ifq, m) do { \ + IF_LOCK(ifq); \ + _IF_ENQUEUE(ifq, m); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define _IF_PREPEND(ifq, m) do { \ + (m)->m_nextpkt = (ifq)->ifq_head; \ + if ((ifq)->ifq_tail == NULL) \ + (ifq)->ifq_tail = (m); \ + (ifq)->ifq_head = (m); \ + (ifq)->ifq_len++; \ +} while (0) + +#define IF_PREPEND(ifq, m) do { \ + IF_LOCK(ifq); \ + _IF_PREPEND(ifq, m); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define _IF_DEQUEUE(ifq, m) do { \ + (m) = (ifq)->ifq_head; \ + if (m) { \ + if (((ifq)->ifq_head = (m)->m_nextpkt) == NULL) \ + (ifq)->ifq_tail = NULL; \ + (m)->m_nextpkt = NULL; \ + (ifq)->ifq_len--; \ + } \ +} while (0) + +#define IF_DEQUEUE(ifq, m) do { \ + IF_LOCK(ifq); \ + _IF_DEQUEUE(ifq, m); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define _IF_POLL(ifq, m) ((m) = (ifq)->ifq_head) +#define IF_POLL(ifq, m) _IF_POLL(ifq, m) + +#define _IF_DRAIN(ifq) do { \ + struct mbuf *m; \ + for (;;) { \ + _IF_DEQUEUE(ifq, m); \ + if (m == NULL) \ + break; \ + m_freem(m); \ + } \ +} while (0) + +#define IF_DRAIN(ifq) do { \ + IF_LOCK(ifq); \ + _IF_DRAIN(ifq); \ + IF_UNLOCK(ifq); \ +} while(0) + +#ifdef _KERNEL +/* interface link layer address change event */ +typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *); +EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t); +/* interface address change event */ +typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *); +EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t); +/* new interface arrival event */ +typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *); +EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t); +/* interface departure event */ +typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *); +EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t); + +/* + * interface groups + */ +struct ifg_group { + char ifg_group[IFNAMSIZ]; + u_int ifg_refcnt; + void *ifg_pf_kif; + TAILQ_HEAD(, ifg_member) ifg_members; + TAILQ_ENTRY(ifg_group) ifg_next; +}; + +struct ifg_member { + TAILQ_ENTRY(ifg_member) ifgm_next; + struct ifnet *ifgm_ifp; +}; + +struct ifg_list { + struct ifg_group *ifgl_group; + TAILQ_ENTRY(ifg_list) ifgl_next; +}; + +/* group attach event */ +typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *); +EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t); +/* group detach event */ +typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *); +EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t); +/* group change event */ +typedef void (*group_change_event_handler_t)(void *, const char *); +EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t); + +#define IF_AFDATA_LOCK_INIT(ifp) \ + rw_init(&(ifp)->if_afdata_lock, "if_afdata") + +#define IF_AFDATA_WLOCK(ifp) rw_wlock(&(ifp)->if_afdata_lock) +#define IF_AFDATA_RLOCK(ifp) rw_rlock(&(ifp)->if_afdata_lock) +#define IF_AFDATA_WUNLOCK(ifp) rw_wunlock(&(ifp)->if_afdata_lock) +#define IF_AFDATA_RUNLOCK(ifp) rw_runlock(&(ifp)->if_afdata_lock) +#define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp) +#define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp) +#define IF_AFDATA_TRYLOCK(ifp) rw_try_wlock(&(ifp)->if_afdata_lock) +#define IF_AFDATA_DESTROY(ifp) rw_destroy(&(ifp)->if_afdata_lock) + +#define IF_AFDATA_LOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_LOCKED) +#define IF_AFDATA_UNLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_UNLOCKED) + +int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, + int adjust); +#define IF_HANDOFF(ifq, m, ifp) \ + if_handoff((struct ifqueue *)ifq, m, ifp, 0) +#define IF_HANDOFF_ADJ(ifq, m, ifp, adj) \ + if_handoff((struct ifqueue *)ifq, m, ifp, adj) + +void if_start(struct ifnet *); + +#define IFQ_ENQUEUE(ifq, m, err) \ +do { \ + IF_LOCK(ifq); \ + if (ALTQ_IS_ENABLED(ifq)) \ + ALTQ_ENQUEUE(ifq, m, NULL, err); \ + else { \ + if (_IF_QFULL(ifq)) { \ + m_freem(m); \ + (err) = ENOBUFS; \ + } else { \ + _IF_ENQUEUE(ifq, m); \ + (err) = 0; \ + } \ + } \ + if (err) \ + (ifq)->ifq_drops++; \ + IF_UNLOCK(ifq); \ +} while (0) + +#define IFQ_DEQUEUE_NOLOCK(ifq, m) \ +do { \ + if (TBR_IS_ENABLED(ifq)) \ + (m) = tbr_dequeue_ptr(ifq, ALTDQ_REMOVE); \ + else if (ALTQ_IS_ENABLED(ifq)) \ + ALTQ_DEQUEUE(ifq, m); \ + else \ + _IF_DEQUEUE(ifq, m); \ +} while (0) + +#define IFQ_DEQUEUE(ifq, m) \ +do { \ + IF_LOCK(ifq); \ + IFQ_DEQUEUE_NOLOCK(ifq, m); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define IFQ_POLL_NOLOCK(ifq, m) \ +do { \ + if (TBR_IS_ENABLED(ifq)) \ + (m) = tbr_dequeue_ptr(ifq, ALTDQ_POLL); \ + else if (ALTQ_IS_ENABLED(ifq)) \ + ALTQ_POLL(ifq, m); \ + else \ + _IF_POLL(ifq, m); \ +} while (0) + +#define IFQ_POLL(ifq, m) \ +do { \ + IF_LOCK(ifq); \ + IFQ_POLL_NOLOCK(ifq, m); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define IFQ_PURGE_NOLOCK(ifq) \ +do { \ + if (ALTQ_IS_ENABLED(ifq)) { \ + ALTQ_PURGE(ifq); \ + } else \ + _IF_DRAIN(ifq); \ +} while (0) + +#define IFQ_PURGE(ifq) \ +do { \ + IF_LOCK(ifq); \ + IFQ_PURGE_NOLOCK(ifq); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define IFQ_SET_READY(ifq) \ + do { ((ifq)->altq_flags |= ALTQF_READY); } while (0) + +#define IFQ_LOCK(ifq) IF_LOCK(ifq) +#define IFQ_UNLOCK(ifq) IF_UNLOCK(ifq) +#define IFQ_LOCK_ASSERT(ifq) IF_LOCK_ASSERT(ifq) +#define IFQ_IS_EMPTY(ifq) ((ifq)->ifq_len == 0) +#define IFQ_INC_LEN(ifq) ((ifq)->ifq_len++) +#define IFQ_DEC_LEN(ifq) (--(ifq)->ifq_len) +#define IFQ_INC_DROPS(ifq) ((ifq)->ifq_drops++) +#define IFQ_SET_MAXLEN(ifq, len) ((ifq)->ifq_maxlen = (len)) + +/* + * The IFF_DRV_OACTIVE test should really occur in the device driver, not in + * the handoff logic, as that flag is locked by the device driver. + */ +#define IFQ_HANDOFF_ADJ(ifp, m, adj, err) \ +do { \ + int len; \ + short mflags; \ + \ + len = (m)->m_pkthdr.len; \ + mflags = (m)->m_flags; \ + IFQ_ENQUEUE(&(ifp)->if_snd, m, err); \ + if ((err) == 0) { \ + (ifp)->if_obytes += len + (adj); \ + if (mflags & M_MCAST) \ + (ifp)->if_omcasts++; \ + if (((ifp)->if_drv_flags & IFF_DRV_OACTIVE) == 0) \ + if_start(ifp); \ + } \ +} while (0) + +#define IFQ_HANDOFF(ifp, m, err) \ + IFQ_HANDOFF_ADJ(ifp, m, 0, err) + +#define IFQ_DRV_DEQUEUE(ifq, m) \ +do { \ + (m) = (ifq)->ifq_drv_head; \ + if (m) { \ + if (((ifq)->ifq_drv_head = (m)->m_nextpkt) == NULL) \ + (ifq)->ifq_drv_tail = NULL; \ + (m)->m_nextpkt = NULL; \ + (ifq)->ifq_drv_len--; \ + } else { \ + IFQ_LOCK(ifq); \ + IFQ_DEQUEUE_NOLOCK(ifq, m); \ + while ((ifq)->ifq_drv_len < (ifq)->ifq_drv_maxlen) { \ + struct mbuf *m0; \ + IFQ_DEQUEUE_NOLOCK(ifq, m0); \ + if (m0 == NULL) \ + break; \ + m0->m_nextpkt = NULL; \ + if ((ifq)->ifq_drv_tail == NULL) \ + (ifq)->ifq_drv_head = m0; \ + else \ + (ifq)->ifq_drv_tail->m_nextpkt = m0; \ + (ifq)->ifq_drv_tail = m0; \ + (ifq)->ifq_drv_len++; \ + } \ + IFQ_UNLOCK(ifq); \ + } \ +} while (0) + +#define IFQ_DRV_PREPEND(ifq, m) \ +do { \ + (m)->m_nextpkt = (ifq)->ifq_drv_head; \ + if ((ifq)->ifq_drv_tail == NULL) \ + (ifq)->ifq_drv_tail = (m); \ + (ifq)->ifq_drv_head = (m); \ + (ifq)->ifq_drv_len++; \ +} while (0) + +#define IFQ_DRV_IS_EMPTY(ifq) \ + (((ifq)->ifq_drv_len == 0) && ((ifq)->ifq_len == 0)) + +#define IFQ_DRV_PURGE(ifq) \ +do { \ + struct mbuf *m, *n = (ifq)->ifq_drv_head; \ + while((m = n) != NULL) { \ + n = m->m_nextpkt; \ + m_freem(m); \ + } \ + (ifq)->ifq_drv_head = (ifq)->ifq_drv_tail = NULL; \ + (ifq)->ifq_drv_len = 0; \ + IFQ_PURGE(ifq); \ +} while (0) + +#ifdef _KERNEL +static __inline void +drbr_stats_update(struct ifnet *ifp, int len, int mflags) +{ +#ifndef NO_SLOW_STATS + ifp->if_obytes += len; + if (mflags & M_MCAST) + ifp->if_omcasts++; +#endif +} + +static __inline int +drbr_enqueue(struct ifnet *ifp, struct buf_ring *br, struct mbuf *m) +{ + int error = 0; + int len = m->m_pkthdr.len; + int mflags = m->m_flags; + +#ifdef ALTQ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) { + IFQ_ENQUEUE(&ifp->if_snd, m, error); + return (error); + } +#endif + if ((error = buf_ring_enqueue_bytes(br, m, len)) == ENOBUFS) { + br->br_drops++; + m_freem(m); + } else + drbr_stats_update(ifp, len, mflags); + + return (error); +} + +static __inline void +drbr_flush(struct ifnet *ifp, struct buf_ring *br) +{ + struct mbuf *m; + +#ifdef ALTQ + if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) + IFQ_PURGE(&ifp->if_snd); +#endif + while ((m = buf_ring_dequeue_sc(br)) != NULL) + m_freem(m); +} + +static __inline void +drbr_free(struct buf_ring *br, struct malloc_type *type) +{ + + drbr_flush(NULL, br); + buf_ring_free(br, type); +} + +static __inline struct mbuf * +drbr_dequeue(struct ifnet *ifp, struct buf_ring *br) +{ +#ifdef ALTQ + struct mbuf *m; + + if (ALTQ_IS_ENABLED(&ifp->if_snd)) { + IFQ_DEQUEUE(&ifp->if_snd, m); + return (m); + } +#endif + return (buf_ring_dequeue_sc(br)); +} + +static __inline struct mbuf * +drbr_dequeue_cond(struct ifnet *ifp, struct buf_ring *br, + int (*func) (struct mbuf *, void *), void *arg) +{ + struct mbuf *m; +#ifdef ALTQ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) { + IFQ_LOCK(&ifp->if_snd); + IFQ_POLL_NOLOCK(&ifp->if_snd, m); + if (m != NULL && func(m, arg) == 0) { + IFQ_UNLOCK(&ifp->if_snd); + return (NULL); + } + IFQ_DEQUEUE_NOLOCK(&ifp->if_snd, m); + IFQ_UNLOCK(&ifp->if_snd); + return (m); + } +#endif + m = buf_ring_peek(br); + if (m == NULL || func(m, arg) == 0) + return (NULL); + + return (buf_ring_dequeue_sc(br)); +} + +static __inline int +drbr_empty(struct ifnet *ifp, struct buf_ring *br) +{ +#ifdef ALTQ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) + return (IFQ_IS_EMPTY(&ifp->if_snd)); +#endif + return (buf_ring_empty(br)); +} + +static __inline int +drbr_needs_enqueue(struct ifnet *ifp, struct buf_ring *br) +{ +#ifdef ALTQ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) + return (1); +#endif + return (!buf_ring_empty(br)); +} + +static __inline int +drbr_inuse(struct ifnet *ifp, struct buf_ring *br) +{ +#ifdef ALTQ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) + return (ifp->if_snd.ifq_len); +#endif + return (buf_ring_count(br)); +} +#endif +/* + * 72 was chosen below because it is the size of a TCP/IP + * header (40) + the minimum mss (32). + */ +#define IF_MINMTU 72 +#define IF_MAXMTU 65535 + +#endif /* _KERNEL */ + +/* + * The ifaddr structure contains information about one address + * of an interface. They are maintained by the different address families, + * are allocated and attached when an address is set, and are linked + * together so all addresses for an interface can be located. + * + * NOTE: a 'struct ifaddr' is always at the beginning of a larger + * chunk of malloc'ed memory, where we store the three addresses + * (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here. + */ +struct ifaddr { + struct sockaddr *ifa_addr; /* address of interface */ + struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ +#define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ + struct sockaddr *ifa_netmask; /* used to determine subnet */ + struct if_data if_data; /* not all members are meaningful */ + struct ifnet *ifa_ifp; /* back-pointer to interface */ + TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ + void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ + (int, struct rtentry *, struct rt_addrinfo *); + u_short ifa_flags; /* mostly rt_flags for cloning */ + u_int ifa_refcnt; /* references to this structure */ + int ifa_metric; /* cost of going out this interface */ + int (*ifa_claim_addr) /* check if an addr goes to this if */ + (struct ifaddr *, struct sockaddr *); + struct mtx ifa_mtx; +}; +#define IFA_ROUTE RTF_UP /* route installed */ +#define IFA_RTSELF RTF_HOST /* loopback route to self installed */ + +/* for compatibility with other BSDs */ +#define ifa_list ifa_link + +#ifdef _KERNEL +#define IFA_LOCK(ifa) mtx_lock(&(ifa)->ifa_mtx) +#define IFA_UNLOCK(ifa) mtx_unlock(&(ifa)->ifa_mtx) + +void ifa_free(struct ifaddr *ifa); +void ifa_init(struct ifaddr *ifa); +void ifa_ref(struct ifaddr *ifa); +#endif + +/* + * The prefix structure contains information about one prefix + * of an interface. They are maintained by the different address families, + * are allocated and attached when a prefix or an address is set, + * and are linked together so all prefixes for an interface can be located. + */ +struct ifprefix { + struct sockaddr *ifpr_prefix; /* prefix of interface */ + struct ifnet *ifpr_ifp; /* back-pointer to interface */ + TAILQ_ENTRY(ifprefix) ifpr_list; /* queue macro glue */ + u_char ifpr_plen; /* prefix length in bits */ + u_char ifpr_type; /* protocol dependent prefix type */ +}; + +/* + * Multicast address structure. This is analogous to the ifaddr + * structure except that it keeps track of multicast addresses. + */ +struct ifmultiaddr { + TAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */ + struct sockaddr *ifma_addr; /* address this membership is for */ + struct sockaddr *ifma_lladdr; /* link-layer translation, if any */ + struct ifnet *ifma_ifp; /* back-pointer to interface */ + u_int ifma_refcount; /* reference count */ + void *ifma_protospec; /* protocol-specific state, if any */ + struct ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */ +}; + +#ifdef _KERNEL + +extern struct rwlock ifnet_rwlock; +extern struct sx ifnet_sxlock; + +#define IFNET_LOCK_INIT() do { \ + rw_init_flags(&ifnet_rwlock, "ifnet_rw", RW_RECURSE); \ + sx_init_flags(&ifnet_sxlock, "ifnet_sx", SX_RECURSE); \ +} while(0) + +#define IFNET_WLOCK() do { \ + sx_xlock(&ifnet_sxlock); \ + rw_wlock(&ifnet_rwlock); \ +} while (0) + +#define IFNET_WUNLOCK() do { \ + rw_wunlock(&ifnet_rwlock); \ + sx_xunlock(&ifnet_sxlock); \ +} while (0) + +/* + * To assert the ifnet lock, you must know not only whether it's for read or + * write, but also whether it was acquired with sleep support or not. + */ +#define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED) +#define IFNET_RLOCK_NOSLEEP_ASSERT() rw_assert(&ifnet_rwlock, RA_RLOCKED) +#define IFNET_WLOCK_ASSERT() do { \ + sx_assert(&ifnet_sxlock, SA_XLOCKED); \ + rw_assert(&ifnet_rwlock, RA_WLOCKED); \ +} while (0) + +#define IFNET_RLOCK() sx_slock(&ifnet_sxlock) +#define IFNET_RLOCK_NOSLEEP() rw_rlock(&ifnet_rwlock) +#define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock) +#define IFNET_RUNLOCK_NOSLEEP() rw_runlock(&ifnet_rwlock) + +/* + * Look up an ifnet given its index; the _ref variant also acquires a + * reference that must be freed using if_rele(). It is almost always a bug + * to call ifnet_byindex() instead if ifnet_byindex_ref(). + */ +struct ifnet *ifnet_byindex(u_short idx); +struct ifnet *ifnet_byindex_locked(u_short idx); +struct ifnet *ifnet_byindex_ref(u_short idx); + +/* + * Given the index, ifaddr_byindex() returns the one and only + * link-level ifaddr for the interface. You are not supposed to use + * it to traverse the list of addresses associated to the interface. + */ +struct ifaddr *ifaddr_byindex(u_short idx); + +VNET_DECLARE(struct ifnethead, ifnet); +VNET_DECLARE(struct ifgrouphead, ifg_head); +VNET_DECLARE(int, if_index); +VNET_DECLARE(struct ifnet *, loif); /* first loopback interface */ +VNET_DECLARE(int, useloopback); + +#define V_ifnet VNET(ifnet) +#define V_ifg_head VNET(ifg_head) +#define V_if_index VNET(if_index) +#define V_loif VNET(loif) +#define V_useloopback VNET(useloopback) + +extern int ifqmaxlen; + +int if_addgroup(struct ifnet *, const char *); +int if_delgroup(struct ifnet *, const char *); +int if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **); +int if_allmulti(struct ifnet *, int); +struct ifnet* if_alloc(u_char); +void if_attach(struct ifnet *); +void if_dead(struct ifnet *); +int if_delmulti(struct ifnet *, struct sockaddr *); +void if_delmulti_ifma(struct ifmultiaddr *); +void if_detach(struct ifnet *); +void if_vmove(struct ifnet *, struct vnet *); +void if_purgeaddrs(struct ifnet *); +void if_delallmulti(struct ifnet *); +void if_down(struct ifnet *); +struct ifmultiaddr * + if_findmulti(struct ifnet *, struct sockaddr *); +void if_free(struct ifnet *); +void if_free_type(struct ifnet *, u_char); +void if_initname(struct ifnet *, const char *, int); +void if_link_state_change(struct ifnet *, int); +int if_printf(struct ifnet *, const char *, ...) __printflike(2, 3); +void if_qflush(struct ifnet *); +void if_ref(struct ifnet *); +void if_rele(struct ifnet *); +int if_setlladdr(struct ifnet *, const u_char *, int); +void if_up(struct ifnet *); +int ifioctl(struct socket *, u_long, caddr_t, struct thread *); +int ifpromisc(struct ifnet *, int); +struct ifnet *ifunit(const char *); +struct ifnet *ifunit_ref(const char *); + +void ifq_init(struct ifaltq *, struct ifnet *ifp); +void ifq_delete(struct ifaltq *); + +int ifa_add_loopback_route(struct ifaddr *, struct sockaddr *); +int ifa_del_loopback_route(struct ifaddr *, struct sockaddr *); + +struct ifaddr *ifa_ifwithaddr(struct sockaddr *); +int ifa_ifwithaddr_check(struct sockaddr *); +struct ifaddr *ifa_ifwithbroadaddr(struct sockaddr *); +struct ifaddr *ifa_ifwithdstaddr(struct sockaddr *); +struct ifaddr *ifa_ifwithnet(struct sockaddr *, int); +struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *); +struct ifaddr *ifa_ifwithroute_fib(int, struct sockaddr *, struct sockaddr *, u_int); + +struct ifaddr *ifaof_ifpforaddr(struct sockaddr *, struct ifnet *); + +int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen); + +typedef void *if_com_alloc_t(u_char type, struct ifnet *ifp); +typedef void if_com_free_t(void *com, u_char type); +void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f); +void if_deregister_com_alloc(u_char type); + +#define IF_LLADDR(ifp) \ + LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr)) + +#ifdef DEVICE_POLLING +enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS }; + +typedef int poll_handler_t(struct ifnet *ifp, enum poll_cmd cmd, int count); +int ether_poll_register(poll_handler_t *h, struct ifnet *ifp); +int ether_poll_deregister(struct ifnet *ifp); +#endif /* DEVICE_POLLING */ + +#endif /* _KERNEL */ + +#endif /* !_NET_IF_VAR_HH_ */ diff --git a/freebsd/sys/net/if_vlan.c b/freebsd/sys/net/if_vlan.c new file mode 100644 index 00000000..5ae5efd4 --- /dev/null +++ b/freebsd/sys/net/if_vlan.c @@ -0,0 +1,1538 @@ +#include + +/*- + * Copyright 1998 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs. + * Might be extended some day to also handle IEEE 802.1p priority + * tagging. This is sort of sneaky in the implementation, since + * we need to pretend to be enough of an Ethernet implementation + * to make arp work. The way we do this is by telling everyone + * that we are an Ethernet, and then catch the packets that + * ether_output() left on our output queue when it calls + * if_start(), rewrite them for use by the real outgoing interface, + * and ask it to send them. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define VLANNAME "vlan" +#define VLAN_DEF_HWIDTH 4 +#define VLAN_IFFLAGS (IFF_BROADCAST | IFF_MULTICAST) + +#define UP_AND_RUNNING(ifp) \ + ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING) + +LIST_HEAD(ifvlanhead, ifvlan); + +struct ifvlantrunk { + struct ifnet *parent; /* parent interface of this trunk */ + struct rwlock rw; +#ifdef VLAN_ARRAY +#define VLAN_ARRAY_SIZE (EVL_VLID_MASK + 1) + struct ifvlan *vlans[VLAN_ARRAY_SIZE]; /* static table */ +#else + struct ifvlanhead *hash; /* dynamic hash-list table */ + uint16_t hmask; + uint16_t hwidth; +#endif + int refcnt; +}; + +struct vlan_mc_entry { + struct ether_addr mc_addr; + SLIST_ENTRY(vlan_mc_entry) mc_entries; +}; + +struct ifvlan { + struct ifvlantrunk *ifv_trunk; + struct ifnet *ifv_ifp; +#define TRUNK(ifv) ((ifv)->ifv_trunk) +#define PARENT(ifv) ((ifv)->ifv_trunk->parent) + int ifv_pflags; /* special flags we have set on parent */ + struct ifv_linkmib { + int ifvm_encaplen; /* encapsulation length */ + int ifvm_mtufudge; /* MTU fudged by this much */ + int ifvm_mintu; /* min transmission unit */ + uint16_t ifvm_proto; /* encapsulation ethertype */ + uint16_t ifvm_tag; /* tag to apply on packets leaving if */ + } ifv_mib; + SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; +#ifndef VLAN_ARRAY + LIST_ENTRY(ifvlan) ifv_list; +#endif +}; +#define ifv_proto ifv_mib.ifvm_proto +#define ifv_tag ifv_mib.ifvm_tag +#define ifv_encaplen ifv_mib.ifvm_encaplen +#define ifv_mtufudge ifv_mib.ifvm_mtufudge +#define ifv_mintu ifv_mib.ifvm_mintu + +/* Special flags we should propagate to parent. */ +static struct { + int flag; + int (*func)(struct ifnet *, int); +} vlan_pflags[] = { + {IFF_PROMISC, ifpromisc}, + {IFF_ALLMULTI, if_allmulti}, + {0, NULL} +}; + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW, 0, "IEEE 802.1Q VLAN"); +SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW, 0, "for consistency"); + +static int soft_pad = 0; +SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW, &soft_pad, 0, + "pad short frames before tagging"); + +static MALLOC_DEFINE(M_VLAN, VLANNAME, "802.1Q Virtual LAN Interface"); + +static eventhandler_tag ifdetach_tag; +static eventhandler_tag iflladdr_tag; + +/* + * We have a global mutex, that is used to serialize configuration + * changes and isn't used in normal packet delivery. + * + * We also have a per-trunk rwlock, that is locked shared on packet + * processing and exclusive when configuration is changed. + * + * The VLAN_ARRAY substitutes the dynamic hash with a static array + * with 4096 entries. In theory this can give a boost in processing, + * however on practice it does not. Probably this is because array + * is too big to fit into CPU cache. + */ +static struct mtx ifv_mtx; +#define VLAN_LOCK_INIT() mtx_init(&ifv_mtx, "vlan_global", NULL, MTX_DEF) +#define VLAN_LOCK_DESTROY() mtx_destroy(&ifv_mtx) +#define VLAN_LOCK_ASSERT() mtx_assert(&ifv_mtx, MA_OWNED) +#define VLAN_LOCK() mtx_lock(&ifv_mtx) +#define VLAN_UNLOCK() mtx_unlock(&ifv_mtx) +#define TRUNK_LOCK_INIT(trunk) rw_init(&(trunk)->rw, VLANNAME) +#define TRUNK_LOCK_DESTROY(trunk) rw_destroy(&(trunk)->rw) +#define TRUNK_LOCK(trunk) rw_wlock(&(trunk)->rw) +#define TRUNK_UNLOCK(trunk) rw_wunlock(&(trunk)->rw) +#define TRUNK_LOCK_ASSERT(trunk) rw_assert(&(trunk)->rw, RA_WLOCKED) +#define TRUNK_RLOCK(trunk) rw_rlock(&(trunk)->rw) +#define TRUNK_RUNLOCK(trunk) rw_runlock(&(trunk)->rw) +#define TRUNK_LOCK_RASSERT(trunk) rw_assert(&(trunk)->rw, RA_RLOCKED) + +#ifndef VLAN_ARRAY +static void vlan_inithash(struct ifvlantrunk *trunk); +static void vlan_freehash(struct ifvlantrunk *trunk); +static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv); +static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv); +static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch); +static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, + uint16_t tag); +#endif +static void trunk_destroy(struct ifvlantrunk *trunk); + +static void vlan_start(struct ifnet *ifp); +static void vlan_init(void *foo); +static void vlan_input(struct ifnet *ifp, struct mbuf *m); +static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); +static int vlan_setflag(struct ifnet *ifp, int flag, int status, + int (*func)(struct ifnet *, int)); +static int vlan_setflags(struct ifnet *ifp, int status); +static int vlan_setmulti(struct ifnet *ifp); +static void vlan_unconfig(struct ifnet *ifp); +static void vlan_unconfig_locked(struct ifnet *ifp); +static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag); +static void vlan_link_state(struct ifnet *ifp, int link); +static void vlan_capabilities(struct ifvlan *ifv); +static void vlan_trunk_capabilities(struct ifnet *ifp); + +static struct ifnet *vlan_clone_match_ethertag(struct if_clone *, + const char *, int *); +static int vlan_clone_match(struct if_clone *, const char *); +static int vlan_clone_create(struct if_clone *, char *, size_t, caddr_t); +static int vlan_clone_destroy(struct if_clone *, struct ifnet *); + +static void vlan_ifdetach(void *arg, struct ifnet *ifp); +static void vlan_iflladdr(void *arg, struct ifnet *ifp); + +static struct if_clone vlan_cloner = IFC_CLONE_INITIALIZER(VLANNAME, NULL, + IF_MAXUNIT, NULL, vlan_clone_match, vlan_clone_create, vlan_clone_destroy); + +#ifdef VIMAGE +static VNET_DEFINE(struct if_clone, vlan_cloner); +#define V_vlan_cloner VNET(vlan_cloner) +#endif + +#ifndef VLAN_ARRAY +#define HASH(n, m) ((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m)) + +static void +vlan_inithash(struct ifvlantrunk *trunk) +{ + int i, n; + + /* + * The trunk must not be locked here since we call malloc(M_WAITOK). + * It is OK in case this function is called before the trunk struct + * gets hooked up and becomes visible from other threads. + */ + + KASSERT(trunk->hwidth == 0 && trunk->hash == NULL, + ("%s: hash already initialized", __func__)); + + trunk->hwidth = VLAN_DEF_HWIDTH; + n = 1 << trunk->hwidth; + trunk->hmask = n - 1; + trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK); + for (i = 0; i < n; i++) + LIST_INIT(&trunk->hash[i]); +} + +static void +vlan_freehash(struct ifvlantrunk *trunk) +{ +#ifdef INVARIANTS + int i; + + KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); + for (i = 0; i < (1 << trunk->hwidth); i++) + KASSERT(LIST_EMPTY(&trunk->hash[i]), + ("%s: hash table not empty", __func__)); +#endif + free(trunk->hash, M_VLAN); + trunk->hash = NULL; + trunk->hwidth = trunk->hmask = 0; +} + +static int +vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) +{ + int i, b; + struct ifvlan *ifv2; + + TRUNK_LOCK_ASSERT(trunk); + KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); + + b = 1 << trunk->hwidth; + i = HASH(ifv->ifv_tag, trunk->hmask); + LIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) + if (ifv->ifv_tag == ifv2->ifv_tag) + return (EEXIST); + + /* + * Grow the hash when the number of vlans exceeds half of the number of + * hash buckets squared. This will make the average linked-list length + * buckets/2. + */ + if (trunk->refcnt > (b * b) / 2) { + vlan_growhash(trunk, 1); + i = HASH(ifv->ifv_tag, trunk->hmask); + } + LIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); + trunk->refcnt++; + + return (0); +} + +static int +vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) +{ + int i, b; + struct ifvlan *ifv2; + + TRUNK_LOCK_ASSERT(trunk); + KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); + + b = 1 << trunk->hwidth; + i = HASH(ifv->ifv_tag, trunk->hmask); + LIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) + if (ifv2 == ifv) { + trunk->refcnt--; + LIST_REMOVE(ifv2, ifv_list); + if (trunk->refcnt < (b * b) / 2) + vlan_growhash(trunk, -1); + return (0); + } + + panic("%s: vlan not found\n", __func__); + return (ENOENT); /*NOTREACHED*/ +} + +/* + * Grow the hash larger or smaller if memory permits. + */ +static void +vlan_growhash(struct ifvlantrunk *trunk, int howmuch) +{ + struct ifvlan *ifv; + struct ifvlanhead *hash2; + int hwidth2, i, j, n, n2; + + TRUNK_LOCK_ASSERT(trunk); + KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); + + if (howmuch == 0) { + /* Harmless yet obvious coding error */ + printf("%s: howmuch is 0\n", __func__); + return; + } + + hwidth2 = trunk->hwidth + howmuch; + n = 1 << trunk->hwidth; + n2 = 1 << hwidth2; + /* Do not shrink the table below the default */ + if (hwidth2 < VLAN_DEF_HWIDTH) + return; + + /* M_NOWAIT because we're called with trunk mutex held */ + hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_NOWAIT); + if (hash2 == NULL) { + printf("%s: out of memory -- hash size not changed\n", + __func__); + return; /* We can live with the old hash table */ + } + for (j = 0; j < n2; j++) + LIST_INIT(&hash2[j]); + for (i = 0; i < n; i++) + while ((ifv = LIST_FIRST(&trunk->hash[i])) != NULL) { + LIST_REMOVE(ifv, ifv_list); + j = HASH(ifv->ifv_tag, n2 - 1); + LIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); + } + free(trunk->hash, M_VLAN); + trunk->hash = hash2; + trunk->hwidth = hwidth2; + trunk->hmask = n2 - 1; + + if (bootverbose) + if_printf(trunk->parent, + "VLAN hash table resized from %d to %d buckets\n", n, n2); +} + +static __inline struct ifvlan * +vlan_gethash(struct ifvlantrunk *trunk, uint16_t tag) +{ + struct ifvlan *ifv; + + TRUNK_LOCK_RASSERT(trunk); + + LIST_FOREACH(ifv, &trunk->hash[HASH(tag, trunk->hmask)], ifv_list) + if (ifv->ifv_tag == tag) + return (ifv); + return (NULL); +} + +#if 0 +/* Debugging code to view the hashtables. */ +static void +vlan_dumphash(struct ifvlantrunk *trunk) +{ + int i; + struct ifvlan *ifv; + + for (i = 0; i < (1 << trunk->hwidth); i++) { + printf("%d: ", i); + LIST_FOREACH(ifv, &trunk->hash[i], ifv_list) + printf("%s ", ifv->ifv_ifp->if_xname); + printf("\n"); + } +} +#endif /* 0 */ +#endif /* !VLAN_ARRAY */ + +static void +trunk_destroy(struct ifvlantrunk *trunk) +{ + VLAN_LOCK_ASSERT(); + + TRUNK_LOCK(trunk); +#ifndef VLAN_ARRAY + vlan_freehash(trunk); +#endif + trunk->parent->if_vlantrunk = NULL; + TRUNK_UNLOCK(trunk); + TRUNK_LOCK_DESTROY(trunk); + free(trunk, M_VLAN); +} + +/* + * Program our multicast filter. What we're actually doing is + * programming the multicast filter of the parent. This has the + * side effect of causing the parent interface to receive multicast + * traffic that it doesn't really want, which ends up being discarded + * later by the upper protocol layers. Unfortunately, there's no way + * to avoid this: there really is only one physical interface. + * + * XXX: There is a possible race here if more than one thread is + * modifying the multicast state of the vlan interface at the same time. + */ +static int +vlan_setmulti(struct ifnet *ifp) +{ + struct ifnet *ifp_p; + struct ifmultiaddr *ifma, *rifma = NULL; + struct ifvlan *sc; + struct vlan_mc_entry *mc; + struct sockaddr_dl sdl; + int error; + + /*VLAN_LOCK_ASSERT();*/ + + /* Find the parent. */ + sc = ifp->if_softc; + ifp_p = PARENT(sc); + + CURVNET_SET_QUIET(ifp_p->if_vnet); + + bzero((char *)&sdl, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_index = ifp_p->if_index; + sdl.sdl_type = IFT_ETHER; + sdl.sdl_alen = ETHER_ADDR_LEN; + + /* First, remove any existing filter entries. */ + while ((mc = SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { + bcopy((char *)&mc->mc_addr, LLADDR(&sdl), ETHER_ADDR_LEN); + error = if_delmulti(ifp_p, (struct sockaddr *)&sdl); + if (error) + return (error); + SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); + free(mc, M_VLAN); + } + + /* Now program new ones. */ + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_LINK) + continue; + mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT); + if (mc == NULL) + return (ENOMEM); + bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), + (char *)&mc->mc_addr, ETHER_ADDR_LEN); + SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); + bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), + LLADDR(&sdl), ETHER_ADDR_LEN); + error = if_addmulti(ifp_p, (struct sockaddr *)&sdl, &rifma); + if (error) + return (error); + } + + CURVNET_RESTORE(); + return (0); +} + +/* + * A handler for parent interface link layer address changes. + * If the parent interface link layer address is changed we + * should also change it on all children vlans. + */ +static void +vlan_iflladdr(void *arg __unused, struct ifnet *ifp) +{ + struct ifvlan *ifv; +#ifndef VLAN_ARRAY + struct ifvlan *next; +#endif + int i; + + /* + * Check if it's a trunk interface first of all + * to avoid needless locking. + */ + if (ifp->if_vlantrunk == NULL) + return; + + VLAN_LOCK(); + /* + * OK, it's a trunk. Loop over and change all vlan's lladdrs on it. + */ +#ifdef VLAN_ARRAY + for (i = 0; i < VLAN_ARRAY_SIZE; i++) + if ((ifv = ifp->if_vlantrunk->vlans[i])) { +#else /* VLAN_ARRAY */ + for (i = 0; i < (1 << ifp->if_vlantrunk->hwidth); i++) + LIST_FOREACH_SAFE(ifv, &ifp->if_vlantrunk->hash[i], ifv_list, next) { +#endif /* VLAN_ARRAY */ + VLAN_UNLOCK(); + if_setlladdr(ifv->ifv_ifp, IF_LLADDR(ifp), ETHER_ADDR_LEN); + VLAN_LOCK(); + } + VLAN_UNLOCK(); + +} + +/* + * A handler for network interface departure events. + * Track departure of trunks here so that we don't access invalid + * pointers or whatever if a trunk is ripped from under us, e.g., + * by ejecting its hot-plug card. However, if an ifnet is simply + * being renamed, then there's no need to tear down the state. + */ +static void +vlan_ifdetach(void *arg __unused, struct ifnet *ifp) +{ + struct ifvlan *ifv; + int i; + + /* + * Check if it's a trunk interface first of all + * to avoid needless locking. + */ + if (ifp->if_vlantrunk == NULL) + return; + + /* If the ifnet is just being renamed, don't do anything. */ + if (ifp->if_flags & IFF_RENAMING) + return; + + VLAN_LOCK(); + /* + * OK, it's a trunk. Loop over and detach all vlan's on it. + * Check trunk pointer after each vlan_unconfig() as it will + * free it and set to NULL after the last vlan was detached. + */ +#ifdef VLAN_ARRAY + for (i = 0; i < VLAN_ARRAY_SIZE; i++) + if ((ifv = ifp->if_vlantrunk->vlans[i])) { + vlan_unconfig_locked(ifv->ifv_ifp); + if (ifp->if_vlantrunk == NULL) + break; + } +#else /* VLAN_ARRAY */ +restart: + for (i = 0; i < (1 << ifp->if_vlantrunk->hwidth); i++) + if ((ifv = LIST_FIRST(&ifp->if_vlantrunk->hash[i]))) { + vlan_unconfig_locked(ifv->ifv_ifp); + if (ifp->if_vlantrunk) + goto restart; /* trunk->hwidth can change */ + else + break; + } +#endif /* VLAN_ARRAY */ + /* Trunk should have been destroyed in vlan_unconfig(). */ + KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__)); + VLAN_UNLOCK(); +} + +/* + * VLAN support can be loaded as a module. The only place in the + * system that's intimately aware of this is ether_input. We hook + * into this code through vlan_input_p which is defined there and + * set here. Noone else in the system should be aware of this so + * we use an explicit reference here. + */ +extern void (*vlan_input_p)(struct ifnet *, struct mbuf *); + +/* For if_link_state_change() eyes only... */ +extern void (*vlan_link_state_p)(struct ifnet *, int); + +static int +vlan_modevent(module_t mod, int type, void *data) +{ + + switch (type) { + case MOD_LOAD: + ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, + vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY); + if (ifdetach_tag == NULL) + return (ENOMEM); + iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, + vlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY); + if (iflladdr_tag == NULL) + return (ENOMEM); + VLAN_LOCK_INIT(); + vlan_input_p = vlan_input; + vlan_link_state_p = vlan_link_state; + vlan_trunk_cap_p = vlan_trunk_capabilities; +#ifndef VIMAGE + if_clone_attach(&vlan_cloner); +#endif + if (bootverbose) + printf("vlan: initialized, using " +#ifdef VLAN_ARRAY + "full-size arrays" +#else + "hash tables with chaining" +#endif + + "\n"); + break; + case MOD_UNLOAD: +#ifndef VIMAGE + if_clone_detach(&vlan_cloner); +#endif + EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag); + EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag); + vlan_input_p = NULL; + vlan_link_state_p = NULL; + vlan_trunk_cap_p = NULL; + VLAN_LOCK_DESTROY(); + if (bootverbose) + printf("vlan: unloaded\n"); + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t vlan_mod = { + "if_vlan", + vlan_modevent, + 0 +}; + +DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_vlan, 3); + +#ifdef VIMAGE +static void +vnet_vlan_init(const void *unused __unused) +{ + + V_vlan_cloner = vlan_cloner; + if_clone_attach(&V_vlan_cloner); +} +VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_vlan_init, NULL); + +static void +vnet_vlan_uninit(const void *unused __unused) +{ + + if_clone_detach(&V_vlan_cloner); +} +VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, + vnet_vlan_uninit, NULL); +#endif + +static struct ifnet * +vlan_clone_match_ethertag(struct if_clone *ifc, const char *name, int *tag) +{ + const char *cp; + struct ifnet *ifp; + int t; + + /* Check for . style interface names. */ + IFNET_RLOCK_NOSLEEP(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (ifp->if_type != IFT_ETHER) + continue; + if (strncmp(ifp->if_xname, name, strlen(ifp->if_xname)) != 0) + continue; + cp = name + strlen(ifp->if_xname); + if (*cp++ != '.') + continue; + if (*cp == '\0') + continue; + t = 0; + for(; *cp >= '0' && *cp <= '9'; cp++) + t = (t * 10) + (*cp - '0'); + if (*cp != '\0') + continue; + if (tag != NULL) + *tag = t; + break; + } + IFNET_RUNLOCK_NOSLEEP(); + + return (ifp); +} + +static int +vlan_clone_match(struct if_clone *ifc, const char *name) +{ + const char *cp; + + if (vlan_clone_match_ethertag(ifc, name, NULL) != NULL) + return (1); + + if (strncmp(VLANNAME, name, strlen(VLANNAME)) != 0) + return (0); + for (cp = name + 4; *cp != '\0'; cp++) { + if (*cp < '0' || *cp > '9') + return (0); + } + + return (1); +} + +static int +vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +{ + char *dp; + int wildcard; + int unit; + int error; + int tag; + int ethertag; + struct ifvlan *ifv; + struct ifnet *ifp; + struct ifnet *p; + struct vlanreq vlr; + static const u_char eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ + + /* + * There are 3 (ugh) ways to specify the cloned device: + * o pass a parameter block with the clone request. + * o specify parameters in the text of the clone device name + * o specify no parameters and get an unattached device that + * must be configured separately. + * The first technique is preferred; the latter two are + * supported for backwards compatibilty. + */ + if (params) { + error = copyin(params, &vlr, sizeof(vlr)); + if (error) + return error; + p = ifunit(vlr.vlr_parent); + if (p == NULL) + return ENXIO; + /* + * Don't let the caller set up a VLAN tag with + * anything except VLID bits. + */ + if (vlr.vlr_tag & ~EVL_VLID_MASK) + return (EINVAL); + error = ifc_name2unit(name, &unit); + if (error != 0) + return (error); + + ethertag = 1; + tag = vlr.vlr_tag; + wildcard = (unit < 0); + } else if ((p = vlan_clone_match_ethertag(ifc, name, &tag)) != NULL) { + ethertag = 1; + unit = -1; + wildcard = 0; + + /* + * Don't let the caller set up a VLAN tag with + * anything except VLID bits. + */ + if (tag & ~EVL_VLID_MASK) + return (EINVAL); + } else { + ethertag = 0; + + error = ifc_name2unit(name, &unit); + if (error != 0) + return (error); + + wildcard = (unit < 0); + } + + error = ifc_alloc_unit(ifc, &unit); + if (error != 0) + return (error); + + /* In the wildcard case, we need to update the name. */ + if (wildcard) { + for (dp = name; *dp != '\0'; dp++); + if (snprintf(dp, len - (dp-name), "%d", unit) > + len - (dp-name) - 1) { + panic("%s: interface name too long", __func__); + } + } + + ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO); + ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) { + ifc_free_unit(ifc, unit); + free(ifv, M_VLAN); + return (ENOSPC); + } + SLIST_INIT(&ifv->vlan_mc_listhead); + + ifp->if_softc = ifv; + /* + * Set the name manually rather than using if_initname because + * we don't conform to the default naming convention for interfaces. + */ + strlcpy(ifp->if_xname, name, IFNAMSIZ); + ifp->if_dname = ifc->ifc_name; + ifp->if_dunit = unit; + /* NB: flags are not set here */ + ifp->if_linkmib = &ifv->ifv_mib; + ifp->if_linkmiblen = sizeof(ifv->ifv_mib); + /* NB: mtu is not set here */ + + ifp->if_init = vlan_init; + ifp->if_start = vlan_start; + ifp->if_ioctl = vlan_ioctl; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_flags = VLAN_IFFLAGS; + ether_ifattach(ifp, eaddr); + /* Now undo some of the damage... */ + ifp->if_baudrate = 0; + ifp->if_type = IFT_L2VLAN; + ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN; + + if (ethertag) { + error = vlan_config(ifv, p, tag); + if (error != 0) { + /* + * Since we've partialy failed, we need to back + * out all the way, otherwise userland could get + * confused. Thus, we destroy the interface. + */ + ether_ifdetach(ifp); + vlan_unconfig(ifp); + if_free_type(ifp, IFT_ETHER); + ifc_free_unit(ifc, unit); + free(ifv, M_VLAN); + + return (error); + } + + /* Update flags on the parent, if necessary. */ + vlan_setflags(ifp, 1); + } + + return (0); +} + +static int +vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) +{ + struct ifvlan *ifv = ifp->if_softc; + int unit = ifp->if_dunit; + + ether_ifdetach(ifp); /* first, remove it from system-wide lists */ + vlan_unconfig(ifp); /* now it can be unconfigured and freed */ + if_free_type(ifp, IFT_ETHER); + free(ifv, M_VLAN); + ifc_free_unit(ifc, unit); + + return (0); +} + +/* + * The ifp->if_init entry point for vlan(4) is a no-op. + */ +static void +vlan_init(void *foo __unused) +{ +} + +/* + * The if_start method for vlan(4) interface. It doesn't + * raises the IFF_DRV_OACTIVE flag, since it is called + * only from IFQ_HANDOFF() macro in ether_output_frame(). + * If the interface queue is full, and vlan_start() is + * not called, the queue would never get emptied and + * interface would stall forever. + */ +static void +vlan_start(struct ifnet *ifp) +{ + struct ifvlan *ifv; + struct ifnet *p; + struct mbuf *m; + int error; + + ifv = ifp->if_softc; + p = PARENT(ifv); + + for (;;) { + IF_DEQUEUE(&ifp->if_snd, m); + if (m == NULL) + break; + BPF_MTAP(ifp, m); + + /* + * Do not run parent's if_start() if the parent is not up, + * or parent's driver will cause a system crash. + */ + if (!UP_AND_RUNNING(p)) { + m_freem(m); + ifp->if_collisions++; + continue; + } + + /* + * Pad the frame to the minimum size allowed if told to. + * This option is in accord with IEEE Std 802.1Q, 2003 Ed., + * paragraph C.4.4.3.b. It can help to work around buggy + * bridges that violate paragraph C.4.4.3.a from the same + * document, i.e., fail to pad short frames after untagging. + * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but + * untagging it will produce a 62-byte frame, which is a runt + * and requires padding. There are VLAN-enabled network + * devices that just discard such runts instead or mishandle + * them somehow. + */ + if (soft_pad) { + static char pad[8]; /* just zeros */ + int n; + + for (n = ETHERMIN + ETHER_HDR_LEN - m->m_pkthdr.len; + n > 0; n -= sizeof(pad)) + if (!m_append(m, min(n, sizeof(pad)), pad)) + break; + + if (n > 0) { + if_printf(ifp, "cannot pad short frame\n"); + ifp->if_oerrors++; + m_freem(m); + continue; + } + } + + /* + * If underlying interface can do VLAN tag insertion itself, + * just pass the packet along. However, we need some way to + * tell the interface where the packet came from so that it + * knows how to find the VLAN tag to use, so we attach a + * packet tag that holds it. + */ + if (p->if_capenable & IFCAP_VLAN_HWTAGGING) { + m->m_pkthdr.ether_vtag = ifv->ifv_tag; + m->m_flags |= M_VLANTAG; + } else { + m = ether_vlanencap(m, ifv->ifv_tag); + if (m == NULL) { + if_printf(ifp, + "unable to prepend VLAN header\n"); + ifp->if_oerrors++; + continue; + } + } + + /* + * Send it, precisely as ether_output() would have. + * We are already running at splimp. + */ + error = (p->if_transmit)(p, m); + if (!error) + ifp->if_opackets++; + else + ifp->if_oerrors++; + } +} + +static void +vlan_input(struct ifnet *ifp, struct mbuf *m) +{ + struct ifvlantrunk *trunk = ifp->if_vlantrunk; + struct ifvlan *ifv; + uint16_t tag; + + KASSERT(trunk != NULL, ("%s: no trunk", __func__)); + + if (m->m_flags & M_VLANTAG) { + /* + * Packet is tagged, but m contains a normal + * Ethernet frame; the tag is stored out-of-band. + */ + tag = EVL_VLANOFTAG(m->m_pkthdr.ether_vtag); + m->m_flags &= ~M_VLANTAG; + } else { + struct ether_vlan_header *evl; + + /* + * Packet is tagged in-band as specified by 802.1q. + */ + switch (ifp->if_type) { + case IFT_ETHER: + if (m->m_len < sizeof(*evl) && + (m = m_pullup(m, sizeof(*evl))) == NULL) { + if_printf(ifp, "cannot pullup VLAN header\n"); + return; + } + evl = mtod(m, struct ether_vlan_header *); + tag = EVL_VLANOFTAG(ntohs(evl->evl_tag)); + + /* + * Remove the 802.1q header by copying the Ethernet + * addresses over it and adjusting the beginning of + * the data in the mbuf. The encapsulated Ethernet + * type field is already in place. + */ + bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, + ETHER_HDR_LEN - ETHER_TYPE_LEN); + m_adj(m, ETHER_VLAN_ENCAP_LEN); + break; + + default: +#ifdef INVARIANTS + panic("%s: %s has unsupported if_type %u", + __func__, ifp->if_xname, ifp->if_type); +#endif + m_freem(m); + ifp->if_noproto++; + return; + } + } + + TRUNK_RLOCK(trunk); +#ifdef VLAN_ARRAY + ifv = trunk->vlans[tag]; +#else + ifv = vlan_gethash(trunk, tag); +#endif + if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { + TRUNK_RUNLOCK(trunk); + m_freem(m); + ifp->if_noproto++; + return; + } + TRUNK_RUNLOCK(trunk); + + m->m_pkthdr.rcvif = ifv->ifv_ifp; + ifv->ifv_ifp->if_ipackets++; + + /* Pass it back through the parent's input routine. */ + (*ifp->if_input)(ifv->ifv_ifp, m); +} + +static int +vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag) +{ + struct ifvlantrunk *trunk; + struct ifnet *ifp; + int error = 0; + + /* VID numbers 0x0 and 0xFFF are reserved */ + if (tag == 0 || tag == 0xFFF) + return (EINVAL); + if (p->if_type != IFT_ETHER) + return (EPROTONOSUPPORT); + if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS) + return (EPROTONOSUPPORT); + if (ifv->ifv_trunk) + return (EBUSY); + + if (p->if_vlantrunk == NULL) { + trunk = malloc(sizeof(struct ifvlantrunk), + M_VLAN, M_WAITOK | M_ZERO); +#ifndef VLAN_ARRAY + vlan_inithash(trunk); +#endif + VLAN_LOCK(); + if (p->if_vlantrunk != NULL) { + /* A race that that is very unlikely to be hit. */ +#ifndef VLAN_ARRAY + vlan_freehash(trunk); +#endif + free(trunk, M_VLAN); + goto exists; + } + TRUNK_LOCK_INIT(trunk); + TRUNK_LOCK(trunk); + p->if_vlantrunk = trunk; + trunk->parent = p; + } else { + VLAN_LOCK(); +exists: + trunk = p->if_vlantrunk; + TRUNK_LOCK(trunk); + } + + ifv->ifv_tag = tag; /* must set this before vlan_inshash() */ +#ifdef VLAN_ARRAY + if (trunk->vlans[tag] != NULL) { + error = EEXIST; + goto done; + } + trunk->vlans[tag] = ifv; + trunk->refcnt++; +#else + error = vlan_inshash(trunk, ifv); + if (error) + goto done; +#endif + ifv->ifv_proto = ETHERTYPE_VLAN; + ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN; + ifv->ifv_mintu = ETHERMIN; + ifv->ifv_pflags = 0; + + /* + * If the parent supports the VLAN_MTU capability, + * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames, + * use it. + */ + if (p->if_capenable & IFCAP_VLAN_MTU) { + /* + * No need to fudge the MTU since the parent can + * handle extended frames. + */ + ifv->ifv_mtufudge = 0; + } else { + /* + * Fudge the MTU by the encapsulation size. This + * makes us incompatible with strictly compliant + * 802.1Q implementations, but allows us to use + * the feature with other NetBSD implementations, + * which might still be useful. + */ + ifv->ifv_mtufudge = ifv->ifv_encaplen; + } + + ifv->ifv_trunk = trunk; + ifp = ifv->ifv_ifp; + ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge; + ifp->if_baudrate = p->if_baudrate; + /* + * Copy only a selected subset of flags from the parent. + * Other flags are none of our business. + */ +#define VLAN_COPY_FLAGS (IFF_SIMPLEX) + ifp->if_flags &= ~VLAN_COPY_FLAGS; + ifp->if_flags |= p->if_flags & VLAN_COPY_FLAGS; +#undef VLAN_COPY_FLAGS + + ifp->if_link_state = p->if_link_state; + + vlan_capabilities(ifv); + + /* + * Set up our ``Ethernet address'' to reflect the underlying + * physical interface's. + */ + bcopy(IF_LLADDR(p), IF_LLADDR(ifp), ETHER_ADDR_LEN); + + /* + * Configure multicast addresses that may already be + * joined on the vlan device. + */ + (void)vlan_setmulti(ifp); /* XXX: VLAN lock held */ + + /* We are ready for operation now. */ + ifp->if_drv_flags |= IFF_DRV_RUNNING; +done: + TRUNK_UNLOCK(trunk); + if (error == 0) + EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_tag); + VLAN_UNLOCK(); + + return (error); +} + +static void +vlan_unconfig(struct ifnet *ifp) +{ + + VLAN_LOCK(); + vlan_unconfig_locked(ifp); + VLAN_UNLOCK(); +} + +static void +vlan_unconfig_locked(struct ifnet *ifp) +{ + struct ifvlantrunk *trunk; + struct vlan_mc_entry *mc; + struct ifvlan *ifv; + struct ifnet *parent; + + VLAN_LOCK_ASSERT(); + + ifv = ifp->if_softc; + trunk = ifv->ifv_trunk; + parent = NULL; + + if (trunk != NULL) { + struct sockaddr_dl sdl; + + TRUNK_LOCK(trunk); + parent = trunk->parent; + + /* + * Since the interface is being unconfigured, we need to + * empty the list of multicast groups that we may have joined + * while we were alive from the parent's list. + */ + bzero((char *)&sdl, sizeof(sdl)); + sdl.sdl_len = sizeof(sdl); + sdl.sdl_family = AF_LINK; + sdl.sdl_index = parent->if_index; + sdl.sdl_type = IFT_ETHER; + sdl.sdl_alen = ETHER_ADDR_LEN; + + while ((mc = SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { + bcopy((char *)&mc->mc_addr, LLADDR(&sdl), + ETHER_ADDR_LEN); + + /* + * This may fail if the parent interface is + * being detached. Regardless, we should do a + * best effort to free this interface as much + * as possible as all callers expect vlan + * destruction to succeed. + */ + (void)if_delmulti(parent, (struct sockaddr *)&sdl); + SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); + free(mc, M_VLAN); + } + + vlan_setflags(ifp, 0); /* clear special flags on parent */ +#ifdef VLAN_ARRAY + trunk->vlans[ifv->ifv_tag] = NULL; + trunk->refcnt--; +#else + vlan_remhash(trunk, ifv); +#endif + ifv->ifv_trunk = NULL; + + /* + * Check if we were the last. + */ + if (trunk->refcnt == 0) { + trunk->parent->if_vlantrunk = NULL; + /* + * XXXGL: If some ithread has already entered + * vlan_input() and is now blocked on the trunk + * lock, then it should preempt us right after + * unlock and finish its work. Then we will acquire + * lock again in trunk_destroy(). + */ + TRUNK_UNLOCK(trunk); + trunk_destroy(trunk); + } else + TRUNK_UNLOCK(trunk); + } + + /* Disconnect from parent. */ + if (ifv->ifv_pflags) + if_printf(ifp, "%s: ifv_pflags unclean\n", __func__); + ifp->if_mtu = ETHERMTU; + ifp->if_link_state = LINK_STATE_UNKNOWN; + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + + /* + * Only dispatch an event if vlan was + * attached, otherwise there is nothing + * to cleanup anyway. + */ + if (parent != NULL) + EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_tag); +} + +/* Handle a reference counted flag that should be set on the parent as well */ +static int +vlan_setflag(struct ifnet *ifp, int flag, int status, + int (*func)(struct ifnet *, int)) +{ + struct ifvlan *ifv; + int error; + + /* XXX VLAN_LOCK_ASSERT(); */ + + ifv = ifp->if_softc; + status = status ? (ifp->if_flags & flag) : 0; + /* Now "status" contains the flag value or 0 */ + + /* + * See if recorded parent's status is different from what + * we want it to be. If it is, flip it. We record parent's + * status in ifv_pflags so that we won't clear parent's flag + * we haven't set. In fact, we don't clear or set parent's + * flags directly, but get or release references to them. + * That's why we can be sure that recorded flags still are + * in accord with actual parent's flags. + */ + if (status != (ifv->ifv_pflags & flag)) { + error = (*func)(PARENT(ifv), status); + if (error) + return (error); + ifv->ifv_pflags &= ~flag; + ifv->ifv_pflags |= status; + } + return (0); +} + +/* + * Handle IFF_* flags that require certain changes on the parent: + * if "status" is true, update parent's flags respective to our if_flags; + * if "status" is false, forcedly clear the flags set on parent. + */ +static int +vlan_setflags(struct ifnet *ifp, int status) +{ + int error, i; + + for (i = 0; vlan_pflags[i].flag; i++) { + error = vlan_setflag(ifp, vlan_pflags[i].flag, + status, vlan_pflags[i].func); + if (error) + return (error); + } + return (0); +} + +/* Inform all vlans that their parent has changed link state */ +static void +vlan_link_state(struct ifnet *ifp, int link) +{ + struct ifvlantrunk *trunk = ifp->if_vlantrunk; + struct ifvlan *ifv; + int i; + + TRUNK_LOCK(trunk); +#ifdef VLAN_ARRAY + for (i = 0; i < VLAN_ARRAY_SIZE; i++) + if (trunk->vlans[i] != NULL) { + ifv = trunk->vlans[i]; +#else + for (i = 0; i < (1 << trunk->hwidth); i++) + LIST_FOREACH(ifv, &trunk->hash[i], ifv_list) { +#endif + ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate; + if_link_state_change(ifv->ifv_ifp, + trunk->parent->if_link_state); + } + TRUNK_UNLOCK(trunk); +} + +static void +vlan_capabilities(struct ifvlan *ifv) +{ + struct ifnet *p = PARENT(ifv); + struct ifnet *ifp = ifv->ifv_ifp; + + TRUNK_LOCK_ASSERT(TRUNK(ifv)); + + /* + * If the parent interface can do checksum offloading + * on VLANs, then propagate its hardware-assisted + * checksumming flags. Also assert that checksum + * offloading requires hardware VLAN tagging. + */ + if (p->if_capabilities & IFCAP_VLAN_HWCSUM) + ifp->if_capabilities = p->if_capabilities & IFCAP_HWCSUM; + + if (p->if_capenable & IFCAP_VLAN_HWCSUM && + p->if_capenable & IFCAP_VLAN_HWTAGGING) { + ifp->if_capenable = p->if_capenable & IFCAP_HWCSUM; + ifp->if_hwassist = p->if_hwassist & (CSUM_IP | CSUM_TCP | + CSUM_UDP | CSUM_SCTP | CSUM_IP_FRAGS | CSUM_FRAGMENT); + } else { + ifp->if_capenable = 0; + ifp->if_hwassist = 0; + } + /* + * If the parent interface can do TSO on VLANs then + * propagate the hardware-assisted flag. TSO on VLANs + * does not necessarily require hardware VLAN tagging. + */ + if (p->if_capabilities & IFCAP_VLAN_HWTSO) + ifp->if_capabilities |= p->if_capabilities & IFCAP_TSO; + if (p->if_capenable & IFCAP_VLAN_HWTSO) { + ifp->if_capenable |= p->if_capenable & IFCAP_TSO; + ifp->if_hwassist |= p->if_hwassist & CSUM_TSO; + } else { + ifp->if_capenable &= ~(p->if_capenable & IFCAP_TSO); + ifp->if_hwassist &= ~(p->if_hwassist & CSUM_TSO); + } +} + +static void +vlan_trunk_capabilities(struct ifnet *ifp) +{ + struct ifvlantrunk *trunk = ifp->if_vlantrunk; + struct ifvlan *ifv; + int i; + + TRUNK_LOCK(trunk); +#ifdef VLAN_ARRAY + for (i = 0; i < VLAN_ARRAY_SIZE; i++) + if (trunk->vlans[i] != NULL) { + ifv = trunk->vlans[i]; +#else + for (i = 0; i < (1 << trunk->hwidth); i++) { + LIST_FOREACH(ifv, &trunk->hash[i], ifv_list) +#endif + vlan_capabilities(ifv); + } + TRUNK_UNLOCK(trunk); +} + +static int +vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct ifnet *p; + struct ifreq *ifr; + struct ifvlan *ifv; + struct vlanreq vlr; + int error = 0; + + ifr = (struct ifreq *)data; + ifv = ifp->if_softc; + + switch (cmd) { + case SIOCGIFMEDIA: + VLAN_LOCK(); + if (TRUNK(ifv) != NULL) { + p = PARENT(ifv); + VLAN_UNLOCK(); + error = (*p->if_ioctl)(p, SIOCGIFMEDIA, data); + /* Limit the result to the parent's current config. */ + if (error == 0) { + struct ifmediareq *ifmr; + + ifmr = (struct ifmediareq *)data; + if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) { + ifmr->ifm_count = 1; + error = copyout(&ifmr->ifm_current, + ifmr->ifm_ulist, + sizeof(int)); + } + } + } else { + VLAN_UNLOCK(); + error = EINVAL; + } + break; + + case SIOCSIFMEDIA: + error = EINVAL; + break; + + case SIOCSIFMTU: + /* + * Set the interface MTU. + */ + VLAN_LOCK(); + if (TRUNK(ifv) != NULL) { + if (ifr->ifr_mtu > + (PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) || + ifr->ifr_mtu < + (ifv->ifv_mintu - ifv->ifv_mtufudge)) + error = EINVAL; + else + ifp->if_mtu = ifr->ifr_mtu; + } else + error = EINVAL; + VLAN_UNLOCK(); + break; + + case SIOCSETVLAN: +#ifdef VIMAGE + if (ifp->if_vnet != ifp->if_home_vnet) { + error = EPERM; + break; + } +#endif + error = copyin(ifr->ifr_data, &vlr, sizeof(vlr)); + if (error) + break; + if (vlr.vlr_parent[0] == '\0') { + vlan_unconfig(ifp); + break; + } + p = ifunit(vlr.vlr_parent); + if (p == 0) { + error = ENOENT; + break; + } + /* + * Don't let the caller set up a VLAN tag with + * anything except VLID bits. + */ + if (vlr.vlr_tag & ~EVL_VLID_MASK) { + error = EINVAL; + break; + } + error = vlan_config(ifv, p, vlr.vlr_tag); + if (error) + break; + + /* Update flags on the parent, if necessary. */ + vlan_setflags(ifp, 1); + break; + + case SIOCGETVLAN: +#ifdef VIMAGE + if (ifp->if_vnet != ifp->if_home_vnet) { + error = EPERM; + break; + } +#endif + bzero(&vlr, sizeof(vlr)); + VLAN_LOCK(); + if (TRUNK(ifv) != NULL) { + strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, + sizeof(vlr.vlr_parent)); + vlr.vlr_tag = ifv->ifv_tag; + } + VLAN_UNLOCK(); + error = copyout(&vlr, ifr->ifr_data, sizeof(vlr)); + break; + + case SIOCSIFFLAGS: + /* + * We should propagate selected flags to the parent, + * e.g., promiscuous mode. + */ + if (TRUNK(ifv) != NULL) + error = vlan_setflags(ifp, 1); + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + /* + * If we don't have a parent, just remember the membership for + * when we do. + */ + if (TRUNK(ifv) != NULL) + error = vlan_setmulti(ifp); + break; + + default: + error = ether_ioctl(ifp, cmd, data); + } + + return (error); +} diff --git a/freebsd/sys/net/if_vlan_var.h b/freebsd/sys/net/if_vlan_var.h new file mode 100644 index 00000000..045e2fa1 --- /dev/null +++ b/freebsd/sys/net/if_vlan_var.h @@ -0,0 +1,137 @@ +/*- + * Copyright 1998 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_IF_VLAN_VAR_HH_ +#define _NET_IF_VLAN_VAR_HH_ 1 + +struct ether_vlan_header { + u_char evl_dhost[ETHER_ADDR_LEN]; + u_char evl_shost[ETHER_ADDR_LEN]; + u_int16_t evl_encap_proto; + u_int16_t evl_tag; + u_int16_t evl_proto; +}; + +#define EVL_VLID_MASK 0x0FFF +#define EVL_PRI_MASK 0xE000 +#define EVL_VLANOFTAG(tag) ((tag) & EVL_VLID_MASK) +#define EVL_PRIOFTAG(tag) (((tag) >> 13) & 7) +#define EVL_CFIOFTAG(tag) (((tag) >> 12) & 1) +#define EVL_MAKETAG(vlid, pri, cfi) \ + ((((((pri) & 7) << 1) | ((cfi) & 1)) << 12) | ((vlid) & EVL_VLID_MASK)) + +/* Set the VLAN ID in an mbuf packet header non-destructively. */ +#define EVL_APPLY_VLID(m, vlid) \ + do { \ + if ((m)->m_flags & M_VLANTAG) { \ + (m)->m_pkthdr.ether_vtag &= EVL_VLID_MASK; \ + (m)->m_pkthdr.ether_vtag |= (vlid); \ + } else { \ + (m)->m_pkthdr.ether_vtag = (vlid); \ + (m)->m_flags |= M_VLANTAG; \ + } \ + } while (0) + +/* Set the priority ID in an mbuf packet header non-destructively. */ +#define EVL_APPLY_PRI(m, pri) \ + do { \ + if ((m)->m_flags & M_VLANTAG) { \ + uint16_t __vlantag = (m)->m_pkthdr.ether_vtag; \ + (m)->m_pkthdr.ether_vtag |= EVL_MAKETAG( \ + EVL_VLANOFTAG(__vlantag), (pri), \ + EVL_CFIOFTAG(__vlantag)); \ + } else { \ + (m)->m_pkthdr.ether_vtag = \ + EVL_MAKETAG(0, (pri), 0); \ + (m)->m_flags |= M_VLANTAG; \ + } \ + } while (0) + +/* sysctl(3) tags, for compatibility purposes */ +#define VLANCTL_PROTO 1 +#define VLANCTL_MAX 2 + +/* + * Configuration structure for SIOCSETVLAN and SIOCGETVLAN ioctls. + */ +struct vlanreq { + char vlr_parent[IFNAMSIZ]; + u_short vlr_tag; +}; +#define SIOCSETVLAN SIOCSIFGENERIC +#define SIOCGETVLAN SIOCGIFGENERIC + +#ifdef _KERNEL +/* + * Drivers that are capable of adding and removing the VLAN header + * in hardware indicate they support this by marking IFCAP_VLAN_HWTAGGING + * in if_capabilities. Drivers for hardware that is capable + * of handling larger MTU's that may include a software-appended + * VLAN header w/o lowering the normal MTU should mark IFCAP_VLAN_MTU + * in if_capabilities; this notifies the VLAN code it can leave the + * MTU on the vlan interface at the normal setting. + */ + +/* + * VLAN tags are stored in host byte order. Byte swapping may be + * necessary. + * + * Drivers that support hardware VLAN tag stripping fill in the + * received VLAN tag (containing both vlan and priority information) + * into the ether_vtag mbuf packet header field: + * + * m->m_pkthdr.ether_vtag = vlan_id; // ntohs()? + * m->m_flags |= M_VLANTAG; + * + * to mark the packet m with the specified VLAN tag. + * + * On output the driver should check the mbuf for the M_VLANTAG + * flag to see if a VLAN tag is present and valid: + * + * if (m->m_flags & M_VLANTAG) { + * ... = m->m_pkthdr.ether_vtag; // htons()? + * ... pass tag to hardware ... + * } + * + * Note that a driver must indicate it supports hardware VLAN + * stripping/insertion by marking IFCAP_VLAN_HWTAGGING in + * if_capabilities. + */ + +#define VLAN_CAPABILITIES(_ifp) do { \ + if ((_ifp)->if_vlantrunk != NULL) \ + (*vlan_trunk_cap_p)(_ifp); \ +} while (0) + +extern void (*vlan_trunk_cap_p)(struct ifnet *); +#endif /* _KERNEL */ + +#endif /* _NET_IF_VLAN_VAR_HH_ */ diff --git a/freebsd/sys/net/iso88025.h b/freebsd/sys/net/iso88025.h new file mode 100644 index 00000000..26e3ada6 --- /dev/null +++ b/freebsd/sys/net/iso88025.h @@ -0,0 +1,172 @@ +/*- + * Copyright (c) 1998, Larry Lile + * All rights reserved. + * + * For latest sources and information on this driver, please + * go to http://anarchy.stdio.com. + * + * Questions, comments or suggestions should be directed to + * Larry Lile . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + * Information gathered from tokenring@freebsd, /sys/net/ethernet.h and + * the Mach token ring driver. + */ + +/* + * Fundamental constants relating to iso 802.5 + */ + +#ifndef _NET_ISO88025_HH_ +#define _NET_ISO88025_HH_ + +/* + * General ISO 802.5 definitions + */ +#define ISO88025_ADDR_LEN 6 +#define ISO88025_CF_LEN 2 +#define ISO88025_HDR_LEN (ISO88025_CF_LEN + (ISO88025_ADDR_LEN * 2)) +#define RCF_LEN 2 +#define RIF_MAX_RD 14 +#define RIF_MAX_LEN 16 + +#define TR_AC 0x10 +#define TR_LLC_FRAME 0x40 + +#define TR_4MBPS 4000000 +#define TR_16MBPS 16000000 +#define TR_100MBPS 100000000 + +/* + * Source routing + */ +#define TR_RII 0x80 +#define TR_RCF_BCST_MASK 0xe000 +#define TR_RCF_LEN_MASK 0x1f00 +#define TR_RCF_DIR 0x0080 +#define TR_RCF_LF_MASK 0x0070 + +#define TR_RCF_RIFLEN(x) ((ntohs(x) & TR_RCF_LEN_MASK) >> 8) + +/* + * Minimum and maximum packet payload lengths. + */ +#define ISO88025_MIN_LEN 0 +#define ISO88025_MAX_LEN_4 4464 +#define ISO88025_MAX_LEN_16 17960 +#define ISO88025_MAX_LEN ISO88025_MAX_LEN_16 + +/* + * A macro to validate a length with + */ +#define ISO88025_IS_VALID_LEN(foo) \ + ((foo) >= ISO88025_MIN_LEN && (foo) <= ISO88025_MAX_LEN) + +/* Access Control field */ +#define AC_PRI_MASK 0xe0 /* Priority bits */ +#define AC_TOKEN 0x10 /* Token bit: 0=Token, 1=Frame */ +#define AC_MONITOR 0x08 /* Monitor */ +#define AC_RESV_MASK 0x07 /* Reservation bits */ + +/* Frame Control field */ +#define FC_FT_MASK 0xc0 /* Frame Type */ +#define FC_FT_MAC 0x00 /* MAC frame */ +#define FC_FT_LLC 0x40 /* LLC frame */ +#define FC_ATTN_MASK 0x0f /* Attention bits */ +#define FC_ATTN_EB 0x01 /* Express buffer */ +#define FC_ATTN_BE 0x02 /* Beacon */ +#define FC_ATTN_CT 0x03 /* Claim token */ +#define FC_ATTN_RP 0x04 /* Ring purge */ +#define FC_ATTN_AMP 0x05 /* Active monitor present */ +#define FC_ATTN_SMP 0x06 /* Standby monitor present */ + +/* Token Ring destination address */ +#define DA_IG 0x80 /* Individual/group address. */ + /* 0=Individual, 1=Group */ +#define DA_UL 0x40 /* Universal/local address. */ + /* 0=Universal, 1=Local */ +/* Token Ring source address */ +#define SA_RII 0x80 /* Routing information indicator */ +#define SA_IG 0x40 /* Individual/group address */ + /* 0=Group, 1=Individual */ + +/* + * ISO 802.5 physical header + */ +struct iso88025_header { + u_int8_t ac; /* access control field */ + u_int8_t fc; /* frame control field */ + u_int8_t iso88025_dhost[ISO88025_ADDR_LEN]; /* destination address */ + u_int8_t iso88025_shost[ISO88025_ADDR_LEN]; /* source address */ + u_int16_t rcf; /* route control field */ + u_int16_t rd[RIF_MAX_RD]; /* routing designators */ +} __packed; + +struct iso88025_rif { + u_int16_t rcf; /* route control field */ + u_int16_t rd[RIF_MAX_RD]; /* routing designators */ +} __packed; + +struct iso88025_sockaddr_data { + u_char ether_dhost[ISO88025_ADDR_LEN]; + u_char ether_shost[ISO88025_ADDR_LEN]; + u_char ac; + u_char fc; +}; + +struct iso88025_sockaddr_dl_data { + u_short trld_rcf; + u_short *trld_route[RIF_MAX_LEN]; +}; + +#define ISO88025_MAX(a, b) (((a)>(b))?(a):(b)) +#define SDL_ISO88025(s) ((struct iso88025_sockaddr_dl_data *) \ + ((s)->sdl_data + \ + ISO88025_MAX((s)->sdl_nlen + (s)->sdl_alen + \ + (s)->sdl_slen, 12))) + +/* + * Structure of a 48-bit iso 802.5 address. + * ( We could also add the 16 bit addresses as a union) + */ +struct iso88025_addr { + u_char octet[ISO88025_ADDR_LEN]; +}; + +#define ISO88025_MAX_MTU 18000 +#define ISO88025_DEFAULT_MTU 1500 + +#define ISO88025_BPF_UNSUPPORTED 0 +#define ISO88025_BPF_SUPPORTED 1 + +void iso88025_ifattach (struct ifnet *, const u_int8_t *, int); +void iso88025_ifdetach (struct ifnet *, int); +int iso88025_ioctl (struct ifnet *, u_long, caddr_t ); +int iso88025_output (struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); +void iso88025_input (struct ifnet *, struct mbuf *); + +#endif diff --git a/freebsd/sys/net/netisr.c b/freebsd/sys/net/netisr.c new file mode 100644 index 00000000..4d34953a --- /dev/null +++ b/freebsd/sys/net/netisr.c @@ -0,0 +1,1172 @@ +#include + +/*- + * Copyright (c) 2007-2009 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + * netisr is a packet dispatch service, allowing synchronous (directly + * dispatched) and asynchronous (deferred dispatch) processing of packets by + * registered protocol handlers. Callers pass a protocol identifier and + * packet to netisr, along with a direct dispatch hint, and work will either + * be immediately processed with the registered handler, or passed to a + * kernel software interrupt (SWI) thread for deferred dispatch. Callers + * will generally select one or the other based on: + * + * - Might directly dispatching a netisr handler lead to code reentrance or + * lock recursion, such as entering the socket code from the socket code. + * - Might directly dispatching a netisr handler lead to recursive + * processing, such as when decapsulating several wrapped layers of tunnel + * information (IPSEC within IPSEC within ...). + * + * Maintaining ordering for protocol streams is a critical design concern. + * Enforcing ordering limits the opportunity for concurrency, but maintains + * the strong ordering requirements found in some protocols, such as TCP. Of + * related concern is CPU affinity--it is desirable to process all data + * associated with a particular stream on the same CPU over time in order to + * avoid acquiring locks associated with the connection on different CPUs, + * keep connection data in one cache, and to generally encourage associated + * user threads to live on the same CPU as the stream. It's also desirable + * to avoid lock migration and contention where locks are associated with + * more than one flow. + * + * netisr supports several policy variations, represented by the + * NETISR_POLICY_* constants, allowing protocols to play a varying role in + * identifying flows, assigning work to CPUs, etc. These are described in + * detail in netisr.h. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DDB +#include +#endif + +#include +#include +#include +#include + +/*- + * Synchronize use and modification of the registered netisr data structures; + * acquire a read lock while modifying the set of registered protocols to + * prevent partially registered or unregistered protocols from being run. + * + * The following data structures and fields are protected by this lock: + * + * - The np array, including all fields of struct netisr_proto. + * - The nws array, including all fields of struct netisr_worker. + * - The nws_array array. + * + * Note: the NETISR_LOCKING define controls whether read locks are acquired + * in packet processing paths requiring netisr registration stability. This + * is disabled by default as it can lead to a measurable performance + * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and + * because netisr registration and unregistration is extremely rare at + * runtime. If it becomes more common, this decision should be revisited. + * + * XXXRW: rmlocks don't support assertions. + */ +static struct rmlock netisr_rmlock; +#define NETISR_LOCK_INIT() rm_init_flags(&netisr_rmlock, "netisr", \ + RM_NOWITNESS) +#define NETISR_LOCK_ASSERT() +#define NETISR_RLOCK(tracker) rm_rlock(&netisr_rmlock, (tracker)) +#define NETISR_RUNLOCK(tracker) rm_runlock(&netisr_rmlock, (tracker)) +#define NETISR_WLOCK() rm_wlock(&netisr_rmlock) +#define NETISR_WUNLOCK() rm_wunlock(&netisr_rmlock) +/* #define NETISR_LOCKING */ + +SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); + +/*- + * Three direct dispatch policies are supported: + * + * - Always defer: all work is scheduled for a netisr, regardless of context. + * (!direct) + * + * - Hybrid: if the executing context allows direct dispatch, and we're + * running on the CPU the work would be done on, then direct dispatch if it + * wouldn't violate ordering constraints on the workstream. + * (direct && !direct_force) + * + * - Always direct: if the executing context allows direct dispatch, always + * direct dispatch. (direct && direct_force) + * + * Notice that changing the global policy could lead to short periods of + * misordered processing, but this is considered acceptable as compared to + * the complexity of enforcing ordering during policy changes. + */ +static int netisr_direct_force = 1; /* Always direct dispatch. */ +TUNABLE_INT("net.isr.direct_force", &netisr_direct_force); +SYSCTL_INT(_net_isr, OID_AUTO, direct_force, CTLFLAG_RW, + &netisr_direct_force, 0, "Force direct dispatch"); + +static int netisr_direct = 1; /* Enable direct dispatch. */ +TUNABLE_INT("net.isr.direct", &netisr_direct); +SYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RW, + &netisr_direct, 0, "Enable direct dispatch"); + +/* + * Allow the administrator to limit the number of threads (CPUs) to use for + * netisr. We don't check netisr_maxthreads before creating the thread for + * CPU 0, so in practice we ignore values <= 1. This must be set at boot. + * We will create at most one thread per CPU. + */ +static int netisr_maxthreads = -1; /* Max number of threads. */ +TUNABLE_INT("net.isr.maxthreads", &netisr_maxthreads); +SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RD, + &netisr_maxthreads, 0, + "Use at most this many CPUs for netisr processing"); + +static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ +TUNABLE_INT("net.isr.bindthreads", &netisr_bindthreads); +SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RD, + &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); + +/* + * Limit per-workstream queues to at most net.isr.maxqlimit, both for initial + * configuration and later modification using netisr_setqlimit(). + */ +#define NETISR_DEFAULT_MAXQLIMIT 10240 +static u_int netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT; +TUNABLE_INT("net.isr.maxqlimit", &netisr_maxqlimit); +SYSCTL_INT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RD, + &netisr_maxqlimit, 0, + "Maximum netisr per-protocol, per-CPU queue depth."); + +/* + * The default per-workstream queue limit for protocols that don't initialize + * the nh_qlimit field of their struct netisr_handler. If this is set above + * netisr_maxqlimit, we truncate it to the maximum during boot. + */ +#define NETISR_DEFAULT_DEFAULTQLIMIT 256 +static u_int netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT; +TUNABLE_INT("net.isr.defaultqlimit", &netisr_defaultqlimit); +SYSCTL_INT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RD, + &netisr_defaultqlimit, 0, + "Default netisr per-protocol, per-CPU queue limit if not set by protocol"); + +/* + * Each protocol is described by a struct netisr_proto, which holds all + * global per-protocol information. This data structure is set up by + * netisr_register(), and derived from the public struct netisr_handler. + */ +struct netisr_proto { + const char *np_name; /* Character string protocol name. */ + netisr_handler_t *np_handler; /* Protocol handler. */ + netisr_m2flow_t *np_m2flow; /* Query flow for untagged packet. */ + netisr_m2cpuid_t *np_m2cpuid; /* Query CPU to process packet on. */ + netisr_drainedcpu_t *np_drainedcpu; /* Callback when drained a queue. */ + u_int np_qlimit; /* Maximum per-CPU queue depth. */ + u_int np_policy; /* Work placement policy. */ +}; + +#define NETISR_MAXPROT 16 /* Compile-time limit. */ + +/* + * The np array describes all registered protocols, indexed by protocol + * number. + */ +static struct netisr_proto np[NETISR_MAXPROT]; + +/* + * Protocol-specific work for each workstream is described by struct + * netisr_work. Each work descriptor consists of an mbuf queue and + * statistics. + */ +struct netisr_work { + /* + * Packet queue, linked by m_nextpkt. + */ + struct mbuf *nw_head; + struct mbuf *nw_tail; + u_int nw_len; + u_int nw_qlimit; + u_int nw_watermark; + + /* + * Statistics -- written unlocked, but mostly from curcpu. + */ + u_int64_t nw_dispatched; /* Number of direct dispatches. */ + u_int64_t nw_hybrid_dispatched; /* "" hybrid dispatches. */ + u_int64_t nw_qdrops; /* "" drops. */ + u_int64_t nw_queued; /* "" enqueues. */ + u_int64_t nw_handled; /* "" handled in worker. */ +}; + +/* + * Workstreams hold a set of ordered work across each protocol, and are + * described by netisr_workstream. Each workstream is associated with a + * worker thread, which in turn is pinned to a CPU. Work associated with a + * workstream can be processd in other threads during direct dispatch; + * concurrent processing is prevented by the NWS_RUNNING flag, which + * indicates that a thread is already processing the work queue. + */ +struct netisr_workstream { + struct intr_event *nws_intr_event; /* Handler for stream. */ + void *nws_swi_cookie; /* swi(9) cookie for stream. */ + struct mtx nws_mtx; /* Synchronize work. */ + u_int nws_cpu; /* CPU pinning. */ + u_int nws_flags; /* Wakeup flags. */ + u_int nws_pendingbits; /* Scheduled protocols. */ + + /* + * Each protocol has per-workstream data. + */ + struct netisr_work nws_work[NETISR_MAXPROT]; +} __aligned(CACHE_LINE_SIZE); + +/* + * Per-CPU workstream data. + */ +DPCPU_DEFINE(struct netisr_workstream, nws); + +/* + * Map contiguous values between 0 and nws_count into CPU IDs appropriate for + * accessing workstreams. This allows constructions of the form + * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws). + */ +static u_int nws_array[MAXCPU]; + +/* + * Number of registered workstreams. Will be at most the number of running + * CPUs once fully started. + */ +static u_int nws_count; +SYSCTL_INT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD, + &nws_count, 0, "Number of extant netisr threads."); + +/* + * Per-workstream flags. + */ +#define NWS_RUNNING 0x00000001 /* Currently running in a thread. */ +#define NWS_DISPATCHING 0x00000002 /* Currently being direct-dispatched. */ +#define NWS_SCHEDULED 0x00000004 /* Signal issued. */ + +/* + * Synchronization for each workstream: a mutex protects all mutable fields + * in each stream, including per-protocol state (mbuf queues). The SWI is + * woken up if asynchronous dispatch is required. + */ +#define NWS_LOCK(s) mtx_lock(&(s)->nws_mtx) +#define NWS_LOCK_ASSERT(s) mtx_assert(&(s)->nws_mtx, MA_OWNED) +#define NWS_UNLOCK(s) mtx_unlock(&(s)->nws_mtx) +#define NWS_SIGNAL(s) swi_sched((s)->nws_swi_cookie, 0) + +#ifndef __rtems__ +/* + * Utility routines for protocols that implement their own mapping of flows + * to CPUs. + */ +u_int +netisr_get_cpucount(void) +{ + + return (nws_count); +} + +u_int +netisr_get_cpuid(u_int cpunumber) +{ + + KASSERT(cpunumber < nws_count, ("%s: %u > %u", __func__, cpunumber, + nws_count)); + + return (nws_array[cpunumber]); +} +#endif /* __rtems__ */ + +/* + * The default implementation of -> CPU ID mapping. + * + * Non-static so that protocols can use it to map their own work to specific + * CPUs in a manner consistent to netisr for affinity purposes. + */ +u_int +netisr_default_flow2cpu(u_int flowid) +{ + + return (nws_array[flowid % nws_count]); +} + +/* + * Register a new netisr handler, which requires initializing per-protocol + * fields for each workstream. All netisr work is briefly suspended while + * the protocol is installed. + */ +void +netisr_register(const struct netisr_handler *nhp) +{ + struct netisr_work *npwp; + const char *name; + u_int i, proto; + + proto = nhp->nh_proto; + name = nhp->nh_name; + + /* + * Test that the requested registration is valid. + */ + KASSERT(nhp->nh_name != NULL, + ("%s: nh_name NULL for %u", __func__, proto)); + KASSERT(nhp->nh_handler != NULL, + ("%s: nh_handler NULL for %s", __func__, name)); + KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE || + nhp->nh_policy == NETISR_POLICY_FLOW || + nhp->nh_policy == NETISR_POLICY_CPU, + ("%s: unsupported nh_policy %u for %s", __func__, + nhp->nh_policy, name)); + KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW || + nhp->nh_m2flow == NULL, + ("%s: nh_policy != FLOW but m2flow defined for %s", __func__, + name)); + KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL, + ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__, + name)); + KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL, + ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__, + name)); + KASSERT(proto < NETISR_MAXPROT, + ("%s(%u, %s): protocol too big", __func__, proto, name)); + + /* + * Test that no existing registration exists for this protocol. + */ + NETISR_WLOCK(); + KASSERT(np[proto].np_name == NULL, + ("%s(%u, %s): name present", __func__, proto, name)); + KASSERT(np[proto].np_handler == NULL, + ("%s(%u, %s): handler present", __func__, proto, name)); + + np[proto].np_name = name; + np[proto].np_handler = nhp->nh_handler; + np[proto].np_m2flow = nhp->nh_m2flow; + np[proto].np_m2cpuid = nhp->nh_m2cpuid; + np[proto].np_drainedcpu = nhp->nh_drainedcpu; + if (nhp->nh_qlimit == 0) + np[proto].np_qlimit = netisr_defaultqlimit; + else if (nhp->nh_qlimit > netisr_maxqlimit) { + printf("%s: %s requested queue limit %u capped to " + "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit, + netisr_maxqlimit); + np[proto].np_qlimit = netisr_maxqlimit; + } else + np[proto].np_qlimit = nhp->nh_qlimit; + np[proto].np_policy = nhp->nh_policy; + for (i = 0; i <= mp_maxid; i++) { + if (CPU_ABSENT(i)) + continue; + npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; + bzero(npwp, sizeof(*npwp)); + npwp->nw_qlimit = np[proto].np_qlimit; + } + NETISR_WUNLOCK(); +} + +/* + * Clear drop counters across all workstreams for a protocol. + */ +void +netisr_clearqdrops(const struct netisr_handler *nhp) +{ + struct netisr_work *npwp; +#ifdef INVARIANTS + const char *name; +#endif + u_int i, proto; + + proto = nhp->nh_proto; +#ifdef INVARIANTS + name = nhp->nh_name; +#endif + KASSERT(proto < NETISR_MAXPROT, + ("%s(%u): protocol too big for %s", __func__, proto, name)); + + NETISR_WLOCK(); + KASSERT(np[proto].np_handler != NULL, + ("%s(%u): protocol not registered for %s", __func__, proto, + name)); + + for (i = 0; i <= mp_maxid; i++) { + if (CPU_ABSENT(i)) + continue; + npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; + npwp->nw_qdrops = 0; + } + NETISR_WUNLOCK(); +} + +/* + * Query the current drop counters across all workstreams for a protocol. + */ +void +netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) +{ + struct netisr_work *npwp; + struct rm_priotracker tracker; +#ifdef INVARIANTS + const char *name; +#endif + u_int i, proto; + + *qdropp = 0; + proto = nhp->nh_proto; +#ifdef INVARIANTS + name = nhp->nh_name; +#endif + KASSERT(proto < NETISR_MAXPROT, + ("%s(%u): protocol too big for %s", __func__, proto, name)); + + NETISR_RLOCK(&tracker); + KASSERT(np[proto].np_handler != NULL, + ("%s(%u): protocol not registered for %s", __func__, proto, + name)); + + for (i = 0; i <= mp_maxid; i++) { + if (CPU_ABSENT(i)) + continue; + npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; + *qdropp += npwp->nw_qdrops; + } + NETISR_RUNLOCK(&tracker); +} + +/* + * Query the current queue limit for per-workstream queues for a protocol. + */ +void +netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp) +{ + struct rm_priotracker tracker; +#ifdef INVARIANTS + const char *name; +#endif + u_int proto; + + proto = nhp->nh_proto; +#ifdef INVARIANTS + name = nhp->nh_name; +#endif + KASSERT(proto < NETISR_MAXPROT, + ("%s(%u): protocol too big for %s", __func__, proto, name)); + + NETISR_RLOCK(&tracker); + KASSERT(np[proto].np_handler != NULL, + ("%s(%u): protocol not registered for %s", __func__, proto, + name)); + *qlimitp = np[proto].np_qlimit; + NETISR_RUNLOCK(&tracker); +} + +/* + * Update the queue limit across per-workstream queues for a protocol. We + * simply change the limits, and don't drain overflowed packets as they will + * (hopefully) take care of themselves shortly. + */ +int +netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit) +{ + struct netisr_work *npwp; +#ifdef INVARIANTS + const char *name; +#endif + u_int i, proto; + + if (qlimit > netisr_maxqlimit) + return (EINVAL); + + proto = nhp->nh_proto; +#ifdef INVARIANTS + name = nhp->nh_name; +#endif + KASSERT(proto < NETISR_MAXPROT, + ("%s(%u): protocol too big for %s", __func__, proto, name)); + + NETISR_WLOCK(); + KASSERT(np[proto].np_handler != NULL, + ("%s(%u): protocol not registered for %s", __func__, proto, + name)); + + np[proto].np_qlimit = qlimit; + for (i = 0; i <= mp_maxid; i++) { + if (CPU_ABSENT(i)) + continue; + npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; + npwp->nw_qlimit = qlimit; + } + NETISR_WUNLOCK(); + return (0); +} + +/* + * Drain all packets currently held in a particular protocol work queue. + */ +static void +netisr_drain_proto(struct netisr_work *npwp) +{ + struct mbuf *m; + + /* + * We would assert the lock on the workstream but it's not passed in. + */ + while ((m = npwp->nw_head) != NULL) { + npwp->nw_head = m->m_nextpkt; + m->m_nextpkt = NULL; + if (npwp->nw_head == NULL) + npwp->nw_tail = NULL; + npwp->nw_len--; + m_freem(m); + } + KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__)); + KASSERT(npwp->nw_len == 0, ("%s: len", __func__)); +} + +/* + * Remove the registration of a network protocol, which requires clearing + * per-protocol fields across all workstreams, including freeing all mbufs in + * the queues at time of unregister. All work in netisr is briefly suspended + * while this takes place. + */ +void +netisr_unregister(const struct netisr_handler *nhp) +{ + struct netisr_work *npwp; +#ifdef INVARIANTS + const char *name; +#endif + u_int i, proto; + + proto = nhp->nh_proto; +#ifdef INVARIANTS + name = nhp->nh_name; +#endif + KASSERT(proto < NETISR_MAXPROT, + ("%s(%u): protocol too big for %s", __func__, proto, name)); + + NETISR_WLOCK(); + KASSERT(np[proto].np_handler != NULL, + ("%s(%u): protocol not registered for %s", __func__, proto, + name)); + + np[proto].np_name = NULL; + np[proto].np_handler = NULL; + np[proto].np_m2flow = NULL; + np[proto].np_m2cpuid = NULL; + np[proto].np_qlimit = 0; + np[proto].np_policy = 0; + for (i = 0; i <= mp_maxid; i++) { + if (CPU_ABSENT(i)) + continue; + npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; + netisr_drain_proto(npwp); + bzero(npwp, sizeof(*npwp)); + } + NETISR_WUNLOCK(); +} + +/* + * Look up the workstream given a packet and source identifier. Do this by + * checking the protocol's policy, and optionally call out to the protocol + * for assistance if required. + */ +static struct mbuf * +netisr_select_cpuid(struct netisr_proto *npp, uintptr_t source, + struct mbuf *m, u_int *cpuidp) +{ + struct ifnet *ifp; + + NETISR_LOCK_ASSERT(); + + /* + * In the event we have only one worker, shortcut and deliver to it + * without further ado. + */ + if (nws_count == 1) { + *cpuidp = nws_array[0]; + return (m); + } + + /* + * What happens next depends on the policy selected by the protocol. + * If we want to support per-interface policies, we should do that + * here first. + */ + switch (npp->np_policy) { + case NETISR_POLICY_CPU: + return (npp->np_m2cpuid(m, source, cpuidp)); + + case NETISR_POLICY_FLOW: + if (!(m->m_flags & M_FLOWID) && npp->np_m2flow != NULL) { + m = npp->np_m2flow(m, source); + if (m == NULL) + return (NULL); + } + if (m->m_flags & M_FLOWID) { + *cpuidp = + netisr_default_flow2cpu(m->m_pkthdr.flowid); + return (m); + } + /* FALLTHROUGH */ + + case NETISR_POLICY_SOURCE: + ifp = m->m_pkthdr.rcvif; + if (ifp != NULL) + *cpuidp = nws_array[(ifp->if_index + source) % + nws_count]; + else + *cpuidp = nws_array[source % nws_count]; + return (m); + + default: + panic("%s: invalid policy %u for %s", __func__, + npp->np_policy, npp->np_name); + } +} + +/* + * Process packets associated with a workstream and protocol. For reasons of + * fairness, we process up to one complete netisr queue at a time, moving the + * queue to a stack-local queue for processing, but do not loop refreshing + * from the global queue. The caller is responsible for deciding whether to + * loop, and for setting the NWS_RUNNING flag. The passed workstream will be + * locked on entry and relocked before return, but will be released while + * processing. The number of packets processed is returned. + */ +static u_int +netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto) +{ + struct netisr_work local_npw, *npwp; + u_int handled; + struct mbuf *m; + + NETISR_LOCK_ASSERT(); + NWS_LOCK_ASSERT(nwsp); + + KASSERT(nwsp->nws_flags & NWS_RUNNING, + ("%s(%u): not running", __func__, proto)); + KASSERT(proto >= 0 && proto < NETISR_MAXPROT, + ("%s(%u): invalid proto\n", __func__, proto)); + + npwp = &nwsp->nws_work[proto]; + if (npwp->nw_len == 0) + return (0); + + /* + * Move the global work queue to a thread-local work queue. + * + * Notice that this means the effective maximum length of the queue + * is actually twice that of the maximum queue length specified in + * the protocol registration call. + */ + handled = npwp->nw_len; + local_npw = *npwp; + npwp->nw_head = NULL; + npwp->nw_tail = NULL; + npwp->nw_len = 0; + nwsp->nws_pendingbits &= ~(1 << proto); + NWS_UNLOCK(nwsp); + while ((m = local_npw.nw_head) != NULL) { + local_npw.nw_head = m->m_nextpkt; + m->m_nextpkt = NULL; + if (local_npw.nw_head == NULL) + local_npw.nw_tail = NULL; + local_npw.nw_len--; + VNET_ASSERT(m->m_pkthdr.rcvif != NULL); + CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); + np[proto].np_handler(m); + CURVNET_RESTORE(); + } + KASSERT(local_npw.nw_len == 0, + ("%s(%u): len %u", __func__, proto, local_npw.nw_len)); + if (np[proto].np_drainedcpu) + np[proto].np_drainedcpu(nwsp->nws_cpu); + NWS_LOCK(nwsp); + npwp->nw_handled += handled; + return (handled); +} + +/* + * SWI handler for netisr -- processes prackets in a set of workstreams that + * it owns, woken up by calls to NWS_SIGNAL(). If this workstream is already + * being direct dispatched, go back to sleep and wait for the dispatching + * thread to wake us up again. + */ +static void +swi_net(void *arg) +{ +#ifdef NETISR_LOCKING + struct rm_priotracker tracker; +#endif + struct netisr_workstream *nwsp; + u_int bits, prot; + + nwsp = arg; + +#ifdef DEVICE_POLLING + KASSERT(nws_count == 1, + ("%s: device_polling but nws_count != 1", __func__)); + netisr_poll(); +#endif +#ifdef NETISR_LOCKING + NETISR_RLOCK(&tracker); +#endif + NWS_LOCK(nwsp); + KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running")); + if (nwsp->nws_flags & NWS_DISPATCHING) + goto out; + nwsp->nws_flags |= NWS_RUNNING; + nwsp->nws_flags &= ~NWS_SCHEDULED; + while ((bits = nwsp->nws_pendingbits) != 0) { + while ((prot = ffs(bits)) != 0) { + prot--; + bits &= ~(1 << prot); + (void)netisr_process_workstream_proto(nwsp, prot); + } + } + nwsp->nws_flags &= ~NWS_RUNNING; +out: + NWS_UNLOCK(nwsp); +#ifdef NETISR_LOCKING + NETISR_RUNLOCK(&tracker); +#endif +#ifdef DEVICE_POLLING + netisr_pollmore(); +#endif +} + +static int +netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto, + struct netisr_work *npwp, struct mbuf *m, int *dosignalp) +{ + + NWS_LOCK_ASSERT(nwsp); + + *dosignalp = 0; + if (npwp->nw_len < npwp->nw_qlimit) { + m->m_nextpkt = NULL; + if (npwp->nw_head == NULL) { + npwp->nw_head = m; + npwp->nw_tail = m; + } else { + npwp->nw_tail->m_nextpkt = m; + npwp->nw_tail = m; + } + npwp->nw_len++; + if (npwp->nw_len > npwp->nw_watermark) + npwp->nw_watermark = npwp->nw_len; + nwsp->nws_pendingbits |= (1 << proto); + if (!(nwsp->nws_flags & + (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) { + nwsp->nws_flags |= NWS_SCHEDULED; + *dosignalp = 1; /* Defer until unlocked. */ + } + npwp->nw_queued++; + return (0); + } else { + m_freem(m); + npwp->nw_qdrops++; + return (ENOBUFS); + } +} + +static int +netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid) +{ + struct netisr_workstream *nwsp; + struct netisr_work *npwp; + int dosignal, error; + +#ifdef NETISR_LOCKING + NETISR_LOCK_ASSERT(); +#endif + KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__, + cpuid, mp_maxid)); + KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); + + dosignal = 0; + error = 0; + nwsp = DPCPU_ID_PTR(cpuid, nws); + npwp = &nwsp->nws_work[proto]; + NWS_LOCK(nwsp); + error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); + NWS_UNLOCK(nwsp); + if (dosignal) + NWS_SIGNAL(nwsp); + return (error); +} + +int +netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) +{ +#ifdef NETISR_LOCKING + struct rm_priotracker tracker; +#endif + u_int cpuid; + int error; + + KASSERT(proto < NETISR_MAXPROT, + ("%s: invalid proto %u", __func__, proto)); + +#ifdef NETISR_LOCKING + NETISR_RLOCK(&tracker); +#endif + KASSERT(np[proto].np_handler != NULL, + ("%s: invalid proto %u", __func__, proto)); + + m = netisr_select_cpuid(&np[proto], source, m, &cpuid); + if (m != NULL) { + KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, + cpuid)); + error = netisr_queue_internal(proto, m, cpuid); + } else + error = ENOBUFS; +#ifdef NETISR_LOCKING + NETISR_RUNLOCK(&tracker); +#endif + return (error); +} + +int +netisr_queue(u_int proto, struct mbuf *m) +{ + + return (netisr_queue_src(proto, 0, m)); +} + +/* + * Dispatch a packet for netisr processing, direct dispatch permitted by + * calling context. + */ +int +netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) +{ +#ifdef NETISR_LOCKING + struct rm_priotracker tracker; +#endif + struct netisr_workstream *nwsp; + struct netisr_work *npwp; + int dosignal, error; + u_int cpuid; + + /* + * If direct dispatch is entirely disabled, fall back on queueing. + */ + if (!netisr_direct) + return (netisr_queue_src(proto, source, m)); + + KASSERT(proto < NETISR_MAXPROT, + ("%s: invalid proto %u", __func__, proto)); +#ifdef NETISR_LOCKING + NETISR_RLOCK(&tracker); +#endif + KASSERT(np[proto].np_handler != NULL, + ("%s: invalid proto %u", __func__, proto)); + + /* + * If direct dispatch is forced, then unconditionally dispatch + * without a formal CPU selection. Borrow the current CPU's stats, + * even if there's no worker on it. In this case we don't update + * nws_flags because all netisr processing will be source ordered due + * to always being forced to directly dispatch. + */ + if (netisr_direct_force) { + nwsp = DPCPU_PTR(nws); + npwp = &nwsp->nws_work[proto]; + npwp->nw_dispatched++; + npwp->nw_handled++; + np[proto].np_handler(m); + error = 0; + goto out_unlock; + } + + /* + * Otherwise, we execute in a hybrid mode where we will try to direct + * dispatch if we're on the right CPU and the netisr worker isn't + * already running. + */ + m = netisr_select_cpuid(&np[proto], source, m, &cpuid); + if (m == NULL) { + error = ENOBUFS; + goto out_unlock; + } + KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); +#ifndef __rtems__ + sched_pin(); +#endif /* __rtems__ */ + if (cpuid != curcpu) + goto queue_fallback; + nwsp = DPCPU_PTR(nws); + npwp = &nwsp->nws_work[proto]; + + /*- + * We are willing to direct dispatch only if three conditions hold: + * + * (1) The netisr worker isn't already running, + * (2) Another thread isn't already directly dispatching, and + * (3) The netisr hasn't already been woken up. + */ + NWS_LOCK(nwsp); + if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) { + error = netisr_queue_workstream(nwsp, proto, npwp, m, + &dosignal); + NWS_UNLOCK(nwsp); + if (dosignal) + NWS_SIGNAL(nwsp); + goto out_unpin; + } + + /* + * The current thread is now effectively the netisr worker, so set + * the dispatching flag to prevent concurrent processing of the + * stream from another thread (even the netisr worker), which could + * otherwise lead to effective misordering of the stream. + */ + nwsp->nws_flags |= NWS_DISPATCHING; + NWS_UNLOCK(nwsp); + np[proto].np_handler(m); + NWS_LOCK(nwsp); + nwsp->nws_flags &= ~NWS_DISPATCHING; + npwp->nw_handled++; + npwp->nw_hybrid_dispatched++; + + /* + * If other work was enqueued by another thread while we were direct + * dispatching, we need to signal the netisr worker to do that work. + * In the future, we might want to do some of that work in the + * current thread, rather than trigger further context switches. If + * so, we'll want to establish a reasonable bound on the work done in + * the "borrowed" context. + */ + if (nwsp->nws_pendingbits != 0) { + nwsp->nws_flags |= NWS_SCHEDULED; + dosignal = 1; + } else + dosignal = 0; + NWS_UNLOCK(nwsp); + if (dosignal) + NWS_SIGNAL(nwsp); + error = 0; + goto out_unpin; + +queue_fallback: + error = netisr_queue_internal(proto, m, cpuid); +out_unpin: +#ifndef __rtems__ + sched_unpin(); +#endif /* __rtems__ */ +out_unlock: +#ifdef NETISR_LOCKING + NETISR_RUNLOCK(&tracker); +#endif + return (error); +} + +int +netisr_dispatch(u_int proto, struct mbuf *m) +{ + + return (netisr_dispatch_src(proto, 0, m)); +} + +#ifdef DEVICE_POLLING +/* + * Kernel polling borrows a netisr thread to run interface polling in; this + * function allows kernel polling to request that the netisr thread be + * scheduled even if no packets are pending for protocols. + */ +void +netisr_sched_poll(void) +{ + struct netisr_workstream *nwsp; + + nwsp = DPCPU_ID_PTR(nws_array[0], nws); + NWS_SIGNAL(nwsp); +} +#endif + +static void +netisr_start_swi(u_int cpuid, struct pcpu *pc) +{ + char swiname[12]; + struct netisr_workstream *nwsp; + int error; + + KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); + + nwsp = DPCPU_ID_PTR(cpuid, nws); + mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF); + nwsp->nws_cpu = cpuid; + snprintf(swiname, sizeof(swiname), "netisr %u", cpuid); + error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp, + SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie); + if (error) + panic("%s: swi_add %d", __func__, error); +#ifndef __rtems__ + pc->pc_netisr = nwsp->nws_intr_event; + if (netisr_bindthreads) { + error = intr_event_bind(nwsp->nws_intr_event, cpuid); + if (error != 0) + printf("%s: cpu %u: intr_event_bind: %d", __func__, + cpuid, error); + } +#endif + NETISR_WLOCK(); + nws_array[nws_count] = nwsp->nws_cpu; + nws_count++; + NETISR_WUNLOCK(); +} + +/* + * Initialize the netisr subsystem. We rely on BSS and static initialization + * of most fields in global data structures. + * + * Start a worker thread for the boot CPU so that we can support network + * traffic immediately in case the network stack is used before additional + * CPUs are started (for example, diskless boot). + */ +static void +netisr_init(void *arg) +{ + + KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__)); + + NETISR_LOCK_INIT(); + if (netisr_maxthreads < 1) + netisr_maxthreads = 1; + if (netisr_maxthreads > mp_ncpus) { + printf("netisr_init: forcing maxthreads from %d to %d\n", + netisr_maxthreads, mp_ncpus); + netisr_maxthreads = mp_ncpus; + } + if (netisr_defaultqlimit > netisr_maxqlimit) { + printf("netisr_init: forcing defaultqlimit from %d to %d\n", + netisr_defaultqlimit, netisr_maxqlimit); + netisr_defaultqlimit = netisr_maxqlimit; + } +#ifdef DEVICE_POLLING + /* + * The device polling code is not yet aware of how to deal with + * multiple netisr threads, so for the time being compiling in device + * polling disables parallel netisr workers. + */ + if (netisr_maxthreads != 1 || netisr_bindthreads != 0) { + printf("netisr_init: forcing maxthreads to 1 and " + "bindthreads to 0 for device polling\n"); + netisr_maxthreads = 1; + netisr_bindthreads = 0; + } +#endif + +#ifndef __rtems__ + netisr_start_swi(curcpu, pcpu_find(curcpu)); +#else /* __rtems__ */ + netisr_start_swi(0, NULL); +#endif /* __rtems__ */ +} +SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL); + +#ifndef __rtems__ +/* + * Start worker threads for additional CPUs. No attempt to gracefully handle + * work reassignment, we don't yet support dynamic reconfiguration. + */ +static void +netisr_start(void *arg) +{ + struct pcpu *pc; + + SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { + if (nws_count >= netisr_maxthreads) + break; + /* XXXRW: Is skipping absent CPUs still required here? */ + if (CPU_ABSENT(pc->pc_cpuid)) + continue; + /* Worker will already be present for boot CPU. */ + if (pc->pc_netisr != NULL) + continue; + netisr_start_swi(pc->pc_cpuid, pc); + } +} +SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL); +#endif /* __rtems__ */ + +#ifdef DDB +DB_SHOW_COMMAND(netisr, db_show_netisr) +{ + struct netisr_workstream *nwsp; + struct netisr_work *nwp; + int first, proto; + u_int cpuid; + + db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto", + "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue"); + for (cpuid = 0; cpuid <= mp_maxid; cpuid++) { + if (CPU_ABSENT(cpuid)) + continue; + nwsp = DPCPU_ID_PTR(cpuid, nws); + if (nwsp->nws_intr_event == NULL) + continue; + first = 1; + for (proto = 0; proto < NETISR_MAXPROT; proto++) { + if (np[proto].np_handler == NULL) + continue; + nwp = &nwsp->nws_work[proto]; + if (first) { + db_printf("%3d ", cpuid); + first = 0; + } else + db_printf("%3s ", ""); + db_printf( + "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n", + np[proto].np_name, nwp->nw_len, + nwp->nw_watermark, nwp->nw_qlimit, + nwp->nw_dispatched, nwp->nw_hybrid_dispatched, + nwp->nw_qdrops, nwp->nw_queued); + } + } +} +#endif diff --git a/freebsd/sys/net/netisr.h b/freebsd/sys/net/netisr.h new file mode 100644 index 00000000..b755332a --- /dev/null +++ b/freebsd/sys/net/netisr.h @@ -0,0 +1,156 @@ +/*- + * Copyright (c) 2007-2009 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_NETISR_HH_ +#define _NET_NETISR_HH_ +#ifdef _KERNEL + +/* + * The netisr (network interrupt service routine) provides a deferred + * execution evironment in which (generally inbound) network processing can + * take place. Protocols register handlers which will be executed directly, + * or via deferred dispatch, depending on the circumstances. + * + * Historically, this was implemented by the BSD software ISR facility; it is + * now implemented via a software ithread (SWI). + */ +#define NETISR_IP 1 +#define NETISR_IGMP 2 /* IGMPv3 output queue */ +#define NETISR_ROUTE 3 /* routing socket */ +#define NETISR_AARP 4 /* Appletalk ARP */ +#define NETISR_ATALK2 5 /* Appletalk phase 2 */ +#define NETISR_ATALK1 6 /* Appletalk phase 1 */ +#define NETISR_ARP 7 /* same as AF_LINK */ +#define NETISR_IPX 8 /* same as AF_IPX */ +#define NETISR_ETHER 9 /* ethernet input */ +#define NETISR_IPV6 10 +#define NETISR_NATM 11 +#define NETISR_EPAIR 12 /* if_epair(4) */ + +/*- + * Protocols express ordering constraints and affinity preferences by + * implementing one or neither of nh_m2flow and nh_m2cpuid, which are used by + * netisr to determine which per-CPU workstream to assign mbufs to. + * + * The following policies may be used by protocols: + * + * NETISR_POLICY_SOURCE - netisr should maintain source ordering without + * advice from the protocol. netisr will ignore any + * flow IDs present on the mbuf for the purposes of + * work placement. + * + * NETISR_POLICY_FLOW - netisr should maintain flow ordering as defined by + * the mbuf header flow ID field. If the protocol + * implements nh_m2flow, then netisr will query the + * protocol in the event that the mbuf doesn't have a + * flow ID, falling back on source ordering. + * + * NETISR_POLICY_CPU - netisr will delegate all work placement decisions to + * the protocol, querying nh_m2cpuid for each packet. + * + * Protocols might make decisions about work placement based on an existing + * calculated flow ID on the mbuf, such as one provided in hardware, the + * receive interface pointed to by the mbuf (if any), the optional source + * identifier passed at some dispatch points, or even parse packet headers to + * calculate a flow. Both protocol handlers may return a new mbuf pointer + * for the chain, or NULL if the packet proves invalid or m_pullup() fails. + * + * XXXRW: If we eventually support dynamic reconfiguration, there should be + * protocol handlers to notify them of CPU configuration changes so that they + * can rebalance work. + */ +struct mbuf; +typedef void netisr_handler_t(struct mbuf *m); +typedef struct mbuf *netisr_m2cpuid_t(struct mbuf *m, uintptr_t source, + u_int *cpuid); +typedef struct mbuf *netisr_m2flow_t(struct mbuf *m, uintptr_t source); +typedef void netisr_drainedcpu_t(u_int cpuid); + +#define NETISR_POLICY_SOURCE 1 /* Maintain source ordering. */ +#define NETISR_POLICY_FLOW 2 /* Maintain flow ordering. */ +#define NETISR_POLICY_CPU 3 /* Protocol determines CPU placement. */ + +/* + * Data structure describing a protocol handler. + */ +struct netisr_handler { + const char *nh_name; /* Character string protocol name. */ + netisr_handler_t *nh_handler; /* Protocol handler. */ + netisr_m2flow_t *nh_m2flow; /* Query flow for untagged packet. */ + netisr_m2cpuid_t *nh_m2cpuid; /* Query CPU to process mbuf on. */ + netisr_drainedcpu_t *nh_drainedcpu; /* Callback when drained a queue. */ + u_int nh_proto; /* Integer protocol ID. */ + u_int nh_qlimit; /* Maximum per-CPU queue depth. */ + u_int nh_policy; /* Work placement policy. */ + u_int nh_ispare[5]; /* For future use. */ + void *nh_pspare[4]; /* For future use. */ +}; + +/* + * Register, unregister, and other netisr handler management functions. + */ +void netisr_clearqdrops(const struct netisr_handler *nhp); +void netisr_getqdrops(const struct netisr_handler *nhp, + u_int64_t *qdropsp); +void netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp); +void netisr_register(const struct netisr_handler *nhp); +int netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit); +void netisr_unregister(const struct netisr_handler *nhp); + +/* + * Process a packet destined for a protocol, and attempt direct dispatch. + * Supplemental source ordering information can be passed using the _src + * variant. + */ +int netisr_dispatch(u_int proto, struct mbuf *m); +int netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m); +int netisr_queue(u_int proto, struct mbuf *m); +int netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m); + +/* + * Provide a default implementation of "map an ID to a CPU ID". + */ +u_int netisr_default_flow2cpu(u_int flowid); + +/* + * Utility routines to return the number of CPUs participting in netisr, and + * to return a mapping from a number to a CPU ID that can be used with the + * scheduler. + */ +u_int netisr_get_cpucount(void); +u_int netisr_get_cpuid(u_int cpunumber); + +/* + * Interfaces between DEVICE_POLLING and netisr. + */ +void netisr_sched_poll(void); +void netisr_poll(void); +void netisr_pollmore(void); + +#endif /* !_KERNEL */ +#endif /* !_NET_NETISR_HH_ */ diff --git a/freebsd/sys/net/pfil.c b/freebsd/sys/net/pfil.c new file mode 100644 index 00000000..3a382bc5 --- /dev/null +++ b/freebsd/sys/net/pfil.c @@ -0,0 +1,331 @@ +#include + +/* $FreeBSD$ */ +/* $NetBSD: pfil.c,v 1.20 2001/11/12 23:49:46 lukem Exp $ */ + +/*- + * Copyright (c) 1996 Matthew R. Green + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static struct mtx pfil_global_lock; + +MTX_SYSINIT(pfil_heads_lock, &pfil_global_lock, "pfil_head_list lock", + MTX_DEF); + +static int pfil_list_add(pfil_list_t *, struct packet_filter_hook *, int); + +static int pfil_list_remove(pfil_list_t *, + int (*)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), + void *); + +LIST_HEAD(pfilheadhead, pfil_head); +VNET_DEFINE(struct pfilheadhead, pfil_head_list); +#define V_pfil_head_list VNET(pfil_head_list) + +/* + * pfil_run_hooks() runs the specified packet filter hooks. + */ +int +pfil_run_hooks(struct pfil_head *ph, struct mbuf **mp, struct ifnet *ifp, + int dir, struct inpcb *inp) +{ + struct rm_priotracker rmpt; + struct packet_filter_hook *pfh; + struct mbuf *m = *mp; + int rv = 0; + + PFIL_RLOCK(ph, &rmpt); + KASSERT(ph->ph_nhooks >= 0, ("Pfil hook count dropped < 0")); + for (pfh = pfil_hook_get(dir, ph); pfh != NULL; + pfh = TAILQ_NEXT(pfh, pfil_link)) { + if (pfh->pfil_func != NULL) { + rv = (*pfh->pfil_func)(pfh->pfil_arg, &m, ifp, dir, + inp); + if (rv != 0 || m == NULL) + break; + } + } + PFIL_RUNLOCK(ph, &rmpt); + *mp = m; + return (rv); +} + +/* + * pfil_head_register() registers a pfil_head with the packet filter hook + * mechanism. + */ +int +pfil_head_register(struct pfil_head *ph) +{ + struct pfil_head *lph; + + PFIL_LIST_LOCK(); + LIST_FOREACH(lph, &V_pfil_head_list, ph_list) { + if (ph->ph_type == lph->ph_type && + ph->ph_un.phu_val == lph->ph_un.phu_val) { + PFIL_LIST_UNLOCK(); + return (EEXIST); + } + } + PFIL_LOCK_INIT(ph); + ph->ph_nhooks = 0; + TAILQ_INIT(&ph->ph_in); + TAILQ_INIT(&ph->ph_out); + LIST_INSERT_HEAD(&V_pfil_head_list, ph, ph_list); + PFIL_LIST_UNLOCK(); + return (0); +} + +/* + * pfil_head_unregister() removes a pfil_head from the packet filter hook + * mechanism. The producer of the hook promises that all outstanding + * invocations of the hook have completed before it unregisters the hook. + */ +int +pfil_head_unregister(struct pfil_head *ph) +{ + struct packet_filter_hook *pfh, *pfnext; + + PFIL_LIST_LOCK(); + LIST_REMOVE(ph, ph_list); + PFIL_LIST_UNLOCK(); + TAILQ_FOREACH_SAFE(pfh, &ph->ph_in, pfil_link, pfnext) + free(pfh, M_IFADDR); + TAILQ_FOREACH_SAFE(pfh, &ph->ph_out, pfil_link, pfnext) + free(pfh, M_IFADDR); + PFIL_LOCK_DESTROY(ph); + return (0); +} + +/* + * pfil_head_get() returns the pfil_head for a given key/dlt. + */ +struct pfil_head * +pfil_head_get(int type, u_long val) +{ + struct pfil_head *ph; + + PFIL_LIST_LOCK(); + LIST_FOREACH(ph, &V_pfil_head_list, ph_list) + if (ph->ph_type == type && ph->ph_un.phu_val == val) + break; + PFIL_LIST_UNLOCK(); + return (ph); +} + +/* + * pfil_add_hook() adds a function to the packet filter hook. the + * flags are: + * PFIL_IN call me on incoming packets + * PFIL_OUT call me on outgoing packets + * PFIL_ALL call me on all of the above + * PFIL_WAITOK OK to call malloc with M_WAITOK. + */ +int +pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, + struct inpcb *), void *arg, int flags, struct pfil_head *ph) +{ + struct packet_filter_hook *pfh1 = NULL; + struct packet_filter_hook *pfh2 = NULL; + int err; + + if (flags & PFIL_IN) { + pfh1 = (struct packet_filter_hook *)malloc(sizeof(*pfh1), + M_IFADDR, (flags & PFIL_WAITOK) ? M_WAITOK : M_NOWAIT); + if (pfh1 == NULL) { + err = ENOMEM; + goto error; + } + } + if (flags & PFIL_OUT) { + pfh2 = (struct packet_filter_hook *)malloc(sizeof(*pfh1), + M_IFADDR, (flags & PFIL_WAITOK) ? M_WAITOK : M_NOWAIT); + if (pfh2 == NULL) { + err = ENOMEM; + goto error; + } + } + PFIL_WLOCK(ph); + if (flags & PFIL_IN) { + pfh1->pfil_func = func; + pfh1->pfil_arg = arg; + err = pfil_list_add(&ph->ph_in, pfh1, flags & ~PFIL_OUT); + if (err) + goto locked_error; + ph->ph_nhooks++; + } + if (flags & PFIL_OUT) { + pfh2->pfil_func = func; + pfh2->pfil_arg = arg; + err = pfil_list_add(&ph->ph_out, pfh2, flags & ~PFIL_IN); + if (err) { + if (flags & PFIL_IN) + pfil_list_remove(&ph->ph_in, func, arg); + goto locked_error; + } + ph->ph_nhooks++; + } + PFIL_WUNLOCK(ph); + return (0); +locked_error: + PFIL_WUNLOCK(ph); +error: + if (pfh1 != NULL) + free(pfh1, M_IFADDR); + if (pfh2 != NULL) + free(pfh2, M_IFADDR); + return (err); +} + +/* + * pfil_remove_hook removes a specific function from the packet filter hook + * list. + */ +int +pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, + struct inpcb *), void *arg, int flags, struct pfil_head *ph) +{ + int err = 0; + + PFIL_WLOCK(ph); + if (flags & PFIL_IN) { + err = pfil_list_remove(&ph->ph_in, func, arg); + if (err == 0) + ph->ph_nhooks--; + } + if ((err == 0) && (flags & PFIL_OUT)) { + err = pfil_list_remove(&ph->ph_out, func, arg); + if (err == 0) + ph->ph_nhooks--; + } + PFIL_WUNLOCK(ph); + return (err); +} + +static int +pfil_list_add(pfil_list_t *list, struct packet_filter_hook *pfh1, int flags) +{ + struct packet_filter_hook *pfh; + + /* + * First make sure the hook is not already there. + */ + TAILQ_FOREACH(pfh, list, pfil_link) + if (pfh->pfil_func == pfh1->pfil_func && + pfh->pfil_arg == pfh1->pfil_arg) + return (EEXIST); + + /* + * Insert the input list in reverse order of the output list so that + * the same path is followed in or out of the kernel. + */ + if (flags & PFIL_IN) + TAILQ_INSERT_HEAD(list, pfh1, pfil_link); + else + TAILQ_INSERT_TAIL(list, pfh1, pfil_link); + return (0); +} + +/* + * pfil_list_remove is an internal function that takes a function off the + * specified list. + */ +static int +pfil_list_remove(pfil_list_t *list, + int (*func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), + void *arg) +{ + struct packet_filter_hook *pfh; + + TAILQ_FOREACH(pfh, list, pfil_link) + if (pfh->pfil_func == func && pfh->pfil_arg == arg) { + TAILQ_REMOVE(list, pfh, pfil_link); + free(pfh, M_IFADDR); + return (0); + } + return (ENOENT); +} + +/**************** + * Stuff that must be initialized for every instance + * (including the first of course). + */ +static int +vnet_pfil_init(const void *unused) +{ + LIST_INIT(&V_pfil_head_list); + return (0); +} + +/*********************** + * Called for the removal of each instance. + */ +static int +vnet_pfil_uninit(const void *unused) +{ + /* XXX should panic if list is not empty */ + return 0; +} + +/* Define startup order. */ +#define PFIL_SYSINIT_ORDER SI_SUB_PROTO_BEGIN +#define PFIL_MODEVENT_ORDER (SI_ORDER_FIRST) /* On boot slot in here. */ +#define PFIL_VNET_ORDER (PFIL_MODEVENT_ORDER + 2) /* Later still. */ + +/* + * Starting up. + * VNET_SYSINIT is called for each existing vnet and each new vnet. + */ +VNET_SYSINIT(vnet_pfil_init, PFIL_SYSINIT_ORDER, PFIL_VNET_ORDER, + vnet_pfil_init, NULL); + +/* + * Closing up shop. These are done in REVERSE ORDER, + * Not called on reboot. + * VNET_SYSUNINIT is called for each exiting vnet as it exits. + */ +VNET_SYSUNINIT(vnet_pfil_uninit, PFIL_SYSINIT_ORDER, PFIL_VNET_ORDER, + vnet_pfil_uninit, NULL); + diff --git a/freebsd/sys/net/pfil.h b/freebsd/sys/net/pfil.h new file mode 100644 index 00000000..78ab0518 --- /dev/null +++ b/freebsd/sys/net/pfil.h @@ -0,0 +1,117 @@ +/* $FreeBSD$ */ +/* $NetBSD: pfil.h,v 1.22 2003/06/23 12:57:08 martin Exp $ */ + +/*- + * Copyright (c) 1996 Matthew R. Green + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NET_PFIL_HH_ +#define _NET_PFIL_HH_ + +#include +#include +#include +#include +#include +#include + +struct mbuf; +struct ifnet; +struct inpcb; + +/* + * The packet filter hooks are designed for anything to call them to + * possibly intercept the packet. + */ +struct packet_filter_hook { + TAILQ_ENTRY(packet_filter_hook) pfil_link; + int (*pfil_func)(void *, struct mbuf **, struct ifnet *, int, + struct inpcb *); + void *pfil_arg; +}; + +#define PFIL_IN 0x00000001 +#define PFIL_OUT 0x00000002 +#define PFIL_WAITOK 0x00000004 +#define PFIL_ALL (PFIL_IN|PFIL_OUT) + +typedef TAILQ_HEAD(pfil_list, packet_filter_hook) pfil_list_t; + +#define PFIL_TYPE_AF 1 /* key is AF_* type */ +#define PFIL_TYPE_IFNET 2 /* key is ifnet pointer */ + +struct pfil_head { + pfil_list_t ph_in; + pfil_list_t ph_out; + int ph_type; + int ph_nhooks; + struct rmlock ph_lock; + union { + u_long phu_val; + void *phu_ptr; + } ph_un; +#define ph_af ph_un.phu_val +#define ph_ifnet ph_un.phu_ptr + LIST_ENTRY(pfil_head) ph_list; +}; + +int pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *, + int, struct inpcb *), void *, int, struct pfil_head *); +int pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *, + int, struct inpcb *), void *, int, struct pfil_head *); +int pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *, + int, struct inpcb *inp); + +int pfil_head_register(struct pfil_head *); +int pfil_head_unregister(struct pfil_head *); + +struct pfil_head *pfil_head_get(int, u_long); + +#define PFIL_HOOKED(p) ((p)->ph_nhooks > 0) +#define PFIL_LOCK_INIT(p) \ + rm_init_flags(&(p)->ph_lock, "PFil hook read/write mutex", RM_RECURSE) +#define PFIL_LOCK_DESTROY(p) rm_destroy(&(p)->ph_lock) +#define PFIL_RLOCK(p, t) rm_rlock(&(p)->ph_lock, (t)) +#define PFIL_WLOCK(p) rm_wlock(&(p)->ph_lock) +#define PFIL_RUNLOCK(p, t) rm_runlock(&(p)->ph_lock, (t)) +#define PFIL_WUNLOCK(p) rm_wunlock(&(p)->ph_lock) +#define PFIL_LIST_LOCK() mtx_lock(&pfil_global_lock) +#define PFIL_LIST_UNLOCK() mtx_unlock(&pfil_global_lock) + +static __inline struct packet_filter_hook * +pfil_hook_get(int dir, struct pfil_head *ph) +{ + + if (dir == PFIL_IN) + return (TAILQ_FIRST(&ph->ph_in)); + else if (dir == PFIL_OUT) + return (TAILQ_FIRST(&ph->ph_out)); + else + return (NULL); +} + +#endif /* _NET_PFIL_HH_ */ diff --git a/freebsd/sys/net/pfkeyv2.h b/freebsd/sys/net/pfkeyv2.h new file mode 100644 index 00000000..f8e088e1 --- /dev/null +++ b/freebsd/sys/net/pfkeyv2.h @@ -0,0 +1,432 @@ +/* $FreeBSD$ */ +/* $KAME: pfkeyv2.h,v 1.37 2003/09/06 05:15:43 itojun Exp $ */ + +/*- + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file has been derived rfc 2367, + * And added some flags of SADB_KEY_FLAGS_ as SADB_X_EXT_. + * sakane@ydc.co.jp + */ + +#ifndef _NET_PFKEYV2_HH_ +#define _NET_PFKEYV2_HH_ + +/* +This file defines structures and symbols for the PF_KEY Version 2 +key management interface. It was written at the U.S. Naval Research +Laboratory. This file is in the public domain. The authors ask that +you leave this credit intact on any copies of this file. +*/ +#ifndef __PFKEY_V2_H +#define __PFKEY_V2_H 1 + +#define PF_KEY_V2 2 +#define PFKEYV2_REVISION 199806L + +#define SADB_RESERVED 0 +#define SADB_GETSPI 1 +#define SADB_UPDATE 2 +#define SADB_ADD 3 +#define SADB_DELETE 4 +#define SADB_GET 5 +#define SADB_ACQUIRE 6 +#define SADB_REGISTER 7 +#define SADB_EXPIRE 8 +#define SADB_FLUSH 9 +#define SADB_DUMP 10 +#define SADB_X_PROMISC 11 +#define SADB_X_PCHANGE 12 + +#define SADB_X_SPDUPDATE 13 +#define SADB_X_SPDADD 14 +#define SADB_X_SPDDELETE 15 /* by policy index */ +#define SADB_X_SPDGET 16 +#define SADB_X_SPDACQUIRE 17 +#define SADB_X_SPDDUMP 18 +#define SADB_X_SPDFLUSH 19 +#define SADB_X_SPDSETIDX 20 +#define SADB_X_SPDEXPIRE 21 +#define SADB_X_SPDDELETE2 22 /* by policy id */ +#define SADB_MAX 22 + +struct sadb_msg { + u_int8_t sadb_msg_version; + u_int8_t sadb_msg_type; + u_int8_t sadb_msg_errno; + u_int8_t sadb_msg_satype; + u_int16_t sadb_msg_len; + u_int16_t sadb_msg_reserved; + u_int32_t sadb_msg_seq; + u_int32_t sadb_msg_pid; +}; + +struct sadb_ext { + u_int16_t sadb_ext_len; + u_int16_t sadb_ext_type; +}; + +struct sadb_sa { + u_int16_t sadb_sa_len; + u_int16_t sadb_sa_exttype; + u_int32_t sadb_sa_spi; + u_int8_t sadb_sa_replay; + u_int8_t sadb_sa_state; + u_int8_t sadb_sa_auth; + u_int8_t sadb_sa_encrypt; + u_int32_t sadb_sa_flags; +}; + +struct sadb_lifetime { + u_int16_t sadb_lifetime_len; + u_int16_t sadb_lifetime_exttype; + u_int32_t sadb_lifetime_allocations; + u_int64_t sadb_lifetime_bytes; + u_int64_t sadb_lifetime_addtime; + u_int64_t sadb_lifetime_usetime; +}; + +struct sadb_address { + u_int16_t sadb_address_len; + u_int16_t sadb_address_exttype; + u_int8_t sadb_address_proto; + u_int8_t sadb_address_prefixlen; + u_int16_t sadb_address_reserved; +}; + +struct sadb_key { + u_int16_t sadb_key_len; + u_int16_t sadb_key_exttype; + u_int16_t sadb_key_bits; + u_int16_t sadb_key_reserved; +}; + +struct sadb_ident { + u_int16_t sadb_ident_len; + u_int16_t sadb_ident_exttype; + u_int16_t sadb_ident_type; + u_int16_t sadb_ident_reserved; + u_int64_t sadb_ident_id; +}; + +struct sadb_sens { + u_int16_t sadb_sens_len; + u_int16_t sadb_sens_exttype; + u_int32_t sadb_sens_dpd; + u_int8_t sadb_sens_sens_level; + u_int8_t sadb_sens_sens_len; + u_int8_t sadb_sens_integ_level; + u_int8_t sadb_sens_integ_len; + u_int32_t sadb_sens_reserved; +}; + +struct sadb_prop { + u_int16_t sadb_prop_len; + u_int16_t sadb_prop_exttype; + u_int8_t sadb_prop_replay; + u_int8_t sadb_prop_reserved[3]; +}; + +struct sadb_comb { + u_int8_t sadb_comb_auth; + u_int8_t sadb_comb_encrypt; + u_int16_t sadb_comb_flags; + u_int16_t sadb_comb_auth_minbits; + u_int16_t sadb_comb_auth_maxbits; + u_int16_t sadb_comb_encrypt_minbits; + u_int16_t sadb_comb_encrypt_maxbits; + u_int32_t sadb_comb_reserved; + u_int32_t sadb_comb_soft_allocations; + u_int32_t sadb_comb_hard_allocations; + u_int64_t sadb_comb_soft_bytes; + u_int64_t sadb_comb_hard_bytes; + u_int64_t sadb_comb_soft_addtime; + u_int64_t sadb_comb_hard_addtime; + u_int64_t sadb_comb_soft_usetime; + u_int64_t sadb_comb_hard_usetime; +}; + +struct sadb_supported { + u_int16_t sadb_supported_len; + u_int16_t sadb_supported_exttype; + u_int32_t sadb_supported_reserved; +}; + +struct sadb_alg { + u_int8_t sadb_alg_id; + u_int8_t sadb_alg_ivlen; + u_int16_t sadb_alg_minbits; + u_int16_t sadb_alg_maxbits; + u_int16_t sadb_alg_reserved; +}; + +struct sadb_spirange { + u_int16_t sadb_spirange_len; + u_int16_t sadb_spirange_exttype; + u_int32_t sadb_spirange_min; + u_int32_t sadb_spirange_max; + u_int32_t sadb_spirange_reserved; +}; + +struct sadb_x_kmprivate { + u_int16_t sadb_x_kmprivate_len; + u_int16_t sadb_x_kmprivate_exttype; + u_int32_t sadb_x_kmprivate_reserved; +}; + +/* + * XXX Additional SA Extension. + * mode: tunnel or transport + * reqid: to make SA unique nevertheless the address pair of SA are same. + * Mainly it's for VPN. + */ +struct sadb_x_sa2 { + u_int16_t sadb_x_sa2_len; + u_int16_t sadb_x_sa2_exttype; + u_int8_t sadb_x_sa2_mode; + u_int8_t sadb_x_sa2_reserved1; + u_int16_t sadb_x_sa2_reserved2; + u_int32_t sadb_x_sa2_sequence; /* lowermost 32bit of sequence number */ + u_int32_t sadb_x_sa2_reqid; +}; + +/* XXX Policy Extension */ +/* sizeof(struct sadb_x_policy) == 16 */ +struct sadb_x_policy { + u_int16_t sadb_x_policy_len; + u_int16_t sadb_x_policy_exttype; + u_int16_t sadb_x_policy_type; /* See policy type of ipsec.h */ + u_int8_t sadb_x_policy_dir; /* direction, see ipsec.h */ + u_int8_t sadb_x_policy_reserved; + u_int32_t sadb_x_policy_id; + u_int32_t sadb_x_policy_reserved2; +}; +/* + * When policy_type == IPSEC, it is followed by some of + * the ipsec policy request. + * [total length of ipsec policy requests] + * = (sadb_x_policy_len * sizeof(uint64_t) - sizeof(struct sadb_x_policy)) + */ + +/* XXX IPsec Policy Request Extension */ +/* + * This structure is aligned 8 bytes. + */ +struct sadb_x_ipsecrequest { + u_int16_t sadb_x_ipsecrequest_len; /* structure length in 64 bits. */ + u_int16_t sadb_x_ipsecrequest_proto; /* See ipsec.h */ + u_int8_t sadb_x_ipsecrequest_mode; /* See IPSEC_MODE_XX in ipsec.h. */ + u_int8_t sadb_x_ipsecrequest_level; /* See IPSEC_LEVEL_XX in ipsec.h */ + u_int16_t sadb_x_ipsecrequest_reqid; /* See ipsec.h */ + + /* + * followed by source IP address of SA, and immediately followed by + * destination IP address of SA. These encoded into two of sockaddr + * structure without any padding. Must set each sa_len exactly. + * Each of length of the sockaddr structure are not aligned to 64bits, + * but sum of x_request and addresses is aligned to 64bits. + */ +}; + +/* NAT-Traversal type, see RFC 3948 (and drafts). */ +/* sizeof(struct sadb_x_nat_t_type) == 8 */ +struct sadb_x_nat_t_type { + u_int16_t sadb_x_nat_t_type_len; + u_int16_t sadb_x_nat_t_type_exttype; + u_int8_t sadb_x_nat_t_type_type; + u_int8_t sadb_x_nat_t_type_reserved[3]; +}; + +/* NAT-Traversal source or destination port. */ +/* sizeof(struct sadb_x_nat_t_port) == 8 */ +struct sadb_x_nat_t_port { + u_int16_t sadb_x_nat_t_port_len; + u_int16_t sadb_x_nat_t_port_exttype; + u_int16_t sadb_x_nat_t_port_port; + u_int16_t sadb_x_nat_t_port_reserved; +}; + +/* ESP fragmentation size. */ +/* sizeof(struct sadb_x_nat_t_frag) == 8 */ +struct sadb_x_nat_t_frag { + u_int16_t sadb_x_nat_t_frag_len; + u_int16_t sadb_x_nat_t_frag_exttype; + u_int16_t sadb_x_nat_t_frag_fraglen; + u_int16_t sadb_x_nat_t_frag_reserved; +}; + + +#define SADB_EXT_RESERVED 0 +#define SADB_EXT_SA 1 +#define SADB_EXT_LIFETIME_CURRENT 2 +#define SADB_EXT_LIFETIME_HARD 3 +#define SADB_EXT_LIFETIME_SOFT 4 +#define SADB_EXT_ADDRESS_SRC 5 +#define SADB_EXT_ADDRESS_DST 6 +#define SADB_EXT_ADDRESS_PROXY 7 +#define SADB_EXT_KEY_AUTH 8 +#define SADB_EXT_KEY_ENCRYPT 9 +#define SADB_EXT_IDENTITY_SRC 10 +#define SADB_EXT_IDENTITY_DST 11 +#define SADB_EXT_SENSITIVITY 12 +#define SADB_EXT_PROPOSAL 13 +#define SADB_EXT_SUPPORTED_AUTH 14 +#define SADB_EXT_SUPPORTED_ENCRYPT 15 +#define SADB_EXT_SPIRANGE 16 +#define SADB_X_EXT_KMPRIVATE 17 +#define SADB_X_EXT_POLICY 18 +#define SADB_X_EXT_SA2 19 +#define SADB_X_EXT_NAT_T_TYPE 20 +#define SADB_X_EXT_NAT_T_SPORT 21 +#define SADB_X_EXT_NAT_T_DPORT 22 +#define SADB_X_EXT_NAT_T_OA 23 /* Deprecated. */ +#define SADB_X_EXT_NAT_T_OAI 23 /* Peer's NAT_OA for src of SA. */ +#define SADB_X_EXT_NAT_T_OAR 24 /* Peer's NAT_OA for dst of SA. */ +#define SADB_X_EXT_NAT_T_FRAG 25 /* Manual MTU override. */ +#define SADB_EXT_MAX 25 + +#define SADB_SATYPE_UNSPEC 0 +#define SADB_SATYPE_AH 2 +#define SADB_SATYPE_ESP 3 +#define SADB_SATYPE_RSVP 5 +#define SADB_SATYPE_OSPFV2 6 +#define SADB_SATYPE_RIPV2 7 +#define SADB_SATYPE_MIP 8 +#define SADB_X_SATYPE_IPCOMP 9 +/*#define SADB_X_SATYPE_POLICY 10 obsolete, do not reuse */ +#define SADB_X_SATYPE_TCPSIGNATURE 11 +#define SADB_SATYPE_MAX 12 + +#define SADB_SASTATE_LARVAL 0 +#define SADB_SASTATE_MATURE 1 +#define SADB_SASTATE_DYING 2 +#define SADB_SASTATE_DEAD 3 +#define SADB_SASTATE_MAX 3 + +#define SADB_SAFLAGS_PFS 1 + +/* RFC2367 numbers - meets RFC2407 */ +#define SADB_AALG_NONE 0 +#define SADB_AALG_MD5HMAC 2 +#define SADB_AALG_SHA1HMAC 3 +#define SADB_AALG_MAX 252 +/* private allocations - based on RFC2407/IANA assignment */ +#define SADB_X_AALG_SHA2_256 5 +#define SADB_X_AALG_SHA2_384 6 +#define SADB_X_AALG_SHA2_512 7 +#define SADB_X_AALG_RIPEMD160HMAC 8 +#define SADB_X_AALG_AES_XCBC_MAC 9 /* draft-ietf-ipsec-ciph-aes-xcbc-mac-04 */ +/* private allocations should use 249-255 (RFC2407) */ +#define SADB_X_AALG_MD5 249 /* Keyed MD5 */ +#define SADB_X_AALG_SHA 250 /* Keyed SHA */ +#define SADB_X_AALG_NULL 251 /* null authentication */ +#define SADB_X_AALG_TCP_MD5 252 /* Keyed TCP-MD5 (RFC2385) */ + +/* RFC2367 numbers - meets RFC2407 */ +#define SADB_EALG_NONE 0 +#define SADB_EALG_DESCBC 2 +#define SADB_EALG_3DESCBC 3 +#define SADB_EALG_NULL 11 +#define SADB_EALG_MAX 250 +/* private allocations - based on RFC2407/IANA assignment */ +#define SADB_X_EALG_CAST128CBC 6 +#define SADB_X_EALG_BLOWFISHCBC 7 +#define SADB_X_EALG_RIJNDAELCBC 12 +#define SADB_X_EALG_AES 12 +/* private allocations - based on RFC4312/IANA assignment */ +#define SADB_X_EALG_CAMELLIACBC 22 +/* private allocations should use 249-255 (RFC2407) */ +#define SADB_X_EALG_SKIPJACK 249 /*250*/ /* for IPSEC */ +#define SADB_X_EALG_AESCTR 250 /*249*/ /* draft-ietf-ipsec-ciph-aes-ctr-03 */ + +/* private allocations - based on RFC2407/IANA assignment */ +#define SADB_X_CALG_NONE 0 +#define SADB_X_CALG_OUI 1 +#define SADB_X_CALG_DEFLATE 2 +#define SADB_X_CALG_LZS 3 +#define SADB_X_CALG_MAX 4 + +#define SADB_IDENTTYPE_RESERVED 0 +#define SADB_IDENTTYPE_PREFIX 1 +#define SADB_IDENTTYPE_FQDN 2 +#define SADB_IDENTTYPE_USERFQDN 3 +#define SADB_X_IDENTTYPE_ADDR 4 +#define SADB_IDENTTYPE_MAX 4 + +/* `flags' in sadb_sa structure holds followings */ +#define SADB_X_EXT_NONE 0x0000 /* i.e. new format. */ +#define SADB_X_EXT_OLD 0x0001 /* old format. */ + +#define SADB_X_EXT_IV4B 0x0010 /* IV length of 4 bytes in use */ +#define SADB_X_EXT_DERIV 0x0020 /* DES derived */ +#define SADB_X_EXT_CYCSEQ 0x0040 /* allowing to cyclic sequence. */ + + /* three of followings are exclusive flags each them */ +#define SADB_X_EXT_PSEQ 0x0000 /* sequencial padding for ESP */ +#define SADB_X_EXT_PRAND 0x0100 /* random padding for ESP */ +#define SADB_X_EXT_PZERO 0x0200 /* zero padding for ESP */ +#define SADB_X_EXT_PMASK 0x0300 /* mask for padding flag */ + +#if 1 +#define SADB_X_EXT_RAWCPI 0x0080 /* use well known CPI (IPComp) */ +#endif + +#define SADB_KEY_FLAGS_MAX 0x0fff + +/* SPI size for PF_KEYv2 */ +#define PFKEY_SPI_SIZE sizeof(u_int32_t) + +/* Identifier for menber of lifetime structure */ +#define SADB_X_LIFETIME_ALLOCATIONS 0 +#define SADB_X_LIFETIME_BYTES 1 +#define SADB_X_LIFETIME_ADDTIME 2 +#define SADB_X_LIFETIME_USETIME 3 + +/* The rate for SOFT lifetime against HARD one. */ +#define PFKEY_SOFT_LIFETIME_RATE 80 + +/* Utilities */ +#define PFKEY_ALIGN8(a) (1 + (((a) - 1) | (8 - 1))) +#define PFKEY_EXTLEN(msg) \ + PFKEY_UNUNIT64(((struct sadb_ext *)(msg))->sadb_ext_len) +#define PFKEY_ADDR_PREFIX(ext) \ + (((struct sadb_address *)(ext))->sadb_address_prefixlen) +#define PFKEY_ADDR_PROTO(ext) \ + (((struct sadb_address *)(ext))->sadb_address_proto) +#define PFKEY_ADDR_SADDR(ext) \ + ((struct sockaddr *)((caddr_t)(ext) + sizeof(struct sadb_address))) + +/* in 64bits */ +#define PFKEY_UNUNIT64(a) ((a) << 3) +#define PFKEY_UNIT64(a) ((a) >> 3) + +#endif /* __PFKEY_V2_H */ + +#endif /* _NET_PFKEYV2_HH_ */ diff --git a/freebsd/sys/net/ppp_defs.h b/freebsd/sys/net/ppp_defs.h new file mode 100644 index 00000000..e0690e94 --- /dev/null +++ b/freebsd/sys/net/ppp_defs.h @@ -0,0 +1,158 @@ +/* + * ppp_defs.h - PPP definitions. + */ +/*- + * Copyright (c) 1994 The Australian National University. + * All rights reserved. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation is hereby granted, provided that the above copyright + * notice appears in all copies. This software is provided without any + * warranty, express or implied. The Australian National University + * makes no representations about the suitability of this software for + * any purpose. + * + * IN NO EVENT SHALL THE AUSTRALIAN NATIONAL UNIVERSITY BE LIABLE TO ANY + * PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF + * THE AUSTRALIAN NATIONAL UNIVERSITY HAVE BEEN ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * THE AUSTRALIAN NATIONAL UNIVERSITY SPECIFICALLY DISCLAIMS ANY WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE AUSTRALIAN NATIONAL UNIVERSITY HAS NO + * OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, + * OR MODIFICATIONS. + * + * $FreeBSD$ + */ + +#ifndef _PPP_DEFS_HH_ +#define _PPP_DEFS_HH_ + +/* + * The basic PPP frame. + */ +#define PPP_HDRLEN 4 /* octets for standard ppp header */ +#define PPP_FCSLEN 2 /* octets for FCS */ +#define PPP_MRU 1500 /* default MRU = max length of info field */ + +#define PPP_ADDRESS(p) (((u_char *)(p))[0]) +#define PPP_CONTROL(p) (((u_char *)(p))[1]) +#define PPP_PROTOCOL(p) ((((u_char *)(p))[2] << 8) + ((u_char *)(p))[3]) + +/* + * Significant octet values. + */ +#define PPP_ALLSTATIONS 0xff /* All-Stations broadcast address */ +#define PPP_UI 0x03 /* Unnumbered Information */ +#define PPP_FLAG 0x7e /* Flag Sequence */ +#define PPP_ESCAPE 0x7d /* Asynchronous Control Escape */ +#define PPP_TRANS 0x20 /* Asynchronous transparency modifier */ + +/* + * Protocol field values. + */ +#define PPP_IP 0x21 /* Internet Protocol */ +#define PPP_XNS 0x25 /* Xerox NS */ +#define PPP_AT 0x29 /* AppleTalk Protocol */ +#define PPP_IPX 0x2b /* IPX Datagram (RFC1552) */ +#define PPP_VJC_COMP 0x2d /* VJ compressed TCP */ +#define PPP_VJC_UNCOMP 0x2f /* VJ uncompressed TCP */ +#define PPP_COMP 0xfd /* compressed packet */ +#define PPP_IPCP 0x8021 /* IP Control Protocol */ +#define PPP_ATCP 0x8029 /* AppleTalk Control Protocol */ +#define PPP_IPXCP 0x802b /* IPX Control Protocol (RFC1552) */ +#define PPP_CCP 0x80fd /* Compression Control Protocol */ +#define PPP_LCP 0xc021 /* Link Control Protocol */ +#define PPP_PAP 0xc023 /* Password Authentication Protocol */ +#define PPP_LQR 0xc025 /* Link Quality Report protocol */ +#define PPP_CHAP 0xc223 /* Cryptographic Handshake Auth. Protocol */ +#define PPP_CBCP 0xc029 /* Callback Control Protocol */ +#define PPP_IPV6 0x57 /* Internet Protocol version 6*/ +#define PPP_IPV6CP 0x8057 /* IPv6 Control Protocol */ + +/* + * Values for FCS calculations. + */ +#define PPP_INITFCS 0xffff /* Initial FCS value */ +#define PPP_GOODFCS 0xf0b8 /* Good final FCS value */ +#define PPP_FCS(fcs, c) (((fcs) >> 8) ^ fcstab[((fcs) ^ (c)) & 0xff]) + +/* + * Extended asyncmap - allows any character to be escaped. + */ +typedef u_int32_t ext_accm[8]; + +/* + * What to do with network protocol (NP) packets. + */ +enum NPmode { + NPMODE_PASS, /* pass the packet through */ + NPMODE_DROP, /* silently drop the packet */ + NPMODE_ERROR, /* return an error */ + NPMODE_QUEUE /* save it up for later. */ +}; + +/* + * Statistics. + */ +struct pppstat { + unsigned int ppp_ibytes; /* bytes received */ + unsigned int ppp_ipackets; /* packets received */ + unsigned int ppp_ierrors; /* receive errors */ + unsigned int ppp_obytes; /* bytes sent */ + unsigned int ppp_opackets; /* packets sent */ + unsigned int ppp_oerrors; /* transmit errors */ +}; + +struct vjstat { + unsigned int vjs_packets; /* outbound packets */ + unsigned int vjs_compressed; /* outbound compressed packets */ + unsigned int vjs_searches; /* searches for connection state */ + unsigned int vjs_misses; /* times couldn't find conn. state */ + unsigned int vjs_uncompressedin; /* inbound uncompressed packets */ + unsigned int vjs_compressedin; /* inbound compressed packets */ + unsigned int vjs_errorin; /* inbound unknown type packets */ + unsigned int vjs_tossed; /* inbound packets tossed because of error */ +}; + +struct ppp_stats { + struct pppstat p; /* basic PPP statistics */ + struct vjstat vj; /* VJ header compression statistics */ +}; + +struct compstat { + unsigned int unc_bytes; /* total uncompressed bytes */ + unsigned int unc_packets; /* total uncompressed packets */ + unsigned int comp_bytes; /* compressed bytes */ + unsigned int comp_packets; /* compressed packets */ + unsigned int inc_bytes; /* incompressible bytes */ + unsigned int inc_packets; /* incompressible packets */ + unsigned int ratio; /* recent compression ratio << 8 */ +}; + +struct ppp_comp_stats { + struct compstat c; /* packet compression statistics */ + struct compstat d; /* packet decompression statistics */ +}; + +/* + * The following structure records the time in seconds since + * the last NP packet was sent or received. + */ +struct ppp_idle { + time_t xmit_idle; /* time since last NP packet sent */ + time_t recv_idle; /* time since last NP packet received */ +}; + +#ifndef __P +#ifdef __STDC__ +#define __P(x) x +#else +#define __P(x) () +#endif +#endif + +#endif /* _PPP_DEFS_HH_ */ diff --git a/freebsd/sys/net/radix.c b/freebsd/sys/net/radix.c new file mode 100644 index 00000000..c1881acb --- /dev/null +++ b/freebsd/sys/net/radix.c @@ -0,0 +1,1205 @@ +#include + +/*- + * Copyright (c) 1988, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)radix.c 8.5 (Berkeley) 5/19/95 + * $FreeBSD$ + */ + +/* + * Routines to build and maintain radix trees for routing lookups. + */ +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef RADIX_MPATH +#include +#endif +#else /* !_KERNEL */ +#include +#include +#include +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x) fprintf(stderr, "PANIC: %s", x), exit(1) +#define min(a, b) ((a) < (b) ? (a) : (b) ) +#include +#endif /* !_KERNEL */ + +static int rn_walktree_from(struct radix_node_head *h, void *a, void *m, + walktree_f_t *f, void *w); +static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *); +static struct radix_node + *rn_insert(void *, struct radix_node_head *, int *, + struct radix_node [2]), + *rn_newpair(void *, int, struct radix_node[2]), + *rn_search(void *, struct radix_node *), + *rn_search_m(void *, struct radix_node *, void *); + +static int max_keylen; +static struct radix_mask *rn_mkfreelist; +static struct radix_node_head *mask_rnhead; +/* + * Work area -- the following point to 3 buffers of size max_keylen, + * allocated in this order in a block of memory malloc'ed by rn_init. + * rn_zeros, rn_ones are set in rn_init and used in readonly afterwards. + * addmask_key is used in rn_addmask in rw mode and not thread-safe. + */ +static char *rn_zeros, *rn_ones, *addmask_key; + +#define MKGet(m) { \ + if (rn_mkfreelist) { \ + m = rn_mkfreelist; \ + rn_mkfreelist = (m)->rm_mklist; \ + } else \ + R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); } + +#define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);} + +#define rn_masktop (mask_rnhead->rnh_treetop) + +static int rn_lexobetter(void *m_arg, void *n_arg); +static struct radix_mask * + rn_new_radix_mask(struct radix_node *tt, + struct radix_mask *next); +static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, + int skip); + +/* + * The data structure for the keys is a radix tree with one way + * branching removed. The index rn_bit at an internal node n represents a bit + * position to be tested. The tree is arranged so that all descendants + * of a node n have keys whose bits all agree up to position rn_bit - 1. + * (We say the index of n is rn_bit.) + * + * There is at least one descendant which has a one bit at position rn_bit, + * and at least one with a zero there. + * + * A route is determined by a pair of key and mask. We require that the + * bit-wise logical and of the key and mask to be the key. + * We define the index of a route to associated with the mask to be + * the first bit number in the mask where 0 occurs (with bit number 0 + * representing the highest order bit). + * + * We say a mask is normal if every bit is 0, past the index of the mask. + * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit, + * and m is a normal mask, then the route applies to every descendant of n. + * If the index(m) < rn_bit, this implies the trailing last few bits of k + * before bit b are all 0, (and hence consequently true of every descendant + * of n), so the route applies to all descendants of the node as well. + * + * Similar logic shows that a non-normal mask m such that + * index(m) <= index(n) could potentially apply to many children of n. + * Thus, for each non-host route, we attach its mask to a list at an internal + * node as high in the tree as we can go. + * + * The present version of the code makes use of normal routes in short- + * circuiting an explict mask and compare operation when testing whether + * a key satisfies a normal route, and also in remembering the unique leaf + * that governs a subtree. + */ + +/* + * Most of the functions in this code assume that the key/mask arguments + * are sockaddr-like structures, where the first byte is an u_char + * indicating the size of the entire structure. + * + * To make the assumption more explicit, we use the LEN() macro to access + * this field. It is safe to pass an expression with side effects + * to LEN() as the argument is evaluated only once. + * We cast the result to int as this is the dominant usage. + */ +#define LEN(x) ( (int) (*(const u_char *)(x)) ) + +/* + * XXX THIS NEEDS TO BE FIXED + * In the code, pointers to keys and masks are passed as either + * 'void *' (because callers use to pass pointers of various kinds), or + * 'caddr_t' (which is fine for pointer arithmetics, but not very + * clean when you dereference it to access data). Furthermore, caddr_t + * is really 'char *', while the natural type to operate on keys and + * masks would be 'u_char'. This mismatch require a lot of casts and + * intermediate variables to adapt types that clutter the code. + */ + +/* + * Search a node in the tree matching the key. + */ +static struct radix_node * +rn_search(v_arg, head) + void *v_arg; + struct radix_node *head; +{ + register struct radix_node *x; + register caddr_t v; + + for (x = head, v = v_arg; x->rn_bit >= 0;) { + if (x->rn_bmask & v[x->rn_offset]) + x = x->rn_right; + else + x = x->rn_left; + } + return (x); +} + +/* + * Same as above, but with an additional mask. + * XXX note this function is used only once. + */ +static struct radix_node * +rn_search_m(v_arg, head, m_arg) + struct radix_node *head; + void *v_arg, *m_arg; +{ + register struct radix_node *x; + register caddr_t v = v_arg, m = m_arg; + + for (x = head; x->rn_bit >= 0;) { + if ((x->rn_bmask & m[x->rn_offset]) && + (x->rn_bmask & v[x->rn_offset])) + x = x->rn_right; + else + x = x->rn_left; + } + return x; +} + +int +rn_refines(m_arg, n_arg) + void *m_arg, *n_arg; +{ + register caddr_t m = m_arg, n = n_arg; + register caddr_t lim, lim2 = lim = n + LEN(n); + int longer = LEN(n++) - LEN(m++); + int masks_are_equal = 1; + + if (longer > 0) + lim -= longer; + while (n < lim) { + if (*n & ~(*m)) + return 0; + if (*n++ != *m++) + masks_are_equal = 0; + } + while (n < lim2) + if (*n++) + return 0; + if (masks_are_equal && (longer < 0)) + for (lim2 = m - longer; m < lim2; ) + if (*m++) + return 1; + return (!masks_are_equal); +} + +struct radix_node * +rn_lookup(v_arg, m_arg, head) + void *v_arg, *m_arg; + struct radix_node_head *head; +{ + register struct radix_node *x; + caddr_t netmask = 0; + + if (m_arg) { + x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_offset); + if (x == 0) + return (0); + netmask = x->rn_key; + } + x = rn_match(v_arg, head); + if (x && netmask) { + while (x && x->rn_mask != netmask) + x = x->rn_dupedkey; + } + return x; +} + +static int +rn_satisfies_leaf(trial, leaf, skip) + char *trial; + register struct radix_node *leaf; + int skip; +{ + register char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask; + char *cplim; + int length = min(LEN(cp), LEN(cp2)); + + if (cp3 == NULL) + cp3 = rn_ones; + else + length = min(length, LEN(cp3)); + cplim = cp + length; cp3 += skip; cp2 += skip; + for (cp += skip; cp < cplim; cp++, cp2++, cp3++) + if ((*cp ^ *cp2) & *cp3) + return 0; + return 1; +} + +struct radix_node * +rn_match(v_arg, head) + void *v_arg; + struct radix_node_head *head; +{ + caddr_t v = v_arg; + register struct radix_node *t = head->rnh_treetop, *x; + register caddr_t cp = v, cp2; + caddr_t cplim; + struct radix_node *saved_t, *top = t; + int off = t->rn_offset, vlen = LEN(cp), matched_off; + register int test, b, rn_bit; + + /* + * Open code rn_search(v, top) to avoid overhead of extra + * subroutine call. + */ + for (; t->rn_bit >= 0; ) { + if (t->rn_bmask & cp[t->rn_offset]) + t = t->rn_right; + else + t = t->rn_left; + } + /* + * See if we match exactly as a host destination + * or at least learn how many bits match, for normal mask finesse. + * + * It doesn't hurt us to limit how many bytes to check + * to the length of the mask, since if it matches we had a genuine + * match and the leaf we have is the most specific one anyway; + * if it didn't match with a shorter length it would fail + * with a long one. This wins big for class B&C netmasks which + * are probably the most common case... + */ + if (t->rn_mask) + vlen = *(u_char *)t->rn_mask; + cp += off; cp2 = t->rn_key + off; cplim = v + vlen; + for (; cp < cplim; cp++, cp2++) + if (*cp != *cp2) + goto on1; + /* + * This extra grot is in case we are explicitly asked + * to look up the default. Ugh! + * + * Never return the root node itself, it seems to cause a + * lot of confusion. + */ + if (t->rn_flags & RNF_ROOT) + t = t->rn_dupedkey; + return t; +on1: + test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */ + for (b = 7; (test >>= 1) > 0;) + b--; + matched_off = cp - v; + b += matched_off << 3; + rn_bit = -1 - b; + /* + * If there is a host route in a duped-key chain, it will be first. + */ + if ((saved_t = t)->rn_mask == 0) + t = t->rn_dupedkey; + for (; t; t = t->rn_dupedkey) + /* + * Even if we don't match exactly as a host, + * we may match if the leaf we wound up at is + * a route to a net. + */ + if (t->rn_flags & RNF_NORMAL) { + if (rn_bit <= t->rn_bit) + return t; + } else if (rn_satisfies_leaf(v, t, matched_off)) + return t; + t = saved_t; + /* start searching up the tree */ + do { + register struct radix_mask *m; + t = t->rn_parent; + m = t->rn_mklist; + /* + * If non-contiguous masks ever become important + * we can restore the masking and open coding of + * the search and satisfaction test and put the + * calculation of "off" back before the "do". + */ + while (m) { + if (m->rm_flags & RNF_NORMAL) { + if (rn_bit <= m->rm_bit) + return (m->rm_leaf); + } else { + off = min(t->rn_offset, matched_off); + x = rn_search_m(v, t, m->rm_mask); + while (x && x->rn_mask != m->rm_mask) + x = x->rn_dupedkey; + if (x && rn_satisfies_leaf(v, x, off)) + return x; + } + m = m->rm_mklist; + } + } while (t != top); + return 0; +} + +#ifdef RN_DEBUG +int rn_nodenum; +struct radix_node *rn_clist; +int rn_saveinfo; +int rn_debug = 1; +#endif + +/* + * Whenever we add a new leaf to the tree, we also add a parent node, + * so we allocate them as an array of two elements: the first one must be + * the leaf (see RNTORT() in route.c), the second one is the parent. + * This routine initializes the relevant fields of the nodes, so that + * the leaf is the left child of the parent node, and both nodes have + * (almost) all all fields filled as appropriate. + * (XXX some fields are left unset, see the '#if 0' section). + * The function returns a pointer to the parent node. + */ + +static struct radix_node * +rn_newpair(v, b, nodes) + void *v; + int b; + struct radix_node nodes[2]; +{ + register struct radix_node *tt = nodes, *t = tt + 1; + t->rn_bit = b; + t->rn_bmask = 0x80 >> (b & 7); + t->rn_left = tt; + t->rn_offset = b >> 3; + +#if 0 /* XXX perhaps we should fill these fields as well. */ + t->rn_parent = t->rn_right = NULL; + + tt->rn_mask = NULL; + tt->rn_dupedkey = NULL; + tt->rn_bmask = 0; +#endif + tt->rn_bit = -1; + tt->rn_key = (caddr_t)v; + tt->rn_parent = t; + tt->rn_flags = t->rn_flags = RNF_ACTIVE; + tt->rn_mklist = t->rn_mklist = 0; +#ifdef RN_DEBUG + tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; + tt->rn_twin = t; + tt->rn_ybro = rn_clist; + rn_clist = tt; +#endif + return t; +} + +static struct radix_node * +rn_insert(v_arg, head, dupentry, nodes) + void *v_arg; + struct radix_node_head *head; + int *dupentry; + struct radix_node nodes[2]; +{ + caddr_t v = v_arg; + struct radix_node *top = head->rnh_treetop; + int head_off = top->rn_offset, vlen = LEN(v); + register struct radix_node *t = rn_search(v_arg, top); + register caddr_t cp = v + head_off; + register int b; + struct radix_node *tt; + /* + * Find first bit at which v and t->rn_key differ + */ + { + register caddr_t cp2 = t->rn_key + head_off; + register int cmp_res; + caddr_t cplim = v + vlen; + + while (cp < cplim) + if (*cp2++ != *cp++) + goto on1; + *dupentry = 1; + return t; +on1: + *dupentry = 0; + cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; + for (b = (cp - v) << 3; cmp_res; b--) + cmp_res >>= 1; + } + { + register struct radix_node *p, *x = top; + cp = v; + do { + p = x; + if (cp[x->rn_offset] & x->rn_bmask) + x = x->rn_right; + else + x = x->rn_left; + } while (b > (unsigned) x->rn_bit); + /* x->rn_bit < b && x->rn_bit >= 0 */ +#ifdef RN_DEBUG + if (rn_debug) + log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p); +#endif + t = rn_newpair(v_arg, b, nodes); + tt = t->rn_left; + if ((cp[p->rn_offset] & p->rn_bmask) == 0) + p->rn_left = t; + else + p->rn_right = t; + x->rn_parent = t; + t->rn_parent = p; /* frees x, p as temp vars below */ + if ((cp[t->rn_offset] & t->rn_bmask) == 0) { + t->rn_right = x; + } else { + t->rn_right = tt; + t->rn_left = x; + } +#ifdef RN_DEBUG + if (rn_debug) + log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p); +#endif + } + return (tt); +} + +struct radix_node * +rn_addmask(n_arg, search, skip) + int search, skip; + void *n_arg; +{ + caddr_t netmask = (caddr_t)n_arg; + register struct radix_node *x; + register caddr_t cp, cplim; + register int b = 0, mlen, j; + int maskduplicated, m0, isnormal; + struct radix_node *saved_x; + static int last_zeroed = 0; + + if ((mlen = LEN(netmask)) > max_keylen) + mlen = max_keylen; + if (skip == 0) + skip = 1; + if (mlen <= skip) + return (mask_rnhead->rnh_nodes); + if (skip > 1) + bcopy(rn_ones + 1, addmask_key + 1, skip - 1); + if ((m0 = mlen) > skip) + bcopy(netmask + skip, addmask_key + skip, mlen - skip); + /* + * Trim trailing zeroes. + */ + for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;) + cp--; + mlen = cp - addmask_key; + if (mlen <= skip) { + if (m0 >= last_zeroed) + last_zeroed = mlen; + return (mask_rnhead->rnh_nodes); + } + if (m0 < last_zeroed) + bzero(addmask_key + m0, last_zeroed - m0); + *addmask_key = last_zeroed = mlen; + x = rn_search(addmask_key, rn_masktop); + if (bcmp(addmask_key, x->rn_key, mlen) != 0) + x = 0; + if (x || search) + return (x); + R_Zalloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x)); + if ((saved_x = x) == 0) + return (0); + netmask = cp = (caddr_t)(x + 2); + bcopy(addmask_key, cp, mlen); + x = rn_insert(cp, mask_rnhead, &maskduplicated, x); + if (maskduplicated) { + log(LOG_ERR, "rn_addmask: mask impossibly already in tree"); + Free(saved_x); + return (x); + } + /* + * Calculate index of mask, and check for normalcy. + * First find the first byte with a 0 bit, then if there are + * more bits left (remember we already trimmed the trailing 0's), + * the pattern must be one of those in normal_chars[], or we have + * a non-contiguous mask. + */ + cplim = netmask + mlen; + isnormal = 1; + for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;) + cp++; + if (cp != cplim) { + static char normal_chars[] = { + 0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; + + for (j = 0x80; (j & *cp) != 0; j >>= 1) + b++; + if (*cp != normal_chars[b] || cp != (cplim - 1)) + isnormal = 0; + } + b += (cp - netmask) << 3; + x->rn_bit = -1 - b; + if (isnormal) + x->rn_flags |= RNF_NORMAL; + return (x); +} + +static int /* XXX: arbitrary ordering for non-contiguous masks */ +rn_lexobetter(m_arg, n_arg) + void *m_arg, *n_arg; +{ + register u_char *mp = m_arg, *np = n_arg, *lim; + + if (LEN(mp) > LEN(np)) + return 1; /* not really, but need to check longer one first */ + if (LEN(mp) == LEN(np)) + for (lim = mp + LEN(mp); mp < lim;) + if (*mp++ > *np++) + return 1; + return 0; +} + +static struct radix_mask * +rn_new_radix_mask(tt, next) + register struct radix_node *tt; + register struct radix_mask *next; +{ + register struct radix_mask *m; + + MKGet(m); + if (m == 0) { + log(LOG_ERR, "Mask for route not entered\n"); + return (0); + } + bzero(m, sizeof *m); + m->rm_bit = tt->rn_bit; + m->rm_flags = tt->rn_flags; + if (tt->rn_flags & RNF_NORMAL) + m->rm_leaf = tt; + else + m->rm_mask = tt->rn_mask; + m->rm_mklist = next; + tt->rn_mklist = m; + return m; +} + +struct radix_node * +rn_addroute(v_arg, n_arg, head, treenodes) + void *v_arg, *n_arg; + struct radix_node_head *head; + struct radix_node treenodes[2]; +{ + caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg; + register struct radix_node *t, *x = 0, *tt; + struct radix_node *saved_tt, *top = head->rnh_treetop; + short b = 0, b_leaf = 0; + int keyduplicated; + caddr_t mmask; + struct radix_mask *m, **mp; + + /* + * In dealing with non-contiguous masks, there may be + * many different routes which have the same mask. + * We will find it useful to have a unique pointer to + * the mask to speed avoiding duplicate references at + * nodes and possibly save time in calculating indices. + */ + if (netmask) { + if ((x = rn_addmask(netmask, 0, top->rn_offset)) == 0) + return (0); + b_leaf = x->rn_bit; + b = -1 - x->rn_bit; + netmask = x->rn_key; + } + /* + * Deal with duplicated keys: attach node to previous instance + */ + saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes); + if (keyduplicated) { + for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) { +#ifdef RADIX_MPATH + /* permit multipath, if enabled for the family */ + if (rn_mpath_capable(head) && netmask == tt->rn_mask) { + /* + * go down to the end of multipaths, so that + * new entry goes into the end of rn_dupedkey + * chain. + */ + do { + t = tt; + tt = tt->rn_dupedkey; + } while (tt && t->rn_mask == tt->rn_mask); + break; + } +#endif + if (tt->rn_mask == netmask) + return (0); + if (netmask == 0 || + (tt->rn_mask && + ((b_leaf < tt->rn_bit) /* index(netmask) > node */ + || rn_refines(netmask, tt->rn_mask) + || rn_lexobetter(netmask, tt->rn_mask)))) + break; + } + /* + * If the mask is not duplicated, we wouldn't + * find it among possible duplicate key entries + * anyway, so the above test doesn't hurt. + * + * We sort the masks for a duplicated key the same way as + * in a masklist -- most specific to least specific. + * This may require the unfortunate nuisance of relocating + * the head of the list. + * + * We also reverse, or doubly link the list through the + * parent pointer. + */ + if (tt == saved_tt) { + struct radix_node *xx = x; + /* link in at head of list */ + (tt = treenodes)->rn_dupedkey = t; + tt->rn_flags = t->rn_flags; + tt->rn_parent = x = t->rn_parent; + t->rn_parent = tt; /* parent */ + if (x->rn_left == t) + x->rn_left = tt; + else + x->rn_right = tt; + saved_tt = tt; x = xx; + } else { + (tt = treenodes)->rn_dupedkey = t->rn_dupedkey; + t->rn_dupedkey = tt; + tt->rn_parent = t; /* parent */ + if (tt->rn_dupedkey) /* parent */ + tt->rn_dupedkey->rn_parent = tt; /* parent */ + } +#ifdef RN_DEBUG + t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; + tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; +#endif + tt->rn_key = (caddr_t) v; + tt->rn_bit = -1; + tt->rn_flags = RNF_ACTIVE; + } + /* + * Put mask in tree. + */ + if (netmask) { + tt->rn_mask = netmask; + tt->rn_bit = x->rn_bit; + tt->rn_flags |= x->rn_flags & RNF_NORMAL; + } + t = saved_tt->rn_parent; + if (keyduplicated) + goto on2; + b_leaf = -1 - t->rn_bit; + if (t->rn_right == saved_tt) + x = t->rn_left; + else + x = t->rn_right; + /* Promote general routes from below */ + if (x->rn_bit < 0) { + for (mp = &t->rn_mklist; x; x = x->rn_dupedkey) + if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) { + *mp = m = rn_new_radix_mask(x, 0); + if (m) + mp = &m->rm_mklist; + } + } else if (x->rn_mklist) { + /* + * Skip over masks whose index is > that of new node + */ + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) + if (m->rm_bit >= b_leaf) + break; + t->rn_mklist = m; *mp = 0; + } +on2: + /* Add new route to highest possible ancestor's list */ + if ((netmask == 0) || (b > t->rn_bit )) + return tt; /* can't lift at all */ + b_leaf = tt->rn_bit; + do { + x = t; + t = t->rn_parent; + } while (b <= t->rn_bit && x != top); + /* + * Search through routes associated with node to + * insert new route according to index. + * Need same criteria as when sorting dupedkeys to avoid + * double loop on deletion. + */ + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) { + if (m->rm_bit < b_leaf) + continue; + if (m->rm_bit > b_leaf) + break; + if (m->rm_flags & RNF_NORMAL) { + mmask = m->rm_leaf->rn_mask; + if (tt->rn_flags & RNF_NORMAL) { +#if !defined(RADIX_MPATH) + log(LOG_ERR, + "Non-unique normal route, mask not entered\n"); +#endif + return tt; + } + } else + mmask = m->rm_mask; + if (mmask == netmask) { + m->rm_refs++; + tt->rn_mklist = m; + return tt; + } + if (rn_refines(netmask, mmask) + || rn_lexobetter(netmask, mmask)) + break; + } + *mp = rn_new_radix_mask(tt, *mp); + return tt; +} + +struct radix_node * +rn_delete(v_arg, netmask_arg, head) + void *v_arg, *netmask_arg; + struct radix_node_head *head; +{ + register struct radix_node *t, *p, *x, *tt; + struct radix_mask *m, *saved_m, **mp; + struct radix_node *dupedkey, *saved_tt, *top; + caddr_t v, netmask; + int b, head_off, vlen; + + v = v_arg; + netmask = netmask_arg; + x = head->rnh_treetop; + tt = rn_search(v, x); + head_off = x->rn_offset; + vlen = LEN(v); + saved_tt = tt; + top = x; + if (tt == 0 || + bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off)) + return (0); + /* + * Delete our route from mask lists. + */ + if (netmask) { + if ((x = rn_addmask(netmask, 1, head_off)) == 0) + return (0); + netmask = x->rn_key; + while (tt->rn_mask != netmask) + if ((tt = tt->rn_dupedkey) == 0) + return (0); + } + if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0) + goto on1; + if (tt->rn_flags & RNF_NORMAL) { + if (m->rm_leaf != tt || m->rm_refs > 0) { + log(LOG_ERR, "rn_delete: inconsistent annotation\n"); + return 0; /* dangling ref could cause disaster */ + } + } else { + if (m->rm_mask != tt->rn_mask) { + log(LOG_ERR, "rn_delete: inconsistent annotation\n"); + goto on1; + } + if (--m->rm_refs >= 0) + goto on1; + } + b = -1 - tt->rn_bit; + t = saved_tt->rn_parent; + if (b > t->rn_bit) + goto on1; /* Wasn't lifted at all */ + do { + x = t; + t = t->rn_parent; + } while (b <= t->rn_bit && x != top); + for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) + if (m == saved_m) { + *mp = m->rm_mklist; + MKFree(m); + break; + } + if (m == 0) { + log(LOG_ERR, "rn_delete: couldn't find our annotation\n"); + if (tt->rn_flags & RNF_NORMAL) + return (0); /* Dangling ref to us */ + } +on1: + /* + * Eliminate us from tree + */ + if (tt->rn_flags & RNF_ROOT) + return (0); +#ifdef RN_DEBUG + /* Get us out of the creation list */ + for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {} + if (t) t->rn_ybro = tt->rn_ybro; +#endif + t = tt->rn_parent; + dupedkey = saved_tt->rn_dupedkey; + if (dupedkey) { + /* + * Here, tt is the deletion target and + * saved_tt is the head of the dupekey chain. + */ + if (tt == saved_tt) { + /* remove from head of chain */ + x = dupedkey; x->rn_parent = t; + if (t->rn_left == tt) + t->rn_left = x; + else + t->rn_right = x; + } else { + /* find node in front of tt on the chain */ + for (x = p = saved_tt; p && p->rn_dupedkey != tt;) + p = p->rn_dupedkey; + if (p) { + p->rn_dupedkey = tt->rn_dupedkey; + if (tt->rn_dupedkey) /* parent */ + tt->rn_dupedkey->rn_parent = p; + /* parent */ + } else log(LOG_ERR, "rn_delete: couldn't find us\n"); + } + t = tt + 1; + if (t->rn_flags & RNF_ACTIVE) { +#ifndef RN_DEBUG + *++x = *t; + p = t->rn_parent; +#else + b = t->rn_info; + *++x = *t; + t->rn_info = b; + p = t->rn_parent; +#endif + if (p->rn_left == t) + p->rn_left = x; + else + p->rn_right = x; + x->rn_left->rn_parent = x; + x->rn_right->rn_parent = x; + } + goto out; + } + if (t->rn_left == tt) + x = t->rn_right; + else + x = t->rn_left; + p = t->rn_parent; + if (p->rn_right == t) + p->rn_right = x; + else + p->rn_left = x; + x->rn_parent = p; + /* + * Demote routes attached to us. + */ + if (t->rn_mklist) { + if (x->rn_bit >= 0) { + for (mp = &x->rn_mklist; (m = *mp);) + mp = &m->rm_mklist; + *mp = t->rn_mklist; + } else { + /* If there are any key,mask pairs in a sibling + duped-key chain, some subset will appear sorted + in the same order attached to our mklist */ + for (m = t->rn_mklist; m && x; x = x->rn_dupedkey) + if (m == x->rn_mklist) { + struct radix_mask *mm = m->rm_mklist; + x->rn_mklist = 0; + if (--(m->rm_refs) < 0) + MKFree(m); + m = mm; + } + if (m) + log(LOG_ERR, + "rn_delete: Orphaned Mask %p at %p\n", + m, x); + } + } + /* + * We may be holding an active internal node in the tree. + */ + x = tt + 1; + if (t != x) { +#ifndef RN_DEBUG + *t = *x; +#else + b = t->rn_info; + *t = *x; + t->rn_info = b; +#endif + t->rn_left->rn_parent = t; + t->rn_right->rn_parent = t; + p = x->rn_parent; + if (p->rn_left == x) + p->rn_left = t; + else + p->rn_right = t; + } +out: + tt->rn_flags &= ~RNF_ACTIVE; + tt[1].rn_flags &= ~RNF_ACTIVE; + return (tt); +} + +/* + * This is the same as rn_walktree() except for the parameters and the + * exit. + */ +static int +rn_walktree_from(h, a, m, f, w) + struct radix_node_head *h; + void *a, *m; + walktree_f_t *f; + void *w; +{ + int error; + struct radix_node *base, *next; + u_char *xa = (u_char *)a; + u_char *xm = (u_char *)m; + register struct radix_node *rn, *last = 0 /* shut up gcc */; + int stopping = 0; + int lastb; + + /* + * rn_search_m is sort-of-open-coded here. We cannot use the + * function because we need to keep track of the last node seen. + */ + /* printf("about to search\n"); */ + for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) { + last = rn; + /* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n", + rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */ + if (!(rn->rn_bmask & xm[rn->rn_offset])) { + break; + } + if (rn->rn_bmask & xa[rn->rn_offset]) { + rn = rn->rn_right; + } else { + rn = rn->rn_left; + } + } + /* printf("done searching\n"); */ + + /* + * Two cases: either we stepped off the end of our mask, + * in which case last == rn, or we reached a leaf, in which + * case we want to start from the last node we looked at. + * Either way, last is the node we want to start from. + */ + rn = last; + lastb = rn->rn_bit; + + /* printf("rn %p, lastb %d\n", rn, lastb);*/ + + /* + * This gets complicated because we may delete the node + * while applying the function f to it, so we need to calculate + * the successor node in advance. + */ + while (rn->rn_bit >= 0) + rn = rn->rn_left; + + while (!stopping) { + /* printf("node %p (%d)\n", rn, rn->rn_bit); */ + base = rn; + /* If at right child go back up, otherwise, go right */ + while (rn->rn_parent->rn_right == rn + && !(rn->rn_flags & RNF_ROOT)) { + rn = rn->rn_parent; + + /* if went up beyond last, stop */ + if (rn->rn_bit <= lastb) { + stopping = 1; + /* printf("up too far\n"); */ + /* + * XXX we should jump to the 'Process leaves' + * part, because the values of 'rn' and 'next' + * we compute will not be used. Not a big deal + * because this loop will terminate, but it is + * inefficient and hard to understand! + */ + } + } + + /* + * At the top of the tree, no need to traverse the right + * half, prevent the traversal of the entire tree in the + * case of default route. + */ + if (rn->rn_parent->rn_flags & RNF_ROOT) + stopping = 1; + + /* Find the next *leaf* since next node might vanish, too */ + for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) + rn = rn->rn_left; + next = rn; + /* Process leaves */ + while ((rn = base) != 0) { + base = rn->rn_dupedkey; + /* printf("leaf %p\n", rn); */ + if (!(rn->rn_flags & RNF_ROOT) + && (error = (*f)(rn, w))) + return (error); + } + rn = next; + + if (rn->rn_flags & RNF_ROOT) { + /* printf("root, stopping"); */ + stopping = 1; + } + + } + return 0; +} + +static int +rn_walktree(h, f, w) + struct radix_node_head *h; + walktree_f_t *f; + void *w; +{ + int error; + struct radix_node *base, *next; + register struct radix_node *rn = h->rnh_treetop; + /* + * This gets complicated because we may delete the node + * while applying the function f to it, so we need to calculate + * the successor node in advance. + */ + + /* First time through node, go left */ + while (rn->rn_bit >= 0) + rn = rn->rn_left; + for (;;) { + base = rn; + /* If at right child go back up, otherwise, go right */ + while (rn->rn_parent->rn_right == rn + && (rn->rn_flags & RNF_ROOT) == 0) + rn = rn->rn_parent; + /* Find the next *leaf* since next node might vanish, too */ + for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) + rn = rn->rn_left; + next = rn; + /* Process leaves */ + while ((rn = base)) { + base = rn->rn_dupedkey; + if (!(rn->rn_flags & RNF_ROOT) + && (error = (*f)(rn, w))) + return (error); + } + rn = next; + if (rn->rn_flags & RNF_ROOT) + return (0); + } + /* NOTREACHED */ +} + +/* + * Allocate and initialize an empty tree. This has 3 nodes, which are + * part of the radix_node_head (in the order ) and are + * marked RNF_ROOT so they cannot be freed. + * The leaves have all-zero and all-one keys, with significant + * bits starting at 'off'. + * Return 1 on success, 0 on error. + */ +int +rn_inithead(head, off) + void **head; + int off; +{ + register struct radix_node_head *rnh; + register struct radix_node *t, *tt, *ttt; + if (*head) + return (1); + R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh)); + if (rnh == 0) + return (0); +#ifdef _KERNEL + RADIX_NODE_HEAD_LOCK_INIT(rnh); +#endif + *head = rnh; + t = rn_newpair(rn_zeros, off, rnh->rnh_nodes); + ttt = rnh->rnh_nodes + 2; + t->rn_right = ttt; + t->rn_parent = t; + tt = t->rn_left; /* ... which in turn is rnh->rnh_nodes */ + tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE; + tt->rn_bit = -1 - off; + *ttt = *tt; + ttt->rn_key = rn_ones; + rnh->rnh_addaddr = rn_addroute; + rnh->rnh_deladdr = rn_delete; + rnh->rnh_matchaddr = rn_match; + rnh->rnh_lookup = rn_lookup; + rnh->rnh_walktree = rn_walktree; + rnh->rnh_walktree_from = rn_walktree_from; + rnh->rnh_treetop = t; + return (1); +} + +int +rn_detachhead(void **head) +{ + struct radix_node_head *rnh; + + KASSERT((head != NULL && *head != NULL), + ("%s: head already freed", __func__)); + rnh = *head; + + /* Free nodes. */ + Free(rnh); + + *head = NULL; + return (1); +} + +void +rn_init(int maxk) +{ + char *cp, *cplim; + + max_keylen = maxk; + if (max_keylen == 0) { + log(LOG_ERR, + "rn_init: radix functions require max_keylen be set\n"); + return; + } + R_Malloc(rn_zeros, char *, 3 * max_keylen); + if (rn_zeros == NULL) + panic("rn_init"); + bzero(rn_zeros, 3 * max_keylen); + rn_ones = cp = rn_zeros + max_keylen; + addmask_key = cplim = rn_ones + max_keylen; + while (cp < cplim) + *cp++ = -1; + if (rn_inithead((void **)(void *)&mask_rnhead, 0) == 0) + panic("rn_init 2"); +} diff --git a/freebsd/sys/net/radix.h b/freebsd/sys/net/radix.h new file mode 100644 index 00000000..ccd5f491 --- /dev/null +++ b/freebsd/sys/net/radix.h @@ -0,0 +1,176 @@ +/*- + * Copyright (c) 1988, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)radix.h 8.2 (Berkeley) 10/31/94 + * $FreeBSD$ + */ + +#ifndef _RADIX_HH_ +#define _RADIX_HH_ + +#ifdef _KERNEL +#include +#include +#include +#endif + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_RTABLE); +#endif + +/* + * Radix search tree node layout. + */ + +struct radix_node { + struct radix_mask *rn_mklist; /* list of masks contained in subtree */ + struct radix_node *rn_parent; /* parent */ + short rn_bit; /* bit offset; -1-index(netmask) */ + char rn_bmask; /* node: mask for bit test*/ + u_char rn_flags; /* enumerated next */ +#define RNF_NORMAL 1 /* leaf contains normal route */ +#define RNF_ROOT 2 /* leaf is root leaf for tree */ +#define RNF_ACTIVE 4 /* This node is alive (for rtfree) */ + union { + struct { /* leaf only data: */ + caddr_t rn_Key; /* object of search */ + caddr_t rn_Mask; /* netmask, if present */ + struct radix_node *rn_Dupedkey; + } rn_leaf; + struct { /* node only data: */ + int rn_Off; /* where to start compare */ + struct radix_node *rn_L;/* progeny */ + struct radix_node *rn_R;/* progeny */ + } rn_node; + } rn_u; +#ifdef RN_DEBUG + int rn_info; + struct radix_node *rn_twin; + struct radix_node *rn_ybro; +#endif +}; + +#define rn_dupedkey rn_u.rn_leaf.rn_Dupedkey +#define rn_key rn_u.rn_leaf.rn_Key +#define rn_mask rn_u.rn_leaf.rn_Mask +#define rn_offset rn_u.rn_node.rn_Off +#define rn_left rn_u.rn_node.rn_L +#define rn_right rn_u.rn_node.rn_R + +/* + * Annotations to tree concerning potential routes applying to subtrees. + */ + +struct radix_mask { + short rm_bit; /* bit offset; -1-index(netmask) */ + char rm_unused; /* cf. rn_bmask */ + u_char rm_flags; /* cf. rn_flags */ + struct radix_mask *rm_mklist; /* more masks to try */ + union { + caddr_t rmu_mask; /* the mask */ + struct radix_node *rmu_leaf; /* for normal routes */ + } rm_rmu; + int rm_refs; /* # of references to this struct */ +}; + +#define rm_mask rm_rmu.rmu_mask +#define rm_leaf rm_rmu.rmu_leaf /* extra field would make 32 bytes */ + +typedef int walktree_f_t(struct radix_node *, void *); + +struct radix_node_head { + struct radix_node *rnh_treetop; + int rnh_addrsize; /* permit, but not require fixed keys */ + int rnh_pktsize; /* permit, but not require fixed keys */ + struct radix_node *(*rnh_addaddr) /* add based on sockaddr */ + (void *v, void *mask, + struct radix_node_head *head, struct radix_node nodes[]); + struct radix_node *(*rnh_addpkt) /* add based on packet hdr */ + (void *v, void *mask, + struct radix_node_head *head, struct radix_node nodes[]); + struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */ + (void *v, void *mask, struct radix_node_head *head); + struct radix_node *(*rnh_delpkt) /* remove based on packet hdr */ + (void *v, void *mask, struct radix_node_head *head); + struct radix_node *(*rnh_matchaddr) /* locate based on sockaddr */ + (void *v, struct radix_node_head *head); + struct radix_node *(*rnh_lookup) /* locate based on sockaddr */ + (void *v, void *mask, struct radix_node_head *head); + struct radix_node *(*rnh_matchpkt) /* locate based on packet hdr */ + (void *v, struct radix_node_head *head); + int (*rnh_walktree) /* traverse tree */ + (struct radix_node_head *head, walktree_f_t *f, void *w); + int (*rnh_walktree_from) /* traverse tree below a */ + (struct radix_node_head *head, void *a, void *m, + walktree_f_t *f, void *w); + void (*rnh_close) /* do something when the last ref drops */ + (struct radix_node *rn, struct radix_node_head *head); + struct radix_node rnh_nodes[3]; /* empty tree for common case */ + int rnh_multipath; /* multipath capable ? */ +#ifdef _KERNEL + struct rwlock rnh_lock; /* locks entire radix tree */ +#endif +}; + +#ifndef _KERNEL +#define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n))) +#define R_Zalloc(p, t, n) (p = (t) calloc(1,(unsigned int)(n))) +#define Free(p) free((char *)p); +#else +#define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT)) +#define R_Zalloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT | M_ZERO)) +#define Free(p) free((caddr_t)p, M_RTABLE); + +#define RADIX_NODE_HEAD_LOCK_INIT(rnh) \ + rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0) +#define RADIX_NODE_HEAD_LOCK(rnh) rw_wlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_UNLOCK(rnh) rw_wunlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_RLOCK(rnh) rw_rlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_RUNLOCK(rnh) rw_runlock(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_LOCK_TRY_UPGRADE(rnh) rw_try_upgrade(&(rnh)->rnh_lock) + + +#define RADIX_NODE_HEAD_DESTROY(rnh) rw_destroy(&(rnh)->rnh_lock) +#define RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_LOCKED) +#define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED) +#endif /* _KERNEL */ + +void rn_init(int); +int rn_inithead(void **, int); +int rn_detachhead(void **); +int rn_refines(void *, void *); +struct radix_node + *rn_addmask(void *, int, int), + *rn_addroute (void *, void *, struct radix_node_head *, + struct radix_node [2]), + *rn_delete(void *, void *, struct radix_node_head *), + *rn_lookup (void *v_arg, void *m_arg, + struct radix_node_head *head), + *rn_match(void *, struct radix_node_head *); + +#endif /* _RADIX_HH_ */ diff --git a/freebsd/sys/net/radix_mpath.c b/freebsd/sys/net/radix_mpath.c new file mode 100644 index 00000000..3c348249 --- /dev/null +++ b/freebsd/sys/net/radix_mpath.c @@ -0,0 +1,365 @@ +#include + +/* $KAME: radix_mpath.c,v 1.17 2004/11/08 10:29:39 itojun Exp $ */ + +/* + * Copyright (C) 2001 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * THE AUTHORS DO NOT GUARANTEE THAT THIS SOFTWARE DOES NOT INFRINGE + * ANY OTHERS' INTELLECTUAL PROPERTIES. IN NO EVENT SHALL THE AUTHORS + * BE LIABLE FOR ANY INFRINGEMENT OF ANY OTHERS' INTELLECTUAL + * PROPERTIES. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * give some jitter to hash, to avoid synchronization between routers + */ +static uint32_t hashjitter; + +int +rn_mpath_capable(struct radix_node_head *rnh) +{ + + return rnh->rnh_multipath; +} + +struct radix_node * +rn_mpath_next(struct radix_node *rn) +{ + struct radix_node *next; + + if (!rn->rn_dupedkey) + return NULL; + next = rn->rn_dupedkey; + if (rn->rn_mask == next->rn_mask) + return next; + else + return NULL; +} + +#ifdef __rtems__ +u_int32_t +#else +uint32_t +#endif +rn_mpath_count(struct radix_node *rn) +{ + uint32_t i = 0; + struct rtentry *rt; + + while (rn != NULL) { + rt = (struct rtentry *)rn; + i += rt->rt_rmx.rmx_weight; + rn = rn_mpath_next(rn); + } + return (i); +} + +struct rtentry * +rt_mpath_matchgate(struct rtentry *rt, struct sockaddr *gate) +{ + struct radix_node *rn; + + if (!rn_mpath_next((struct radix_node *)rt)) + return rt; + + if (!gate) + return NULL; + + /* beyond here, we use rn as the master copy */ + rn = (struct radix_node *)rt; + do { + rt = (struct rtentry *)rn; + /* + * we are removing an address alias that has + * the same prefix as another address + * we need to compare the interface address because + * rt_gateway is a special sockadd_dl structure + */ + if (rt->rt_gateway->sa_family == AF_LINK) { + if (!memcmp(rt->rt_ifa->ifa_addr, gate, gate->sa_len)) + break; + } else { + if (rt->rt_gateway->sa_len == gate->sa_len && + !memcmp(rt->rt_gateway, gate, gate->sa_len)) + break; + } + } while ((rn = rn_mpath_next(rn)) != NULL); + + return (struct rtentry *)rn; +} + +/* + * go through the chain and unlink "rt" from the list + * the caller will free "rt" + */ +int +rt_mpath_deldup(struct rtentry *headrt, struct rtentry *rt) +{ + struct radix_node *t, *tt; + + if (!headrt || !rt) + return (0); + t = (struct radix_node *)headrt; + tt = rn_mpath_next(t); + while (tt) { + if (tt == (struct radix_node *)rt) { + t->rn_dupedkey = tt->rn_dupedkey; + tt->rn_dupedkey = NULL; + tt->rn_flags &= ~RNF_ACTIVE; + tt[1].rn_flags &= ~RNF_ACTIVE; + return (1); + } + t = tt; + tt = rn_mpath_next((struct radix_node *)t); + } + return (0); +} + +/* + * check if we have the same key/mask/gateway on the table already. + */ +int +rt_mpath_conflict(struct radix_node_head *rnh, struct rtentry *rt, + struct sockaddr *netmask) +{ + struct radix_node *rn, *rn1; + struct rtentry *rt1; + char *p, *q, *eq; + int same, l, skip; + + rn = (struct radix_node *)rt; + rn1 = rnh->rnh_lookup(rt_key(rt), netmask, rnh); + if (!rn1 || rn1->rn_flags & RNF_ROOT) + return 0; + + /* + * unlike other functions we have in this file, we have to check + * all key/mask/gateway as rnh_lookup can match less specific entry. + */ + rt1 = (struct rtentry *)rn1; + + /* compare key. */ + if (rt_key(rt1)->sa_len != rt_key(rt)->sa_len || + bcmp(rt_key(rt1), rt_key(rt), rt_key(rt1)->sa_len)) + goto different; + + /* key was the same. compare netmask. hairy... */ + if (rt_mask(rt1) && netmask) { + skip = rnh->rnh_treetop->rn_offset; + if (rt_mask(rt1)->sa_len > netmask->sa_len) { + /* + * as rt_mask(rt1) is made optimal by radix.c, + * there must be some 1-bits on rt_mask(rt1) + * after netmask->sa_len. therefore, in + * this case, the entries are different. + */ + if (rt_mask(rt1)->sa_len > skip) + goto different; + else { + /* no bits to compare, i.e. same*/ + goto maskmatched; + } + } + + l = rt_mask(rt1)->sa_len; + if (skip > l) { + /* no bits to compare, i.e. same */ + goto maskmatched; + } + p = (char *)rt_mask(rt1); + q = (char *)netmask; + if (bcmp(p + skip, q + skip, l - skip)) + goto different; + /* + * need to go through all the bit, as netmask is not + * optimal and can contain trailing 0s + */ + eq = (char *)netmask + netmask->sa_len; + q += l; + same = 1; + while (eq > q) + if (*q++) { + same = 0; + break; + } + if (!same) + goto different; + } else if (!rt_mask(rt1) && !netmask) + ; /* no mask to compare, i.e. same */ + else { + /* one has mask and the other does not, different */ + goto different; + } + +maskmatched: + + /* key/mask were the same. compare gateway for all multipaths */ + do { + rt1 = (struct rtentry *)rn1; + + /* sanity: no use in comparing the same thing */ + if (rn1 == rn) + continue; + + if (rt1->rt_gateway->sa_family == AF_LINK) { + if (rt1->rt_ifa->ifa_addr->sa_len != rt->rt_ifa->ifa_addr->sa_len || + bcmp(rt1->rt_ifa->ifa_addr, rt->rt_ifa->ifa_addr, + rt1->rt_ifa->ifa_addr->sa_len)) + continue; + } else { + if (rt1->rt_gateway->sa_len != rt->rt_gateway->sa_len || + bcmp(rt1->rt_gateway, rt->rt_gateway, + rt1->rt_gateway->sa_len)) + continue; + } + + /* all key/mask/gateway are the same. conflicting entry. */ + return EEXIST; + } while ((rn1 = rn_mpath_next(rn1)) != NULL); + +different: + return 0; +} + +void +#ifdef __rtems__ +rtalloc_mpath_fib(struct route *ro, u_int32_t hash, u_int fibnum) +#else +rtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum) +#endif +{ + struct radix_node *rn0, *rn; + u_int32_t n; + struct rtentry *rt; + int64_t weight; + + /* + * XXX we don't attempt to lookup cached route again; what should + * be done for sendto(3) case? + */ + if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP) + && RT_LINK_IS_UP(ro->ro_rt->rt_ifp)) + return; + ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, 0, fibnum); + + /* if the route does not exist or it is not multipath, don't care */ + if (ro->ro_rt == NULL) + return; + if (rn_mpath_next((struct radix_node *)ro->ro_rt) == NULL) { + RT_UNLOCK(ro->ro_rt); + return; + } + + /* beyond here, we use rn as the master copy */ + rn0 = rn = (struct radix_node *)ro->ro_rt; + n = rn_mpath_count(rn0); + + /* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */ + hash += hashjitter; + hash %= n; + for (weight = abs((int32_t)hash), rt = ro->ro_rt; + weight >= rt->rt_rmx.rmx_weight && rn; + weight -= rt->rt_rmx.rmx_weight) { + + /* stay within the multipath routes */ + if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask) + break; + rn = rn->rn_dupedkey; + rt = (struct rtentry *)rn; + } + /* XXX try filling rt_gwroute and avoid unreachable gw */ + + /* gw selection has failed - there must be only zero weight routes */ + if (!rn) { + RT_UNLOCK(ro->ro_rt); + ro->ro_rt = NULL; + return; + } + if (ro->ro_rt != rt) { + RTFREE_LOCKED(ro->ro_rt); + ro->ro_rt = (struct rtentry *)rn; + RT_LOCK(ro->ro_rt); + RT_ADDREF(ro->ro_rt); + + } + RT_UNLOCK(ro->ro_rt); +} + +extern int in6_inithead(void **head, int off); +extern int in_inithead(void **head, int off); + +#ifdef INET +int +rn4_mpath_inithead(void **head, int off) +{ + struct radix_node_head *rnh; + + hashjitter = arc4random(); + if (in_inithead(head, off) == 1) { + rnh = (struct radix_node_head *)*head; + rnh->rnh_multipath = 1; + return 1; + } else + return 0; +} +#endif + +#ifdef INET6 +int +rn6_mpath_inithead(void **head, int off) +{ + struct radix_node_head *rnh; + + hashjitter = arc4random(); + if (in6_inithead(head, off) == 1) { + rnh = (struct radix_node_head *)*head; + rnh->rnh_multipath = 1; + return 1; + } else + return 0; +} + +#endif diff --git a/freebsd/sys/net/radix_mpath.h b/freebsd/sys/net/radix_mpath.h new file mode 100644 index 00000000..b6d8c16a --- /dev/null +++ b/freebsd/sys/net/radix_mpath.h @@ -0,0 +1,63 @@ +/* $KAME: radix_mpath.h,v 1.10 2004/11/06 15:44:28 itojun Exp $ */ + +/* + * Copyright (C) 2001 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * THE AUTHORS DO NOT GUARANTEE THAT THIS SOFTWARE DOES NOT INFRINGE + * ANY OTHERS' INTELLECTUAL PROPERTIES. IN NO EVENT SHALL THE AUTHORS + * BE LIABLE FOR ANY INFRINGEMENT OF ANY OTHERS' INTELLECTUAL + * PROPERTIES. + */ +/* $FreeBSD$ */ + +#ifndef _NET_RADIX_MPATH_HH_ +#define _NET_RADIX_MPATH_HH_ + +#ifdef _KERNEL +/* + * Radix tree API with multipath support + */ +struct route; +struct rtentry; +struct sockaddr; +int rn_mpath_capable(struct radix_node_head *); +struct radix_node *rn_mpath_next(struct radix_node *); +u_int32_t rn_mpath_count(struct radix_node *); +struct rtentry *rt_mpath_matchgate(struct rtentry *, struct sockaddr *); +int rt_mpath_conflict(struct radix_node_head *, struct rtentry *, + struct sockaddr *); +void rtalloc_mpath_fib(struct route *, u_int32_t, u_int); +#define rtalloc_mpath(_route, _hash) rtalloc_mpath_fib((_route), (_hash), 0) +struct radix_node *rn_mpath_lookup(void *, void *, + struct radix_node_head *); +int rt_mpath_deldup(struct rtentry *, struct rtentry *); +int rn4_mpath_inithead(void **, int); +int rn6_mpath_inithead(void **, int); + +#endif + +#endif /* _NET_RADIX_MPATH_HH_ */ diff --git a/freebsd/sys/net/raw_cb.c b/freebsd/sys/net/raw_cb.c new file mode 100644 index 00000000..2fd73dac --- /dev/null +++ b/freebsd/sys/net/raw_cb.c @@ -0,0 +1,119 @@ +#include + +/*- + * Copyright (c) 1980, 1986, 1993 + * The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)raw_cb.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Routines to manage the raw protocol control blocks. + * + * TODO: + * hash lookups by protocol family/protocol + address family + * take care of unique address problems per AF? + * redo address binding to allow wildcards + */ + +struct mtx rawcb_mtx; +VNET_DEFINE(struct rawcb_list_head, rawcb_list); + +SYSCTL_NODE(_net, OID_AUTO, raw, CTLFLAG_RW, 0, "Raw socket infrastructure"); + +static u_long raw_sendspace = RAWSNDQ; +SYSCTL_ULONG(_net_raw, OID_AUTO, sendspace, CTLFLAG_RW, &raw_sendspace, 0, + "Default raw socket send space"); + +static u_long raw_recvspace = RAWRCVQ; +SYSCTL_ULONG(_net_raw, OID_AUTO, recvspace, CTLFLAG_RW, &raw_recvspace, 0, + "Default raw socket receive space"); + +/* + * Allocate a control block and a nominal amount of buffer space for the + * socket. + */ +int +raw_attach(struct socket *so, int proto) +{ + struct rawcb *rp = sotorawcb(so); + int error; + + /* + * It is assumed that raw_attach is called after space has been + * allocated for the rawcb; consumer protocols may simply allocate + * type struct rawcb, or a wrapper data structure that begins with a + * struct rawcb. + */ + KASSERT(rp != NULL, ("raw_attach: rp == NULL")); + + error = soreserve(so, raw_sendspace, raw_recvspace); + if (error) + return (error); + rp->rcb_socket = so; + rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family; + rp->rcb_proto.sp_protocol = proto; + mtx_lock(&rawcb_mtx); + LIST_INSERT_HEAD(&V_rawcb_list, rp, list); + mtx_unlock(&rawcb_mtx); + return (0); +} + +/* + * Detach the raw connection block and discard socket resources. + */ +void +raw_detach(struct rawcb *rp) +{ + struct socket *so = rp->rcb_socket; + + KASSERT(so->so_pcb == rp, ("raw_detach: so_pcb != rp")); + + so->so_pcb = NULL; + mtx_lock(&rawcb_mtx); + LIST_REMOVE(rp, list); + mtx_unlock(&rawcb_mtx); + free((caddr_t)(rp), M_PCB); +} diff --git a/freebsd/sys/net/raw_cb.h b/freebsd/sys/net/raw_cb.h new file mode 100644 index 00000000..278b4235 --- /dev/null +++ b/freebsd/sys/net/raw_cb.h @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 1980, 1986, 1993 + * The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)raw_cb.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NET_RAW_CB_HH_ +#define _NET_RAW_CB_HH_ + +#include + +/* + * Raw protocol interface control block. Used to tie a socket to the generic + * raw interface. + */ +struct rawcb { + LIST_ENTRY(rawcb) list; + struct socket *rcb_socket; /* back pointer to socket */ + struct sockproto rcb_proto; /* protocol family, protocol */ +}; + +#define sotorawcb(so) ((struct rawcb *)(so)->so_pcb) + +/* + * Nominal space allocated to a raw socket. + */ +#define RAWSNDQ 8192 +#define RAWRCVQ 8192 + +#ifdef _KERNEL +VNET_DECLARE(LIST_HEAD(rawcb_list_head, rawcb), rawcb_list); +#define V_rawcb_list VNET(rawcb_list) + +extern struct mtx rawcb_mtx; + +/* + * Generic protosw entries for raw socket protocols. + */ +pr_ctlinput_t raw_ctlinput; +pr_init_t raw_init; + +/* + * Library routines for raw socket usrreq functions; will always be wrapped + * so that protocol-specific functions can be handled. + */ +int raw_attach(struct socket *, int); +void raw_detach(struct rawcb *); +void raw_input(struct mbuf *, struct sockproto *, struct sockaddr *); + +/* + * Generic pr_usrreqs entries for raw socket protocols, usually wrapped so + * that protocol-specific functions can be handled. + */ +extern struct pr_usrreqs raw_usrreqs; +#endif + +#endif diff --git a/freebsd/sys/net/raw_usrreq.c b/freebsd/sys/net/raw_usrreq.c new file mode 100644 index 00000000..bdf3369e --- /dev/null +++ b/freebsd/sys/net/raw_usrreq.c @@ -0,0 +1,266 @@ +#include + +/*- + * Copyright (c) 1980, 1986, 1993 + * The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)raw_usrreq.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MTX_SYSINIT(rawcb_mtx, &rawcb_mtx, "rawcb", MTX_DEF); + +/* + * Initialize raw connection block q. + */ +void +raw_init(void) +{ + + LIST_INIT(&V_rawcb_list); +} + +/* + * Raw protocol input routine. Find the socket associated with the packet(s) + * and move them over. If nothing exists for this packet, drop it. + */ +/* + * Raw protocol interface. + */ +void +raw_input(struct mbuf *m0, struct sockproto *proto, struct sockaddr *src) +{ + struct rawcb *rp; + struct mbuf *m = m0; + struct socket *last; + + last = 0; + mtx_lock(&rawcb_mtx); + LIST_FOREACH(rp, &V_rawcb_list, list) { + if (rp->rcb_proto.sp_family != proto->sp_family) + continue; + if (rp->rcb_proto.sp_protocol && + rp->rcb_proto.sp_protocol != proto->sp_protocol) + continue; + if (last) { + struct mbuf *n; + n = m_copy(m, 0, (int)M_COPYALL); + if (n) { + if (sbappendaddr(&last->so_rcv, src, + n, (struct mbuf *)0) == 0) + /* should notify about lost packet */ + m_freem(n); + else + sorwakeup(last); + } + } + last = rp->rcb_socket; + } + if (last) { + if (sbappendaddr(&last->so_rcv, src, + m, (struct mbuf *)0) == 0) + m_freem(m); + else + sorwakeup(last); + } else + m_freem(m); + mtx_unlock(&rawcb_mtx); +} + +/*ARGSUSED*/ +void +raw_ctlinput(int cmd, struct sockaddr *arg, void *dummy) +{ + + if (cmd < 0 || cmd >= PRC_NCMDS) + return; + /* INCOMPLETE */ +} + +static void +raw_uabort(struct socket *so) +{ + + KASSERT(sotorawcb(so) != NULL, ("raw_uabort: rp == NULL")); + + soisdisconnected(so); +} + +static void +raw_uclose(struct socket *so) +{ + + KASSERT(sotorawcb(so) != NULL, ("raw_uabort: rp == NULL")); + + soisdisconnected(so); +} + +/* pru_accept is EOPNOTSUPP */ + +static int +raw_uattach(struct socket *so, int proto, struct thread *td) +{ + int error; + + /* + * Implementors of raw sockets will already have allocated the PCB, + * so it must be non-NULL here. + */ + KASSERT(sotorawcb(so) != NULL, ("raw_uattach: so_pcb == NULL")); + + if (td != NULL) { + error = priv_check(td, PRIV_NET_RAW); + if (error) + return (error); + } + return (raw_attach(so, proto)); +} + +static int +raw_ubind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + + return (EINVAL); +} + +static int +raw_uconnect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + + return (EINVAL); +} + +/* pru_connect2 is EOPNOTSUPP */ +/* pru_control is EOPNOTSUPP */ + +static void +raw_udetach(struct socket *so) +{ + struct rawcb *rp = sotorawcb(so); + + KASSERT(rp != NULL, ("raw_udetach: rp == NULL")); + + raw_detach(rp); +} + +static int +raw_udisconnect(struct socket *so) +{ + + KASSERT(sotorawcb(so) != NULL, ("raw_udisconnect: rp == NULL")); + + return (ENOTCONN); +} + +/* pru_listen is EOPNOTSUPP */ + +static int +raw_upeeraddr(struct socket *so, struct sockaddr **nam) +{ + + KASSERT(sotorawcb(so) != NULL, ("raw_upeeraddr: rp == NULL")); + + return (ENOTCONN); +} + +/* pru_rcvd is EOPNOTSUPP */ +/* pru_rcvoob is EOPNOTSUPP */ + +static int +raw_usend(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct thread *td) +{ + + KASSERT(sotorawcb(so) != NULL, ("raw_usend: rp == NULL")); + + if ((flags & PRUS_OOB) || (control && control->m_len)) { + /* XXXRW: Should control also be freed here? */ + if (m != NULL) + m_freem(m); + return (EOPNOTSUPP); + } + + /* + * For historical (bad?) reasons, we effectively ignore the address + * argument to sendto(2). Perhaps we should return an error instead? + */ + return ((*so->so_proto->pr_output)(m, so)); +} + +/* pru_sense is null */ + +static int +raw_ushutdown(struct socket *so) +{ + + KASSERT(sotorawcb(so) != NULL, ("raw_ushutdown: rp == NULL")); + + socantsendmore(so); + return (0); +} + +static int +raw_usockaddr(struct socket *so, struct sockaddr **nam) +{ + + KASSERT(sotorawcb(so) != NULL, ("raw_usockaddr: rp == NULL")); + + return (EINVAL); +} + +struct pr_usrreqs raw_usrreqs = { + .pru_abort = raw_uabort, + .pru_attach = raw_uattach, + .pru_bind = raw_ubind, + .pru_connect = raw_uconnect, + .pru_detach = raw_udetach, + .pru_disconnect = raw_udisconnect, + .pru_peeraddr = raw_upeeraddr, + .pru_send = raw_usend, + .pru_shutdown = raw_ushutdown, + .pru_sockaddr = raw_usockaddr, + .pru_close = raw_uclose, +}; diff --git a/freebsd/sys/net/route.c b/freebsd/sys/net/route.c new file mode 100644 index 00000000..ee43c843 --- /dev/null +++ b/freebsd/sys/net/route.c @@ -0,0 +1,1601 @@ +#include + +/*- + * Copyright (c) 1980, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 + * $FreeBSD$ + */ +/************************************************************************ + * Note: In this file a 'fib' is a "forwarding information base" * + * Which is the new name for an in kernel routing (next hop) table. * + ***********************************************************************/ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef RADIX_MPATH +#include +#endif + +#include +#include + +#include + +u_int rt_numfibs = RT_NUMFIBS; +SYSCTL_INT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, ""); +/* + * Allow the boot code to allow LESS than RT_MAXFIBS to be used. + * We can't do more because storage is statically allocated for now. + * (for compatibility reasons.. this will change). + */ +TUNABLE_INT("net.fibs", &rt_numfibs); + +/* + * By default add routes to all fibs for new interfaces. + * Once this is set to 0 then only allocate routes on interface + * changes for the FIB of the caller when adding a new set of addresses + * to an interface. XXX this is a shotgun aproach to a problem that needs + * a more fine grained solution.. that will come. + */ +u_int rt_add_addr_allfibs = 1; +SYSCTL_INT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW, + &rt_add_addr_allfibs, 0, ""); +TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs); + +VNET_DEFINE(struct rtstat, rtstat); +#define V_rtstat VNET(rtstat) + +VNET_DEFINE(struct radix_node_head *, rt_tables); +#define V_rt_tables VNET(rt_tables) + +VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ +#define V_rttrash VNET(rttrash) + + +/* compare two sockaddr structures */ +#define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0) + +/* + * Convert a 'struct radix_node *' to a 'struct rtentry *'. + * The operation can be done safely (in this code) because a + * 'struct rtentry' starts with two 'struct radix_node''s, the first + * one representing leaf nodes in the routing tree, which is + * what the code in radix.c passes us as a 'struct radix_node'. + * + * But because there are a lot of assumptions in this conversion, + * do not cast explicitly, but always use the macro below. + */ +#define RNTORT(p) ((struct rtentry *)(p)) + +static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */ +#define V_rtzone VNET(rtzone) + +#if 0 +/* default fib for tunnels to use */ +u_int tunnel_fib = 0; +SYSCTL_INT(_net, OID_AUTO, tunnelfib, CTLFLAG_RD, &tunnel_fib, 0, ""); +#endif + +/* + * handler for net.my_fibnum + */ +static int +sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) +{ + int fibnum; + int error; + + fibnum = curthread->td_proc->p_fibnum; + error = sysctl_handle_int(oidp, &fibnum, 0, req); + return (error); +} + +SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, + NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); + +static __inline struct radix_node_head ** +rt_tables_get_rnh_ptr(int table, int fam) +{ + struct radix_node_head **rnh; + + KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.", + __func__)); + KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.", + __func__)); + + /* rnh is [fib=0][af=0]. */ + rnh = (struct radix_node_head **)V_rt_tables; + /* Get the offset to the requested table and fam. */ + rnh += table * (AF_MAX+1) + fam; + + return (rnh); +} + +struct radix_node_head * +rt_tables_get_rnh(int table, int fam) +{ + + return (*rt_tables_get_rnh_ptr(table, fam)); +} + +/* + * route initialization must occur before ip6_init2(), which happenas at + * SI_ORDER_MIDDLE. + */ +static void +route_init(void) +{ + struct domain *dom; + int max_keylen = 0; + + /* whack the tunable ints into line. */ + if (rt_numfibs > RT_MAXFIBS) + rt_numfibs = RT_MAXFIBS; + if (rt_numfibs == 0) + rt_numfibs = 1; + + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_maxrtkey > max_keylen) + max_keylen = dom->dom_maxrtkey; + + rn_init(max_keylen); /* init all zeroes, all ones, mask table */ +} +SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0); + +static void +vnet_route_init(const void *unused __unused) +{ + struct domain *dom; + struct radix_node_head **rnh; + int table; + int fam; + + V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) * + sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK|M_ZERO); + + V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, 0); + for (dom = domains; dom; dom = dom->dom_next) { + if (dom->dom_rtattach) { + for (table = 0; table < rt_numfibs; table++) { + if ( (fam = dom->dom_family) == AF_INET || + table == 0) { + /* for now only AF_INET has > 1 table */ + /* XXX MRT + * rtattach will be also called + * from vfs_export.c but the + * offset will be 0 + * (only for AF_INET and AF_INET6 + * which don't need it anyhow) + */ + rnh = rt_tables_get_rnh_ptr(table, fam); + if (rnh == NULL) + panic("%s: rnh NULL", __func__); + dom->dom_rtattach((void **)rnh, + dom->dom_rtoffset); + } else { + break; + } + } + } + } +} +VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, + vnet_route_init, 0); + +#ifdef VIMAGE +static void +vnet_route_uninit(const void *unused __unused) +{ + int table; + int fam; + struct domain *dom; + struct radix_node_head **rnh; + + for (dom = domains; dom; dom = dom->dom_next) { + if (dom->dom_rtdetach) { + for (table = 0; table < rt_numfibs; table++) { + if ( (fam = dom->dom_family) == AF_INET || + table == 0) { + /* For now only AF_INET has > 1 tbl. */ + rnh = rt_tables_get_rnh_ptr(table, fam); + if (rnh == NULL) + panic("%s: rnh NULL", __func__); + dom->dom_rtdetach((void **)rnh, + dom->dom_rtoffset); + } else { + break; + } + } + } + } +} +VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, + vnet_route_uninit, 0); +#endif + +#ifndef _SYS_SYSPROTO_HH_ +struct setfib_args { + int fibnum; +}; +#endif +int +setfib(struct thread *td, struct setfib_args *uap) +{ + if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs) + return EINVAL; + td->td_proc->p_fibnum = uap->fibnum; + return (0); +} + +/* + * Packet routing routines. + */ +void +rtalloc(struct route *ro) +{ + rtalloc_ign_fib(ro, 0UL, 0); +} + +void +rtalloc_fib(struct route *ro, u_int fibnum) +{ + rtalloc_ign_fib(ro, 0UL, fibnum); +} + +void +rtalloc_ign(struct route *ro, u_long ignore) +{ + struct rtentry *rt; + + if ((rt = ro->ro_rt) != NULL) { + if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) + return; + RTFREE(rt); + ro->ro_rt = NULL; + } + ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, 0); + if (ro->ro_rt) + RT_UNLOCK(ro->ro_rt); +} + +void +rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum) +{ + struct rtentry *rt; + + if ((rt = ro->ro_rt) != NULL) { + if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) + return; + RTFREE(rt); + ro->ro_rt = NULL; + } + ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum); + if (ro->ro_rt) + RT_UNLOCK(ro->ro_rt); +} + +/* + * Look up the route that matches the address given + * Or, at least try.. Create a cloned route if needed. + * + * The returned route, if any, is locked. + */ +struct rtentry * +rtalloc1(struct sockaddr *dst, int report, u_long ignflags) +{ + return (rtalloc1_fib(dst, report, ignflags, 0)); +} + +struct rtentry * +rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, + u_int fibnum) +{ + struct radix_node_head *rnh; + struct rtentry *rt; + struct radix_node *rn; + struct rtentry *newrt; + struct rt_addrinfo info; + int err = 0, msgtype = RTM_MISS; + int needlock; + + KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); + if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */ + fibnum = 0; + rnh = rt_tables_get_rnh(fibnum, dst->sa_family); + newrt = NULL; + /* + * Look up the address in the table for that Address Family + */ + if (rnh == NULL) { + V_rtstat.rts_unreach++; + goto miss; + } + needlock = !(ignflags & RTF_RNH_LOCKED); + if (needlock) + RADIX_NODE_HEAD_RLOCK(rnh); +#ifdef INVARIANTS + else + RADIX_NODE_HEAD_LOCK_ASSERT(rnh); +#endif + rn = rnh->rnh_matchaddr(dst, rnh); + if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { + newrt = rt = RNTORT(rn); + RT_LOCK(newrt); + RT_ADDREF(newrt); + if (needlock) + RADIX_NODE_HEAD_RUNLOCK(rnh); + goto done; + + } else if (needlock) + RADIX_NODE_HEAD_RUNLOCK(rnh); + + /* + * Either we hit the root or couldn't find any match, + * Which basically means + * "caint get there frm here" + */ + V_rtstat.rts_unreach++; +miss: + if (report) { + /* + * If required, report the failure to the supervising + * Authorities. + * For a delete, this is not an error. (report == 0) + */ + bzero(&info, sizeof(info)); + info.rti_info[RTAX_DST] = dst; + rt_missmsg(msgtype, &info, 0, err); + } +done: + if (newrt) + RT_LOCK_ASSERT(newrt); + return (newrt); +} + +/* + * Remove a reference count from an rtentry. + * If the count gets low enough, take it out of the routing table + */ +void +rtfree(struct rtentry *rt) +{ + struct radix_node_head *rnh; + + KASSERT(rt != NULL,("%s: NULL rt", __func__)); + rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); + KASSERT(rnh != NULL,("%s: NULL rnh", __func__)); + + RT_LOCK_ASSERT(rt); + + /* + * The callers should use RTFREE_LOCKED() or RTFREE(), so + * we should come here exactly with the last reference. + */ + RT_REMREF(rt); + if (rt->rt_refcnt > 0) { + log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt); + goto done; + } + + /* + * On last reference give the "close method" a chance + * to cleanup private state. This also permits (for + * IPv4 and IPv6) a chance to decide if the routing table + * entry should be purged immediately or at a later time. + * When an immediate purge is to happen the close routine + * typically calls rtexpunge which clears the RTF_UP flag + * on the entry so that the code below reclaims the storage. + */ + if (rt->rt_refcnt == 0 && rnh->rnh_close) + rnh->rnh_close((struct radix_node *)rt, rnh); + + /* + * If we are no longer "up" (and ref == 0) + * then we can free the resources associated + * with the route. + */ + if ((rt->rt_flags & RTF_UP) == 0) { + if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) + panic("rtfree 2"); + /* + * the rtentry must have been removed from the routing table + * so it is represented in rttrash.. remove that now. + */ + V_rttrash--; +#ifdef DIAGNOSTIC + if (rt->rt_refcnt < 0) { + printf("rtfree: %p not freed (neg refs)\n", rt); + goto done; + } +#endif + /* + * release references on items we hold them on.. + * e.g other routes and ifaddrs. + */ + if (rt->rt_ifa) + ifa_free(rt->rt_ifa); + /* + * The key is separatly alloc'd so free it (see rt_setgate()). + * This also frees the gateway, as they are always malloc'd + * together. + */ + Free(rt_key(rt)); + + /* + * and the rtentry itself of course + */ + RT_LOCK_DESTROY(rt); + uma_zfree(V_rtzone, rt); + return; + } +done: + RT_UNLOCK(rt); +} + + +/* + * Force a routing table entry to the specified + * destination to go through the given gateway. + * Normally called as a result of a routing redirect + * message from the network layer. + */ +void +rtredirect(struct sockaddr *dst, + struct sockaddr *gateway, + struct sockaddr *netmask, + int flags, + struct sockaddr *src) +{ + rtredirect_fib(dst, gateway, netmask, flags, src, 0); +} + +void +rtredirect_fib(struct sockaddr *dst, + struct sockaddr *gateway, + struct sockaddr *netmask, + int flags, + struct sockaddr *src, + u_int fibnum) +{ + struct rtentry *rt, *rt0 = NULL; + int error = 0; + short *stat = NULL; + struct rt_addrinfo info; + struct ifaddr *ifa; + struct radix_node_head *rnh; + + ifa = NULL; + rnh = rt_tables_get_rnh(fibnum, dst->sa_family); + if (rnh == NULL) { + error = EAFNOSUPPORT; + goto out; + } + + /* verify the gateway is directly reachable */ + if ((ifa = ifa_ifwithnet(gateway, 0)) == NULL) { + error = ENETUNREACH; + goto out; + } + rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */ + /* + * If the redirect isn't from our current router for this dst, + * it's either old or wrong. If it redirects us to ourselves, + * we have a routing loop, perhaps as a result of an interface + * going down recently. + */ + if (!(flags & RTF_DONE) && rt && + (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa)) + error = EINVAL; + else if (ifa_ifwithaddr_check(gateway)) + error = EHOSTUNREACH; + if (error) + goto done; + /* + * Create a new entry if we just got back a wildcard entry + * or the the lookup failed. This is necessary for hosts + * which use routing redirects generated by smart gateways + * to dynamically build the routing tables. + */ + if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2)) + goto create; + /* + * Don't listen to the redirect if it's + * for a route to an interface. + */ + if (rt->rt_flags & RTF_GATEWAY) { + if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) { + /* + * Changing from route to net => route to host. + * Create new route, rather than smashing route to net. + */ + create: + rt0 = rt; + rt = NULL; + + flags |= RTF_GATEWAY | RTF_DYNAMIC; + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_DST] = dst; + info.rti_info[RTAX_GATEWAY] = gateway; + info.rti_info[RTAX_NETMASK] = netmask; + info.rti_ifa = ifa; + info.rti_flags = flags; + if (rt0 != NULL) + RT_UNLOCK(rt0); /* drop lock to avoid LOR with RNH */ + error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); + if (rt != NULL) { + RT_LOCK(rt); + if (rt0 != NULL) + EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst); + flags = rt->rt_flags; + } + if (rt0 != NULL) + RTFREE(rt0); + + stat = &V_rtstat.rts_dynamic; + } else { + struct rtentry *gwrt; + + /* + * Smash the current notion of the gateway to + * this destination. Should check about netmask!!! + */ + rt->rt_flags |= RTF_MODIFIED; + flags |= RTF_MODIFIED; + stat = &V_rtstat.rts_newgateway; + /* + * add the key and gateway (in one malloc'd chunk). + */ + RT_UNLOCK(rt); + RADIX_NODE_HEAD_LOCK(rnh); + RT_LOCK(rt); + rt_setgate(rt, rt_key(rt), gateway); + gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED); + RADIX_NODE_HEAD_UNLOCK(rnh); + EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst); + RTFREE_LOCKED(gwrt); + } + } else + error = EHOSTUNREACH; +done: + if (rt) + RTFREE_LOCKED(rt); +out: + if (error) + V_rtstat.rts_badredirect++; + else if (stat != NULL) + (*stat)++; + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_DST] = dst; + info.rti_info[RTAX_GATEWAY] = gateway; + info.rti_info[RTAX_NETMASK] = netmask; + info.rti_info[RTAX_AUTHOR] = src; + rt_missmsg(RTM_REDIRECT, &info, flags, error); + if (ifa != NULL) + ifa_free(ifa); +} + +int +rtioctl(u_long req, caddr_t data) +{ + return (rtioctl_fib(req, data, 0)); +} + +/* + * Routing table ioctl interface. + */ +int +rtioctl_fib(u_long req, caddr_t data, u_int fibnum) +{ + + /* + * If more ioctl commands are added here, make sure the proper + * super-user checks are being performed because it is possible for + * prison-root to make it this far if raw sockets have been enabled + * in jails. + */ +#ifdef INET + /* Multicast goop, grrr... */ + return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; +#else /* INET */ + return ENXIO; +#endif /* INET */ +} + +/* + * For both ifa_ifwithroute() routines, 'ifa' is returned referenced. + */ +struct ifaddr * +ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway) +{ + return (ifa_ifwithroute_fib(flags, dst, gateway, 0)); +} + +struct ifaddr * +ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway, + u_int fibnum) +{ + register struct ifaddr *ifa; + int not_found = 0; + + if ((flags & RTF_GATEWAY) == 0) { + /* + * If we are adding a route to an interface, + * and the interface is a pt to pt link + * we should search for the destination + * as our clue to the interface. Otherwise + * we can use the local address. + */ + ifa = NULL; + if (flags & RTF_HOST) + ifa = ifa_ifwithdstaddr(dst); + if (ifa == NULL) + ifa = ifa_ifwithaddr(gateway); + } else { + /* + * If we are adding a route to a remote net + * or host, the gateway may still be on the + * other end of a pt to pt link. + */ + ifa = ifa_ifwithdstaddr(gateway); + } + if (ifa == NULL) + ifa = ifa_ifwithnet(gateway, 0); + if (ifa == NULL) { + struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum); + if (rt == NULL) + return (NULL); + /* + * dismiss a gateway that is reachable only + * through the default router + */ + switch (gateway->sa_family) { + case AF_INET: + if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) + not_found = 1; + break; + case AF_INET6: + if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr)) + not_found = 1; + break; + default: + break; + } + if (!not_found && rt->rt_ifa != NULL) { + ifa = rt->rt_ifa; + ifa_ref(ifa); + } + RT_REMREF(rt); + RT_UNLOCK(rt); + if (not_found || ifa == NULL) + return (NULL); + } + if (ifa->ifa_addr->sa_family != dst->sa_family) { + struct ifaddr *oifa = ifa; + ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); + if (ifa == NULL) + ifa = oifa; + else + ifa_free(oifa); + } + return (ifa); +} + +/* + * Do appropriate manipulations of a routing tree given + * all the bits of info needed + */ +int +rtrequest(int req, + struct sockaddr *dst, + struct sockaddr *gateway, + struct sockaddr *netmask, + int flags, + struct rtentry **ret_nrt) +{ + return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, 0)); +} + +int +rtrequest_fib(int req, + struct sockaddr *dst, + struct sockaddr *gateway, + struct sockaddr *netmask, + int flags, + struct rtentry **ret_nrt, + u_int fibnum) +{ + struct rt_addrinfo info; + + if (dst->sa_len == 0) + return(EINVAL); + + bzero((caddr_t)&info, sizeof(info)); + info.rti_flags = flags; + info.rti_info[RTAX_DST] = dst; + info.rti_info[RTAX_GATEWAY] = gateway; + info.rti_info[RTAX_NETMASK] = netmask; + return rtrequest1_fib(req, &info, ret_nrt, fibnum); +} + +/* + * These (questionable) definitions of apparent local variables apply + * to the next two functions. XXXXXX!!! + */ +#define dst info->rti_info[RTAX_DST] +#define gateway info->rti_info[RTAX_GATEWAY] +#define netmask info->rti_info[RTAX_NETMASK] +#define ifaaddr info->rti_info[RTAX_IFA] +#define ifpaddr info->rti_info[RTAX_IFP] +#define flags info->rti_flags + +int +rt_getifa(struct rt_addrinfo *info) +{ + return (rt_getifa_fib(info, 0)); +} + +/* + * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, + * it will be referenced so the caller must free it. + */ +int +rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) +{ + struct ifaddr *ifa; + int error = 0; + + /* + * ifp may be specified by sockaddr_dl + * when protocol address is ambiguous. + */ + if (info->rti_ifp == NULL && ifpaddr != NULL && + ifpaddr->sa_family == AF_LINK && + (ifa = ifa_ifwithnet(ifpaddr, 0)) != NULL) { + info->rti_ifp = ifa->ifa_ifp; + ifa_free(ifa); + } + if (info->rti_ifa == NULL && ifaaddr != NULL) + info->rti_ifa = ifa_ifwithaddr(ifaaddr); + if (info->rti_ifa == NULL) { + struct sockaddr *sa; + + sa = ifaaddr != NULL ? ifaaddr : + (gateway != NULL ? gateway : dst); + if (sa != NULL && info->rti_ifp != NULL) + info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); + else if (dst != NULL && gateway != NULL) + info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway, + fibnum); + else if (sa != NULL) + info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa, + fibnum); + } + if ((ifa = info->rti_ifa) != NULL) { + if (info->rti_ifp == NULL) + info->rti_ifp = ifa->ifa_ifp; + } else + error = ENETUNREACH; + return (error); +} + +/* + * Expunges references to a route that's about to be reclaimed. + * The route must be locked. + */ +int +rtexpunge(struct rtentry *rt) +{ +#if !defined(RADIX_MPATH) + struct radix_node *rn; +#else + struct rt_addrinfo info; + int fib; + struct rtentry *rt0; +#endif + struct radix_node_head *rnh; + struct ifaddr *ifa; + int error = 0; + + /* + * Find the correct routing tree to use for this Address Family + */ + rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); + RT_LOCK_ASSERT(rt); + if (rnh == NULL) + return (EAFNOSUPPORT); + RADIX_NODE_HEAD_LOCK_ASSERT(rnh); + +#ifdef RADIX_MPATH + fib = rt->rt_fibnum; + bzero(&info, sizeof(info)); + info.rti_ifp = rt->rt_ifp; + info.rti_flags = RTF_RNH_LOCKED; + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr; + + RT_UNLOCK(rt); + error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib); + + if (error == 0 && rt0 != NULL) { + rt = rt0; + RT_LOCK(rt); + } else if (error != 0) { + RT_LOCK(rt); + return (error); + } +#else + /* + * Remove the item from the tree; it should be there, + * but when callers invoke us blindly it may not (sigh). + */ + rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh); + if (rn == NULL) { + error = ESRCH; + goto bad; + } + KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0, + ("unexpected flags 0x%x", rn->rn_flags)); + KASSERT(rt == RNTORT(rn), + ("lookup mismatch, rt %p rn %p", rt, rn)); +#endif /* RADIX_MPATH */ + + rt->rt_flags &= ~RTF_UP; + + /* + * Give the protocol a chance to keep things in sync. + */ + if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) { + struct rt_addrinfo info; + + bzero((caddr_t)&info, sizeof(info)); + info.rti_flags = rt->rt_flags; + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + ifa->ifa_rtrequest(RTM_DELETE, rt, &info); + } + + /* + * one more rtentry floating around that is not + * linked to the routing table. + */ + V_rttrash++; +#if !defined(RADIX_MPATH) +bad: +#endif + return (error); +} + +#ifdef RADIX_MPATH +static int +rn_mpath_update(int req, struct rt_addrinfo *info, + struct radix_node_head *rnh, struct rtentry **ret_nrt) +{ + /* + * if we got multipath routes, we require users to specify + * a matching RTAX_GATEWAY. + */ + struct rtentry *rt, *rto = NULL; + register struct radix_node *rn; + int error = 0; + + rn = rnh->rnh_matchaddr(dst, rnh); + if (rn == NULL) + return (ESRCH); + rto = rt = RNTORT(rn); + rt = rt_mpath_matchgate(rt, gateway); + if (rt == NULL) + return (ESRCH); + /* + * this is the first entry in the chain + */ + if (rto == rt) { + rn = rn_mpath_next((struct radix_node *)rt); + /* + * there is another entry, now it's active + */ + if (rn) { + rto = RNTORT(rn); + RT_LOCK(rto); + rto->rt_flags |= RTF_UP; + RT_UNLOCK(rto); + } else if (rt->rt_flags & RTF_GATEWAY) { + /* + * For gateway routes, we need to + * make sure that we we are deleting + * the correct gateway. + * rt_mpath_matchgate() does not + * check the case when there is only + * one route in the chain. + */ + if (gateway && + (rt->rt_gateway->sa_len != gateway->sa_len || + memcmp(rt->rt_gateway, gateway, gateway->sa_len))) + error = ESRCH; + else { + /* + * remove from tree before returning it + * to the caller + */ + rn = rnh->rnh_deladdr(dst, netmask, rnh); + KASSERT(rt == RNTORT(rn), ("radix node disappeared")); + goto gwdelete; + } + + } + /* + * use the normal delete code to remove + * the first entry + */ + if (req != RTM_DELETE) + goto nondelete; + + error = ENOENT; + goto done; + } + + /* + * if the entry is 2nd and on up + */ + if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt)) + panic ("rtrequest1: rt_mpath_deldup"); +gwdelete: + RT_LOCK(rt); + RT_ADDREF(rt); + if (req == RTM_DELETE) { + rt->rt_flags &= ~RTF_UP; + /* + * One more rtentry floating around that is not + * linked to the routing table. rttrash will be decremented + * when RTFREE(rt) is eventually called. + */ + V_rttrash++; + } + +nondelete: + if (req != RTM_DELETE) + panic("unrecognized request %d", req); + + + /* + * If the caller wants it, then it can have it, + * but it's up to it to free the rtentry as we won't be + * doing it. + */ + if (ret_nrt) { + *ret_nrt = rt; + RT_UNLOCK(rt); + } else + RTFREE_LOCKED(rt); +done: + return (error); +} +#endif + +int +rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, + u_int fibnum) +{ + int error = 0, needlock = 0; + register struct rtentry *rt; +#ifdef FLOWTABLE + register struct rtentry *rt0; +#endif + register struct radix_node *rn; + register struct radix_node_head *rnh; + struct ifaddr *ifa; + struct sockaddr *ndst; +#define senderr(x) { error = x ; goto bad; } + + KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); + if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */ + fibnum = 0; + /* + * Find the correct routing tree to use for this Address Family + */ + rnh = rt_tables_get_rnh(fibnum, dst->sa_family); + if (rnh == NULL) + return (EAFNOSUPPORT); + needlock = ((flags & RTF_RNH_LOCKED) == 0); + flags &= ~RTF_RNH_LOCKED; + if (needlock) + RADIX_NODE_HEAD_LOCK(rnh); + else + RADIX_NODE_HEAD_LOCK_ASSERT(rnh); + /* + * If we are adding a host route then we don't want to put + * a netmask in the tree, nor do we want to clone it. + */ + if (flags & RTF_HOST) + netmask = NULL; + + switch (req) { + case RTM_DELETE: +#ifdef RADIX_MPATH + if (rn_mpath_capable(rnh)) { + error = rn_mpath_update(req, info, rnh, ret_nrt); + /* + * "bad" holds true for the success case + * as well + */ + if (error != ENOENT) + goto bad; + error = 0; + } +#endif + /* + * Remove the item from the tree and return it. + * Complain if it is not there and do no more processing. + */ + rn = rnh->rnh_deladdr(dst, netmask, rnh); + if (rn == NULL) + senderr(ESRCH); + if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) + panic ("rtrequest delete"); + rt = RNTORT(rn); + RT_LOCK(rt); + RT_ADDREF(rt); + rt->rt_flags &= ~RTF_UP; + + /* + * give the protocol a chance to keep things in sync. + */ + if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) + ifa->ifa_rtrequest(RTM_DELETE, rt, info); + + /* + * One more rtentry floating around that is not + * linked to the routing table. rttrash will be decremented + * when RTFREE(rt) is eventually called. + */ + V_rttrash++; + + /* + * If the caller wants it, then it can have it, + * but it's up to it to free the rtentry as we won't be + * doing it. + */ + if (ret_nrt) { + *ret_nrt = rt; + RT_UNLOCK(rt); + } else + RTFREE_LOCKED(rt); + break; + case RTM_RESOLVE: + /* + * resolve was only used for route cloning + * here for compat + */ + break; + case RTM_ADD: + if ((flags & RTF_GATEWAY) && !gateway) + senderr(EINVAL); + if (dst && gateway && (dst->sa_family != gateway->sa_family) && + (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) + senderr(EINVAL); + + if (info->rti_ifa == NULL) { + error = rt_getifa_fib(info, fibnum); + if (error) + senderr(error); + } else + ifa_ref(info->rti_ifa); + ifa = info->rti_ifa; + rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO); + if (rt == NULL) { + if (ifa != NULL) + ifa_free(ifa); + senderr(ENOBUFS); + } + RT_LOCK_INIT(rt); + rt->rt_flags = RTF_UP | flags; + rt->rt_fibnum = fibnum; + /* + * Add the gateway. Possibly re-malloc-ing the storage for it + * + */ + RT_LOCK(rt); + if ((error = rt_setgate(rt, dst, gateway)) != 0) { + RT_LOCK_DESTROY(rt); + if (ifa != NULL) + ifa_free(ifa); + uma_zfree(V_rtzone, rt); + senderr(error); + } + + /* + * point to the (possibly newly malloc'd) dest address. + */ + ndst = (struct sockaddr *)rt_key(rt); + + /* + * make sure it contains the value we want (masked if needed). + */ + if (netmask) { + rt_maskedcopy(dst, ndst, netmask); + } else + bcopy(dst, ndst, dst->sa_len); + + /* + * We use the ifa reference returned by rt_getifa_fib(). + * This moved from below so that rnh->rnh_addaddr() can + * examine the ifa and ifa->ifa_ifp if it so desires. + */ + rt->rt_ifa = ifa; + rt->rt_ifp = ifa->ifa_ifp; + rt->rt_rmx.rmx_weight = 1; + +#ifdef RADIX_MPATH + /* do not permit exactly the same dst/mask/gw pair */ + if (rn_mpath_capable(rnh) && + rt_mpath_conflict(rnh, rt, netmask)) { + if (rt->rt_ifa) { + ifa_free(rt->rt_ifa); + } + Free(rt_key(rt)); + RT_LOCK_DESTROY(rt); + uma_zfree(V_rtzone, rt); + senderr(EEXIST); + } +#endif + +#ifdef FLOWTABLE + rt0 = NULL; + /* XXX + * "flow-table" only support IPv4 at the moment. + */ +#ifdef INET + if (dst->sa_family == AF_INET) { + rn = rnh->rnh_matchaddr(dst, rnh); + if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { + struct sockaddr *mask; + u_char *m, *n; + int len; + + /* + * compare mask to see if the new route is + * more specific than the existing one + */ + rt0 = RNTORT(rn); + RT_LOCK(rt0); + RT_ADDREF(rt0); + RT_UNLOCK(rt0); + /* + * A host route is already present, so + * leave the flow-table entries as is. + */ + if (rt0->rt_flags & RTF_HOST) { + RTFREE(rt0); + rt0 = NULL; + } else if (!(flags & RTF_HOST) && netmask) { + mask = rt_mask(rt0); + len = mask->sa_len; + m = (u_char *)mask; + n = (u_char *)netmask; + while (len-- > 0) { + if (*n != *m) + break; + n++; + m++; + } + if (len == 0 || (*n < *m)) { + RTFREE(rt0); + rt0 = NULL; + } + } + } + } +#endif +#endif + + /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ + rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes); + /* + * If it still failed to go into the tree, + * then un-make it (this should be a function) + */ + if (rn == NULL) { + if (rt->rt_ifa) + ifa_free(rt->rt_ifa); + Free(rt_key(rt)); + RT_LOCK_DESTROY(rt); + uma_zfree(V_rtzone, rt); +#ifdef FLOWTABLE + if (rt0 != NULL) + RTFREE(rt0); +#endif + senderr(EEXIST); + } +#ifdef FLOWTABLE + else if (rt0 != NULL) { +#ifdef INET + flowtable_route_flush(V_ip_ft, rt0); +#endif + RTFREE(rt0); + } +#endif + + /* + * If this protocol has something to add to this then + * allow it to do that as well. + */ + if (ifa->ifa_rtrequest) + ifa->ifa_rtrequest(req, rt, info); + + /* + * actually return a resultant rtentry and + * give the caller a single reference. + */ + if (ret_nrt) { + *ret_nrt = rt; + RT_ADDREF(rt); + } + RT_UNLOCK(rt); + break; + default: + error = EOPNOTSUPP; + } +bad: + if (needlock) + RADIX_NODE_HEAD_UNLOCK(rnh); + return (error); +#undef senderr +} + +#undef dst +#undef gateway +#undef netmask +#undef ifaaddr +#undef ifpaddr +#undef flags + +int +rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) +{ + /* XXX dst may be overwritten, can we move this to below */ + int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); +#ifdef INVARIANTS + struct radix_node_head *rnh; + + rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family); +#endif + + RT_LOCK_ASSERT(rt); + RADIX_NODE_HEAD_LOCK_ASSERT(rnh); + + /* + * Prepare to store the gateway in rt->rt_gateway. + * Both dst and gateway are stored one after the other in the same + * malloc'd chunk. If we have room, we can reuse the old buffer, + * rt_gateway already points to the right place. + * Otherwise, malloc a new block and update the 'dst' address. + */ + if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) { + caddr_t new; + + R_Malloc(new, caddr_t, dlen + glen); + if (new == NULL) + return ENOBUFS; + /* + * XXX note, we copy from *dst and not *rt_key(rt) because + * rt_setgate() can be called to initialize a newly + * allocated route entry, in which case rt_key(rt) == NULL + * (and also rt->rt_gateway == NULL). + * Free()/free() handle a NULL argument just fine. + */ + bcopy(dst, new, dlen); + Free(rt_key(rt)); /* free old block, if any */ + rt_key(rt) = (struct sockaddr *)new; + rt->rt_gateway = (struct sockaddr *)(new + dlen); + } + + /* + * Copy the new gateway value into the memory chunk. + */ + bcopy(gate, rt->rt_gateway, glen); + + return (0); +} + +void +rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) +{ + register u_char *cp1 = (u_char *)src; + register u_char *cp2 = (u_char *)dst; + register u_char *cp3 = (u_char *)netmask; + u_char *cplim = cp2 + *cp3; + u_char *cplim2 = cp2 + *cp1; + + *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ + cp3 += 2; + if (cplim > cplim2) + cplim = cplim2; + while (cp2 < cplim) + *cp2++ = *cp1++ & *cp3++; + if (cp2 < cplim2) + bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); +} + +/* + * Set up a routing table entry, normally + * for an interface. + */ +#define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */ +static inline int +rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) +{ + struct sockaddr *dst; + struct sockaddr *netmask; + struct rtentry *rt = NULL; + struct rt_addrinfo info; + int error = 0; + int startfib, endfib; + char tempbuf[_SOCKADDR_TMPSIZE]; + int didwork = 0; + int a_failure = 0; + static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + + if (flags & RTF_HOST) { + dst = ifa->ifa_dstaddr; + netmask = NULL; + } else { + dst = ifa->ifa_addr; + netmask = ifa->ifa_netmask; + } + if ( dst->sa_family != AF_INET) + fibnum = 0; + if (fibnum == -1) { + if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) { + startfib = endfib = curthread->td_proc->p_fibnum; + } else { + startfib = 0; + endfib = rt_numfibs - 1; + } + } else { + KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum")); + startfib = fibnum; + endfib = fibnum; + } + if (dst->sa_len == 0) + return(EINVAL); + + /* + * If it's a delete, check that if it exists, + * it's on the correct interface or we might scrub + * a route to another ifa which would + * be confusing at best and possibly worse. + */ + if (cmd == RTM_DELETE) { + /* + * It's a delete, so it should already exist.. + * If it's a net, mask off the host bits + * (Assuming we have a mask) + * XXX this is kinda inet specific.. + */ + if (netmask != NULL) { + rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask); + dst = (struct sockaddr *)tempbuf; + } + } + /* + * Now go through all the requested tables (fibs) and do the + * requested action. Realistically, this will either be fib 0 + * for protocols that don't do multiple tables or all the + * tables for those that do. XXX For this version only AF_INET. + * When that changes code should be refactored to protocol + * independent parts and protocol dependent parts. + */ + for ( fibnum = startfib; fibnum <= endfib; fibnum++) { + if (cmd == RTM_DELETE) { + struct radix_node_head *rnh; + struct radix_node *rn; + /* + * Look up an rtentry that is in the routing tree and + * contains the correct info. + */ + rnh = rt_tables_get_rnh(fibnum, dst->sa_family); + if (rnh == NULL) + /* this table doesn't exist but others might */ + continue; + RADIX_NODE_HEAD_LOCK(rnh); +#ifdef RADIX_MPATH + if (rn_mpath_capable(rnh)) { + + rn = rnh->rnh_matchaddr(dst, rnh); + if (rn == NULL) + error = ESRCH; + else { + rt = RNTORT(rn); + /* + * for interface route the + * rt->rt_gateway is sockaddr_intf + * for cloning ARP entries, so + * rt_mpath_matchgate must use the + * interface address + */ + rt = rt_mpath_matchgate(rt, + ifa->ifa_addr); + if (!rt) + error = ESRCH; + } + } + else +#endif + rn = rnh->rnh_lookup(dst, netmask, rnh); + error = (rn == NULL || + (rn->rn_flags & RNF_ROOT) || + RNTORT(rn)->rt_ifa != ifa || + !sa_equal((struct sockaddr *)rn->rn_key, dst)); + RADIX_NODE_HEAD_UNLOCK(rnh); + if (error) { + /* this is only an error if bad on ALL tables */ + continue; + } + } + /* + * Do the actual request + */ + bzero((caddr_t)&info, sizeof(info)); + info.rti_ifa = ifa; + info.rti_flags = flags | ifa->ifa_flags; + info.rti_info[RTAX_DST] = dst; + /* + * doing this for compatibility reasons + */ + if (cmd == RTM_ADD) + info.rti_info[RTAX_GATEWAY] = + (struct sockaddr *)&null_sdl; + else + info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; + info.rti_info[RTAX_NETMASK] = netmask; + error = rtrequest1_fib(cmd, &info, &rt, fibnum); + if (error == 0 && rt != NULL) { + /* + * notify any listening routing agents of the change + */ + RT_LOCK(rt); +#ifdef RADIX_MPATH + /* + * in case address alias finds the first address + * e.g. ifconfig bge0 192.103.54.246/24 + * e.g. ifconfig bge0 192.103.54.247/24 + * the address set in the route is 192.103.54.246 + * so we need to replace it with 192.103.54.247 + */ + if (memcmp(rt->rt_ifa->ifa_addr, + ifa->ifa_addr, ifa->ifa_addr->sa_len)) { + ifa_free(rt->rt_ifa); + ifa_ref(ifa); + rt->rt_ifp = ifa->ifa_ifp; + rt->rt_ifa = ifa; + } +#endif + /* + * doing this for compatibility reasons + */ + if (cmd == RTM_ADD) { + ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = + rt->rt_ifp->if_type; + ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = + rt->rt_ifp->if_index; + } + RT_ADDREF(rt); + RT_UNLOCK(rt); + rt_newaddrmsg(cmd, ifa, error, rt); + RT_LOCK(rt); + RT_REMREF(rt); + if (cmd == RTM_DELETE) { + /* + * If we are deleting, and we found an entry, + * then it's been removed from the tree.. + * now throw it away. + */ + RTFREE_LOCKED(rt); + } else { + if (cmd == RTM_ADD) { + /* + * We just wanted to add it.. + * we don't actually need a reference. + */ + RT_REMREF(rt); + } + RT_UNLOCK(rt); + } + didwork = 1; + } + if (error) + a_failure = error; + } + if (cmd == RTM_DELETE) { + if (didwork) { + error = 0; + } else { + /* we only give an error if it wasn't in any table */ + error = ((flags & RTF_HOST) ? + EHOSTUNREACH : ENETUNREACH); + } + } else { + if (a_failure) { + /* return an error if any of them failed */ + error = a_failure; + } + } + return (error); +} + +/* special one for inet internal use. may not use. */ +int +rtinit_fib(struct ifaddr *ifa, int cmd, int flags) +{ + return (rtinit1(ifa, cmd, flags, -1)); +} + +/* + * Set up a routing table entry, normally + * for an interface. + */ +int +rtinit(struct ifaddr *ifa, int cmd, int flags) +{ + struct sockaddr *dst; + int fib = 0; + + if (flags & RTF_HOST) { + dst = ifa->ifa_dstaddr; + } else { + dst = ifa->ifa_addr; + } + + if (dst->sa_family == AF_INET) + fib = -1; + return (rtinit1(ifa, cmd, flags, fib)); +} diff --git a/freebsd/sys/net/route.h b/freebsd/sys/net/route.h new file mode 100644 index 00000000..4375020f --- /dev/null +++ b/freebsd/sys/net/route.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/net/rtsock.c b/freebsd/sys/net/rtsock.c new file mode 100644 index 00000000..287dd74d --- /dev/null +++ b/freebsd/sys/net/rtsock.c @@ -0,0 +1,1702 @@ +#include + +/*- + * Copyright (c) 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 + * $FreeBSD$ + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef INET6 +#include +#endif + +#if defined(INET) || defined(INET6) +#ifdef SCTP +extern void sctp_addr_change(struct ifaddr *ifa, int cmd); +#endif /* SCTP */ +#endif + +#ifdef COMPAT_FREEBSD32 +#include +#include + +struct if_data32 { + uint8_t ifi_type; + uint8_t ifi_physical; + uint8_t ifi_addrlen; + uint8_t ifi_hdrlen; + uint8_t ifi_link_state; + uint8_t ifi_spare_char1; + uint8_t ifi_spare_char2; + uint8_t ifi_datalen; + uint32_t ifi_mtu; + uint32_t ifi_metric; + uint32_t ifi_baudrate; + uint32_t ifi_ipackets; + uint32_t ifi_ierrors; + uint32_t ifi_opackets; + uint32_t ifi_oerrors; + uint32_t ifi_collisions; + uint32_t ifi_ibytes; + uint32_t ifi_obytes; + uint32_t ifi_imcasts; + uint32_t ifi_omcasts; + uint32_t ifi_iqdrops; + uint32_t ifi_noproto; + uint32_t ifi_hwassist; + int32_t ifi_epoch; + struct timeval32 ifi_lastchange; +}; + +struct if_msghdr32 { + uint16_t ifm_msglen; + uint8_t ifm_version; + uint8_t ifm_type; + int32_t ifm_addrs; + int32_t ifm_flags; + uint16_t ifm_index; + struct if_data32 ifm_data; +}; +#endif + +MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); + +/* NB: these are not modified */ +static struct sockaddr route_src = { 2, PF_ROUTE, }; +static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; + +static struct { + int ip_count; /* attached w/ AF_INET */ + int ip6_count; /* attached w/ AF_INET6 */ + int ipx_count; /* attached w/ AF_IPX */ + int any_count; /* total attached */ +} route_cb; + +struct mtx rtsock_mtx; +MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); + +#define RTSOCK_LOCK() mtx_lock(&rtsock_mtx) +#define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) +#define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) + +SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, ""); + +struct walkarg { + int w_tmemsize; + int w_op, w_arg; + caddr_t w_tmem; + struct sysctl_req *w_req; +}; + +static void rts_input(struct mbuf *m); +static struct mbuf *rt_msg1(int type, struct rt_addrinfo *rtinfo); +static int rt_msg2(int type, struct rt_addrinfo *rtinfo, + caddr_t cp, struct walkarg *w); +static int rt_xaddrs(caddr_t cp, caddr_t cplim, + struct rt_addrinfo *rtinfo); +static int sysctl_dumpentry(struct radix_node *rn, void *vw); +static int sysctl_iflist(int af, struct walkarg *w); +static int sysctl_ifmalist(int af, struct walkarg *w); +static int route_output(struct mbuf *m, struct socket *so); +static void rt_setmetrics(u_long which, const struct rt_metrics *in, + struct rt_metrics_lite *out); +static void rt_getmetrics(const struct rt_metrics_lite *in, + struct rt_metrics *out); +static void rt_dispatch(struct mbuf *, const struct sockaddr *); + +static struct netisr_handler rtsock_nh = { + .nh_name = "rtsock", + .nh_handler = rts_input, + .nh_proto = NETISR_ROUTE, + .nh_policy = NETISR_POLICY_SOURCE, +}; + +static int +sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS) +{ + int error, qlimit; + + netisr_getqlimit(&rtsock_nh, &qlimit); + error = sysctl_handle_int(oidp, &qlimit, 0, req); + if (error || !req->newptr) + return (error); + if (qlimit < 1) + return (EINVAL); + return (netisr_setqlimit(&rtsock_nh, qlimit)); +} +SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, + 0, 0, sysctl_route_netisr_maxqlen, "I", + "maximum routing socket dispatch queue length"); + +static void +rts_init(void) +{ + int tmp; + +#ifndef __rtems__ + if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) + rtsock_nh.nh_qlimit = tmp; +#endif + netisr_register(&rtsock_nh); +} +SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0); + +static void +rts_input(struct mbuf *m) +{ + struct sockproto route_proto; + unsigned short *family; + struct m_tag *tag; + + route_proto.sp_family = PF_ROUTE; + tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL); + if (tag != NULL) { + family = (unsigned short *)(tag + 1); + route_proto.sp_protocol = *family; + m_tag_delete(m, tag); + } else + route_proto.sp_protocol = 0; + + raw_input(m, &route_proto, &route_src); +} + +/* + * It really doesn't make any sense at all for this code to share much + * with raw_usrreq.c, since its functionality is so restricted. XXX + */ +static void +rts_abort(struct socket *so) +{ + + raw_usrreqs.pru_abort(so); +} + +static void +rts_close(struct socket *so) +{ + + raw_usrreqs.pru_close(so); +} + +/* pru_accept is EOPNOTSUPP */ + +static int +rts_attach(struct socket *so, int proto, struct thread *td) +{ + struct rawcb *rp; + int s, error; + + KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL")); + + /* XXX */ + rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO); + if (rp == NULL) + return ENOBUFS; + + /* + * The splnet() is necessary to block protocols from sending + * error notifications (like RTM_REDIRECT or RTM_LOSING) while + * this PCB is extant but incompletely initialized. + * Probably we should try to do more of this work beforehand and + * eliminate the spl. + */ + s = splnet(); + so->so_pcb = (caddr_t)rp; + so->so_fibnum = td->td_proc->p_fibnum; + error = raw_attach(so, proto); + rp = sotorawcb(so); + if (error) { + splx(s); + so->so_pcb = NULL; + free(rp, M_PCB); + return error; + } + RTSOCK_LOCK(); + switch(rp->rcb_proto.sp_protocol) { + case AF_INET: + route_cb.ip_count++; + break; + case AF_INET6: + route_cb.ip6_count++; + break; + case AF_IPX: + route_cb.ipx_count++; + break; + } + route_cb.any_count++; + RTSOCK_UNLOCK(); + soisconnected(so); + so->so_options |= SO_USELOOPBACK; + splx(s); + return 0; +} + +static int +rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + + return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */ +} + +static int +rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + + return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */ +} + +/* pru_connect2 is EOPNOTSUPP */ +/* pru_control is EOPNOTSUPP */ + +static void +rts_detach(struct socket *so) +{ + struct rawcb *rp = sotorawcb(so); + + KASSERT(rp != NULL, ("rts_detach: rp == NULL")); + + RTSOCK_LOCK(); + switch(rp->rcb_proto.sp_protocol) { + case AF_INET: + route_cb.ip_count--; + break; + case AF_INET6: + route_cb.ip6_count--; + break; + case AF_IPX: + route_cb.ipx_count--; + break; + } + route_cb.any_count--; + RTSOCK_UNLOCK(); + raw_usrreqs.pru_detach(so); +} + +static int +rts_disconnect(struct socket *so) +{ + + return (raw_usrreqs.pru_disconnect(so)); +} + +/* pru_listen is EOPNOTSUPP */ + +static int +rts_peeraddr(struct socket *so, struct sockaddr **nam) +{ + + return (raw_usrreqs.pru_peeraddr(so, nam)); +} + +/* pru_rcvd is EOPNOTSUPP */ +/* pru_rcvoob is EOPNOTSUPP */ + +static int +rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct thread *td) +{ + + return (raw_usrreqs.pru_send(so, flags, m, nam, control, td)); +} + +/* pru_sense is null */ + +static int +rts_shutdown(struct socket *so) +{ + + return (raw_usrreqs.pru_shutdown(so)); +} + +static int +rts_sockaddr(struct socket *so, struct sockaddr **nam) +{ + + return (raw_usrreqs.pru_sockaddr(so, nam)); +} + +static struct pr_usrreqs route_usrreqs = { + .pru_abort = rts_abort, + .pru_attach = rts_attach, + .pru_bind = rts_bind, + .pru_connect = rts_connect, + .pru_detach = rts_detach, + .pru_disconnect = rts_disconnect, + .pru_peeraddr = rts_peeraddr, + .pru_send = rts_send, + .pru_shutdown = rts_shutdown, + .pru_sockaddr = rts_sockaddr, + .pru_close = rts_close, +}; + +#ifndef _SOCKADDR_UNION_DEFINED +#define _SOCKADDR_UNION_DEFINED +/* + * The union of all possible address formats we handle. + */ +union sockaddr_union { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; +}; +#endif /* _SOCKADDR_UNION_DEFINED */ + +static int +rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, + struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred) +{ + + /* First, see if the returned address is part of the jail. */ + if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) { + info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; + return (0); + } + + switch (info->rti_info[RTAX_DST]->sa_family) { +#ifdef INET + case AF_INET: + { + struct in_addr ia; + struct ifaddr *ifa; + int found; + + found = 0; + /* + * Try to find an address on the given outgoing interface + * that belongs to the jail. + */ + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + struct sockaddr *sa; + sa = ifa->ifa_addr; + if (sa->sa_family != AF_INET) + continue; + ia = ((struct sockaddr_in *)sa)->sin_addr; + if (prison_check_ip4(cred, &ia) == 0) { + found = 1; + break; + } + } + IF_ADDR_UNLOCK(ifp); + if (!found) { + /* + * As a last resort return the 'default' jail address. + */ + ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)-> + sin_addr; + if (prison_get_ip4(cred, &ia) != 0) + return (ESRCH); + } + bzero(&saun->sin, sizeof(struct sockaddr_in)); + saun->sin.sin_len = sizeof(struct sockaddr_in); + saun->sin.sin_family = AF_INET; + saun->sin.sin_addr.s_addr = ia.s_addr; + info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin; + break; + } +#endif +#ifdef INET6 + case AF_INET6: + { + struct in6_addr ia6; + struct ifaddr *ifa; + int found; + + found = 0; + /* + * Try to find an address on the given outgoing interface + * that belongs to the jail. + */ + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + struct sockaddr *sa; + sa = ifa->ifa_addr; + if (sa->sa_family != AF_INET6) + continue; + bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr, + &ia6, sizeof(struct in6_addr)); + if (prison_check_ip6(cred, &ia6) == 0) { + found = 1; + break; + } + } + IF_ADDR_UNLOCK(ifp); + if (!found) { + /* + * As a last resort return the 'default' jail address. + */ + ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)-> + sin6_addr; + if (prison_get_ip6(cred, &ia6) != 0) + return (ESRCH); + } + bzero(&saun->sin6, sizeof(struct sockaddr_in6)); + saun->sin6.sin6_len = sizeof(struct sockaddr_in6); + saun->sin6.sin6_family = AF_INET6; + bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr)); + if (sa6_recoverscope(&saun->sin6) != 0) + return (ESRCH); + info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6; + break; + } +#endif + default: + return (ESRCH); + } + return (0); +} + +/*ARGSUSED*/ +static int +route_output(struct mbuf *m, struct socket *so) +{ +#define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0) + struct rt_msghdr *rtm = NULL; + struct rtentry *rt = NULL; + struct radix_node_head *rnh; + struct rt_addrinfo info; + int len, error = 0; + struct ifnet *ifp = NULL; + union sockaddr_union saun; + +#define senderr(e) { error = e; goto flush;} + if (m == NULL || ((m->m_len < sizeof(long)) && + (m = m_pullup(m, sizeof(long))) == NULL)) + return (ENOBUFS); + if ((m->m_flags & M_PKTHDR) == 0) + panic("route_output"); + len = m->m_pkthdr.len; + if (len < sizeof(*rtm) || + len != mtod(m, struct rt_msghdr *)->rtm_msglen) { + info.rti_info[RTAX_DST] = NULL; + senderr(EINVAL); + } + R_Malloc(rtm, struct rt_msghdr *, len); + if (rtm == NULL) { + info.rti_info[RTAX_DST] = NULL; + senderr(ENOBUFS); + } + m_copydata(m, 0, len, (caddr_t)rtm); + if (rtm->rtm_version != RTM_VERSION) { + info.rti_info[RTAX_DST] = NULL; + senderr(EPROTONOSUPPORT); + } + rtm->rtm_pid = curproc->p_pid; + bzero(&info, sizeof(info)); + info.rti_addrs = rtm->rtm_addrs; + if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) { + info.rti_info[RTAX_DST] = NULL; + senderr(EINVAL); + } + info.rti_flags = rtm->rtm_flags; + if (info.rti_info[RTAX_DST] == NULL || + info.rti_info[RTAX_DST]->sa_family >= AF_MAX || + (info.rti_info[RTAX_GATEWAY] != NULL && + info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) + senderr(EINVAL); + /* + * Verify that the caller has the appropriate privilege; RTM_GET + * is the only operation the non-superuser is allowed. + */ + if (rtm->rtm_type != RTM_GET) { + error = priv_check(curthread, PRIV_NET_ROUTE); + if (error) + senderr(error); + } + + /* + * The given gateway address may be an interface address. + * For example, issuing a "route change" command on a route + * entry that was created from a tunnel, and the gateway + * address given is the local end point. In this case the + * RTF_GATEWAY flag must be cleared or the destination will + * not be reachable even though there is no error message. + */ + if (info.rti_info[RTAX_GATEWAY] != NULL && + info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) { + struct route gw_ro; + + bzero(&gw_ro, sizeof(gw_ro)); + gw_ro.ro_dst = *info.rti_info[RTAX_GATEWAY]; + rtalloc_ign_fib(&gw_ro, 0, so->so_fibnum); + /* + * A host route through the loopback interface is + * installed for each interface adddress. In pre 8.0 + * releases the interface address of a PPP link type + * is not reachable locally. This behavior is fixed as + * part of the new L2/L3 redesign and rewrite work. The + * signature of this interface address route is the + * AF_LINK sa_family type of the rt_gateway, and the + * rt_ifp has the IFF_LOOPBACK flag set. + */ + if (gw_ro.ro_rt != NULL && + gw_ro.ro_rt->rt_gateway->sa_family == AF_LINK && + gw_ro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) + info.rti_flags &= ~RTF_GATEWAY; + if (gw_ro.ro_rt != NULL) + RTFREE(gw_ro.ro_rt); + } + + switch (rtm->rtm_type) { + struct rtentry *saved_nrt; + + case RTM_ADD: + if (info.rti_info[RTAX_GATEWAY] == NULL) + senderr(EINVAL); + saved_nrt = NULL; + + /* support for new ARP code */ + if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK && + (rtm->rtm_flags & RTF_LLDATA) != 0) { + error = lla_rt_output(rtm, &info); + break; + } + error = rtrequest1_fib(RTM_ADD, &info, &saved_nrt, + so->so_fibnum); + if (error == 0 && saved_nrt) { + RT_LOCK(saved_nrt); + rt_setmetrics(rtm->rtm_inits, + &rtm->rtm_rmx, &saved_nrt->rt_rmx); + rtm->rtm_index = saved_nrt->rt_ifp->if_index; + RT_REMREF(saved_nrt); + RT_UNLOCK(saved_nrt); + } + break; + + case RTM_DELETE: + saved_nrt = NULL; + /* support for new ARP code */ + if (info.rti_info[RTAX_GATEWAY] && + (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) && + (rtm->rtm_flags & RTF_LLDATA) != 0) { + error = lla_rt_output(rtm, &info); + break; + } + error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, + so->so_fibnum); + if (error == 0) { + RT_LOCK(saved_nrt); + rt = saved_nrt; + goto report; + } + break; + + case RTM_GET: + case RTM_CHANGE: + case RTM_LOCK: + rnh = rt_tables_get_rnh(so->so_fibnum, + info.rti_info[RTAX_DST]->sa_family); + if (rnh == NULL) + senderr(EAFNOSUPPORT); + RADIX_NODE_HEAD_RLOCK(rnh); + rt = (struct rtentry *) rnh->rnh_lookup(info.rti_info[RTAX_DST], + info.rti_info[RTAX_NETMASK], rnh); + if (rt == NULL) { /* XXX looks bogus */ + RADIX_NODE_HEAD_RUNLOCK(rnh); + senderr(ESRCH); + } +#ifdef RADIX_MPATH + /* + * for RTM_CHANGE/LOCK, if we got multipath routes, + * we require users to specify a matching RTAX_GATEWAY. + * + * for RTM_GET, gate is optional even with multipath. + * if gate == NULL the first match is returned. + * (no need to call rt_mpath_matchgate if gate == NULL) + */ + if (rn_mpath_capable(rnh) && + (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) { + rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]); + if (!rt) { + RADIX_NODE_HEAD_RUNLOCK(rnh); + senderr(ESRCH); + } + } +#endif + /* + * If performing proxied L2 entry insertion, and + * the actual PPP host entry is found, perform + * another search to retrieve the prefix route of + * the local end point of the PPP link. + */ + if (rtm->rtm_flags & RTF_ANNOUNCE) { + struct sockaddr laddr; + + if (rt->rt_ifp != NULL && + rt->rt_ifp->if_type == IFT_PROPVIRTUAL) { + struct ifaddr *ifa; + + ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1); + if (ifa != NULL) + rt_maskedcopy(ifa->ifa_addr, + &laddr, + ifa->ifa_netmask); + } else + rt_maskedcopy(rt->rt_ifa->ifa_addr, + &laddr, + rt->rt_ifa->ifa_netmask); + /* + * refactor rt and no lock operation necessary + */ + rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, rnh); + if (rt == NULL) { + RADIX_NODE_HEAD_RUNLOCK(rnh); + senderr(ESRCH); + } + } + RT_LOCK(rt); + RT_ADDREF(rt); + RADIX_NODE_HEAD_RUNLOCK(rnh); + + /* + * Fix for PR: 82974 + * + * RTM_CHANGE/LOCK need a perfect match, rn_lookup() + * returns a perfect match in case a netmask is + * specified. For host routes only a longest prefix + * match is returned so it is necessary to compare the + * existence of the netmask. If both have a netmask + * rnh_lookup() did a perfect match and if none of them + * have a netmask both are host routes which is also a + * perfect match. + */ + + if (rtm->rtm_type != RTM_GET && + (!rt_mask(rt) != !info.rti_info[RTAX_NETMASK])) { + RT_UNLOCK(rt); + senderr(ESRCH); + } + + switch(rtm->rtm_type) { + + case RTM_GET: + report: + RT_LOCK_ASSERT(rt); + if ((rt->rt_flags & RTF_HOST) == 0 + ? jailed_without_vnet(curthread->td_ucred) + : prison_if(curthread->td_ucred, + rt_key(rt)) != 0) { + RT_UNLOCK(rt); + senderr(ESRCH); + } + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_GENMASK] = 0; + if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { + ifp = rt->rt_ifp; + if (ifp) { + info.rti_info[RTAX_IFP] = + ifp->if_addr->ifa_addr; + error = rtm_get_jailed(&info, ifp, rt, + &saun, curthread->td_ucred); + if (error != 0) { + RT_UNLOCK(rt); + senderr(error); + } + if (ifp->if_flags & IFF_POINTOPOINT) + info.rti_info[RTAX_BRD] = + rt->rt_ifa->ifa_dstaddr; + rtm->rtm_index = ifp->if_index; + } else { + info.rti_info[RTAX_IFP] = NULL; + info.rti_info[RTAX_IFA] = NULL; + } + } else if ((ifp = rt->rt_ifp) != NULL) { + rtm->rtm_index = ifp->if_index; + } + len = rt_msg2(rtm->rtm_type, &info, NULL, NULL); + if (len > rtm->rtm_msglen) { + struct rt_msghdr *new_rtm; + R_Malloc(new_rtm, struct rt_msghdr *, len); + if (new_rtm == NULL) { + RT_UNLOCK(rt); + senderr(ENOBUFS); + } + bcopy(rtm, new_rtm, rtm->rtm_msglen); + Free(rtm); rtm = new_rtm; + } + (void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, NULL); + rtm->rtm_flags = rt->rt_flags; + rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx); + rtm->rtm_addrs = info.rti_addrs; + break; + + case RTM_CHANGE: + /* + * New gateway could require new ifaddr, ifp; + * flags may also be different; ifp may be specified + * by ll sockaddr when protocol address is ambiguous + */ + if (((rt->rt_flags & RTF_GATEWAY) && + info.rti_info[RTAX_GATEWAY] != NULL) || + info.rti_info[RTAX_IFP] != NULL || + (info.rti_info[RTAX_IFA] != NULL && + !sa_equal(info.rti_info[RTAX_IFA], + rt->rt_ifa->ifa_addr))) { + RT_UNLOCK(rt); + RADIX_NODE_HEAD_LOCK(rnh); + error = rt_getifa_fib(&info, rt->rt_fibnum); + /* + * XXXRW: Really we should release this + * reference later, but this maintains + * historical behavior. + */ + if (info.rti_ifa != NULL) + ifa_free(info.rti_ifa); + RADIX_NODE_HEAD_UNLOCK(rnh); + if (error != 0) + senderr(error); + RT_LOCK(rt); + } + if (info.rti_ifa != NULL && + info.rti_ifa != rt->rt_ifa && + rt->rt_ifa != NULL && + rt->rt_ifa->ifa_rtrequest != NULL) { + rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, + &info); + ifa_free(rt->rt_ifa); + } + if (info.rti_info[RTAX_GATEWAY] != NULL) { + RT_UNLOCK(rt); + RADIX_NODE_HEAD_LOCK(rnh); + RT_LOCK(rt); + + error = rt_setgate(rt, rt_key(rt), + info.rti_info[RTAX_GATEWAY]); + RADIX_NODE_HEAD_UNLOCK(rnh); + if (error != 0) { + RT_UNLOCK(rt); + senderr(error); + } + rt->rt_flags |= (RTF_GATEWAY & info.rti_flags); + } + if (info.rti_ifa != NULL && + info.rti_ifa != rt->rt_ifa) { + ifa_ref(info.rti_ifa); + rt->rt_ifa = info.rti_ifa; + rt->rt_ifp = info.rti_ifp; + } + /* Allow some flags to be toggled on change. */ + rt->rt_flags = (rt->rt_flags & ~RTF_FMASK) | + (rtm->rtm_flags & RTF_FMASK); + rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, + &rt->rt_rmx); + rtm->rtm_index = rt->rt_ifp->if_index; + if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) + rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, &info); + /* FALLTHROUGH */ + case RTM_LOCK: + /* We don't support locks anymore */ + break; + } + RT_UNLOCK(rt); + break; + + default: + senderr(EOPNOTSUPP); + } + +flush: + if (rtm) { + if (error) + rtm->rtm_errno = error; + else + rtm->rtm_flags |= RTF_DONE; + } + if (rt) /* XXX can this be true? */ + RTFREE(rt); + { + struct rawcb *rp = NULL; + /* + * Check to see if we don't want our own messages. + */ + if ((so->so_options & SO_USELOOPBACK) == 0) { + if (route_cb.any_count <= 1) { + if (rtm) + Free(rtm); + m_freem(m); + return (error); + } + /* There is another listener, so construct message */ + rp = sotorawcb(so); + } + if (rtm) { + m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); + if (m->m_pkthdr.len < rtm->rtm_msglen) { + m_freem(m); + m = NULL; + } else if (m->m_pkthdr.len > rtm->rtm_msglen) + m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); + Free(rtm); + } + if (m) { + if (rp) { + /* + * XXX insure we don't get a copy by + * invalidating our protocol + */ + unsigned short family = rp->rcb_proto.sp_family; + rp->rcb_proto.sp_family = 0; + rt_dispatch(m, info.rti_info[RTAX_DST]); + rp->rcb_proto.sp_family = family; + } else + rt_dispatch(m, info.rti_info[RTAX_DST]); + } + } + return (error); +#undef sa_equal +} + +static void +rt_setmetrics(u_long which, const struct rt_metrics *in, + struct rt_metrics_lite *out) +{ +#define metric(f, e) if (which & (f)) out->e = in->e; + /* + * Only these are stored in the routing entry since introduction + * of tcp hostcache. The rest is ignored. + */ + metric(RTV_MTU, rmx_mtu); + metric(RTV_WEIGHT, rmx_weight); + /* Userland -> kernel timebase conversion. */ + if (which & RTV_EXPIRE) + out->rmx_expire = in->rmx_expire ? + in->rmx_expire - time_second + time_uptime : 0; +#undef metric +} + +static void +rt_getmetrics(const struct rt_metrics_lite *in, struct rt_metrics *out) +{ +#define metric(e) out->e = in->e; + bzero(out, sizeof(*out)); + metric(rmx_mtu); + metric(rmx_weight); + /* Kernel -> userland timebase conversion. */ + out->rmx_expire = in->rmx_expire ? + in->rmx_expire - time_uptime + time_second : 0; +#undef metric +} + +/* + * Extract the addresses of the passed sockaddrs. + * Do a little sanity checking so as to avoid bad memory references. + * This data is derived straight from userland. + */ +static int +rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) +{ + struct sockaddr *sa; + int i; + + for (i = 0; i < RTAX_MAX && cp < cplim; i++) { + if ((rtinfo->rti_addrs & (1 << i)) == 0) + continue; + sa = (struct sockaddr *)cp; + /* + * It won't fit. + */ + if (cp + sa->sa_len > cplim) + return (EINVAL); + /* + * there are no more.. quit now + * If there are more bits, they are in error. + * I've seen this. route(1) can evidently generate these. + * This causes kernel to core dump. + * for compatibility, If we see this, point to a safe address. + */ + if (sa->sa_len == 0) { + rtinfo->rti_info[i] = &sa_zero; + return (0); /* should be EINVAL but for compat */ + } + /* accept it */ + rtinfo->rti_info[i] = sa; + cp += SA_SIZE(sa); + } + return (0); +} + +static struct mbuf * +rt_msg1(int type, struct rt_addrinfo *rtinfo) +{ + struct rt_msghdr *rtm; + struct mbuf *m; + int i; + struct sockaddr *sa; + int len, dlen; + + switch (type) { + + case RTM_DELADDR: + case RTM_NEWADDR: + len = sizeof(struct ifa_msghdr); + break; + + case RTM_DELMADDR: + case RTM_NEWMADDR: + len = sizeof(struct ifma_msghdr); + break; + + case RTM_IFINFO: + len = sizeof(struct if_msghdr); + break; + + case RTM_IFANNOUNCE: + case RTM_IEEE80211: + len = sizeof(struct if_announcemsghdr); + break; + + default: + len = sizeof(struct rt_msghdr); + } + if (len > MCLBYTES) + panic("rt_msg1"); + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m && len > MHLEN) { + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_free(m); + m = NULL; + } + } + if (m == NULL) + return (m); + m->m_pkthdr.len = m->m_len = len; + m->m_pkthdr.rcvif = NULL; + rtm = mtod(m, struct rt_msghdr *); + bzero((caddr_t)rtm, len); + for (i = 0; i < RTAX_MAX; i++) { + if ((sa = rtinfo->rti_info[i]) == NULL) + continue; + rtinfo->rti_addrs |= (1 << i); + dlen = SA_SIZE(sa); + m_copyback(m, len, dlen, (caddr_t)sa); + len += dlen; + } + if (m->m_pkthdr.len != len) { + m_freem(m); + return (NULL); + } + rtm->rtm_msglen = len; + rtm->rtm_version = RTM_VERSION; + rtm->rtm_type = type; + return (m); +} + +static int +rt_msg2(int type, struct rt_addrinfo *rtinfo, caddr_t cp, struct walkarg *w) +{ + int i; + int len, dlen, second_time = 0; + caddr_t cp0; + + rtinfo->rti_addrs = 0; +again: + switch (type) { + + case RTM_DELADDR: + case RTM_NEWADDR: + len = sizeof(struct ifa_msghdr); + break; + + case RTM_IFINFO: +#ifdef COMPAT_FREEBSD32 + if (w != NULL && w->w_req->flags & SCTL_MASK32) { + len = sizeof(struct if_msghdr32); + break; + } +#endif + len = sizeof(struct if_msghdr); + break; + + case RTM_NEWMADDR: + len = sizeof(struct ifma_msghdr); + break; + + default: + len = sizeof(struct rt_msghdr); + } + cp0 = cp; + if (cp0) + cp += len; + for (i = 0; i < RTAX_MAX; i++) { + struct sockaddr *sa; + + if ((sa = rtinfo->rti_info[i]) == NULL) + continue; + rtinfo->rti_addrs |= (1 << i); + dlen = SA_SIZE(sa); + if (cp) { + bcopy((caddr_t)sa, cp, (unsigned)dlen); + cp += dlen; + } + len += dlen; + } + len = ALIGN(len); + if (cp == NULL && w != NULL && !second_time) { + struct walkarg *rw = w; + + if (rw->w_req) { + if (rw->w_tmemsize < len) { + if (rw->w_tmem) + free(rw->w_tmem, M_RTABLE); + rw->w_tmem = (caddr_t) + malloc(len, M_RTABLE, M_NOWAIT); + if (rw->w_tmem) + rw->w_tmemsize = len; + } + if (rw->w_tmem) { + cp = rw->w_tmem; + second_time = 1; + goto again; + } + } + } + if (cp) { + struct rt_msghdr *rtm = (struct rt_msghdr *)cp0; + + rtm->rtm_version = RTM_VERSION; + rtm->rtm_type = type; + rtm->rtm_msglen = len; + } + return (len); +} + +/* + * This routine is called to generate a message from the routing + * socket indicating that a redirect has occured, a routing lookup + * has failed, or that a protocol has detected timeouts to a particular + * destination. + */ +void +rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) +{ + struct rt_msghdr *rtm; + struct mbuf *m; + struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; + + if (route_cb.any_count == 0) + return; + m = rt_msg1(type, rtinfo); + if (m == NULL) + return; + rtm = mtod(m, struct rt_msghdr *); + rtm->rtm_flags = RTF_DONE | flags; + rtm->rtm_errno = error; + rtm->rtm_addrs = rtinfo->rti_addrs; + rt_dispatch(m, sa); +} + +/* + * This routine is called to generate a message from the routing + * socket indicating that the status of a network interface has changed. + */ +void +rt_ifmsg(struct ifnet *ifp) +{ + struct if_msghdr *ifm; + struct mbuf *m; + struct rt_addrinfo info; + + if (route_cb.any_count == 0) + return; + bzero((caddr_t)&info, sizeof(info)); + m = rt_msg1(RTM_IFINFO, &info); + if (m == NULL) + return; + ifm = mtod(m, struct if_msghdr *); + ifm->ifm_index = ifp->if_index; + ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; + ifm->ifm_data = ifp->if_data; + ifm->ifm_addrs = 0; + rt_dispatch(m, NULL); +} + +/* + * This is called to generate messages from the routing socket + * indicating a network interface has had addresses associated with it. + * if we ever reverse the logic and replace messages TO the routing + * socket indicate a request to configure interfaces, then it will + * be unnecessary as the routing socket will automatically generate + * copies of it. + */ +void +rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) +{ + struct rt_addrinfo info; + struct sockaddr *sa = NULL; + int pass; + struct mbuf *m = NULL; + struct ifnet *ifp = ifa->ifa_ifp; + + KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, + ("unexpected cmd %u", cmd)); +#if defined(INET) || defined(INET6) +#ifdef SCTP + /* + * notify the SCTP stack + * this will only get called when an address is added/deleted + * XXX pass the ifaddr struct instead if ifa->ifa_addr... + */ + sctp_addr_change(ifa, cmd); +#endif /* SCTP */ +#endif + if (route_cb.any_count == 0) + return; + for (pass = 1; pass < 3; pass++) { + bzero((caddr_t)&info, sizeof(info)); + if ((cmd == RTM_ADD && pass == 1) || + (cmd == RTM_DELETE && pass == 2)) { + struct ifa_msghdr *ifam; + int ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; + + info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; + info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; + info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; + if ((m = rt_msg1(ncmd, &info)) == NULL) + continue; + ifam = mtod(m, struct ifa_msghdr *); + ifam->ifam_index = ifp->if_index; + ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_addrs = info.rti_addrs; + } + if ((cmd == RTM_ADD && pass == 2) || + (cmd == RTM_DELETE && pass == 1)) { + struct rt_msghdr *rtm; + + if (rt == NULL) + continue; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_DST] = sa = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + if ((m = rt_msg1(cmd, &info)) == NULL) + continue; + rtm = mtod(m, struct rt_msghdr *); + rtm->rtm_index = ifp->if_index; + rtm->rtm_flags |= rt->rt_flags; + rtm->rtm_errno = error; + rtm->rtm_addrs = info.rti_addrs; + } + rt_dispatch(m, sa); + } +} + +/* + * This is the analogue to the rt_newaddrmsg which performs the same + * function but for multicast group memberhips. This is easier since + * there is no route state to worry about. + */ +void +rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) +{ + struct rt_addrinfo info; + struct mbuf *m = NULL; + struct ifnet *ifp = ifma->ifma_ifp; + struct ifma_msghdr *ifmam; + + if (route_cb.any_count == 0) + return; + + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_IFA] = ifma->ifma_addr; + info.rti_info[RTAX_IFP] = ifp ? ifp->if_addr->ifa_addr : NULL; + /* + * If a link-layer address is present, present it as a ``gateway'' + * (similarly to how ARP entries, e.g., are presented). + */ + info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr; + m = rt_msg1(cmd, &info); + if (m == NULL) + return; + ifmam = mtod(m, struct ifma_msghdr *); + KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n", + __func__)); + ifmam->ifmam_index = ifp->if_index; + ifmam->ifmam_addrs = info.rti_addrs; + rt_dispatch(m, ifma->ifma_addr); +} + +static struct mbuf * +rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, + struct rt_addrinfo *info) +{ + struct if_announcemsghdr *ifan; + struct mbuf *m; + + if (route_cb.any_count == 0) + return NULL; + bzero((caddr_t)info, sizeof(*info)); + m = rt_msg1(type, info); + if (m != NULL) { + ifan = mtod(m, struct if_announcemsghdr *); + ifan->ifan_index = ifp->if_index; + strlcpy(ifan->ifan_name, ifp->if_xname, + sizeof(ifan->ifan_name)); + ifan->ifan_what = what; + } + return m; +} + +/* + * This is called to generate routing socket messages indicating + * IEEE80211 wireless events. + * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way. + */ +void +rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len) +{ + struct mbuf *m; + struct rt_addrinfo info; + + m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info); + if (m != NULL) { + /* + * Append the ieee80211 data. Try to stick it in the + * mbuf containing the ifannounce msg; otherwise allocate + * a new mbuf and append. + * + * NB: we assume m is a single mbuf. + */ + if (data_len > M_TRAILINGSPACE(m)) { + struct mbuf *n = m_get(M_NOWAIT, MT_DATA); + if (n == NULL) { + m_freem(m); + return; + } + bcopy(data, mtod(n, void *), data_len); + n->m_len = data_len; + m->m_next = n; + } else if (data_len > 0) { + bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len); + m->m_len += data_len; + } + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len += data_len; + mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len; + rt_dispatch(m, NULL); + } +} + +/* + * This is called to generate routing socket messages indicating + * network interface arrival and departure. + */ +void +rt_ifannouncemsg(struct ifnet *ifp, int what) +{ + struct mbuf *m; + struct rt_addrinfo info; + + m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); + if (m != NULL) + rt_dispatch(m, NULL); +} + +static void +rt_dispatch(struct mbuf *m, const struct sockaddr *sa) +{ + struct m_tag *tag; + + /* + * Preserve the family from the sockaddr, if any, in an m_tag for + * use when injecting the mbuf into the routing socket buffer from + * the netisr. + */ + if (sa != NULL) { + tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short), + M_NOWAIT); + if (tag == NULL) { + m_freem(m); + return; + } + *(unsigned short *)(tag + 1) = sa->sa_family; + m_tag_prepend(m, tag); + } +#ifdef VIMAGE + if (V_loif) + m->m_pkthdr.rcvif = V_loif; + else { + m_freem(m); + return; + } +#endif + netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ +} + +/* + * This is used in dumping the kernel table via sysctl(). + */ +static int +sysctl_dumpentry(struct radix_node *rn, void *vw) +{ + struct walkarg *w = vw; + struct rtentry *rt = (struct rtentry *)rn; + int error = 0, size; + struct rt_addrinfo info; + + if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) + return 0; + if ((rt->rt_flags & RTF_HOST) == 0 + ? jailed_without_vnet(w->w_req->td->td_ucred) + : prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0) + return (0); + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_GENMASK] = 0; + if (rt->rt_ifp) { + info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr; + info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; + if (rt->rt_ifp->if_flags & IFF_POINTOPOINT) + info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; + } + size = rt_msg2(RTM_GET, &info, NULL, w); + if (w->w_req && w->w_tmem) { + struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; + + rtm->rtm_flags = rt->rt_flags; + /* + * let's be honest about this being a retarded hack + */ + rtm->rtm_fmask = rt->rt_rmx.rmx_pksent; + rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx); + rtm->rtm_index = rt->rt_ifp->if_index; + rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; + rtm->rtm_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); + return (error); + } + return (error); +} + +#ifdef COMPAT_FREEBSD32 +static void +copy_ifdata32(struct if_data *src, struct if_data32 *dst) +{ + + bzero(dst, sizeof(*dst)); + CP(*src, *dst, ifi_type); + CP(*src, *dst, ifi_physical); + CP(*src, *dst, ifi_addrlen); + CP(*src, *dst, ifi_hdrlen); + CP(*src, *dst, ifi_link_state); + dst->ifi_datalen = sizeof(struct if_data32); + CP(*src, *dst, ifi_mtu); + CP(*src, *dst, ifi_metric); + CP(*src, *dst, ifi_baudrate); + CP(*src, *dst, ifi_ipackets); + CP(*src, *dst, ifi_ierrors); + CP(*src, *dst, ifi_opackets); + CP(*src, *dst, ifi_oerrors); + CP(*src, *dst, ifi_collisions); + CP(*src, *dst, ifi_ibytes); + CP(*src, *dst, ifi_obytes); + CP(*src, *dst, ifi_imcasts); + CP(*src, *dst, ifi_omcasts); + CP(*src, *dst, ifi_iqdrops); + CP(*src, *dst, ifi_noproto); + CP(*src, *dst, ifi_hwassist); + CP(*src, *dst, ifi_epoch); + TV_CP(*src, *dst, ifi_lastchange); +} +#endif + +static int +sysctl_iflist(int af, struct walkarg *w) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + struct rt_addrinfo info; + int len, error = 0; + + bzero((caddr_t)&info, sizeof(info)); + IFNET_RLOCK(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (w->w_arg && w->w_arg != ifp->if_index) + continue; + IF_ADDR_LOCK(ifp); + ifa = ifp->if_addr; + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO, &info, NULL, w); + info.rti_info[RTAX_IFP] = NULL; + if (w->w_req && w->w_tmem) { + struct if_msghdr *ifm; + +#ifdef COMPAT_FREEBSD32 + if (w->w_req->flags & SCTL_MASK32) { + struct if_msghdr32 *ifm32; + + ifm32 = (struct if_msghdr32 *)w->w_tmem; + ifm32->ifm_index = ifp->if_index; + ifm32->ifm_flags = ifp->if_flags | + ifp->if_drv_flags; + copy_ifdata32(&ifp->if_data, &ifm32->ifm_data); + ifm32->ifm_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, (caddr_t)ifm32, + len); + goto sysctl_out; + } +#endif + ifm = (struct if_msghdr *)w->w_tmem; + ifm->ifm_index = ifp->if_index; + ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; + ifm->ifm_data = ifp->if_data; + ifm->ifm_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, (caddr_t)ifm, len); +#ifdef COMPAT_FREEBSD32 + sysctl_out: +#endif + if (error) + goto done; + } + while ((ifa = TAILQ_NEXT(ifa, ifa_link)) != NULL) { + if (af && af != ifa->ifa_addr->sa_family) + continue; + if (prison_if(w->w_req->td->td_ucred, + ifa->ifa_addr) != 0) + continue; + info.rti_info[RTAX_IFA] = ifa->ifa_addr; + info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; + len = rt_msg2(RTM_NEWADDR, &info, NULL, w); + if (w->w_req && w->w_tmem) { + struct ifa_msghdr *ifam; + + ifam = (struct ifa_msghdr *)w->w_tmem; + ifam->ifam_index = ifa->ifa_ifp->if_index; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, w->w_tmem, len); + if (error) + goto done; + } + } + IF_ADDR_UNLOCK(ifp); + info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] = + info.rti_info[RTAX_BRD] = NULL; + } +done: + if (ifp != NULL) + IF_ADDR_UNLOCK(ifp); + IFNET_RUNLOCK(); + return (error); +} + +static int +sysctl_ifmalist(int af, struct walkarg *w) +{ + struct ifnet *ifp; + struct ifmultiaddr *ifma; + struct rt_addrinfo info; + int len, error = 0; + struct ifaddr *ifa; + + bzero((caddr_t)&info, sizeof(info)); + IFNET_RLOCK(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (w->w_arg && w->w_arg != ifp->if_index) + continue; + ifa = ifp->if_addr; + info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (af && af != ifma->ifma_addr->sa_family) + continue; + if (prison_if(w->w_req->td->td_ucred, + ifma->ifma_addr) != 0) + continue; + info.rti_info[RTAX_IFA] = ifma->ifma_addr; + info.rti_info[RTAX_GATEWAY] = + (ifma->ifma_addr->sa_family != AF_LINK) ? + ifma->ifma_lladdr : NULL; + len = rt_msg2(RTM_NEWMADDR, &info, NULL, w); + if (w->w_req && w->w_tmem) { + struct ifma_msghdr *ifmam; + + ifmam = (struct ifma_msghdr *)w->w_tmem; + ifmam->ifmam_index = ifma->ifma_ifp->if_index; + ifmam->ifmam_flags = 0; + ifmam->ifmam_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, w->w_tmem, len); + if (error) { + IF_ADDR_UNLOCK(ifp); + goto done; + } + } + } + IF_ADDR_UNLOCK(ifp); + } +done: + IFNET_RUNLOCK(); + return (error); +} + +static int +sysctl_rtsock(SYSCTL_HANDLER_ARGS) +{ + int *name = (int *)arg1; + u_int namelen = arg2; + struct radix_node_head *rnh = NULL; /* silence compiler. */ + int i, lim, error = EINVAL; + u_char af; + struct walkarg w; + + name ++; + namelen--; + if (req->newptr) + return (EPERM); + if (namelen != 3) + return ((namelen < 3) ? EISDIR : ENOTDIR); + af = name[0]; + if (af > AF_MAX) + return (EINVAL); + bzero(&w, sizeof(w)); + w.w_op = name[1]; + w.w_arg = name[2]; + w.w_req = req; + + error = sysctl_wire_old_buffer(req, 0); + if (error) + return (error); + switch (w.w_op) { + + case NET_RT_DUMP: + case NET_RT_FLAGS: + if (af == 0) { /* dump all tables */ + i = 1; + lim = AF_MAX; + } else /* dump only one table */ + i = lim = af; + + /* + * take care of llinfo entries, the caller must + * specify an AF + */ + if (w.w_op == NET_RT_FLAGS && + (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) { + if (af != 0) + error = lltable_sysctl_dumparp(af, w.w_req); + else + error = EINVAL; + break; + } + /* + * take care of routing entries + */ + for (error = 0; error == 0 && i <= lim; i++) { + rnh = rt_tables_get_rnh(req->td->td_proc->p_fibnum, i); + if (rnh != NULL) { + RADIX_NODE_HEAD_LOCK(rnh); + error = rnh->rnh_walktree(rnh, + sysctl_dumpentry, &w); + RADIX_NODE_HEAD_UNLOCK(rnh); + } else if (af != 0) + error = EAFNOSUPPORT; + } + break; + + case NET_RT_IFLIST: + error = sysctl_iflist(af, &w); + break; + + case NET_RT_IFMALIST: + error = sysctl_ifmalist(af, &w); + break; + } + if (w.w_tmem) + free(w.w_tmem, M_RTABLE); + return (error); +} + +SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); + +/* + * Definitions of protocols supported in the ROUTE domain. + */ + +static struct domain routedomain; /* or at least forward */ + +static struct protosw routesw[] = { +{ + .pr_type = SOCK_RAW, + .pr_domain = &routedomain, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_output = route_output, + .pr_ctlinput = raw_ctlinput, + .pr_init = raw_init, + .pr_usrreqs = &route_usrreqs +} +}; + +static struct domain routedomain = { + .dom_family = PF_ROUTE, + .dom_name = "route", + .dom_protosw = routesw, + .dom_protoswNPROTOSW = &routesw[sizeof(routesw)/sizeof(routesw[0])] +}; + +VNET_DOMAIN_SET(route); diff --git a/freebsd/sys/net/slcompress.c b/freebsd/sys/net/slcompress.c new file mode 100644 index 00000000..be337c1f --- /dev/null +++ b/freebsd/sys/net/slcompress.c @@ -0,0 +1,609 @@ +#include + +/*- + * Copyright (c) 1989, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)slcompress.c 8.2 (Berkeley) 4/16/94 + * $FreeBSD$ + */ + +/* + * Routines to compress and uncompess tcp packets (for transmission + * over low speed serial lines. + * + * Van Jacobson (van@helios.ee.lbl.gov), Dec 31, 1989: + * - Initial distribution. + * + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#ifndef SL_NO_STATS +#define INCR(counter) ++comp->counter; +#else +#define INCR(counter) +#endif + +#define BCMP(p1, p2, n) bcmp((void *)(p1), (void *)(p2), (int)(n)) +#define BCOPY(p1, p2, n) bcopy((void *)(p1), (void *)(p2), (int)(n)) + +void +sl_compress_init(comp, max_state) + struct slcompress *comp; + int max_state; +{ + register u_int i; + register struct cstate *tstate = comp->tstate; + + if (max_state == -1) { + max_state = MAX_STATES - 1; + bzero((char *)comp, sizeof(*comp)); + } else { + /* Don't reset statistics */ + bzero((char *)comp->tstate, sizeof(comp->tstate)); + bzero((char *)comp->rstate, sizeof(comp->rstate)); + } + for (i = max_state; i > 0; --i) { + tstate[i].cs_id = i; + tstate[i].cs_next = &tstate[i - 1]; + } + tstate[0].cs_next = &tstate[max_state]; + tstate[0].cs_id = 0; + comp->last_cs = &tstate[0]; + comp->last_recv = 255; + comp->last_xmit = 255; + comp->flags = SLF_TOSS; +} + + +/* ENCODE encodes a number that is known to be non-zero. ENCODEZ + * checks for zero (since zero has to be encoded in the long, 3 byte + * form). + */ +#define ENCODE(n) { \ + if ((u_int16_t)(n) >= 256) { \ + *cp++ = 0; \ + cp[1] = (n); \ + cp[0] = (n) >> 8; \ + cp += 2; \ + } else { \ + *cp++ = (n); \ + } \ +} +#define ENCODEZ(n) { \ + if ((u_int16_t)(n) >= 256 || (u_int16_t)(n) == 0) { \ + *cp++ = 0; \ + cp[1] = (n); \ + cp[0] = (n) >> 8; \ + cp += 2; \ + } else { \ + *cp++ = (n); \ + } \ +} + +#define DECODEL(f) { \ + if (*cp == 0) {\ + (f) = htonl(ntohl(f) + ((cp[1] << 8) | cp[2])); \ + cp += 3; \ + } else { \ + (f) = htonl(ntohl(f) + (u_int32_t)*cp++); \ + } \ +} + +#define DECODES(f) { \ + if (*cp == 0) {\ + (f) = htons(ntohs(f) + ((cp[1] << 8) | cp[2])); \ + cp += 3; \ + } else { \ + (f) = htons(ntohs(f) + (u_int32_t)*cp++); \ + } \ +} + +#define DECODEU(f) { \ + if (*cp == 0) {\ + (f) = htons((cp[1] << 8) | cp[2]); \ + cp += 3; \ + } else { \ + (f) = htons((u_int32_t)*cp++); \ + } \ +} + +/* + * Attempt to compress an outgoing TCP packet and return the type of + * the result. The caller must have already verified that the protocol + * is TCP. The first mbuf must contain the complete IP and TCP headers, + * and "ip" must be == mtod(m, struct ip *). "comp" supplies the + * compression state, and "compress_cid" tells us whether it is OK + * to leave out the CID field when feasible. + * + * The caller is responsible for adjusting m->m_pkthdr.len upon return, + * if m is an M_PKTHDR mbuf. + */ +u_int +sl_compress_tcp(m, ip, comp, compress_cid) + struct mbuf *m; + register struct ip *ip; + struct slcompress *comp; + int compress_cid; +{ + register struct cstate *cs = comp->last_cs->cs_next; + register u_int hlen = ip->ip_hl; + register struct tcphdr *oth; + register struct tcphdr *th; + register u_int deltaS, deltaA; + register u_int changes = 0; + u_char new_seq[16]; + register u_char *cp = new_seq; + + /* + * Bail if this is an IP fragment or if the TCP packet isn't + * `compressible' (i.e., ACK isn't set or some other control bit is + * set). (We assume that the caller has already made sure the + * packet is IP proto TCP). + */ + if ((ip->ip_off & htons(0x3fff)) || m->m_len < 40) + return (TYPE_IP); + + th = (struct tcphdr *)&((int32_t *)ip)[hlen]; + if ((th->th_flags & (TH_SYN|TH_FIN|TH_RST|TH_ACK)) != TH_ACK) + return (TYPE_IP); + /* + * Packet is compressible -- we're going to send either a + * COMPRESSED_TCP or UNCOMPRESSED_TCP packet. Either way we need + * to locate (or create) the connection state. Special case the + * most recently used connection since it's most likely to be used + * again & we don't have to do any reordering if it's used. + */ + INCR(sls_packets) + if (ip->ip_src.s_addr != cs->cs_ip.ip_src.s_addr || + ip->ip_dst.s_addr != cs->cs_ip.ip_dst.s_addr || + *(int32_t *)th != ((int32_t *)&cs->cs_ip)[cs->cs_ip.ip_hl]) { + /* + * Wasn't the first -- search for it. + * + * States are kept in a circularly linked list with + * last_cs pointing to the end of the list. The + * list is kept in lru order by moving a state to the + * head of the list whenever it is referenced. Since + * the list is short and, empirically, the connection + * we want is almost always near the front, we locate + * states via linear search. If we don't find a state + * for the datagram, the oldest state is (re-)used. + */ + register struct cstate *lcs; + register struct cstate *lastcs = comp->last_cs; + + do { + lcs = cs; cs = cs->cs_next; + INCR(sls_searches) + if (ip->ip_src.s_addr == cs->cs_ip.ip_src.s_addr + && ip->ip_dst.s_addr == cs->cs_ip.ip_dst.s_addr + && *(int32_t *)th == + ((int32_t *)&cs->cs_ip)[cs->cs_ip.ip_hl]) + goto found; + } while (cs != lastcs); + + /* + * Didn't find it -- re-use oldest cstate. Send an + * uncompressed packet that tells the other side what + * connection number we're using for this conversation. + * Note that since the state list is circular, the oldest + * state points to the newest and we only need to set + * last_cs to update the lru linkage. + */ + INCR(sls_misses) + comp->last_cs = lcs; + hlen += th->th_off; + hlen <<= 2; + if (hlen > m->m_len) + return TYPE_IP; + goto uncompressed; + + found: + /* + * Found it -- move to the front on the connection list. + */ + if (cs == lastcs) + comp->last_cs = lcs; + else { + lcs->cs_next = cs->cs_next; + cs->cs_next = lastcs->cs_next; + lastcs->cs_next = cs; + } + } + + /* + * Make sure that only what we expect to change changed. The first + * line of the `if' checks the IP protocol version, header length & + * type of service. The 2nd line checks the "Don't fragment" bit. + * The 3rd line checks the time-to-live and protocol (the protocol + * check is unnecessary but costless). The 4th line checks the TCP + * header length. The 5th line checks IP options, if any. The 6th + * line checks TCP options, if any. If any of these things are + * different between the previous & current datagram, we send the + * current datagram `uncompressed'. + */ + oth = (struct tcphdr *)&((int32_t *)&cs->cs_ip)[hlen]; + deltaS = hlen; + hlen += th->th_off; + hlen <<= 2; + if (hlen > m->m_len) + return TYPE_IP; + + if (((u_int16_t *)ip)[0] != ((u_int16_t *)&cs->cs_ip)[0] || + ((u_int16_t *)ip)[3] != ((u_int16_t *)&cs->cs_ip)[3] || + ((u_int16_t *)ip)[4] != ((u_int16_t *)&cs->cs_ip)[4] || + th->th_off != oth->th_off || + (deltaS > 5 && + BCMP(ip + 1, &cs->cs_ip + 1, (deltaS - 5) << 2)) || + (th->th_off > 5 && + BCMP(th + 1, oth + 1, (th->th_off - 5) << 2))) + goto uncompressed; + + /* + * Figure out which of the changing fields changed. The + * receiver expects changes in the order: urgent, window, + * ack, seq (the order minimizes the number of temporaries + * needed in this section of code). + */ + if (th->th_flags & TH_URG) { + deltaS = ntohs(th->th_urp); + ENCODEZ(deltaS); + changes |= NEW_U; + } else if (th->th_urp != oth->th_urp) + /* argh! URG not set but urp changed -- a sensible + * implementation should never do this but RFC793 + * doesn't prohibit the change so we have to deal + * with it. */ + goto uncompressed; + + deltaS = (u_int16_t)(ntohs(th->th_win) - ntohs(oth->th_win)); + if (deltaS) { + ENCODE(deltaS); + changes |= NEW_W; + } + + deltaA = ntohl(th->th_ack) - ntohl(oth->th_ack); + if (deltaA) { + if (deltaA > 0xffff) + goto uncompressed; + ENCODE(deltaA); + changes |= NEW_A; + } + + deltaS = ntohl(th->th_seq) - ntohl(oth->th_seq); + if (deltaS) { + if (deltaS > 0xffff) + goto uncompressed; + ENCODE(deltaS); + changes |= NEW_S; + } + + switch(changes) { + + case 0: + /* + * Nothing changed. If this packet contains data and the + * last one didn't, this is probably a data packet following + * an ack (normal on an interactive connection) and we send + * it compressed. Otherwise it's probably a retransmit, + * retransmitted ack or window probe. Send it uncompressed + * in case the other side missed the compressed version. + */ + if (ip->ip_len != cs->cs_ip.ip_len && + ntohs(cs->cs_ip.ip_len) == hlen) + break; + + /* FALLTHROUGH */ + + case SPECIAL_I: + case SPECIAL_D: + /* + * actual changes match one of our special case encodings -- + * send packet uncompressed. + */ + goto uncompressed; + + case NEW_S|NEW_A: + if (deltaS == deltaA && + deltaS == ntohs(cs->cs_ip.ip_len) - hlen) { + /* special case for echoed terminal traffic */ + changes = SPECIAL_I; + cp = new_seq; + } + break; + + case NEW_S: + if (deltaS == ntohs(cs->cs_ip.ip_len) - hlen) { + /* special case for data xfer */ + changes = SPECIAL_D; + cp = new_seq; + } + break; + } + + deltaS = ntohs(ip->ip_id) - ntohs(cs->cs_ip.ip_id); + if (deltaS != 1) { + ENCODEZ(deltaS); + changes |= NEW_I; + } + if (th->th_flags & TH_PUSH) + changes |= TCP_PUSH_BIT; + /* + * Grab the cksum before we overwrite it below. Then update our + * state with this packet's header. + */ + deltaA = ntohs(th->th_sum); + BCOPY(ip, &cs->cs_ip, hlen); + + /* + * We want to use the original packet as our compressed packet. + * (cp - new_seq) is the number of bytes we need for compressed + * sequence numbers. In addition we need one byte for the change + * mask, one for the connection id and two for the tcp checksum. + * So, (cp - new_seq) + 4 bytes of header are needed. hlen is how + * many bytes of the original packet to toss so subtract the two to + * get the new packet size. + */ + deltaS = cp - new_seq; + cp = (u_char *)ip; + if (compress_cid == 0 || comp->last_xmit != cs->cs_id) { + comp->last_xmit = cs->cs_id; + hlen -= deltaS + 4; + cp += hlen; + *cp++ = changes | NEW_C; + *cp++ = cs->cs_id; + } else { + hlen -= deltaS + 3; + cp += hlen; + *cp++ = changes; + } + m->m_len -= hlen; + m->m_data += hlen; + *cp++ = deltaA >> 8; + *cp++ = deltaA; + BCOPY(new_seq, cp, deltaS); + INCR(sls_compressed) + return (TYPE_COMPRESSED_TCP); + + /* + * Update connection state cs & send uncompressed packet ('uncompressed' + * means a regular ip/tcp packet but with the 'conversation id' we hope + * to use on future compressed packets in the protocol field). + */ +uncompressed: + BCOPY(ip, &cs->cs_ip, hlen); + ip->ip_p = cs->cs_id; + comp->last_xmit = cs->cs_id; + return (TYPE_UNCOMPRESSED_TCP); +} + + +int +sl_uncompress_tcp(bufp, len, type, comp) + u_char **bufp; + int len; + u_int type; + struct slcompress *comp; +{ + u_char *hdr, *cp; + int hlen, vjlen; + + cp = bufp? *bufp: NULL; + vjlen = sl_uncompress_tcp_core(cp, len, len, type, comp, &hdr, &hlen); + if (vjlen < 0) + return (0); /* error */ + if (vjlen == 0) + return (len); /* was uncompressed already */ + + cp += vjlen; + len -= vjlen; + + /* + * At this point, cp points to the first byte of data in the + * packet. If we're not aligned on a 4-byte boundary, copy the + * data down so the ip & tcp headers will be aligned. Then back up + * cp by the tcp/ip header length to make room for the reconstructed + * header (we assume the packet we were handed has enough space to + * prepend 128 bytes of header). + */ + if ((intptr_t)cp & 3) { + if (len > 0) + BCOPY(cp, ((intptr_t)cp &~ 3), len); + cp = (u_char *)((intptr_t)cp &~ 3); + } + cp -= hlen; + len += hlen; + BCOPY(hdr, cp, hlen); + + *bufp = cp; + return (len); +} + +/* + * Uncompress a packet of total length total_len. The first buflen + * bytes are at buf; this must include the entire (compressed or + * uncompressed) TCP/IP header. This procedure returns the length + * of the VJ header, with a pointer to the uncompressed IP header + * in *hdrp and its length in *hlenp. + */ +int +sl_uncompress_tcp_core(buf, buflen, total_len, type, comp, hdrp, hlenp) + u_char *buf; + int buflen, total_len; + u_int type; + struct slcompress *comp; + u_char **hdrp; + u_int *hlenp; +{ + register u_char *cp; + register u_int hlen, changes; + register struct tcphdr *th; + register struct cstate *cs; + register struct ip *ip; + register u_int16_t *bp; + register u_int vjlen; + + switch (type) { + + case TYPE_UNCOMPRESSED_TCP: + ip = (struct ip *) buf; + if (ip->ip_p >= MAX_STATES) + goto bad; + cs = &comp->rstate[comp->last_recv = ip->ip_p]; + comp->flags &=~ SLF_TOSS; + ip->ip_p = IPPROTO_TCP; + /* + * Calculate the size of the TCP/IP header and make sure that + * we don't overflow the space we have available for it. + */ + hlen = ip->ip_hl << 2; + if (hlen + sizeof(struct tcphdr) > buflen) + goto bad; + hlen += ((struct tcphdr *)&((char *)ip)[hlen])->th_off << 2; + if (hlen > MAX_HDR || hlen > buflen) + goto bad; + BCOPY(ip, &cs->cs_ip, hlen); + cs->cs_hlen = hlen; + INCR(sls_uncompressedin) + *hdrp = (u_char *) &cs->cs_ip; + *hlenp = hlen; + return (0); + + default: + goto bad; + + case TYPE_COMPRESSED_TCP: + break; + } + /* We've got a compressed packet. */ + INCR(sls_compressedin) + cp = buf; + changes = *cp++; + if (changes & NEW_C) { + /* Make sure the state index is in range, then grab the state. + * If we have a good state index, clear the 'discard' flag. */ + if (*cp >= MAX_STATES) + goto bad; + + comp->flags &=~ SLF_TOSS; + comp->last_recv = *cp++; + } else { + /* this packet has an implicit state index. If we've + * had a line error since the last time we got an + * explicit state index, we have to toss the packet. */ + if (comp->flags & SLF_TOSS) { + INCR(sls_tossed) + return (-1); + } + } + cs = &comp->rstate[comp->last_recv]; + hlen = cs->cs_ip.ip_hl << 2; + th = (struct tcphdr *)&((u_char *)&cs->cs_ip)[hlen]; + th->th_sum = htons((*cp << 8) | cp[1]); + cp += 2; + if (changes & TCP_PUSH_BIT) + th->th_flags |= TH_PUSH; + else + th->th_flags &=~ TH_PUSH; + + switch (changes & SPECIALS_MASK) { + case SPECIAL_I: + { + register u_int i = ntohs(cs->cs_ip.ip_len) - cs->cs_hlen; + th->th_ack = htonl(ntohl(th->th_ack) + i); + th->th_seq = htonl(ntohl(th->th_seq) + i); + } + break; + + case SPECIAL_D: + th->th_seq = htonl(ntohl(th->th_seq) + ntohs(cs->cs_ip.ip_len) + - cs->cs_hlen); + break; + + default: + if (changes & NEW_U) { + th->th_flags |= TH_URG; + DECODEU(th->th_urp) + } else + th->th_flags &=~ TH_URG; + if (changes & NEW_W) + DECODES(th->th_win) + if (changes & NEW_A) + DECODEL(th->th_ack) + if (changes & NEW_S) + DECODEL(th->th_seq) + break; + } + if (changes & NEW_I) { + DECODES(cs->cs_ip.ip_id) + } else + cs->cs_ip.ip_id = htons(ntohs(cs->cs_ip.ip_id) + 1); + + /* + * At this point, cp points to the first byte of data in the + * packet. Fill in the IP total length and update the IP + * header checksum. + */ + vjlen = cp - buf; + buflen -= vjlen; + if (buflen < 0) + /* we must have dropped some characters (crc should detect + * this but the old slip framing won't) */ + goto bad; + + total_len += cs->cs_hlen - vjlen; + cs->cs_ip.ip_len = htons(total_len); + + /* recompute the ip header checksum */ + bp = (u_int16_t *) &cs->cs_ip; + cs->cs_ip.ip_sum = 0; + for (changes = 0; hlen > 0; hlen -= 2) + changes += *bp++; + changes = (changes & 0xffff) + (changes >> 16); + changes = (changes & 0xffff) + (changes >> 16); + cs->cs_ip.ip_sum = ~ changes; + + *hdrp = (u_char *) &cs->cs_ip; + *hlenp = cs->cs_hlen; + return vjlen; + +bad: + comp->flags |= SLF_TOSS; + INCR(sls_errorin) + return (-1); +} diff --git a/freebsd/sys/net/slcompress.h b/freebsd/sys/net/slcompress.h new file mode 100644 index 00000000..08c9042e --- /dev/null +++ b/freebsd/sys/net/slcompress.h @@ -0,0 +1,158 @@ +/* + * Definitions for tcp compression routines. + */ +/*- + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Van Jacobson (van@helios.ee.lbl.gov), Dec 31, 1989: + * - Initial distribution. + * $FreeBSD$ + */ + +#ifndef _NET_SLCOMPRESS_HH_ +#define _NET_SLCOMPRESS_HH_ + +#define MAX_STATES 16 /* must be > 2 and < 256 */ +#define MAX_HDR 128 + +/* + * Compressed packet format: + * + * The first octet contains the packet type (top 3 bits), TCP + * 'push' bit, and flags that indicate which of the 4 TCP sequence + * numbers have changed (bottom 5 bits). The next octet is a + * conversation number that associates a saved IP/TCP header with + * the compressed packet. The next two octets are the TCP checksum + * from the original datagram. The next 0 to 15 octets are + * sequence number changes, one change per bit set in the header + * (there may be no changes and there are two special cases where + * the receiver implicitly knows what changed -- see below). + * + * There are 5 numbers which can change (they are always inserted + * in the following order): TCP urgent pointer, window, + * acknowledgement, sequence number and IP ID. (The urgent pointer + * is different from the others in that its value is sent, not the + * change in value.) Since typical use of SLIP links is biased + * toward small packets (see comments on MTU/MSS below), changes + * use a variable length coding with one octet for numbers in the + * range 1 - 255 and 3 octets (0, MSB, LSB) for numbers in the + * range 256 - 65535 or 0. (If the change in sequence number or + * ack is more than 65535, an uncompressed packet is sent.) + */ + +/* + * Packet types (must not conflict with IP protocol version) + * + * The top nibble of the first octet is the packet type. There are + * three possible types: IP (not proto TCP or tcp with one of the + * control flags set); uncompressed TCP (a normal IP/TCP packet but + * with the 8-bit protocol field replaced by an 8-bit connection id -- + * this type of packet syncs the sender & receiver); and compressed + * TCP (described above). + * + * LSB of 4-bit field is TCP "PUSH" bit (a worthless anachronism) and + * is logically part of the 4-bit "changes" field that follows. Top + * three bits are actual packet type. For backward compatibility + * and in the interest of conserving bits, numbers are chosen so the + * IP protocol version number (4) which normally appears in this nibble + * means "IP packet". + */ + +/* packet types */ +#define TYPE_IP 0x40 +#define TYPE_UNCOMPRESSED_TCP 0x70 +#define TYPE_COMPRESSED_TCP 0x80 +#define TYPE_ERROR 0x00 + +/* Bits in first octet of compressed packet */ +#define NEW_C 0x40 /* flag bits for what changed in a packet */ +#define NEW_I 0x20 +#define NEW_S 0x08 +#define NEW_A 0x04 +#define NEW_W 0x02 +#define NEW_U 0x01 + +/* reserved, special-case values of above */ +#define SPECIAL_I (NEW_S|NEW_W|NEW_U) /* echoed interactive traffic */ +#define SPECIAL_D (NEW_S|NEW_A|NEW_W|NEW_U) /* unidirectional data */ +#define SPECIALS_MASK (NEW_S|NEW_A|NEW_W|NEW_U) + +#define TCP_PUSH_BIT 0x10 + + +/* + * "state" data for each active tcp conversation on the wire. This is + * basically a copy of the entire IP/TCP header from the last packet + * we saw from the conversation together with a small identifier + * the transmit & receive ends of the line use to locate saved header. + */ +struct cstate { + struct cstate *cs_next; /* next most recently used cstate (xmit only) */ + u_int16_t cs_hlen; /* size of hdr (receive only) */ + u_char cs_id; /* connection # associated with this state */ + u_char cs_filler; + union { + char csu_hdr[MAX_HDR]; + struct ip csu_ip; /* ip/tcp hdr from most recent packet */ + } slcs_u; +}; +#define cs_ip slcs_u.csu_ip +#define cs_hdr slcs_u.csu_hdr + +/* + * all the state data for one serial line (we need one of these + * per line). + */ +struct slcompress { + struct cstate *last_cs; /* most recently used tstate */ + u_char last_recv; /* last rcvd conn. id */ + u_char last_xmit; /* last sent conn. id */ + u_int16_t flags; +#ifndef SL_NO_STATS + int sls_packets; /* outbound packets */ + int sls_compressed; /* outbound compressed packets */ + int sls_searches; /* searches for connection state */ + int sls_misses; /* times couldn't find conn. state */ + int sls_uncompressedin; /* inbound uncompressed packets */ + int sls_compressedin; /* inbound compressed packets */ + int sls_errorin; /* inbound unknown type packets */ + int sls_tossed; /* inbound packets tossed because of error */ +#endif + struct cstate tstate[MAX_STATES]; /* xmit connection states */ + struct cstate rstate[MAX_STATES]; /* receive connection states */ +}; +/* flag values */ +#define SLF_TOSS 1 /* tossing rcvd frames because of input err */ + +void sl_compress_init(struct slcompress *, int); +u_int sl_compress_tcp(struct mbuf *, struct ip *, struct slcompress *, int); +int sl_uncompress_tcp(u_char **, int, u_int, struct slcompress *); +int sl_uncompress_tcp_core(u_char *, int, int, u_int, + struct slcompress *, u_char **, u_int *); + +#endif /* !_NET_SLCOMPRESS_HH_ */ diff --git a/freebsd/sys/net/vnet.h b/freebsd/sys/net/vnet.h new file mode 100644 index 00000000..7f6326fe --- /dev/null +++ b/freebsd/sys/net/vnet.h @@ -0,0 +1,437 @@ +/*- + * Copyright (c) 2006-2009 University of Zagreb + * Copyright (c) 2006-2009 FreeBSD Foundation + * All rights reserved. + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Copyright (c) 2009 Jeffrey Roberson + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/*- + * This header file defines several sets of interfaces supporting virtualized + * network stacks: + * + * - Definition of 'struct vnet' and functions and macros to allocate/free/ + * manipulate it. + * + * - A virtual network stack memory allocator, which provides support for + * virtualized global variables via a special linker set, set_vnet. + * + * - Virtualized sysinits/sysuninits, which allow constructors and + * destructors to be run for each network stack subsystem as virtual + * instances are created and destroyed. + * + * If VIMAGE isn't compiled into the kernel, virtualized global variables + * compile to normal global variables, and virtualized sysinits to regular + * sysinits. + */ + +#ifndef _NET_VNET_HH_ +#define _NET_VNET_HH_ + +/* + * struct vnet describes a virtualized network stack, and is primarily a + * pointer to storage for virtualized global variables. Expose to userspace + * as required for libkvm. + */ +#if defined(_KERNEL) || defined(_WANT_VNET) +#include + +struct vnet { + LIST_ENTRY(vnet) vnet_le; /* all vnets list */ + u_int vnet_magic_n; + u_int vnet_ifcnt; + u_int vnet_sockcnt; + void *vnet_data_mem; + uintptr_t vnet_data_base; +}; +#define VNET_MAGIC_N 0x3e0d8f29 + +/* + * These two virtual network stack allocator definitions are also required + * for libkvm so that it can evaluate virtualized global variables. + */ +#define VNET_SETNAME "set_vnet" +#define VNET_SYMPREFIX "vnet_entry_" +#endif + +#ifdef _KERNEL + +#ifdef VIMAGE +#include +#include /* for struct thread */ +#include +#include + +/* + * Location of the kernel's 'set_vnet' linker set. + */ +extern uintptr_t *__start_set_vnet; +extern uintptr_t *__stop_set_vnet; + +#define VNET_START (uintptr_t)&__start_set_vnet +#define VNET_STOP (uintptr_t)&__stop_set_vnet + +/* + * Functions to allocate and destroy virtual network stacks. + */ +struct vnet *vnet_alloc(void); +void vnet_destroy(struct vnet *vnet); + +/* + * The current virtual network stack -- we may wish to move this to struct + * pcpu in the future. + */ +#define curvnet curthread->td_vnet + +/* + * Various macros -- get and set the current network stack, but also + * assertions. + */ +#ifdef VNET_DEBUG +void vnet_log_recursion(struct vnet *, const char *, int); + +#define VNET_ASSERT(condition) \ + if (!(condition)) { \ + printf("VNET_ASSERT @ %s:%d %s():\n", \ + __FILE__, __LINE__, __FUNCTION__); \ + panic(#condition); \ + } + +#define CURVNET_SET_QUIET(arg) \ + VNET_ASSERT((arg)->vnet_magic_n == VNET_MAGIC_N); \ + struct vnet *saved_vnet = curvnet; \ + const char *saved_vnet_lpush = curthread->td_vnet_lpush; \ + curvnet = arg; \ + curthread->td_vnet_lpush = __FUNCTION__; + +#define CURVNET_SET_VERBOSE(arg) \ + CURVNET_SET_QUIET(arg) \ + if (saved_vnet) \ + vnet_log_recursion(saved_vnet, saved_vnet_lpush, __LINE__); + +#define CURVNET_SET(arg) CURVNET_SET_VERBOSE(arg) + +#define CURVNET_RESTORE() \ + VNET_ASSERT(saved_vnet == NULL || \ + saved_vnet->vnet_magic_n == VNET_MAGIC_N); \ + curvnet = saved_vnet; \ + curthread->td_vnet_lpush = saved_vnet_lpush; +#else /* !VNET_DEBUG */ +#define VNET_ASSERT(condition) + +#define CURVNET_SET(arg) \ + struct vnet *saved_vnet = curvnet; \ + curvnet = arg; + +#define CURVNET_SET_VERBOSE(arg) CURVNET_SET(arg) +#define CURVNET_SET_QUIET(arg) CURVNET_SET(arg) + +#define CURVNET_RESTORE() \ + curvnet = saved_vnet; +#endif /* VNET_DEBUG */ + +extern struct vnet *vnet0; +#define IS_DEFAULT_VNET(arg) ((arg) == vnet0) + +#define CRED_TO_VNET(cr) (cr)->cr_prison->pr_vnet +#define TD_TO_VNET(td) CRED_TO_VNET((td)->td_ucred) +#define P_TO_VNET(p) CRED_TO_VNET((p)->p_ucred) + +/* + * Global linked list of all virtual network stacks, along with read locks to + * access it. If a caller may sleep while accessing the list, it must use + * the sleepable lock macros. + */ +LIST_HEAD(vnet_list_head, vnet); +extern struct vnet_list_head vnet_head; +extern struct rwlock vnet_rwlock; +extern struct sx vnet_sxlock; + +#define VNET_LIST_RLOCK() sx_slock(&vnet_sxlock) +#define VNET_LIST_RLOCK_NOSLEEP() rw_rlock(&vnet_rwlock) +#define VNET_LIST_RUNLOCK() sx_sunlock(&vnet_sxlock) +#define VNET_LIST_RUNLOCK_NOSLEEP() rw_runlock(&vnet_rwlock) + +/* + * Iteration macros to walk the global list of virtual network stacks. + */ +#define VNET_ITERATOR_DECL(arg) struct vnet *arg +#define VNET_FOREACH(arg) LIST_FOREACH((arg), &vnet_head, vnet_le) + +/* + * Virtual network stack memory allocator, which allows global variables to + * be automatically instantiated for each network stack instance. + */ +__asm__( +#if defined(__arm__) + ".section " VNET_SETNAME ", \"aw\", %progbits\n" +#else + ".section " VNET_SETNAME ", \"aw\", @progbits\n" +#endif + "\t.p2align " __XSTRING(CACHE_LINE_SHIFT) "\n" + "\t.previous"); + +#define VNET_NAME(n) vnet_entry_##n +#define VNET_DECLARE(t, n) extern t VNET_NAME(n) +#define VNET_DEFINE(t, n) t VNET_NAME(n) __section(VNET_SETNAME) __used +#define _VNET_PTR(b, n) (__typeof(VNET_NAME(n))*) \ + ((b) + (uintptr_t)&VNET_NAME(n)) + +#define _VNET(b, n) (*_VNET_PTR(b, n)) + +/* + * Virtualized global variable accessor macros. + */ +#define VNET_VNET_PTR(vnet, n) _VNET_PTR((vnet)->vnet_data_base, n) +#define VNET_VNET(vnet, n) (*VNET_VNET_PTR((vnet), n)) + +#define VNET_PTR(n) VNET_VNET_PTR(curvnet, n) +#define VNET(n) VNET_VNET(curvnet, n) + +/* + * Virtual network stack allocator interfaces from the kernel linker. + */ +void *vnet_data_alloc(int size); +void vnet_data_copy(void *start, int size); +void vnet_data_free(void *start_arg, int size); + +/* + * Sysctl variants for vnet-virtualized global variables. Include + * to expose these definitions. + * + * Note: SYSCTL_PROC() handler functions will need to resolve pointer + * arguments themselves, if required. + */ +#ifdef SYSCTL_OID +int vnet_sysctl_handle_int(SYSCTL_HANDLER_ARGS); +int vnet_sysctl_handle_opaque(SYSCTL_HANDLER_ARGS); +int vnet_sysctl_handle_string(SYSCTL_HANDLER_ARGS); +int vnet_sysctl_handle_uint(SYSCTL_HANDLER_ARGS); + +#define SYSCTL_VNET_INT(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_VNET|(access), \ + ptr, val, vnet_sysctl_handle_int, "I", descr) +#define SYSCTL_VNET_PROC(parent, nbr, name, access, ptr, arg, handler, \ + fmt, descr) \ + SYSCTL_OID(parent, nbr, name, CTLFLAG_VNET|(access), ptr, arg, \ + handler, fmt, descr) +#define SYSCTL_VNET_OPAQUE(parent, nbr, name, access, ptr, len, fmt, \ + descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_OPAQUE|CTLFLAG_VNET|(access), ptr, len, \ + vnet_sysctl_handle_opaque, fmt, descr) +#define SYSCTL_VNET_STRING(parent, nbr, name, access, arg, len, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_STRING|CTLFLAG_VNET|(access), \ + arg, len, vnet_sysctl_handle_string, "A", descr) +#define SYSCTL_VNET_STRUCT(parent, nbr, name, access, ptr, type, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_OPAQUE|CTLFLAG_VNET|(access), ptr, \ + sizeof(struct type), vnet_sysctl_handle_opaque, "S," #type, \ + descr) +#define SYSCTL_VNET_UINT(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_UINT|CTLFLAG_MPSAFE|CTLFLAG_VNET|(access), \ + ptr, val, vnet_sysctl_handle_uint, "IU", descr) +#define VNET_SYSCTL_ARG(req, arg1) do { \ + if (arg1 != NULL) \ + arg1 = (void *)(TD_TO_VNET((req)->td)->vnet_data_base + \ + (uintptr_t)(arg1)); \ +} while (0) +#endif /* SYSCTL_OID */ + +/* + * Virtual sysinit mechanism, allowing network stack components to declare + * startup and shutdown methods to be run when virtual network stack + * instances are created and destroyed. + */ +#include + +/* + * SYSINIT/SYSUNINIT variants that provide per-vnet constructors and + * destructors. + */ +struct vnet_sysinit { + enum sysinit_sub_id subsystem; + enum sysinit_elem_order order; + sysinit_cfunc_t func; + const void *arg; + TAILQ_ENTRY(vnet_sysinit) link; +}; + +#define VNET_SYSINIT(ident, subsystem, order, func, arg) \ + static struct vnet_sysinit ident ## _vnet_init = { \ + subsystem, \ + order, \ + (sysinit_cfunc_t)(sysinit_nfunc_t)func, \ + (arg) \ + }; \ + SYSINIT(vnet_init_ ## ident, subsystem, order, \ + vnet_register_sysinit, &ident ## _vnet_init); \ + SYSUNINIT(vnet_init_ ## ident, subsystem, order, \ + vnet_deregister_sysinit, &ident ## _vnet_init) + +#define VNET_SYSUNINIT(ident, subsystem, order, func, arg) \ + static struct vnet_sysinit ident ## _vnet_uninit = { \ + subsystem, \ + order, \ + (sysinit_cfunc_t)(sysinit_nfunc_t)func, \ + (arg) \ + }; \ + SYSINIT(vnet_uninit_ ## ident, subsystem, order, \ + vnet_register_sysuninit, &ident ## _vnet_uninit); \ + SYSUNINIT(vnet_uninit_ ## ident, subsystem, order, \ + vnet_deregister_sysuninit, &ident ## _vnet_uninit) + +/* + * Run per-vnet sysinits or sysuninits during vnet creation/destruction. + */ +void vnet_sysinit(void); +void vnet_sysuninit(void); + +/* + * Interfaces for managing per-vnet constructors and destructors. + */ +void vnet_register_sysinit(void *arg); +void vnet_register_sysuninit(void *arg); +void vnet_deregister_sysinit(void *arg); +void vnet_deregister_sysuninit(void *arg); + +/* + * EVENTHANDLER(9) extensions. + */ +#include + +void vnet_global_eventhandler_iterator_func(void *, ...); +#define VNET_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \ +do { \ + if (IS_DEFAULT_VNET(curvnet)) { \ + (tag) = vimage_eventhandler_register(NULL, #name, func, \ + arg, priority, \ + vnet_global_eventhandler_iterator_func); \ + } \ +} while(0) +#define VNET_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \ +do { \ + if (IS_DEFAULT_VNET(curvnet)) { \ + vimage_eventhandler_register(NULL, #name, func, \ + arg, priority, \ + vnet_global_eventhandler_iterator_func); \ + } \ +} while(0) + +#else /* !VIMAGE */ + +/* + * Various virtual network stack macros compile to no-ops without VIMAGE. + */ +#define curvnet NULL + +#define VNET_ASSERT(condition) +#define CURVNET_SET(arg) +#define CURVNET_SET_QUIET(arg) +#define CURVNET_RESTORE() + +#define VNET_LIST_RLOCK() +#define VNET_LIST_RLOCK_NOSLEEP() +#define VNET_LIST_RUNLOCK() +#define VNET_LIST_RUNLOCK_NOSLEEP() +#define VNET_ITERATOR_DECL(arg) +#define VNET_FOREACH(arg) + +#define IS_DEFAULT_VNET(arg) 1 +#define CRED_TO_VNET(cr) NULL +#define TD_TO_VNET(td) NULL +#define P_TO_VNET(p) NULL + +/* + * Versions of the VNET macros that compile to normal global variables and + * standard sysctl definitions. + */ +#define VNET_NAME(n) n +#define VNET_DECLARE(t, n) extern t n +#define VNET_DEFINE(t, n) t n +#define _VNET_PTR(b, n) &VNET_NAME(n) + +/* + * Virtualized global variable accessor macros. + */ +#define VNET_VNET_PTR(vnet, n) (&(n)) +#define VNET_VNET(vnet, n) (n) + +#define VNET_PTR(n) (&(n)) +#define VNET(n) (n) + +/* + * When VIMAGE isn't compiled into the kernel, virtaulized SYSCTLs simply + * become normal SYSCTLs. + */ +#ifdef SYSCTL_OID +#define SYSCTL_VNET_INT(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) +#define SYSCTL_VNET_PROC(parent, nbr, name, access, ptr, arg, handler, \ + fmt, descr) \ + SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt, \ + descr) +#define SYSCTL_VNET_OPAQUE(parent, nbr, name, access, ptr, len, fmt, \ + descr) \ + SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr) +#define SYSCTL_VNET_STRING(parent, nbr, name, access, arg, len, descr) \ + SYSCTL_STRING(parent, nbr, name, access, arg, len, descr) +#define SYSCTL_VNET_STRUCT(parent, nbr, name, access, ptr, type, descr) \ + SYSCTL_STRUCT(parent, nbr, name, access, ptr, type, descr) +#define SYSCTL_VNET_UINT(parent, nbr, name, access, ptr, val, descr) \ + SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr) +#define VNET_SYSCTL_ARG(req, arg1) +#endif /* SYSCTL_OID */ + +/* + * When VIMAGE isn't compiled into the kernel, VNET_SYSINIT/VNET_SYSUNINIT + * map into normal sysinits, which have the same ordering properties. + */ +#define VNET_SYSINIT(ident, subsystem, order, func, arg) \ + SYSINIT(ident, subsystem, order, func, arg) +#define VNET_SYSUNINIT(ident, subsystem, order, func, arg) \ + SYSUNINIT(ident, subsystem, order, func, arg) + +/* + * Without VIMAGE revert to the default implementation. + */ +#define VNET_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \ + (tag) = eventhandler_register(NULL, #name, func, arg, priority) +#define VNET_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \ + eventhandler_register(NULL, #name, func, arg, priority) +#endif /* VIMAGE */ +#endif /* _KERNEL */ + +#endif /* !_NET_VNET_HH_ */ diff --git a/freebsd/sys/net/zlib.c b/freebsd/sys/net/zlib.c new file mode 100644 index 00000000..a7a54740 --- /dev/null +++ b/freebsd/sys/net/zlib.c @@ -0,0 +1,5409 @@ +#include + +/* + * This file is derived from various .h and .c files from the zlib-1.0.4 + * distribution by Jean-loup Gailly and Mark Adler, with some additions + * by Paul Mackerras to aid in implementing Deflate compression and + * decompression for PPP packets. See zlib.h for conditions of + * distribution and use. + * + * Changes that have been made include: + * - added Z_PACKET_FLUSH (see zlib.h for details) + * - added inflateIncomp and deflateOutputPending + * - allow strm->next_out to be NULL, meaning discard the output + * + * $FreeBSD$ + */ + +/* + * ==FILEVERSION 971210== + * + * This marker is used by the Linux installation script to determine + * whether an up-to-date version of this file is already installed. + */ + +#define NO_DUMMY_DECL +#define NO_ZCFUNCS +#define MY_ZCALLOC + +#if defined(__FreeBSD__) && defined(_KERNEL) +#define inflate inflate_ppp /* FreeBSD already has an inflate :-( */ +#endif + + +/* +++ zutil.h */ +/*- + * zutil.h -- internal interface and configuration of the compression library + * Copyright (C) 1995-1996 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +/* From: zutil.h,v 1.16 1996/07/24 13:41:13 me Exp $ */ + +#ifndef _Z_UTIL_H +#define _Z_UTIL_H + +#ifdef _KERNEL +#include +#else +#include +#endif + +#ifdef _KERNEL +/* Assume this is a *BSD or SVR4 kernel */ +#include +#include +#include +#include +#include +#include +# define HAVE_MEMCPY + +#else +#if defined(__KERNEL__) +/* Assume this is a Linux kernel */ +#include +#define HAVE_MEMCPY + +#else /* not kernel */ + +#if defined(MSDOS)||defined(VMS)||defined(CRAY)||defined(WIN32)||defined(RISCOS) +# include +# include +#else + extern int errno; +#endif +#ifdef STDC +# include +# include +#endif +#endif /* __KERNEL__ */ +#endif /* _KERNEL */ + +#ifndef local +# define local static +#endif +/* compile with -Dlocal if your debugger can't find static symbols */ + +typedef unsigned char uch; +typedef uch FAR uchf; +typedef unsigned short ush; +typedef ush FAR ushf; +typedef unsigned long ulg; + +static const char *z_errmsg[10]; /* indexed by 2-zlib_error */ +/* (size given to avoid silly warnings with Visual C++) */ + +#define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)] + +#define ERR_RETURN(strm,err) \ + return (strm->msg = (const char*)ERR_MSG(err), (err)) +/* To be used only when the state is known to be valid */ + + /* common constants */ + +#ifndef DEF_WBITS +# define DEF_WBITS MAX_WBITS +#endif +/* default windowBits for decompression. MAX_WBITS is for compression only */ + +#if MAX_MEM_LEVEL >= 8 +# define DEF_MEM_LEVEL 8 +#else +# define DEF_MEM_LEVEL MAX_MEM_LEVEL +#endif +/* default memLevel */ + +#define STORED_BLOCK 0 +#define STATIC_TREES 1 +#define DYN_TREES 2 +/* The three kinds of block type */ + +#define MIN_MATCH 3 +#define MAX_MATCH 258 +/* The minimum and maximum match lengths */ + +#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */ + + /* target dependencies */ + +#ifdef MSDOS +# define OS_CODE 0x00 +# ifdef __TURBOC__ +# include +# else /* MSC or DJGPP */ +# include +# endif +#endif + +#ifdef OS2 +# define OS_CODE 0x06 +#endif + +#ifdef WIN32 /* Window 95 & Windows NT */ +# define OS_CODE 0x0b +#endif + +#if defined(VAXC) || defined(VMS) +# define OS_CODE 0x02 +# define FOPEN(name, mode) \ + fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512") +#endif + +#ifdef AMIGA +# define OS_CODE 0x01 +#endif + +#if defined(ATARI) || defined(atarist) +# define OS_CODE 0x05 +#endif + +#ifdef MACOS +# define OS_CODE 0x07 +#endif + +#ifdef __50SERIES /* Prime/PRIMOS */ +# define OS_CODE 0x0F +#endif + +#ifdef TOPS20 +# define OS_CODE 0x0a +#endif + +#if defined(_BEOS_) || defined(RISCOS) +# define fdopen(fd,mode) NULL /* No fdopen() */ +#endif + + /* Common defaults */ + +#ifndef OS_CODE +# define OS_CODE 0x03 /* assume Unix */ +#endif + +#ifndef FOPEN +# define FOPEN(name, mode) fopen((name), (mode)) +#endif + + /* functions */ + +#ifdef HAVE_STRERROR + extern char *strerror OF((int)); +# define zstrerror(errnum) strerror(errnum) +#else +# define zstrerror(errnum) "" +#endif + +#if defined(pyr) +# define NO_MEMCPY +#endif +#if (defined(M_I86SM) || defined(M_I86MM)) && !defined(_MSC_VER) + /* Use our own functions for small and medium model with MSC <= 5.0. + * You may have to use the same strategy for Borland C (untested). + */ +# define NO_MEMCPY +#endif +#if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY) +# define HAVE_MEMCPY +#endif +#ifdef HAVE_MEMCPY +# ifdef SMALL_MEDIUM /* MSDOS small or medium model */ +# define zmemcpy _fmemcpy +# define zmemcmp _fmemcmp +# define zmemzero(dest, len) _fmemset(dest, 0, len) +# else +# define zmemcpy memcpy +# define zmemcmp memcmp +# define zmemzero(dest, len) memset(dest, 0, len) +# endif +#else + extern void zmemcpy OF((Bytef* dest, Bytef* source, uInt len)); + extern int zmemcmp OF((Bytef* s1, Bytef* s2, uInt len)); + extern void zmemzero OF((Bytef* dest, uInt len)); +#endif + +/* Diagnostic functions */ +#ifdef DEBUG_ZLIB +# include +# ifndef verbose +# define verbose 0 +# endif + extern void z_error OF((char *m)); +# define Assert(cond,msg) {if(!(cond)) z_error(msg);} +# define Trace(x) fprintf x +# define Tracev(x) {if (verbose) fprintf x ;} +# define Tracevv(x) {if (verbose>1) fprintf x ;} +# define Tracec(c,x) {if (verbose && (c)) fprintf x ;} +# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;} +#else +# define Assert(cond,msg) +# define Trace(x) +# define Tracev(x) +# define Tracevv(x) +# define Tracec(c,x) +# define Tracecv(c,x) +#endif + + +typedef uLong (*check_func) OF((uLong check, const Bytef *buf, uInt len)); + +voidpf zcalloc OF((voidpf opaque, unsigned items, unsigned size)); +void zcfree OF((voidpf opaque, voidpf ptr)); + +#define ZALLOC(strm, items, size) \ + (*((strm)->zalloc))((strm)->opaque, (items), (size)) +#define ZFREE(strm, addr) (*((strm)->zfree))((strm)->opaque, (voidpf)(addr)) +#define TRY_FREE(s, p) {if (p) ZFREE(s, p);} + +#endif /* _Z_UTIL_H */ +/* --- zutil.h */ + +/* +++ deflate.h */ +/* deflate.h -- internal compression state + * Copyright (C) 1995-1996 Jean-loup Gailly + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +/* From: deflate.h,v 1.10 1996/07/02 12:41:00 me Exp $ */ + +#ifndef _DEFLATE_H +#define _DEFLATE_H + +/* #include */ + +/* =========================================================================== + * Internal compression state. + */ + +#define LENGTH_CODES 29 +/* number of length codes, not counting the special END_BLOCK code */ + +#define LITERALS 256 +/* number of literal bytes 0..255 */ + +#define L_CODES (LITERALS+1+LENGTH_CODES) +/* number of Literal or Length codes, including the END_BLOCK code */ + +#define D_CODES 30 +/* number of distance codes */ + +#define BL_CODES 19 +/* number of codes used to transfer the bit lengths */ + +#define HEAP_SIZE (2*L_CODES+1) +/* maximum heap size */ + +#define MAX_BITS 15 +/* All codes must not exceed MAX_BITS bits */ + +#define INIT_STATE 42 +#define BUSY_STATE 113 +#define FINISH_STATE 666 +/* Stream status */ + + +/* Data structure describing a single value and its code string. */ +typedef struct ct_data_s { + union { + ush freq; /* frequency count */ + ush code; /* bit string */ + } fc; + union { + ush dad; /* father node in Huffman tree */ + ush len; /* length of bit string */ + } dl; +} FAR ct_data; + +#define Freq fc.freq +#define Code fc.code +#define Dad dl.dad +#define Len dl.len + +typedef struct static_tree_desc_s static_tree_desc; + +typedef struct tree_desc_s { + ct_data *dyn_tree; /* the dynamic tree */ + int max_code; /* largest code with non zero frequency */ + static_tree_desc *stat_desc; /* the corresponding static tree */ +} FAR tree_desc; + +typedef ush Pos; +typedef Pos FAR Posf; +typedef unsigned IPos; + +/* A Pos is an index in the character window. We use short instead of int to + * save space in the various tables. IPos is used only for parameter passing. + */ + +typedef struct deflate_state { + z_streamp strm; /* pointer back to this zlib stream */ + int status; /* as the name implies */ + Bytef *pending_buf; /* output still pending */ + ulg pending_buf_size; /* size of pending_buf */ + Bytef *pending_out; /* next pending byte to output to the stream */ + int pending; /* nb of bytes in the pending buffer */ + int noheader; /* suppress zlib header and adler32 */ + Byte data_type; /* UNKNOWN, BINARY or ASCII */ + Byte method; /* STORED (for zip only) or DEFLATED */ + int last_flush; /* value of flush param for previous deflate call */ + + /* used by deflate.c: */ + + uInt w_size; /* LZ77 window size (32K by default) */ + uInt w_bits; /* log2(w_size) (8..16) */ + uInt w_mask; /* w_size - 1 */ + + Bytef *window; + /* Sliding window. Input bytes are read into the second half of the window, + * and move to the first half later to keep a dictionary of at least wSize + * bytes. With this organization, matches are limited to a distance of + * wSize-MAX_MATCH bytes, but this ensures that IO is always + * performed with a length multiple of the block size. Also, it limits + * the window size to 64K, which is quite useful on MSDOS. + * To do: use the user input buffer as sliding window. + */ + + ulg window_size; + /* Actual size of window: 2*wSize, except when the user input buffer + * is directly used as sliding window. + */ + + Posf *prev; + /* Link to older string with same hash index. To limit the size of this + * array to 64K, this link is maintained only for the last 32K strings. + * An index in this array is thus a window index modulo 32K. + */ + + Posf *head; /* Heads of the hash chains or NIL. */ + + uInt ins_h; /* hash index of string to be inserted */ + uInt hash_size; /* number of elements in hash table */ + uInt hash_bits; /* log2(hash_size) */ + uInt hash_mask; /* hash_size-1 */ + + uInt hash_shift; + /* Number of bits by which ins_h must be shifted at each input + * step. It must be such that after MIN_MATCH steps, the oldest + * byte no longer takes part in the hash key, that is: + * hash_shift * MIN_MATCH >= hash_bits + */ + + long block_start; + /* Window position at the beginning of the current output block. Gets + * negative when the window is moved backwards. + */ + + uInt match_length; /* length of best match */ + IPos prev_match; /* previous match */ + int match_available; /* set if previous match exists */ + uInt strstart; /* start of string to insert */ + uInt match_start; /* start of matching string */ + uInt lookahead; /* number of valid bytes ahead in window */ + + uInt prev_length; + /* Length of the best match at previous step. Matches not greater than this + * are discarded. This is used in the lazy match evaluation. + */ + + uInt max_chain_length; + /* To speed up deflation, hash chains are never searched beyond this + * length. A higher limit improves compression ratio but degrades the + * speed. + */ + + uInt max_lazy_match; + /* Attempt to find a better match only when the current match is strictly + * smaller than this value. This mechanism is used only for compression + * levels >= 4. + */ +# define max_insert_length max_lazy_match + /* Insert new strings in the hash table only if the match length is not + * greater than this length. This saves time but degrades compression. + * max_insert_length is used only for compression levels <= 3. + */ + + int level; /* compression level (1..9) */ + int strategy; /* favor or force Huffman coding*/ + + uInt good_match; + /* Use a faster search when the previous match is longer than this */ + + int nice_match; /* Stop searching when current match exceeds this */ + + /* used by trees.c: */ + /* Didn't use ct_data typedef below to supress compiler warning */ + struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */ + struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */ + struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */ + + struct tree_desc_s l_desc; /* desc. for literal tree */ + struct tree_desc_s d_desc; /* desc. for distance tree */ + struct tree_desc_s bl_desc; /* desc. for bit length tree */ + + ush bl_count[MAX_BITS+1]; + /* number of codes at each bit length for an optimal tree */ + + int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */ + int heap_len; /* number of elements in the heap */ + int heap_max; /* element of largest frequency */ + /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used. + * The same heap array is used to build all trees. + */ + + uch depth[2*L_CODES+1]; + /* Depth of each subtree used as tie breaker for trees of equal frequency + */ + + uchf *l_buf; /* buffer for literals or lengths */ + + uInt lit_bufsize; + /* Size of match buffer for literals/lengths. There are 4 reasons for + * limiting lit_bufsize to 64K: + * - frequencies can be kept in 16 bit counters + * - if compression is not successful for the first block, all input + * data is still in the window so we can still emit a stored block even + * when input comes from standard input. (This can also be done for + * all blocks if lit_bufsize is not greater than 32K.) + * - if compression is not successful for a file smaller than 64K, we can + * even emit a stored file instead of a stored block (saving 5 bytes). + * This is applicable only for zip (not gzip or zlib). + * - creating new Huffman trees less frequently may not provide fast + * adaptation to changes in the input data statistics. (Take for + * example a binary file with poorly compressible code followed by + * a highly compressible string table.) Smaller buffer sizes give + * fast adaptation but have of course the overhead of transmitting + * trees more frequently. + * - I can't count above 4 + */ + + uInt last_lit; /* running index in l_buf */ + + ushf *d_buf; + /* Buffer for distances. To simplify the code, d_buf and l_buf have + * the same number of elements. To use different lengths, an extra flag + * array would be necessary. + */ + + ulg opt_len; /* bit length of current block with optimal trees */ + ulg static_len; /* bit length of current block with static trees */ + ulg compressed_len; /* total bit length of compressed file */ + uInt matches; /* number of string matches in current block */ + int last_eob_len; /* bit length of EOB code for last block */ + +#ifdef DEBUG_ZLIB + ulg bits_sent; /* bit length of the compressed data */ +#endif + + ush bi_buf; + /* Output buffer. bits are inserted starting at the bottom (least + * significant bits). + */ + int bi_valid; + /* Number of valid bits in bi_buf. All bits above the last valid bit + * are always zero. + */ + +} FAR deflate_state; + +/* Output a byte on the stream. + * IN assertion: there is enough room in pending_buf. + */ +#define put_byte(s, c) {s->pending_buf[s->pending++] = (c);} + + +#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) +/* Minimum amount of lookahead, except at the end of the input file. + * See deflate.c for comments about the MIN_MATCH+1. + */ + +#define MAX_DIST(s) ((s)->w_size-MIN_LOOKAHEAD) +/* In order to simplify the code, particularly on 16 bit machines, match + * distances are limited to MAX_DIST instead of WSIZE. + */ + + /* in trees.c */ +void _tr_init OF((deflate_state *s)); +int _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc)); +ulg _tr_flush_block OF((deflate_state *s, charf *buf, ulg stored_len, + int eof)); +void _tr_align OF((deflate_state *s)); +void _tr_stored_block OF((deflate_state *s, charf *buf, ulg stored_len, + int eof)); +void _tr_stored_type_only OF((deflate_state *)); + +#endif +/* --- deflate.h */ + +/* +++ deflate.c */ +/* deflate.c -- compress data using the deflation algorithm + * Copyright (C) 1995-1996 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* + * ALGORITHM + * + * The "deflation" process depends on being able to identify portions + * of the input text which are identical to earlier input (within a + * sliding window trailing behind the input currently being processed). + * + * The most straightforward technique turns out to be the fastest for + * most input files: try all possible matches and select the longest. + * The key feature of this algorithm is that insertions into the string + * dictionary are very simple and thus fast, and deletions are avoided + * completely. Insertions are performed at each input character, whereas + * string matches are performed only when the previous match ends. So it + * is preferable to spend more time in matches to allow very fast string + * insertions and avoid deletions. The matching algorithm for small + * strings is inspired from that of Rabin & Karp. A brute force approach + * is used to find longer strings when a small match has been found. + * A similar algorithm is used in comic (by Jan-Mark Wams) and freeze + * (by Leonid Broukhis). + * A previous version of this file used a more sophisticated algorithm + * (by Fiala and Greene) which is guaranteed to run in linear amortized + * time, but has a larger average cost, uses more memory and is patented. + * However the F&G algorithm may be faster for some highly redundant + * files if the parameter max_chain_length (described below) is too large. + * + * ACKNOWLEDGEMENTS + * + * The idea of lazy evaluation of matches is due to Jan-Mark Wams, and + * I found it in 'freeze' written by Leonid Broukhis. + * Thanks to many people for bug reports and testing. + * + * REFERENCES + * + * Deutsch, L.P.,"DEFLATE Compressed Data Format Specification". + * Available in ftp://ds.internic.net/rfc/rfc1951.txt + * + * A description of the Rabin and Karp algorithm is given in the book + * "Algorithms" by R. Sedgewick, Addison-Wesley, p252. + * + * Fiala,E.R., and Greene,D.H. + * Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595 + * + */ + +/* From: deflate.c,v 1.15 1996/07/24 13:40:58 me Exp $ */ + +/* #include */ + +char deflate_copyright[] = " deflate 1.0.4 Copyright 1995-1996 Jean-loup Gailly "; +/* + If you use the zlib library in a product, an acknowledgment is welcome + in the documentation of your product. If for some reason you cannot + include such an acknowledgment, I would appreciate that you keep this + copyright string in the executable of your product. + */ + +/* =========================================================================== + * Function prototypes. + */ +typedef enum { + need_more, /* block not completed, need more input or more output */ + block_done, /* block flush performed */ + finish_started, /* finish started, need only more output at next deflate */ + finish_done /* finish done, accept no more input or output */ +} block_state; + +typedef block_state (*compress_func) OF((deflate_state *s, int flush)); +/* Compression function. Returns the block state after the call. */ + +local void fill_window OF((deflate_state *s)); +local block_state deflate_stored OF((deflate_state *s, int flush)); +local block_state deflate_fast OF((deflate_state *s, int flush)); +local block_state deflate_slow OF((deflate_state *s, int flush)); +local void lm_init OF((deflate_state *s)); +local void putShortMSB OF((deflate_state *s, uInt b)); +local void flush_pending OF((z_streamp strm)); +local int read_buf OF((z_streamp strm, charf *buf, unsigned size)); +#ifdef ASMV + void match_init OF((void)); /* asm code initialization */ + uInt longest_match OF((deflate_state *s, IPos cur_match)); +#else +local uInt longest_match OF((deflate_state *s, IPos cur_match)); +#endif + +#ifdef DEBUG_ZLIB +local void check_match OF((deflate_state *s, IPos start, IPos match, + int length)); +#endif + +/* =========================================================================== + * Local data + */ + +#define NIL 0 +/* Tail of hash chains */ + +#ifndef TOO_FAR +# define TOO_FAR 4096 +#endif +/* Matches of length 3 are discarded if their distance exceeds TOO_FAR */ + +#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) +/* Minimum amount of lookahead, except at the end of the input file. + * See deflate.c for comments about the MIN_MATCH+1. + */ + +/* Values for max_lazy_match, good_match and max_chain_length, depending on + * the desired pack level (0..9). The values given below have been tuned to + * exclude worst case performance for pathological files. Better values may be + * found for specific files. + */ +typedef struct config_s { + ush good_length; /* reduce lazy search above this match length */ + ush max_lazy; /* do not perform lazy search above this match length */ + ush nice_length; /* quit search above this match length */ + ush max_chain; + compress_func func; +} config; + +local config configuration_table[10] = { +/* good lazy nice chain */ +/* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */ +/* 1 */ {4, 4, 8, 4, deflate_fast}, /* maximum speed, no lazy matches */ +/* 2 */ {4, 5, 16, 8, deflate_fast}, +/* 3 */ {4, 6, 32, 32, deflate_fast}, + +/* 4 */ {4, 4, 16, 16, deflate_slow}, /* lazy matches */ +/* 5 */ {8, 16, 32, 32, deflate_slow}, +/* 6 */ {8, 16, 128, 128, deflate_slow}, +/* 7 */ {8, 32, 128, 256, deflate_slow}, +/* 8 */ {32, 128, 258, 1024, deflate_slow}, +/* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* maximum compression */ + +/* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4 + * For deflate_fast() (levels <= 3) good is ignored and lazy has a different + * meaning. + */ + +#define EQUAL 0 +/* result of memcmp for equal strings */ + +#ifndef NO_DUMMY_DECL +struct static_tree_desc_s {int dummy;}; /* for buggy compilers */ +#endif + +/* =========================================================================== + * Update a hash value with the given input byte + * IN assertion: all calls to to UPDATE_HASH are made with consecutive + * input characters, so that a running hash key can be computed from the + * previous key instead of complete recalculation each time. + */ +#define UPDATE_HASH(s,h,c) (h = (((h)<hash_shift) ^ (c)) & s->hash_mask) + + +/* =========================================================================== + * Insert string str in the dictionary and set match_head to the previous head + * of the hash chain (the most recent string with same hash key). Return + * the previous length of the hash chain. + * IN assertion: all calls to to INSERT_STRING are made with consecutive + * input characters and the first MIN_MATCH bytes of str are valid + * (except for the last MIN_MATCH-1 bytes of the input file). + */ +#define INSERT_STRING(s, str, match_head) \ + (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \ + s->prev[(str) & s->w_mask] = match_head = s->head[s->ins_h], \ + s->head[s->ins_h] = (Pos)(str)) + +/* =========================================================================== + * Initialize the hash table (avoiding 64K overflow for 16 bit systems). + * prev[] will be initialized on the fly. + */ +#define CLEAR_HASH(s) \ + s->head[s->hash_size-1] = NIL; \ + zmemzero((charf *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head)); + +/* ========================================================================= */ +int deflateInit_(strm, level, version, stream_size) + z_streamp strm; + int level; + const char *version; + int stream_size; +{ + return deflateInit2_(strm, level, Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL, + Z_DEFAULT_STRATEGY, version, stream_size); + /* To do: ignore strm->next_in if we use it as window */ +} + +/* ========================================================================= */ +int deflateInit2_(strm, level, method, windowBits, memLevel, strategy, + version, stream_size) + z_streamp strm; + int level; + int method; + int windowBits; + int memLevel; + int strategy; + const char *version; + int stream_size; +{ + deflate_state *s; + int noheader = 0; + static char* my_version = ZLIB_VERSION; + + ushf *overlay; + /* We overlay pending_buf and d_buf+l_buf. This works since the average + * output size for (length,distance) codes is <= 24 bits. + */ + + if (version == Z_NULL || version[0] != my_version[0] || + stream_size != sizeof(z_stream)) { + return Z_VERSION_ERROR; + } + if (strm == Z_NULL) return Z_STREAM_ERROR; + + strm->msg = Z_NULL; +#ifndef NO_ZCFUNCS + if (strm->zalloc == Z_NULL) { + strm->zalloc = zcalloc; + strm->opaque = (voidpf)0; + } + if (strm->zfree == Z_NULL) strm->zfree = zcfree; +#endif + + if (level == Z_DEFAULT_COMPRESSION) level = 6; + + if (windowBits < 0) { /* undocumented feature: suppress zlib header */ + noheader = 1; + windowBits = -windowBits; + } + if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED || + windowBits < 9 || windowBits > 15 || level < 0 || level > 9 || + strategy < 0 || strategy > Z_HUFFMAN_ONLY) { + return Z_STREAM_ERROR; + } + s = (deflate_state *) ZALLOC(strm, 1, sizeof(deflate_state)); + if (s == Z_NULL) return Z_MEM_ERROR; + strm->state = (struct internal_state FAR *)s; + s->strm = strm; + + s->noheader = noheader; + s->w_bits = windowBits; + s->w_size = 1 << s->w_bits; + s->w_mask = s->w_size - 1; + + s->hash_bits = memLevel + 7; + s->hash_size = 1 << s->hash_bits; + s->hash_mask = s->hash_size - 1; + s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH); + + s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte)); + s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos)); + s->head = (Posf *) ZALLOC(strm, s->hash_size, sizeof(Pos)); + + s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */ + + overlay = (ushf *) ZALLOC(strm, s->lit_bufsize, sizeof(ush)+2); + s->pending_buf = (uchf *) overlay; + s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L); + + if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL || + s->pending_buf == Z_NULL) { + strm->msg = (const char*)ERR_MSG(Z_MEM_ERROR); + deflateEnd (strm); + return Z_MEM_ERROR; + } + s->d_buf = overlay + s->lit_bufsize/sizeof(ush); + s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize; + + s->level = level; + s->strategy = strategy; + s->method = (Byte)method; + + return deflateReset(strm); +} + +/* ========================================================================= */ +int deflateSetDictionary (strm, dictionary, dictLength) + z_streamp strm; + const Bytef *dictionary; + uInt dictLength; +{ + deflate_state *s; + uInt length = dictLength; + uInt n; + IPos hash_head = 0; + + if (strm == Z_NULL || strm->state == Z_NULL || dictionary == Z_NULL) + return Z_STREAM_ERROR; + + s = (deflate_state *) strm->state; + if (s->status != INIT_STATE) return Z_STREAM_ERROR; + + strm->adler = adler32(strm->adler, dictionary, dictLength); + + if (length < MIN_MATCH) return Z_OK; + if (length > MAX_DIST(s)) { + length = MAX_DIST(s); +#ifndef USE_DICT_HEAD + dictionary += dictLength - length; /* use the tail of the dictionary */ +#endif + } + zmemcpy((charf *)s->window, dictionary, length); + s->strstart = length; + s->block_start = (long)length; + + /* Insert all strings in the hash table (except for the last two bytes). + * s->lookahead stays null, so s->ins_h will be recomputed at the next + * call of fill_window. + */ + s->ins_h = s->window[0]; + UPDATE_HASH(s, s->ins_h, s->window[1]); + for (n = 0; n <= length - MIN_MATCH; n++) { + INSERT_STRING(s, n, hash_head); + } + if (hash_head) hash_head = 0; /* to make compiler happy */ + return Z_OK; +} + +/* ========================================================================= */ +int deflateReset (strm) + z_streamp strm; +{ + deflate_state *s; + + if (strm == Z_NULL || strm->state == Z_NULL || + strm->zalloc == Z_NULL || strm->zfree == Z_NULL) return Z_STREAM_ERROR; + + strm->total_in = strm->total_out = 0; + strm->msg = Z_NULL; /* use zfree if we ever allocate msg dynamically */ + strm->data_type = Z_UNKNOWN; + + s = (deflate_state *)strm->state; + s->pending = 0; + s->pending_out = s->pending_buf; + + if (s->noheader < 0) { + s->noheader = 0; /* was set to -1 by deflate(..., Z_FINISH); */ + } + s->status = s->noheader ? BUSY_STATE : INIT_STATE; + strm->adler = 1; + s->last_flush = Z_NO_FLUSH; + + _tr_init(s); + lm_init(s); + + return Z_OK; +} + +/* ========================================================================= */ +int deflateParams(strm, level, strategy) + z_streamp strm; + int level; + int strategy; +{ + deflate_state *s; + compress_func func; + int err = Z_OK; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + s = (deflate_state *) strm->state; + + if (level == Z_DEFAULT_COMPRESSION) { + level = 6; + } + if (level < 0 || level > 9 || strategy < 0 || strategy > Z_HUFFMAN_ONLY) { + return Z_STREAM_ERROR; + } + func = configuration_table[s->level].func; + + if (func != configuration_table[level].func && strm->total_in != 0) { + /* Flush the last buffer: */ + err = deflate(strm, Z_PARTIAL_FLUSH); + } + if (s->level != level) { + s->level = level; + s->max_lazy_match = configuration_table[level].max_lazy; + s->good_match = configuration_table[level].good_length; + s->nice_match = configuration_table[level].nice_length; + s->max_chain_length = configuration_table[level].max_chain; + } + s->strategy = strategy; + return err; +} + +/* ========================================================================= + * Put a short in the pending buffer. The 16-bit value is put in MSB order. + * IN assertion: the stream state is correct and there is enough room in + * pending_buf. + */ +local void putShortMSB (s, b) + deflate_state *s; + uInt b; +{ + put_byte(s, (Byte)(b >> 8)); + put_byte(s, (Byte)(b & 0xff)); +} + +/* ========================================================================= + * Flush as much pending output as possible. All deflate() output goes + * through this function so some applications may wish to modify it + * to avoid allocating a large strm->next_out buffer and copying into it. + * (See also read_buf()). + */ +local void flush_pending(strm) + z_streamp strm; +{ + deflate_state *s = (deflate_state *) strm->state; + unsigned len = s->pending; + + if (len > strm->avail_out) len = strm->avail_out; + if (len == 0) return; + + if (strm->next_out != Z_NULL) { + zmemcpy(strm->next_out, s->pending_out, len); + strm->next_out += len; + } + s->pending_out += len; + strm->total_out += len; + strm->avail_out -= len; + s->pending -= len; + if (s->pending == 0) { + s->pending_out = s->pending_buf; + } +} + +/* ========================================================================= */ +int deflate (strm, flush) + z_streamp strm; + int flush; +{ + int old_flush; /* value of flush param for previous deflate call */ + deflate_state *s; + + if (strm == Z_NULL || strm->state == Z_NULL || + flush > Z_FINISH || flush < 0) { + return Z_STREAM_ERROR; + } + s = (deflate_state *) strm->state; + + if ((strm->next_in == Z_NULL && strm->avail_in != 0) || + (s->status == FINISH_STATE && flush != Z_FINISH)) { + ERR_RETURN(strm, Z_STREAM_ERROR); + } + if (strm->avail_out == 0) ERR_RETURN(strm, Z_BUF_ERROR); + + s->strm = strm; /* just in case */ + old_flush = s->last_flush; + s->last_flush = flush; + + /* Write the zlib header */ + if (s->status == INIT_STATE) { + + uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8; + uInt level_flags = (s->level-1) >> 1; + + if (level_flags > 3) level_flags = 3; + header |= (level_flags << 6); + if (s->strstart != 0) header |= PRESET_DICT; + header += 31 - (header % 31); + + s->status = BUSY_STATE; + putShortMSB(s, header); + + /* Save the adler32 of the preset dictionary: */ + if (s->strstart != 0) { + putShortMSB(s, (uInt)(strm->adler >> 16)); + putShortMSB(s, (uInt)(strm->adler & 0xffff)); + } + strm->adler = 1L; + } + + /* Flush as much pending output as possible */ + if (s->pending != 0) { + flush_pending(strm); + if (strm->avail_out == 0) { + /* Since avail_out is 0, deflate will be called again with + * more output space, but possibly with both pending and + * avail_in equal to zero. There won't be anything to do, + * but this is not an error situation so make sure we + * return OK instead of BUF_ERROR at next call of deflate: + */ + s->last_flush = -1; + return Z_OK; + } + + /* Make sure there is something to do and avoid duplicate consecutive + * flushes. For repeated and useless calls with Z_FINISH, we keep + * returning Z_STREAM_END instead of Z_BUFF_ERROR. + */ + } else if (strm->avail_in == 0 && flush <= old_flush && + flush != Z_FINISH) { + ERR_RETURN(strm, Z_BUF_ERROR); + } + + /* User must not provide more input after the first FINISH: */ + if (s->status == FINISH_STATE && strm->avail_in != 0) { + ERR_RETURN(strm, Z_BUF_ERROR); + } + + /* Start a new block or continue the current one. + */ + if (strm->avail_in != 0 || s->lookahead != 0 || + (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) { + block_state bstate; + + bstate = (*(configuration_table[s->level].func))(s, flush); + + if (bstate == finish_started || bstate == finish_done) { + s->status = FINISH_STATE; + } + if (bstate == need_more || bstate == finish_started) { + if (strm->avail_out == 0) { + s->last_flush = -1; /* avoid BUF_ERROR next call, see above */ + } + return Z_OK; + /* If flush != Z_NO_FLUSH && avail_out == 0, the next call + * of deflate should use the same flush parameter to make sure + * that the flush is complete. So we don't have to output an + * empty block here, this will be done at next call. This also + * ensures that for a very small output buffer, we emit at most + * one empty block. + */ + } + if (bstate == block_done) { + if (flush == Z_PARTIAL_FLUSH) { + _tr_align(s); + } else if (flush == Z_PACKET_FLUSH) { + /* Output just the 3-bit `stored' block type value, + but not a zero length. */ + _tr_stored_type_only(s); + } else { /* FULL_FLUSH or SYNC_FLUSH */ + _tr_stored_block(s, (char*)0, 0L, 0); + /* For a full flush, this empty block will be recognized + * as a special marker by inflate_sync(). + */ + if (flush == Z_FULL_FLUSH) { + CLEAR_HASH(s); /* forget history */ + } + } + flush_pending(strm); + if (strm->avail_out == 0) { + s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */ + return Z_OK; + } + } + } + Assert(strm->avail_out > 0, "bug2"); + + if (flush != Z_FINISH) return Z_OK; + if (s->noheader) return Z_STREAM_END; + + /* Write the zlib trailer (adler32) */ + putShortMSB(s, (uInt)(strm->adler >> 16)); + putShortMSB(s, (uInt)(strm->adler & 0xffff)); + flush_pending(strm); + /* If avail_out is zero, the application will call deflate again + * to flush the rest. + */ + s->noheader = -1; /* write the trailer only once! */ + return s->pending != 0 ? Z_OK : Z_STREAM_END; +} + +/* ========================================================================= */ +int deflateEnd (strm) + z_streamp strm; +{ + int status; + deflate_state *s; + + if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR; + s = (deflate_state *) strm->state; + + status = s->status; + if (status != INIT_STATE && status != BUSY_STATE && + status != FINISH_STATE) { + return Z_STREAM_ERROR; + } + + /* Deallocate in reverse order of allocations: */ + TRY_FREE(strm, s->pending_buf); + TRY_FREE(strm, s->head); + TRY_FREE(strm, s->prev); + TRY_FREE(strm, s->window); + + ZFREE(strm, s); + strm->state = Z_NULL; + + return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; +} + +/* ========================================================================= + * Copy the source state to the destination state. + */ +int deflateCopy (dest, source) + z_streamp dest; + z_streamp source; +{ + deflate_state *ds; + deflate_state *ss; + ushf *overlay; + + if (source == Z_NULL || dest == Z_NULL || source->state == Z_NULL) + return Z_STREAM_ERROR; + ss = (deflate_state *) source->state; + + zmemcpy(dest, source, sizeof(*dest)); + + ds = (deflate_state *) ZALLOC(dest, 1, sizeof(deflate_state)); + if (ds == Z_NULL) return Z_MEM_ERROR; + dest->state = (struct internal_state FAR *) ds; + zmemcpy(ds, ss, sizeof(*ds)); + ds->strm = dest; + + ds->window = (Bytef *) ZALLOC(dest, ds->w_size, 2*sizeof(Byte)); + ds->prev = (Posf *) ZALLOC(dest, ds->w_size, sizeof(Pos)); + ds->head = (Posf *) ZALLOC(dest, ds->hash_size, sizeof(Pos)); + overlay = (ushf *) ZALLOC(dest, ds->lit_bufsize, sizeof(ush)+2); + ds->pending_buf = (uchf *) overlay; + + if (ds->window == Z_NULL || ds->prev == Z_NULL || ds->head == Z_NULL || + ds->pending_buf == Z_NULL) { + deflateEnd (dest); + return Z_MEM_ERROR; + } + /* ??? following zmemcpy doesn't work for 16-bit MSDOS */ + zmemcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte)); + zmemcpy(ds->prev, ss->prev, ds->w_size * sizeof(Pos)); + zmemcpy(ds->head, ss->head, ds->hash_size * sizeof(Pos)); + zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size); + + ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf); + ds->d_buf = overlay + ds->lit_bufsize/sizeof(ush); + ds->l_buf = ds->pending_buf + (1+sizeof(ush))*ds->lit_bufsize; + + ds->l_desc.dyn_tree = ds->dyn_ltree; + ds->d_desc.dyn_tree = ds->dyn_dtree; + ds->bl_desc.dyn_tree = ds->bl_tree; + + return Z_OK; +} + +/* =========================================================================== + * Return the number of bytes of output which are immediately available + * for output from the decompressor. + */ +int deflateOutputPending (strm) + z_streamp strm; +{ + if (strm == Z_NULL || strm->state == Z_NULL) return 0; + + return ((deflate_state *)(strm->state))->pending; +} + +/* =========================================================================== + * Read a new buffer from the current input stream, update the adler32 + * and total number of bytes read. All deflate() input goes through + * this function so some applications may wish to modify it to avoid + * allocating a large strm->next_in buffer and copying from it. + * (See also flush_pending()). + */ +local int read_buf(strm, buf, size) + z_streamp strm; + charf *buf; + unsigned size; +{ + unsigned len = strm->avail_in; + + if (len > size) len = size; + if (len == 0) return 0; + + strm->avail_in -= len; + + if (!((deflate_state *)(strm->state))->noheader) { + strm->adler = adler32(strm->adler, strm->next_in, len); + } + zmemcpy(buf, strm->next_in, len); + strm->next_in += len; + strm->total_in += len; + + return (int)len; +} + +/* =========================================================================== + * Initialize the "longest match" routines for a new zlib stream + */ +local void lm_init (s) + deflate_state *s; +{ + s->window_size = (ulg)2L*s->w_size; + + CLEAR_HASH(s); + + /* Set the default configuration parameters: + */ + s->max_lazy_match = configuration_table[s->level].max_lazy; + s->good_match = configuration_table[s->level].good_length; + s->nice_match = configuration_table[s->level].nice_length; + s->max_chain_length = configuration_table[s->level].max_chain; + + s->strstart = 0; + s->block_start = 0L; + s->lookahead = 0; + s->match_length = s->prev_length = MIN_MATCH-1; + s->match_available = 0; + s->ins_h = 0; +#ifdef ASMV + match_init(); /* initialize the asm code */ +#endif +} + +/* =========================================================================== + * Set match_start to the longest match starting at the given string and + * return its length. Matches shorter or equal to prev_length are discarded, + * in which case the result is equal to prev_length and match_start is + * garbage. + * IN assertions: cur_match is the head of the hash chain for the current + * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1 + * OUT assertion: the match length is not greater than s->lookahead. + */ +#ifndef ASMV +/* For 80x86 and 680x0, an optimized version will be provided in match.asm or + * match.S. The code will be functionally equivalent. + */ +local uInt longest_match(s, cur_match) + deflate_state *s; + IPos cur_match; /* current match */ +{ + unsigned chain_length = s->max_chain_length;/* max hash chain length */ + register Bytef *scan = s->window + s->strstart; /* current string */ + register Bytef *match; /* matched string */ + register int len; /* length of current match */ + int best_len = s->prev_length; /* best match length so far */ + int nice_match = s->nice_match; /* stop if match long enough */ + IPos limit = s->strstart > (IPos)MAX_DIST(s) ? + s->strstart - (IPos)MAX_DIST(s) : NIL; + /* Stop when cur_match becomes <= limit. To simplify the code, + * we prevent matches with the string of window index 0. + */ + Posf *prev = s->prev; + uInt wmask = s->w_mask; + +#ifdef UNALIGNED_OK + /* Compare two bytes at a time. Note: this is not always beneficial. + * Try with and without -DUNALIGNED_OK to check. + */ + register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1; + register ush scan_start = *(ushf*)scan; + register ush scan_end = *(ushf*)(scan+best_len-1); +#else + register Bytef *strend = s->window + s->strstart + MAX_MATCH; + register Byte scan_end1 = scan[best_len-1]; + register Byte scan_end = scan[best_len]; +#endif + + /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. + * It is easy to get rid of this optimization if necessary. + */ + Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); + + /* Do not waste too much time if we already have a good match: */ + if (s->prev_length >= s->good_match) { + chain_length >>= 2; + } + /* Do not look for matches beyond the end of the input. This is necessary + * to make deflate deterministic. + */ + if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; + + Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); + + do { + Assert(cur_match < s->strstart, "no future"); + match = s->window + cur_match; + + /* Skip to next match if the match length cannot increase + * or if the match length is less than 2: + */ +#if (defined(UNALIGNED_OK) && MAX_MATCH == 258) + /* This code assumes sizeof(unsigned short) == 2. Do not use + * UNALIGNED_OK if your compiler uses a different size. + */ + if (*(ushf*)(match+best_len-1) != scan_end || + *(ushf*)match != scan_start) continue; + + /* It is not necessary to compare scan[2] and match[2] since they are + * always equal when the other bytes match, given that the hash keys + * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at + * strstart+3, +5, ... up to strstart+257. We check for insufficient + * lookahead only every 4th comparison; the 128th check will be made + * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is + * necessary to put more guard bytes at the end of the window, or + * to check more often for insufficient lookahead. + */ + Assert(scan[2] == match[2], "scan[2]?"); + scan++, match++; + do { + } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) && + *(ushf*)(scan+=2) == *(ushf*)(match+=2) && + *(ushf*)(scan+=2) == *(ushf*)(match+=2) && + *(ushf*)(scan+=2) == *(ushf*)(match+=2) && + scan < strend); + /* The funny "do {}" generates better code on most compilers */ + + /* Here, scan <= window+strstart+257 */ + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + if (*scan == *match) scan++; + + len = (MAX_MATCH - 1) - (int)(strend-scan); + scan = strend - (MAX_MATCH-1); + +#else /* UNALIGNED_OK */ + + if (match[best_len] != scan_end || + match[best_len-1] != scan_end1 || + *match != *scan || + *++match != scan[1]) continue; + + /* The check at best_len-1 can be removed because it will be made + * again later. (This heuristic is not always a win.) + * It is not necessary to compare scan[2] and match[2] since they + * are always equal when the other bytes match, given that + * the hash keys are equal and that HASH_BITS >= 8. + */ + scan += 2, match++; + Assert(*scan == *match, "match[2]?"); + + /* We check for insufficient lookahead only every 8th comparison; + * the 256th check will be made at strstart+258. + */ + do { + } while (*++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + scan < strend); + + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + + len = MAX_MATCH - (int)(strend - scan); + scan = strend - MAX_MATCH; + +#endif /* UNALIGNED_OK */ + + if (len > best_len) { + s->match_start = cur_match; + best_len = len; + if (len >= nice_match) break; +#ifdef UNALIGNED_OK + scan_end = *(ushf*)(scan+best_len-1); +#else + scan_end1 = scan[best_len-1]; + scan_end = scan[best_len]; +#endif + } + } while ((cur_match = prev[cur_match & wmask]) > limit + && --chain_length != 0); + + if ((uInt)best_len <= s->lookahead) return best_len; + return s->lookahead; +} +#endif /* ASMV */ + +#ifdef DEBUG_ZLIB +/* =========================================================================== + * Check that the match at match_start is indeed a match. + */ +local void check_match(s, start, match, length) + deflate_state *s; + IPos start, match; + int length; +{ + /* check that the match is indeed a match */ + if (zmemcmp((charf *)s->window + match, + (charf *)s->window + start, length) != EQUAL) { + fprintf(stderr, " start %u, match %u, length %d\n", + start, match, length); + do { + fprintf(stderr, "%c%c", s->window[match++], s->window[start++]); + } while (--length != 0); + z_error("invalid match"); + } + if (z_verbose > 1) { + fprintf(stderr,"\\[%d,%d]", start-match, length); + do { putc(s->window[start++], stderr); } while (--length != 0); + } +} +#else +# define check_match(s, start, match, length) +#endif + +/* =========================================================================== + * Fill the window when the lookahead becomes insufficient. + * Updates strstart and lookahead. + * + * IN assertion: lookahead < MIN_LOOKAHEAD + * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD + * At least one byte has been read, or avail_in == 0; reads are + * performed for at least two bytes (required for the zip translate_eol + * option -- not supported here). + */ +local void fill_window(s) + deflate_state *s; +{ + register unsigned n, m; + register Posf *p; + unsigned more; /* Amount of free space at the end of the window. */ + uInt wsize = s->w_size; + + do { + more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart); + + /* Deal with !@#$% 64K limit: */ + if (more == 0 && s->strstart == 0 && s->lookahead == 0) { + more = wsize; + + } else if (more == (unsigned)(-1)) { + /* Very unlikely, but possible on 16 bit machine if strstart == 0 + * and lookahead == 1 (input done one byte at time) + */ + more--; + + /* If the window is almost full and there is insufficient lookahead, + * move the upper half to the lower one to make room in the upper half. + */ + } else if (s->strstart >= wsize+MAX_DIST(s)) { + + zmemcpy((charf *)s->window, (charf *)s->window+wsize, + (unsigned)wsize); + s->match_start -= wsize; + s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ + s->block_start -= (long) wsize; + + /* Slide the hash table (could be avoided with 32 bit values + at the expense of memory usage). We slide even when level == 0 + to keep the hash table consistent if we switch back to level > 0 + later. (Using level 0 permanently is not an optimal usage of + zlib, so we don't care about this pathological case.) + */ + n = s->hash_size; + p = &s->head[n]; + do { + m = *--p; + *p = (Pos)(m >= wsize ? m-wsize : NIL); + } while (--n); + + n = wsize; + p = &s->prev[n]; + do { + m = *--p; + *p = (Pos)(m >= wsize ? m-wsize : NIL); + /* If n is not on any hash chain, prev[n] is garbage but + * its value will never be used. + */ + } while (--n); + more += wsize; + } + if (s->strm->avail_in == 0) return; + + /* If there was no sliding: + * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && + * more == window_size - lookahead - strstart + * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) + * => more >= window_size - 2*WSIZE + 2 + * In the BIG_MEM or MMAP case (not yet supported), + * window_size == input_size + MIN_LOOKAHEAD && + * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. + * Otherwise, window_size == 2*WSIZE so more >= 2. + * If there was sliding, more >= WSIZE. So in all cases, more >= 2. + */ + Assert(more >= 2, "more < 2"); + + n = read_buf(s->strm, (charf *)s->window + s->strstart + s->lookahead, + more); + s->lookahead += n; + + /* Initialize the hash value now that we have some input: */ + if (s->lookahead >= MIN_MATCH) { + s->ins_h = s->window[s->strstart]; + UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); +#if MIN_MATCH != 3 + Call UPDATE_HASH() MIN_MATCH-3 more times +#endif + } + /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, + * but this is not important since only literal bytes will be emitted. + */ + + } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); +} + +/* =========================================================================== + * Flush the current block, with given end-of-file flag. + * IN assertion: strstart is set to the end of the current match. + */ +#define FLUSH_BLOCK_ONLY(s, eof) { \ + _tr_flush_block(s, (s->block_start >= 0L ? \ + (charf *)&s->window[(unsigned)s->block_start] : \ + (charf *)Z_NULL), \ + (ulg)((long)s->strstart - s->block_start), \ + (eof)); \ + s->block_start = s->strstart; \ + flush_pending(s->strm); \ + Tracev((stderr,"[FLUSH]")); \ +} + +/* Same but force premature exit if necessary. */ +#define FLUSH_BLOCK(s, eof) { \ + FLUSH_BLOCK_ONLY(s, eof); \ + if (s->strm->avail_out == 0) return (eof) ? finish_started : need_more; \ +} + +/* =========================================================================== + * Copy without compression as much as possible from the input stream, return + * the current block state. + * This function does not insert new strings in the dictionary since + * uncompressible data is probably not useful. This function is used + * only for the level=0 compression option. + * NOTE: this function should be optimized to avoid extra copying from + * window to pending_buf. + */ +local block_state deflate_stored(s, flush) + deflate_state *s; + int flush; +{ + /* Stored blocks are limited to 0xffff bytes, pending_buf is limited + * to pending_buf_size, and each stored block has a 5 byte header: + */ + ulg max_block_size = 0xffff; + ulg max_start; + + if (max_block_size > s->pending_buf_size - 5) { + max_block_size = s->pending_buf_size - 5; + } + + /* Copy as much as possible from input to output: */ + for (;;) { + /* Fill the window as much as possible: */ + if (s->lookahead <= 1) { + + Assert(s->strstart < s->w_size+MAX_DIST(s) || + s->block_start >= (long)s->w_size, "slide too late"); + + fill_window(s); + if (s->lookahead == 0 && flush == Z_NO_FLUSH) return need_more; + + if (s->lookahead == 0) break; /* flush the current block */ + } + Assert(s->block_start >= 0L, "block gone"); + + s->strstart += s->lookahead; + s->lookahead = 0; + + /* Emit a stored block if pending_buf will be full: */ + max_start = s->block_start + max_block_size; + if (s->strstart == 0 || (ulg)s->strstart >= max_start) { + /* strstart == 0 is possible when wraparound on 16-bit machine */ + s->lookahead = (uInt)(s->strstart - max_start); + s->strstart = (uInt)max_start; + FLUSH_BLOCK(s, 0); + } + /* Flush if we may have to slide, otherwise block_start may become + * negative and the data will be gone: + */ + if (s->strstart - (uInt)s->block_start >= MAX_DIST(s)) { + FLUSH_BLOCK(s, 0); + } + } + FLUSH_BLOCK(s, flush == Z_FINISH); + return flush == Z_FINISH ? finish_done : block_done; +} + +/* =========================================================================== + * Compress as much as possible from the input stream, return the current + * block state. + * This function does not perform lazy evaluation of matches and inserts + * new strings in the dictionary only for unmatched strings or for short + * matches. It is used only for the fast compression options. + */ +local block_state deflate_fast(s, flush) + deflate_state *s; + int flush; +{ + IPos hash_head = NIL; /* head of the hash chain */ + int bflush; /* set if current block must be flushed */ + + for (;;) { + /* Make sure that we always have enough lookahead, except + * at the end of the input file. We need MAX_MATCH bytes + * for the next match, plus MIN_MATCH bytes to insert the + * string following the next match. + */ + if (s->lookahead < MIN_LOOKAHEAD) { + fill_window(s); + if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { + return need_more; + } + if (s->lookahead == 0) break; /* flush the current block */ + } + + /* Insert the string window[strstart .. strstart+2] in the + * dictionary, and set hash_head to the head of the hash chain: + */ + if (s->lookahead >= MIN_MATCH) { + INSERT_STRING(s, s->strstart, hash_head); + } + + /* Find the longest match, discarding those <= prev_length. + * At this point we have always match_length < MIN_MATCH + */ + if (hash_head != NIL && s->strstart - hash_head <= MAX_DIST(s)) { + /* To simplify the code, we prevent matches with the string + * of window index 0 (in particular we have to avoid a match + * of the string with itself at the start of the input file). + */ + if (s->strategy != Z_HUFFMAN_ONLY) { + s->match_length = longest_match (s, hash_head); + } + /* longest_match() sets match_start */ + } + if (s->match_length >= MIN_MATCH) { + check_match(s, s->strstart, s->match_start, s->match_length); + + bflush = _tr_tally(s, s->strstart - s->match_start, + s->match_length - MIN_MATCH); + + s->lookahead -= s->match_length; + + /* Insert new strings in the hash table only if the match length + * is not too large. This saves time but degrades compression. + */ + if (s->match_length <= s->max_insert_length && + s->lookahead >= MIN_MATCH) { + s->match_length--; /* string at strstart already in hash table */ + do { + s->strstart++; + INSERT_STRING(s, s->strstart, hash_head); + /* strstart never exceeds WSIZE-MAX_MATCH, so there are + * always MIN_MATCH bytes ahead. + */ + } while (--s->match_length != 0); + s->strstart++; + } else { + s->strstart += s->match_length; + s->match_length = 0; + s->ins_h = s->window[s->strstart]; + UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); +#if MIN_MATCH != 3 + Call UPDATE_HASH() MIN_MATCH-3 more times +#endif + /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not + * matter since it will be recomputed at next deflate call. + */ + } + } else { + /* No match, output a literal byte */ + Tracevv((stderr,"%c", s->window[s->strstart])); + bflush = _tr_tally (s, 0, s->window[s->strstart]); + s->lookahead--; + s->strstart++; + } + if (bflush) FLUSH_BLOCK(s, 0); + } + FLUSH_BLOCK(s, flush == Z_FINISH); + return flush == Z_FINISH ? finish_done : block_done; +} + +/* =========================================================================== + * Same as above, but achieves better compression. We use a lazy + * evaluation for matches: a match is finally adopted only if there is + * no better match at the next window position. + */ +local block_state deflate_slow(s, flush) + deflate_state *s; + int flush; +{ + IPos hash_head = NIL; /* head of hash chain */ + int bflush; /* set if current block must be flushed */ + + /* Process the input block. */ + for (;;) { + /* Make sure that we always have enough lookahead, except + * at the end of the input file. We need MAX_MATCH bytes + * for the next match, plus MIN_MATCH bytes to insert the + * string following the next match. + */ + if (s->lookahead < MIN_LOOKAHEAD) { + fill_window(s); + if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { + return need_more; + } + if (s->lookahead == 0) break; /* flush the current block */ + } + + /* Insert the string window[strstart .. strstart+2] in the + * dictionary, and set hash_head to the head of the hash chain: + */ + if (s->lookahead >= MIN_MATCH) { + INSERT_STRING(s, s->strstart, hash_head); + } + + /* Find the longest match, discarding those <= prev_length. + */ + s->prev_length = s->match_length, s->prev_match = s->match_start; + s->match_length = MIN_MATCH-1; + + if (hash_head != NIL && s->prev_length < s->max_lazy_match && + s->strstart - hash_head <= MAX_DIST(s)) { + /* To simplify the code, we prevent matches with the string + * of window index 0 (in particular we have to avoid a match + * of the string with itself at the start of the input file). + */ + if (s->strategy != Z_HUFFMAN_ONLY) { + s->match_length = longest_match (s, hash_head); + } + /* longest_match() sets match_start */ + + if (s->match_length <= 5 && (s->strategy == Z_FILTERED || + (s->match_length == MIN_MATCH && + s->strstart - s->match_start > TOO_FAR))) { + + /* If prev_match is also MIN_MATCH, match_start is garbage + * but we will ignore the current match anyway. + */ + s->match_length = MIN_MATCH-1; + } + } + /* If there was a match at the previous step and the current + * match is not better, output the previous match: + */ + if (s->prev_length >= MIN_MATCH && s->match_length <= s->prev_length) { + uInt max_insert = s->strstart + s->lookahead - MIN_MATCH; + /* Do not insert strings in hash table beyond this. */ + + check_match(s, s->strstart-1, s->prev_match, s->prev_length); + + bflush = _tr_tally(s, s->strstart -1 - s->prev_match, + s->prev_length - MIN_MATCH); + + /* Insert in hash table all strings up to the end of the match. + * strstart-1 and strstart are already inserted. If there is not + * enough lookahead, the last two strings are not inserted in + * the hash table. + */ + s->lookahead -= s->prev_length-1; + s->prev_length -= 2; + do { + if (++s->strstart <= max_insert) { + INSERT_STRING(s, s->strstart, hash_head); + } + } while (--s->prev_length != 0); + s->match_available = 0; + s->match_length = MIN_MATCH-1; + s->strstart++; + + if (bflush) FLUSH_BLOCK(s, 0); + + } else if (s->match_available) { + /* If there was no match at the previous position, output a + * single literal. If there was a match but the current match + * is longer, truncate the previous match to a single literal. + */ + Tracevv((stderr,"%c", s->window[s->strstart-1])); + if (_tr_tally (s, 0, s->window[s->strstart-1])) { + FLUSH_BLOCK_ONLY(s, 0); + } + s->strstart++; + s->lookahead--; + if (s->strm->avail_out == 0) return need_more; + } else { + /* There is no previous match to compare with, wait for + * the next step to decide. + */ + s->match_available = 1; + s->strstart++; + s->lookahead--; + } + } + Assert (flush != Z_NO_FLUSH, "no flush?"); + if (s->match_available) { + Tracevv((stderr,"%c", s->window[s->strstart-1])); + _tr_tally (s, 0, s->window[s->strstart-1]); + s->match_available = 0; + } + FLUSH_BLOCK(s, flush == Z_FINISH); + return flush == Z_FINISH ? finish_done : block_done; +} +/* --- deflate.c */ + +/* +++ trees.c */ +/* trees.c -- output deflated data using Huffman coding + * Copyright (C) 1995-1996 Jean-loup Gailly + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* + * ALGORITHM + * + * The "deflation" process uses several Huffman trees. The more + * common source values are represented by shorter bit sequences. + * + * Each code tree is stored in a compressed form which is itself + * a Huffman encoding of the lengths of all the code strings (in + * ascending order by source values). The actual code strings are + * reconstructed from the lengths in the inflate process, as described + * in the deflate specification. + * + * REFERENCES + * + * Deutsch, L.P.,"'Deflate' Compressed Data Format Specification". + * Available in ftp.uu.net:/pub/archiving/zip/doc/deflate-1.1.doc + * + * Storer, James A. + * Data Compression: Methods and Theory, pp. 49-50. + * Computer Science Press, 1988. ISBN 0-7167-8156-5. + * + * Sedgewick, R. + * Algorithms, p290. + * Addison-Wesley, 1983. ISBN 0-201-06672-6. + */ + +/* From: trees.c,v 1.11 1996/07/24 13:41:06 me Exp $ */ + +/* #include */ + +#ifdef DEBUG_ZLIB +# include +#endif + +/* =========================================================================== + * Constants + */ + +#define MAX_BL_BITS 7 +/* Bit length codes must not exceed MAX_BL_BITS bits */ + +#define END_BLOCK 256 +/* end of block literal code */ + +#define REP_3_6 16 +/* repeat previous bit length 3-6 times (2 bits of repeat count) */ + +#define REPZ_3_10 17 +/* repeat a zero length 3-10 times (3 bits of repeat count) */ + +#define REPZ_11_138 18 +/* repeat a zero length 11-138 times (7 bits of repeat count) */ + +local int extra_lbits[LENGTH_CODES] /* extra bits for each length code */ + = {0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0}; + +local int extra_dbits[D_CODES] /* extra bits for each distance code */ + = {0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13}; + +local int extra_blbits[BL_CODES]/* extra bits for each bit length code */ + = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,7}; + +local uch bl_order[BL_CODES] + = {16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15}; +/* The lengths of the bit length codes are sent in order of decreasing + * probability, to avoid transmitting the lengths for unused bit length codes. + */ + +#define Buf_size (8 * 2*sizeof(char)) +/* Number of bits used within bi_buf. (bi_buf might be implemented on + * more than 16 bits on some systems.) + */ + +/* =========================================================================== + * Local data. These are initialized only once. + */ + +local ct_data static_ltree[L_CODES+2]; +/* The static literal tree. Since the bit lengths are imposed, there is no + * need for the L_CODES extra codes used during heap construction. However + * The codes 286 and 287 are needed to build a canonical tree (see _tr_init + * below). + */ + +local ct_data static_dtree[D_CODES]; +/* The static distance tree. (Actually a trivial tree since all codes use + * 5 bits.) + */ + +local uch dist_code[512]; +/* distance codes. The first 256 values correspond to the distances + * 3 .. 258, the last 256 values correspond to the top 8 bits of + * the 15 bit distances. + */ + +local uch length_code[MAX_MATCH-MIN_MATCH+1]; +/* length code for each normalized match length (0 == MIN_MATCH) */ + +local int base_length[LENGTH_CODES]; +/* First normalized length for each code (0 = MIN_MATCH) */ + +local int base_dist[D_CODES]; +/* First normalized distance for each code (0 = distance of 1) */ + +struct static_tree_desc_s { + ct_data *static_tree; /* static tree or NULL */ + intf *extra_bits; /* extra bits for each code or NULL */ + int extra_base; /* base index for extra_bits */ + int elems; /* max number of elements in the tree */ + int max_length; /* max bit length for the codes */ +}; + +local static_tree_desc static_l_desc = +{static_ltree, extra_lbits, LITERALS+1, L_CODES, MAX_BITS}; + +local static_tree_desc static_d_desc = +{static_dtree, extra_dbits, 0, D_CODES, MAX_BITS}; + +local static_tree_desc static_bl_desc = +{(ct_data *)0, extra_blbits, 0, BL_CODES, MAX_BL_BITS}; + +/* =========================================================================== + * Local (static) routines in this file. + */ + +local void tr_static_init OF((void)); +local void init_block OF((deflate_state *s)); +local void pqdownheap OF((deflate_state *s, ct_data *tree, int k)); +local void gen_bitlen OF((deflate_state *s, tree_desc *desc)); +local void gen_codes OF((ct_data *tree, int max_code, ushf *bl_count)); +local void build_tree OF((deflate_state *s, tree_desc *desc)); +local void scan_tree OF((deflate_state *s, ct_data *tree, int max_code)); +local void send_tree OF((deflate_state *s, ct_data *tree, int max_code)); +local int build_bl_tree OF((deflate_state *s)); +local void send_all_trees OF((deflate_state *s, int lcodes, int dcodes, + int blcodes)); +local void compress_block OF((deflate_state *s, ct_data *ltree, + ct_data *dtree)); +local void set_data_type OF((deflate_state *s)); +local unsigned bi_reverse OF((unsigned value, int length)); +local void bi_windup OF((deflate_state *s)); +local void bi_flush OF((deflate_state *s)); +local void copy_block OF((deflate_state *s, charf *buf, unsigned len, + int header)); + +#ifndef DEBUG_ZLIB +# define send_code(s, c, tree) send_bits(s, tree[(c)].Code, tree[(c)].Len) + /* Send a code of the given tree. c and tree must not have side effects */ + +#else /* DEBUG_ZLIB */ +# define send_code(s, c, tree) \ + { if (verbose>2) fprintf(stderr,"\ncd %3d ",(c)); \ + send_bits(s, tree[c].Code, tree[c].Len); } +#endif + +#define d_code(dist) \ + ((dist) < 256 ? dist_code[dist] : dist_code[256+((dist)>>7)]) +/* Mapping from a distance to a distance code. dist is the distance - 1 and + * must not have side effects. dist_code[256] and dist_code[257] are never + * used. + */ + +/* =========================================================================== + * Output a short LSB first on the stream. + * IN assertion: there is enough room in pendingBuf. + */ +#define put_short(s, w) { \ + put_byte(s, (uch)((w) & 0xff)); \ + put_byte(s, (uch)((ush)(w) >> 8)); \ +} + +/* =========================================================================== + * Send a value on a given number of bits. + * IN assertion: length <= 16 and value fits in length bits. + */ +#ifdef DEBUG_ZLIB +local void send_bits OF((deflate_state *s, int value, int length)); + +local void send_bits(s, value, length) + deflate_state *s; + int value; /* value to send */ + int length; /* number of bits */ +{ + Tracevv((stderr," l %2d v %4x ", length, value)); + Assert(length > 0 && length <= 15, "invalid length"); + s->bits_sent += (ulg)length; + + /* If not enough room in bi_buf, use (valid) bits from bi_buf and + * (16 - bi_valid) bits from value, leaving (width - (16-bi_valid)) + * unused bits in value. + */ + if (s->bi_valid > (int)Buf_size - length) { + s->bi_buf |= (value << s->bi_valid); + put_short(s, s->bi_buf); + s->bi_buf = (ush)value >> (Buf_size - s->bi_valid); + s->bi_valid += length - Buf_size; + } else { + s->bi_buf |= value << s->bi_valid; + s->bi_valid += length; + } +} +#else /* !DEBUG_ZLIB */ + +#define send_bits(s, value, length) \ +{ int len = (length);\ + if ((s)->bi_valid > (int)Buf_size - len) {\ + int val = (value);\ + (s)->bi_buf |= (val << (s)->bi_valid);\ + put_short((s), (s)->bi_buf);\ + (s)->bi_buf = (ush)val >> (Buf_size - (s)->bi_valid);\ + (s)->bi_valid += len - Buf_size;\ + } else {\ + (s)->bi_buf |= (value) << (s)->bi_valid;\ + (s)->bi_valid += len;\ + }\ +} +#endif /* DEBUG_ZLIB */ + +/* the arguments must not have side effects */ + +/* =========================================================================== + * Initialize the various 'constant' tables. In a multi-threaded environment, + * this function may be called by two threads concurrently, but this is + * harmless since both invocations do exactly the same thing. + */ +local void tr_static_init() +{ + static int static_init_done = 0; + int n; /* iterates over tree elements */ + int bits; /* bit counter */ + int length; /* length value */ + int code; /* code value */ + int dist; /* distance index */ + ush bl_count[MAX_BITS+1]; + /* number of codes at each bit length for an optimal tree */ + + if (static_init_done) return; + + /* Initialize the mapping length (0..255) -> length code (0..28) */ + length = 0; + for (code = 0; code < LENGTH_CODES-1; code++) { + base_length[code] = length; + for (n = 0; n < (1< dist code (0..29) */ + dist = 0; + for (code = 0 ; code < 16; code++) { + base_dist[code] = dist; + for (n = 0; n < (1<>= 7; /* from now on, all distances are divided by 128 */ + for ( ; code < D_CODES; code++) { + base_dist[code] = dist << 7; + for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) { + dist_code[256 + dist++] = (uch)code; + } + } + Assert (dist == 256, "tr_static_init: 256+dist != 512"); + + /* Construct the codes of the static literal tree */ + for (bits = 0; bits <= MAX_BITS; bits++) bl_count[bits] = 0; + n = 0; + while (n <= 143) static_ltree[n++].Len = 8, bl_count[8]++; + while (n <= 255) static_ltree[n++].Len = 9, bl_count[9]++; + while (n <= 279) static_ltree[n++].Len = 7, bl_count[7]++; + while (n <= 287) static_ltree[n++].Len = 8, bl_count[8]++; + /* Codes 286 and 287 do not exist, but we must include them in the + * tree construction to get a canonical Huffman tree (longest code + * all ones) + */ + gen_codes((ct_data *)static_ltree, L_CODES+1, bl_count); + + /* The static distance tree is trivial: */ + for (n = 0; n < D_CODES; n++) { + static_dtree[n].Len = 5; + static_dtree[n].Code = bi_reverse((unsigned)n, 5); + } + static_init_done = 1; +} + +/* =========================================================================== + * Initialize the tree data structures for a new zlib stream. + */ +void _tr_init(s) + deflate_state *s; +{ + tr_static_init(); + + s->compressed_len = 0L; + + s->l_desc.dyn_tree = s->dyn_ltree; + s->l_desc.stat_desc = &static_l_desc; + + s->d_desc.dyn_tree = s->dyn_dtree; + s->d_desc.stat_desc = &static_d_desc; + + s->bl_desc.dyn_tree = s->bl_tree; + s->bl_desc.stat_desc = &static_bl_desc; + + s->bi_buf = 0; + s->bi_valid = 0; + s->last_eob_len = 8; /* enough lookahead for inflate */ +#ifdef DEBUG_ZLIB + s->bits_sent = 0L; +#endif + + /* Initialize the first block of the first file: */ + init_block(s); +} + +/* =========================================================================== + * Initialize a new block. + */ +local void init_block(s) + deflate_state *s; +{ + int n; /* iterates over tree elements */ + + /* Initialize the trees. */ + for (n = 0; n < L_CODES; n++) s->dyn_ltree[n].Freq = 0; + for (n = 0; n < D_CODES; n++) s->dyn_dtree[n].Freq = 0; + for (n = 0; n < BL_CODES; n++) s->bl_tree[n].Freq = 0; + + s->dyn_ltree[END_BLOCK].Freq = 1; + s->opt_len = s->static_len = 0L; + s->last_lit = s->matches = 0; +} + +#define SMALLEST 1 +/* Index within the heap array of least frequent node in the Huffman tree */ + + +/* =========================================================================== + * Remove the smallest element from the heap and recreate the heap with + * one less element. Updates heap and heap_len. + */ +#define pqremove(s, tree, top) \ +{\ + top = s->heap[SMALLEST]; \ + s->heap[SMALLEST] = s->heap[s->heap_len--]; \ + pqdownheap(s, tree, SMALLEST); \ +} + +/* =========================================================================== + * Compares to subtrees, using the tree depth as tie breaker when + * the subtrees have equal frequency. This minimizes the worst case length. + */ +#define smaller(tree, n, m, depth) \ + (tree[n].Freq < tree[m].Freq || \ + (tree[n].Freq == tree[m].Freq && depth[n] <= depth[m])) + +/* =========================================================================== + * Restore the heap property by moving down the tree starting at node k, + * exchanging a node with the smallest of its two sons if necessary, stopping + * when the heap property is re-established (each father smaller than its + * two sons). + */ +local void pqdownheap(s, tree, k) + deflate_state *s; + ct_data *tree; /* the tree to restore */ + int k; /* node to move down */ +{ + int v = s->heap[k]; + int j = k << 1; /* left son of k */ + while (j <= s->heap_len) { + /* Set j to the smallest of the two sons: */ + if (j < s->heap_len && + smaller(tree, s->heap[j+1], s->heap[j], s->depth)) { + j++; + } + /* Exit if v is smaller than both sons */ + if (smaller(tree, v, s->heap[j], s->depth)) break; + + /* Exchange v with the smallest son */ + s->heap[k] = s->heap[j]; k = j; + + /* And continue down the tree, setting j to the left son of k */ + j <<= 1; + } + s->heap[k] = v; +} + +/* =========================================================================== + * Compute the optimal bit lengths for a tree and update the total bit length + * for the current block. + * IN assertion: the fields freq and dad are set, heap[heap_max] and + * above are the tree nodes sorted by increasing frequency. + * OUT assertions: the field len is set to the optimal bit length, the + * array bl_count contains the frequencies for each bit length. + * The length opt_len is updated; static_len is also updated if stree is + * not null. + */ +local void gen_bitlen(s, desc) + deflate_state *s; + tree_desc *desc; /* the tree descriptor */ +{ + ct_data *tree = desc->dyn_tree; + int max_code = desc->max_code; + ct_data *stree = desc->stat_desc->static_tree; + intf *extra = desc->stat_desc->extra_bits; + int base = desc->stat_desc->extra_base; + int max_length = desc->stat_desc->max_length; + int h; /* heap index */ + int n, m; /* iterate over the tree elements */ + int bits; /* bit length */ + int xbits; /* extra bits */ + ush f; /* frequency */ + int overflow = 0; /* number of elements with bit length too large */ + + for (bits = 0; bits <= MAX_BITS; bits++) s->bl_count[bits] = 0; + + /* In a first pass, compute the optimal bit lengths (which may + * overflow in the case of the bit length tree). + */ + tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */ + + for (h = s->heap_max+1; h < HEAP_SIZE; h++) { + n = s->heap[h]; + bits = tree[tree[n].Dad].Len + 1; + if (bits > max_length) bits = max_length, overflow++; + tree[n].Len = (ush)bits; + /* We overwrite tree[n].Dad which is no longer needed */ + + if (n > max_code) continue; /* not a leaf node */ + + s->bl_count[bits]++; + xbits = 0; + if (n >= base) xbits = extra[n-base]; + f = tree[n].Freq; + s->opt_len += (ulg)f * (bits + xbits); + if (stree) s->static_len += (ulg)f * (stree[n].Len + xbits); + } + if (overflow == 0) return; + + Trace((stderr,"\nbit length overflow\n")); + /* This happens for example on obj2 and pic of the Calgary corpus */ + + /* Find the first bit length which could increase: */ + do { + bits = max_length-1; + while (s->bl_count[bits] == 0) bits--; + s->bl_count[bits]--; /* move one leaf down the tree */ + s->bl_count[bits+1] += 2; /* move one overflow item as its brother */ + s->bl_count[max_length]--; + /* The brother of the overflow item also moves one step up, + * but this does not affect bl_count[max_length] + */ + overflow -= 2; + } while (overflow > 0); + + /* Now recompute all bit lengths, scanning in increasing frequency. + * h is still equal to HEAP_SIZE. (It is simpler to reconstruct all + * lengths instead of fixing only the wrong ones. This idea is taken + * from 'ar' written by Haruhiko Okumura.) + */ + for (bits = max_length; bits != 0; bits--) { + n = s->bl_count[bits]; + while (n != 0) { + m = s->heap[--h]; + if (m > max_code) continue; + if (tree[m].Len != (unsigned) bits) { + Trace((stderr,"code %d bits %d->%d\n", m, tree[m].Len, bits)); + s->opt_len += ((long)bits - (long)tree[m].Len) + *(long)tree[m].Freq; + tree[m].Len = (ush)bits; + } + n--; + } + } +} + +/* =========================================================================== + * Generate the codes for a given tree and bit counts (which need not be + * optimal). + * IN assertion: the array bl_count contains the bit length statistics for + * the given tree and the field len is set for all tree elements. + * OUT assertion: the field code is set for all tree elements of non + * zero code length. + */ +local void gen_codes (tree, max_code, bl_count) + ct_data *tree; /* the tree to decorate */ + int max_code; /* largest code with non zero frequency */ + ushf *bl_count; /* number of codes at each bit length */ +{ + ush next_code[MAX_BITS+1]; /* next code value for each bit length */ + ush code = 0; /* running code value */ + int bits; /* bit index */ + int n; /* code index */ + + /* The distribution counts are first used to generate the code values + * without bit reversal. + */ + for (bits = 1; bits <= MAX_BITS; bits++) { + next_code[bits] = code = (code + bl_count[bits-1]) << 1; + } + /* Check that the bit counts in bl_count are consistent. The last code + * must be all ones. + */ + Assert (code + bl_count[MAX_BITS]-1 == (1<dyn_tree; + ct_data *stree = desc->stat_desc->static_tree; + int elems = desc->stat_desc->elems; + int n, m; /* iterate over heap elements */ + int max_code = -1; /* largest code with non zero frequency */ + int node; /* new node being created */ + + /* Construct the initial heap, with least frequent element in + * heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1]. + * heap[0] is not used. + */ + s->heap_len = 0, s->heap_max = HEAP_SIZE; + + for (n = 0; n < elems; n++) { + if (tree[n].Freq != 0) { + s->heap[++(s->heap_len)] = max_code = n; + s->depth[n] = 0; + } else { + tree[n].Len = 0; + } + } + + /* The pkzip format requires that at least one distance code exists, + * and that at least one bit should be sent even if there is only one + * possible code. So to avoid special checks later on we force at least + * two codes of non zero frequency. + */ + while (s->heap_len < 2) { + node = s->heap[++(s->heap_len)] = (max_code < 2 ? ++max_code : 0); + tree[node].Freq = 1; + s->depth[node] = 0; + s->opt_len--; if (stree) s->static_len -= stree[node].Len; + /* node is 0 or 1 so it does not have extra bits */ + } + desc->max_code = max_code; + + /* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree, + * establish sub-heaps of increasing lengths: + */ + for (n = s->heap_len/2; n >= 1; n--) pqdownheap(s, tree, n); + + /* Construct the Huffman tree by repeatedly combining the least two + * frequent nodes. + */ + node = elems; /* next internal node of the tree */ + do { + pqremove(s, tree, n); /* n = node of least frequency */ + m = s->heap[SMALLEST]; /* m = node of next least frequency */ + + s->heap[--(s->heap_max)] = n; /* keep the nodes sorted by frequency */ + s->heap[--(s->heap_max)] = m; + + /* Create a new node father of n and m */ + tree[node].Freq = tree[n].Freq + tree[m].Freq; + s->depth[node] = (uch) (MAX(s->depth[n], s->depth[m]) + 1); + tree[n].Dad = tree[m].Dad = (ush)node; +#ifdef DUMP_BL_TREE + if (tree == s->bl_tree) { + fprintf(stderr,"\nnode %d(%d), sons %d(%d) %d(%d)", + node, tree[node].Freq, n, tree[n].Freq, m, tree[m].Freq); + } +#endif + /* and insert the new node in the heap */ + s->heap[SMALLEST] = node++; + pqdownheap(s, tree, SMALLEST); + + } while (s->heap_len >= 2); + + s->heap[--(s->heap_max)] = s->heap[SMALLEST]; + + /* At this point, the fields freq and dad are set. We can now + * generate the bit lengths. + */ + gen_bitlen(s, (tree_desc *)desc); + + /* The field len is now set, we can generate the bit codes */ + gen_codes ((ct_data *)tree, max_code, s->bl_count); +} + +/* =========================================================================== + * Scan a literal or distance tree to determine the frequencies of the codes + * in the bit length tree. + */ +local void scan_tree (s, tree, max_code) + deflate_state *s; + ct_data *tree; /* the tree to be scanned */ + int max_code; /* and its largest code of non zero frequency */ +{ + int n; /* iterates over all tree elements */ + int prevlen = -1; /* last emitted length */ + int curlen; /* length of current code */ + int nextlen = tree[0].Len; /* length of next code */ + int count = 0; /* repeat count of the current code */ + int max_count = 7; /* max repeat count */ + int min_count = 4; /* min repeat count */ + + if (nextlen == 0) max_count = 138, min_count = 3; + tree[max_code+1].Len = (ush)0xffff; /* guard */ + + for (n = 0; n <= max_code; n++) { + curlen = nextlen; nextlen = tree[n+1].Len; + if (++count < max_count && curlen == nextlen) { + continue; + } else if (count < min_count) { + s->bl_tree[curlen].Freq += count; + } else if (curlen != 0) { + if (curlen != prevlen) s->bl_tree[curlen].Freq++; + s->bl_tree[REP_3_6].Freq++; + } else if (count <= 10) { + s->bl_tree[REPZ_3_10].Freq++; + } else { + s->bl_tree[REPZ_11_138].Freq++; + } + count = 0; prevlen = curlen; + if (nextlen == 0) { + max_count = 138, min_count = 3; + } else if (curlen == nextlen) { + max_count = 6, min_count = 3; + } else { + max_count = 7, min_count = 4; + } + } +} + +/* =========================================================================== + * Send a literal or distance tree in compressed form, using the codes in + * bl_tree. + */ +local void send_tree (s, tree, max_code) + deflate_state *s; + ct_data *tree; /* the tree to be scanned */ + int max_code; /* and its largest code of non zero frequency */ +{ + int n; /* iterates over all tree elements */ + int prevlen = -1; /* last emitted length */ + int curlen; /* length of current code */ + int nextlen = tree[0].Len; /* length of next code */ + int count = 0; /* repeat count of the current code */ + int max_count = 7; /* max repeat count */ + int min_count = 4; /* min repeat count */ + + /* tree[max_code+1].Len = -1; */ /* guard already set */ + if (nextlen == 0) max_count = 138, min_count = 3; + + for (n = 0; n <= max_code; n++) { + curlen = nextlen; nextlen = tree[n+1].Len; + if (++count < max_count && curlen == nextlen) { + continue; + } else if (count < min_count) { + do { send_code(s, curlen, s->bl_tree); } while (--count != 0); + + } else if (curlen != 0) { + if (curlen != prevlen) { + send_code(s, curlen, s->bl_tree); count--; + } + Assert(count >= 3 && count <= 6, " 3_6?"); + send_code(s, REP_3_6, s->bl_tree); send_bits(s, count-3, 2); + + } else if (count <= 10) { + send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count-3, 3); + + } else { + send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count-11, 7); + } + count = 0; prevlen = curlen; + if (nextlen == 0) { + max_count = 138, min_count = 3; + } else if (curlen == nextlen) { + max_count = 6, min_count = 3; + } else { + max_count = 7, min_count = 4; + } + } +} + +/* =========================================================================== + * Construct the Huffman tree for the bit lengths and return the index in + * bl_order of the last bit length code to send. + */ +local int build_bl_tree(s) + deflate_state *s; +{ + int max_blindex; /* index of last bit length code of non zero freq */ + + /* Determine the bit length frequencies for literal and distance trees */ + scan_tree(s, (ct_data *)s->dyn_ltree, s->l_desc.max_code); + scan_tree(s, (ct_data *)s->dyn_dtree, s->d_desc.max_code); + + /* Build the bit length tree: */ + build_tree(s, (tree_desc *)(&(s->bl_desc))); + /* opt_len now includes the length of the tree representations, except + * the lengths of the bit lengths codes and the 5+5+4 bits for the counts. + */ + + /* Determine the number of bit length codes to send. The pkzip format + * requires that at least 4 bit length codes be sent. (appnote.txt says + * 3 but the actual value used is 4.) + */ + for (max_blindex = BL_CODES-1; max_blindex >= 3; max_blindex--) { + if (s->bl_tree[bl_order[max_blindex]].Len != 0) break; + } + /* Update opt_len to include the bit length tree and counts */ + s->opt_len += 3*(max_blindex+1) + 5+5+4; + Tracev((stderr, "\ndyn trees: dyn %ld, stat %ld", + s->opt_len, s->static_len)); + + return max_blindex; +} + +/* =========================================================================== + * Send the header for a block using dynamic Huffman trees: the counts, the + * lengths of the bit length codes, the literal tree and the distance tree. + * IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4. + */ +local void send_all_trees(s, lcodes, dcodes, blcodes) + deflate_state *s; + int lcodes, dcodes, blcodes; /* number of codes for each tree */ +{ + int rank; /* index in bl_order */ + + Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes"); + Assert (lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES, + "too many codes"); + Tracev((stderr, "\nbl counts: ")); + send_bits(s, lcodes-257, 5); /* not +255 as stated in appnote.txt */ + send_bits(s, dcodes-1, 5); + send_bits(s, blcodes-4, 4); /* not -3 as stated in appnote.txt */ + for (rank = 0; rank < blcodes; rank++) { + Tracev((stderr, "\nbl code %2d ", bl_order[rank])); + send_bits(s, s->bl_tree[bl_order[rank]].Len, 3); + } + Tracev((stderr, "\nbl tree: sent %ld", s->bits_sent)); + + send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */ + Tracev((stderr, "\nlit tree: sent %ld", s->bits_sent)); + + send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */ + Tracev((stderr, "\ndist tree: sent %ld", s->bits_sent)); +} + +/* =========================================================================== + * Send a stored block + */ +void _tr_stored_block(s, buf, stored_len, eof) + deflate_state *s; + charf *buf; /* input block */ + ulg stored_len; /* length of input block */ + int eof; /* true if this is the last block for a file */ +{ + send_bits(s, (STORED_BLOCK<<1)+eof, 3); /* send block type */ + s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L; + s->compressed_len += (stored_len + 4) << 3; + + copy_block(s, buf, (unsigned)stored_len, 1); /* with header */ +} + +/* Send just the `stored block' type code without any length bytes or data. + */ +void _tr_stored_type_only(s) + deflate_state *s; +{ + send_bits(s, (STORED_BLOCK << 1), 3); + bi_windup(s); + s->compressed_len = (s->compressed_len + 3) & ~7L; +} + + +/* =========================================================================== + * Send one empty static block to give enough lookahead for inflate. + * This takes 10 bits, of which 7 may remain in the bit buffer. + * The current inflate code requires 9 bits of lookahead. If the + * last two codes for the previous block (real code plus EOB) were coded + * on 5 bits or less, inflate may have only 5+3 bits of lookahead to decode + * the last real code. In this case we send two empty static blocks instead + * of one. (There are no problems if the previous block is stored or fixed.) + * To simplify the code, we assume the worst case of last real code encoded + * on one bit only. + */ +void _tr_align(s) + deflate_state *s; +{ + send_bits(s, STATIC_TREES<<1, 3); + send_code(s, END_BLOCK, static_ltree); + s->compressed_len += 10L; /* 3 for block type, 7 for EOB */ + bi_flush(s); + /* Of the 10 bits for the empty block, we have already sent + * (10 - bi_valid) bits. The lookahead for the last real code (before + * the EOB of the previous block) was thus at least one plus the length + * of the EOB plus what we have just sent of the empty static block. + */ + if (1 + s->last_eob_len + 10 - s->bi_valid < 9) { + send_bits(s, STATIC_TREES<<1, 3); + send_code(s, END_BLOCK, static_ltree); + s->compressed_len += 10L; + bi_flush(s); + } + s->last_eob_len = 7; +} + +/* =========================================================================== + * Determine the best encoding for the current block: dynamic trees, static + * trees or store, and output the encoded block to the zip file. This function + * returns the total compressed length for the file so far. + */ +ulg _tr_flush_block(s, buf, stored_len, eof) + deflate_state *s; + charf *buf; /* input block, or NULL if too old */ + ulg stored_len; /* length of input block */ + int eof; /* true if this is the last block for a file */ +{ + ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */ + int max_blindex = 0; /* index of last bit length code of non zero freq */ + + /* Build the Huffman trees unless a stored block is forced */ + if (s->level > 0) { + + /* Check if the file is ascii or binary */ + if (s->data_type == Z_UNKNOWN) set_data_type(s); + + /* Construct the literal and distance trees */ + build_tree(s, (tree_desc *)(&(s->l_desc))); + Tracev((stderr, "\nlit data: dyn %ld, stat %ld", s->opt_len, + s->static_len)); + + build_tree(s, (tree_desc *)(&(s->d_desc))); + Tracev((stderr, "\ndist data: dyn %ld, stat %ld", s->opt_len, + s->static_len)); + /* At this point, opt_len and static_len are the total bit lengths of + * the compressed block data, excluding the tree representations. + */ + + /* Build the bit length tree for the above two trees, and get the index + * in bl_order of the last bit length code to send. + */ + max_blindex = build_bl_tree(s); + + /* Determine the best encoding. Compute first the block length in bytes*/ + opt_lenb = (s->opt_len+3+7)>>3; + static_lenb = (s->static_len+3+7)>>3; + + Tracev((stderr, "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ", + opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len, + s->last_lit)); + + if (static_lenb <= opt_lenb) opt_lenb = static_lenb; + + } else { + Assert(buf != (char*)0, "lost buf"); + opt_lenb = static_lenb = stored_len + 5; /* force a stored block */ + } + + /* If compression failed and this is the first and last block, + * and if the .zip file can be seeked (to rewrite the local header), + * the whole file is transformed into a stored file: + */ +#ifdef STORED_FILE_OK +# ifdef FORCE_STORED_FILE + if (eof && s->compressed_len == 0L) { /* force stored file */ +# else + if (stored_len <= opt_lenb && eof && s->compressed_len==0L && seekable()) { +# endif + /* Since LIT_BUFSIZE <= 2*WSIZE, the input data must be there: */ + if (buf == (charf*)0) error ("block vanished"); + + copy_block(s, buf, (unsigned)stored_len, 0); /* without header */ + s->compressed_len = stored_len << 3; + s->method = STORED; + } else +#endif /* STORED_FILE_OK */ + +#ifdef FORCE_STORED + if (buf != (char*)0) { /* force stored block */ +#else + if (stored_len+4 <= opt_lenb && buf != (char*)0) { + /* 4: two words for the lengths */ +#endif + /* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE. + * Otherwise we can't have processed more than WSIZE input bytes since + * the last block flush, because compression would have been + * successful. If LIT_BUFSIZE <= WSIZE, it is never too late to + * transform a block into a stored block. + */ + _tr_stored_block(s, buf, stored_len, eof); + +#ifdef FORCE_STATIC + } else if (static_lenb >= 0) { /* force static trees */ +#else + } else if (static_lenb == opt_lenb) { +#endif + send_bits(s, (STATIC_TREES<<1)+eof, 3); + compress_block(s, (ct_data *)static_ltree, (ct_data *)static_dtree); + s->compressed_len += 3 + s->static_len; + } else { + send_bits(s, (DYN_TREES<<1)+eof, 3); + send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1, + max_blindex+1); + compress_block(s, (ct_data *)s->dyn_ltree, (ct_data *)s->dyn_dtree); + s->compressed_len += 3 + s->opt_len; + } + Assert (s->compressed_len == s->bits_sent, "bad compressed size"); + init_block(s); + + if (eof) { + bi_windup(s); + s->compressed_len += 7; /* align on byte boundary */ + } + Tracev((stderr,"\ncomprlen %lu(%lu) ", s->compressed_len>>3, + s->compressed_len-7*eof)); + + return s->compressed_len >> 3; +} + +/* =========================================================================== + * Save the match info and tally the frequency counts. Return true if + * the current block must be flushed. + */ +int _tr_tally (s, dist, lc) + deflate_state *s; + unsigned dist; /* distance of matched string */ + unsigned lc; /* match length-MIN_MATCH or unmatched char (if dist==0) */ +{ + s->d_buf[s->last_lit] = (ush)dist; + s->l_buf[s->last_lit++] = (uch)lc; + if (dist == 0) { + /* lc is the unmatched char */ + s->dyn_ltree[lc].Freq++; + } else { + s->matches++; + /* Here, lc is the match length - MIN_MATCH */ + dist--; /* dist = match distance - 1 */ + Assert((ush)dist < (ush)MAX_DIST(s) && + (ush)lc <= (ush)(MAX_MATCH-MIN_MATCH) && + (ush)d_code(dist) < (ush)D_CODES, "_tr_tally: bad match"); + + s->dyn_ltree[length_code[lc]+LITERALS+1].Freq++; + s->dyn_dtree[d_code(dist)].Freq++; + } + + /* Try to guess if it is profitable to stop the current block here */ + if (s->level > 2 && (s->last_lit & 0xfff) == 0) { + /* Compute an upper bound for the compressed length */ + ulg out_length = (ulg)s->last_lit*8L; + ulg in_length = (ulg)((long)s->strstart - s->block_start); + int dcode; + for (dcode = 0; dcode < D_CODES; dcode++) { + out_length += (ulg)s->dyn_dtree[dcode].Freq * + (5L+extra_dbits[dcode]); + } + out_length >>= 3; + Tracev((stderr,"\nlast_lit %u, in %ld, out ~%ld(%ld%%) ", + s->last_lit, in_length, out_length, + 100L - out_length*100L/in_length)); + if (s->matches < s->last_lit/2 && out_length < in_length/2) return 1; + } + return (s->last_lit == s->lit_bufsize-1); + /* We avoid equality with lit_bufsize because of wraparound at 64K + * on 16 bit machines and because stored blocks are restricted to + * 64K-1 bytes. + */ +} + +/* =========================================================================== + * Send the block data compressed using the given Huffman trees + */ +local void compress_block(s, ltree, dtree) + deflate_state *s; + ct_data *ltree; /* literal tree */ + ct_data *dtree; /* distance tree */ +{ + unsigned dist; /* distance of matched string */ + int lc; /* match length or unmatched char (if dist == 0) */ + unsigned lx = 0; /* running index in l_buf */ + unsigned code; /* the code to send */ + int extra; /* number of extra bits to send */ + + if (s->last_lit != 0) do { + dist = s->d_buf[lx]; + lc = s->l_buf[lx++]; + if (dist == 0) { + send_code(s, lc, ltree); /* send a literal byte */ + Tracecv(isgraph(lc), (stderr," '%c' ", lc)); + } else { + /* Here, lc is the match length - MIN_MATCH */ + code = length_code[lc]; + send_code(s, code+LITERALS+1, ltree); /* send the length code */ + extra = extra_lbits[code]; + if (extra != 0) { + lc -= base_length[code]; + send_bits(s, lc, extra); /* send the extra length bits */ + } + dist--; /* dist is now the match distance - 1 */ + code = d_code(dist); + Assert (code < D_CODES, "bad d_code"); + + send_code(s, code, dtree); /* send the distance code */ + extra = extra_dbits[code]; + if (extra != 0) { + dist -= base_dist[code]; + send_bits(s, dist, extra); /* send the extra distance bits */ + } + } /* literal or match pair ? */ + + /* Check that the overlay between pending_buf and d_buf+l_buf is ok: */ + Assert(s->pending < s->lit_bufsize + 2*lx, "pendingBuf overflow"); + + } while (lx < s->last_lit); + + send_code(s, END_BLOCK, ltree); + s->last_eob_len = ltree[END_BLOCK].Len; +} + +/* =========================================================================== + * Set the data type to ASCII or BINARY, using a crude approximation: + * binary if more than 20% of the bytes are <= 6 or >= 128, ascii otherwise. + * IN assertion: the fields freq of dyn_ltree are set and the total of all + * frequencies does not exceed 64K (to fit in an int on 16 bit machines). + */ +local void set_data_type(s) + deflate_state *s; +{ + int n = 0; + unsigned ascii_freq = 0; + unsigned bin_freq = 0; + while (n < 7) bin_freq += s->dyn_ltree[n++].Freq; + while (n < 128) ascii_freq += s->dyn_ltree[n++].Freq; + while (n < LITERALS) bin_freq += s->dyn_ltree[n++].Freq; + s->data_type = (Byte)(bin_freq > (ascii_freq >> 2) ? Z_BINARY : Z_ASCII); +} + +/* =========================================================================== + * Reverse the first len bits of a code, using straightforward code (a faster + * method would use a table) + * IN assertion: 1 <= len <= 15 + */ +local unsigned bi_reverse(code, len) + unsigned code; /* the value to invert */ + int len; /* its bit length */ +{ + register unsigned res = 0; + do { + res |= code & 1; + code >>= 1, res <<= 1; + } while (--len > 0); + return res >> 1; +} + +/* =========================================================================== + * Flush the bit buffer, keeping at most 7 bits in it. + */ +local void bi_flush(s) + deflate_state *s; +{ + if (s->bi_valid == 16) { + put_short(s, s->bi_buf); + s->bi_buf = 0; + s->bi_valid = 0; + } else if (s->bi_valid >= 8) { + put_byte(s, (Byte)s->bi_buf); + s->bi_buf >>= 8; + s->bi_valid -= 8; + } +} + +/* =========================================================================== + * Flush the bit buffer and align the output on a byte boundary + */ +local void bi_windup(s) + deflate_state *s; +{ + if (s->bi_valid > 8) { + put_short(s, s->bi_buf); + } else if (s->bi_valid > 0) { + put_byte(s, (Byte)s->bi_buf); + } + s->bi_buf = 0; + s->bi_valid = 0; +#ifdef DEBUG_ZLIB + s->bits_sent = (s->bits_sent+7) & ~7; +#endif +} + +/* =========================================================================== + * Copy a stored block, storing first the length and its + * one's complement if requested. + */ +local void copy_block(s, buf, len, header) + deflate_state *s; + charf *buf; /* the input data */ + unsigned len; /* its length */ + int header; /* true if block header must be written */ +{ + bi_windup(s); /* align on byte boundary */ + s->last_eob_len = 8; /* enough lookahead for inflate */ + + if (header) { + put_short(s, (ush)len); + put_short(s, (ush)~len); +#ifdef DEBUG_ZLIB + s->bits_sent += 2*16; +#endif + } +#ifdef DEBUG_ZLIB + s->bits_sent += (ulg)len<<3; +#endif + /* bundle up the put_byte(s, *buf++) calls */ + zmemcpy(&s->pending_buf[s->pending], buf, len); + s->pending += len; +} +/* --- trees.c */ + +/* +++ inflate.c */ +/* inflate.c -- zlib interface to inflate modules + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* #include */ + +/* +++ infblock.h */ +/* infblock.h -- header to use infblock.c + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +struct inflate_blocks_state; +typedef struct inflate_blocks_state FAR inflate_blocks_statef; + +extern inflate_blocks_statef * inflate_blocks_new OF(( + z_streamp z, + check_func c, /* check function */ + uInt w)); /* window size */ + +extern int inflate_blocks OF(( + inflate_blocks_statef *, + z_streamp , + int)); /* initial return code */ + +extern void inflate_blocks_reset OF(( + inflate_blocks_statef *, + z_streamp , + uLongf *)); /* check value on output */ + +extern int inflate_blocks_free OF(( + inflate_blocks_statef *, + z_streamp , + uLongf *)); /* check value on output */ + +extern void inflate_set_dictionary OF(( + inflate_blocks_statef *s, + const Bytef *d, /* dictionary */ + uInt n)); /* dictionary length */ + +extern int inflate_addhistory OF(( + inflate_blocks_statef *, + z_streamp)); + +extern int inflate_packet_flush OF(( + inflate_blocks_statef *)); +/* --- infblock.h */ + +#ifndef NO_DUMMY_DECL +struct inflate_blocks_state {int dummy;}; /* for buggy compilers */ +#endif + +/* inflate private state */ +struct internal_state { + + /* mode */ + enum { + METHOD, /* waiting for method byte */ + FLAG, /* waiting for flag byte */ + DICT4, /* four dictionary check bytes to go */ + DICT3, /* three dictionary check bytes to go */ + DICT2, /* two dictionary check bytes to go */ + DICT1, /* one dictionary check byte to go */ + DICT0, /* waiting for inflateSetDictionary */ + BLOCKS, /* decompressing blocks */ + CHECK4, /* four check bytes to go */ + CHECK3, /* three check bytes to go */ + CHECK2, /* two check bytes to go */ + CHECK1, /* one check byte to go */ + DONE, /* finished check, done */ + BAD} /* got an error--stay here */ + mode; /* current inflate mode */ + + /* mode dependent information */ + union { + uInt method; /* if FLAGS, method byte */ + struct { + uLong was; /* computed check value */ + uLong need; /* stream check value */ + } check; /* if CHECK, check values to compare */ + uInt marker; /* if BAD, inflateSync's marker bytes count */ + } sub; /* submode */ + + /* mode independent information */ + int nowrap; /* flag for no wrapper */ + uInt wbits; /* log2(window size) (8..15, defaults to 15) */ + inflate_blocks_statef + *blocks; /* current inflate_blocks state */ + +}; + + +int inflateReset(z) +z_streamp z; +{ + uLong c; + + if (z == Z_NULL || z->state == Z_NULL) + return Z_STREAM_ERROR; + z->total_in = z->total_out = 0; + z->msg = Z_NULL; + z->state->mode = z->state->nowrap ? BLOCKS : METHOD; + inflate_blocks_reset(z->state->blocks, z, &c); + Trace((stderr, "inflate: reset\n")); + return Z_OK; +} + + +int inflateEnd(z) +z_streamp z; +{ + uLong c; + + if (z == Z_NULL || z->state == Z_NULL || z->zfree == Z_NULL) + return Z_STREAM_ERROR; + if (z->state->blocks != Z_NULL) + inflate_blocks_free(z->state->blocks, z, &c); + ZFREE(z, z->state); + z->state = Z_NULL; + Trace((stderr, "inflate: end\n")); + return Z_OK; +} + + +int inflateInit2_(z, w, version, stream_size) +z_streamp z; +int w; +const char *version; +int stream_size; +{ + if (version == Z_NULL || version[0] != ZLIB_VERSION[0] || + stream_size != sizeof(z_stream)) + return Z_VERSION_ERROR; + + /* initialize state */ + if (z == Z_NULL) + return Z_STREAM_ERROR; + z->msg = Z_NULL; +#ifndef NO_ZCFUNCS + if (z->zalloc == Z_NULL) + { + z->zalloc = zcalloc; + z->opaque = (voidpf)0; + } + if (z->zfree == Z_NULL) z->zfree = zcfree; +#endif + if ((z->state = (struct internal_state FAR *) + ZALLOC(z,1,sizeof(struct internal_state))) == Z_NULL) + return Z_MEM_ERROR; + z->state->blocks = Z_NULL; + + /* handle undocumented nowrap option (no zlib header or check) */ + z->state->nowrap = 0; + if (w < 0) + { + w = - w; + z->state->nowrap = 1; + } + + /* set window size */ + if (w < 8 || w > 15) + { + inflateEnd(z); + return Z_STREAM_ERROR; + } + z->state->wbits = (uInt)w; + + /* create inflate_blocks state */ + if ((z->state->blocks = + inflate_blocks_new(z, z->state->nowrap ? Z_NULL : adler32, (uInt)1 << w)) + == Z_NULL) + { + inflateEnd(z); + return Z_MEM_ERROR; + } + Trace((stderr, "inflate: allocated\n")); + + /* reset state */ + inflateReset(z); + return Z_OK; +} + + +int inflateInit_(z, version, stream_size) +z_streamp z; +const char *version; +int stream_size; +{ + return inflateInit2_(z, DEF_WBITS, version, stream_size); +} + + +#define NEEDBYTE {if(z->avail_in==0)goto empty;r=Z_OK;} +#define NEXTBYTE (z->avail_in--,z->total_in++,*z->next_in++) + +int inflate(z, f) +z_streamp z; +int f; +{ + int r; + uInt b; + + if (z == Z_NULL || z->state == Z_NULL || z->next_in == Z_NULL || f < 0) + return Z_STREAM_ERROR; + r = Z_BUF_ERROR; + while (1) switch (z->state->mode) + { + case METHOD: + NEEDBYTE + if (((z->state->sub.method = NEXTBYTE) & 0xf) != Z_DEFLATED) + { + z->state->mode = BAD; + z->msg = (char*)"unknown compression method"; + z->state->sub.marker = 5; /* can't try inflateSync */ + break; + } + if ((z->state->sub.method >> 4) + 8 > z->state->wbits) + { + z->state->mode = BAD; + z->msg = (char*)"invalid window size"; + z->state->sub.marker = 5; /* can't try inflateSync */ + break; + } + z->state->mode = FLAG; + case FLAG: + NEEDBYTE + b = NEXTBYTE; + if (((z->state->sub.method << 8) + b) % 31) + { + z->state->mode = BAD; + z->msg = (char*)"incorrect header check"; + z->state->sub.marker = 5; /* can't try inflateSync */ + break; + } + Trace((stderr, "inflate: zlib header ok\n")); + if (!(b & PRESET_DICT)) + { + z->state->mode = BLOCKS; + break; + } + z->state->mode = DICT4; + case DICT4: + NEEDBYTE + z->state->sub.check.need = (uLong)NEXTBYTE << 24; + z->state->mode = DICT3; + case DICT3: + NEEDBYTE + z->state->sub.check.need += (uLong)NEXTBYTE << 16; + z->state->mode = DICT2; + case DICT2: + NEEDBYTE + z->state->sub.check.need += (uLong)NEXTBYTE << 8; + z->state->mode = DICT1; + case DICT1: + NEEDBYTE + z->state->sub.check.need += (uLong)NEXTBYTE; + z->adler = z->state->sub.check.need; + z->state->mode = DICT0; + return Z_NEED_DICT; + case DICT0: + z->state->mode = BAD; + z->msg = (char*)"need dictionary"; + z->state->sub.marker = 0; /* can try inflateSync */ + return Z_STREAM_ERROR; + case BLOCKS: + r = inflate_blocks(z->state->blocks, z, r); + if (f == Z_PACKET_FLUSH && z->avail_in == 0 && z->avail_out != 0) + r = inflate_packet_flush(z->state->blocks); + if (r == Z_DATA_ERROR) + { + z->state->mode = BAD; + z->state->sub.marker = 0; /* can try inflateSync */ + break; + } + if (r != Z_STREAM_END) + return r; + r = Z_OK; + inflate_blocks_reset(z->state->blocks, z, &z->state->sub.check.was); + if (z->state->nowrap) + { + z->state->mode = DONE; + break; + } + z->state->mode = CHECK4; + case CHECK4: + NEEDBYTE + z->state->sub.check.need = (uLong)NEXTBYTE << 24; + z->state->mode = CHECK3; + case CHECK3: + NEEDBYTE + z->state->sub.check.need += (uLong)NEXTBYTE << 16; + z->state->mode = CHECK2; + case CHECK2: + NEEDBYTE + z->state->sub.check.need += (uLong)NEXTBYTE << 8; + z->state->mode = CHECK1; + case CHECK1: + NEEDBYTE + z->state->sub.check.need += (uLong)NEXTBYTE; + + if (z->state->sub.check.was != z->state->sub.check.need) + { + z->state->mode = BAD; + z->msg = (char*)"incorrect data check"; + z->state->sub.marker = 5; /* can't try inflateSync */ + break; + } + Trace((stderr, "inflate: zlib check ok\n")); + z->state->mode = DONE; + case DONE: + return Z_STREAM_END; + case BAD: + return Z_DATA_ERROR; + default: + return Z_STREAM_ERROR; + } + + empty: + if (f != Z_PACKET_FLUSH) + return r; + z->state->mode = BAD; + z->msg = (char *)"need more for packet flush"; + z->state->sub.marker = 0; /* can try inflateSync */ + return Z_DATA_ERROR; +} + + +int inflateSetDictionary(z, dictionary, dictLength) +z_streamp z; +const Bytef *dictionary; +uInt dictLength; +{ + uInt length = dictLength; + + if (z == Z_NULL || z->state == Z_NULL || z->state->mode != DICT0) + return Z_STREAM_ERROR; + + if (adler32(1L, dictionary, dictLength) != z->adler) return Z_DATA_ERROR; + z->adler = 1L; + + if (length >= ((uInt)1<state->wbits)) + { + length = (1<state->wbits)-1; + dictionary += dictLength - length; + } + inflate_set_dictionary(z->state->blocks, dictionary, length); + z->state->mode = BLOCKS; + return Z_OK; +} + +/* + * This subroutine adds the data at next_in/avail_in to the output history + * without performing any output. The output buffer must be "caught up"; + * i.e. no pending output (hence s->read equals s->write), and the state must + * be BLOCKS (i.e. we should be willing to see the start of a series of + * BLOCKS). On exit, the output will also be caught up, and the checksum + * will have been updated if need be. + */ + +int inflateIncomp(z) +z_stream *z; +{ + if (z->state->mode != BLOCKS) + return Z_DATA_ERROR; + return inflate_addhistory(z->state->blocks, z); +} + + +int inflateSync(z) +z_streamp z; +{ + uInt n; /* number of bytes to look at */ + Bytef *p; /* pointer to bytes */ + uInt m; /* number of marker bytes found in a row */ + uLong r, w; /* temporaries to save total_in and total_out */ + + /* set up */ + if (z == Z_NULL || z->state == Z_NULL) + return Z_STREAM_ERROR; + if (z->state->mode != BAD) + { + z->state->mode = BAD; + z->state->sub.marker = 0; + } + if ((n = z->avail_in) == 0) + return Z_BUF_ERROR; + p = z->next_in; + m = z->state->sub.marker; + + /* search */ + while (n && m < 4) + { + if (*p == (Byte)(m < 2 ? 0 : 0xff)) + m++; + else if (*p) + m = 0; + else + m = 4 - m; + p++, n--; + } + + /* restore */ + z->total_in += p - z->next_in; + z->next_in = p; + z->avail_in = n; + z->state->sub.marker = m; + + /* return no joy or set up to restart on a new block */ + if (m != 4) + return Z_DATA_ERROR; + r = z->total_in; w = z->total_out; + inflateReset(z); + z->total_in = r; z->total_out = w; + z->state->mode = BLOCKS; + return Z_OK; +} + +#undef NEEDBYTE +#undef NEXTBYTE +/* --- inflate.c */ + +/* +++ infblock.c */ +/* infblock.c -- interpret and process block types to last block + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* #include */ +/* #include */ + +/* +++ inftrees.h */ +/* inftrees.h -- header to use inftrees.c + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +/* Huffman code lookup table entry--this entry is four bytes for machines + that have 16-bit pointers (e.g. PC's in the small or medium model). */ + +typedef struct inflate_huft_s FAR inflate_huft; + +struct inflate_huft_s { + union { + struct { + Byte Exop; /* number of extra bits or operation */ + Byte Bits; /* number of bits in this code or subcode */ + } what; + Bytef *pad; /* pad structure to a power of 2 (4 bytes for */ + } word; /* 16-bit, 8 bytes for 32-bit machines) */ + union { + uInt Base; /* literal, length base, or distance base */ + inflate_huft *Next; /* pointer to next level of table */ + } more; +}; + +#ifdef DEBUG_ZLIB + extern uInt inflate_hufts; +#endif + +extern int inflate_trees_bits OF(( + uIntf *, /* 19 code lengths */ + uIntf *, /* bits tree desired/actual depth */ + inflate_huft * FAR *, /* bits tree result */ + z_streamp )); /* for zalloc, zfree functions */ + +extern int inflate_trees_dynamic OF(( + uInt, /* number of literal/length codes */ + uInt, /* number of distance codes */ + uIntf *, /* that many (total) code lengths */ + uIntf *, /* literal desired/actual bit depth */ + uIntf *, /* distance desired/actual bit depth */ + inflate_huft * FAR *, /* literal/length tree result */ + inflate_huft * FAR *, /* distance tree result */ + z_streamp )); /* for zalloc, zfree functions */ + +extern int inflate_trees_fixed OF(( + uIntf *, /* literal desired/actual bit depth */ + uIntf *, /* distance desired/actual bit depth */ + inflate_huft * FAR *, /* literal/length tree result */ + inflate_huft * FAR *)); /* distance tree result */ + +extern int inflate_trees_free OF(( + inflate_huft *, /* tables to free */ + z_streamp )); /* for zfree function */ + +/* --- inftrees.h */ + +/* +++ infcodes.h */ +/* infcodes.h -- header to use infcodes.c + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +struct inflate_codes_state; +typedef struct inflate_codes_state FAR inflate_codes_statef; + +extern inflate_codes_statef *inflate_codes_new OF(( + uInt, uInt, + inflate_huft *, inflate_huft *, + z_streamp )); + +extern int inflate_codes OF(( + inflate_blocks_statef *, + z_streamp , + int)); + +extern void inflate_codes_free OF(( + inflate_codes_statef *, + z_streamp )); + +/* --- infcodes.h */ + +/* +++ infutil.h */ +/* infutil.h -- types and macros common to blocks and codes + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +#ifndef _INFUTIL_H +#define _INFUTIL_H + +typedef enum { + TYPE, /* get type bits (3, including end bit) */ + LENS, /* get lengths for stored */ + STORED, /* processing stored block */ + TABLE, /* get table lengths */ + BTREE, /* get bit lengths tree for a dynamic block */ + DTREE, /* get length, distance trees for a dynamic block */ + CODES, /* processing fixed or dynamic block */ + DRY, /* output remaining window bytes */ + DONEB, /* finished last block, done */ + BADB} /* got a data error--stuck here */ +inflate_block_mode; + +/* inflate blocks semi-private state */ +struct inflate_blocks_state { + + /* mode */ + inflate_block_mode mode; /* current inflate_block mode */ + + /* mode dependent information */ + union { + uInt left; /* if STORED, bytes left to copy */ + struct { + uInt table; /* table lengths (14 bits) */ + uInt index; /* index into blens (or border) */ + uIntf *blens; /* bit lengths of codes */ + uInt bb; /* bit length tree depth */ + inflate_huft *tb; /* bit length decoding tree */ + } trees; /* if DTREE, decoding info for trees */ + struct { + inflate_huft *tl; + inflate_huft *td; /* trees to free */ + inflate_codes_statef + *codes; + } decode; /* if CODES, current state */ + } sub; /* submode */ + uInt last; /* true if this block is the last block */ + + /* mode independent information */ + uInt bitk; /* bits in bit buffer */ + uLong bitb; /* bit buffer */ + Bytef *window; /* sliding window */ + Bytef *end; /* one byte after sliding window */ + Bytef *read; /* window read pointer */ + Bytef *write; /* window write pointer */ + check_func checkfn; /* check function */ + uLong check; /* check on output */ + +}; + + +/* defines for inflate input/output */ +/* update pointers and return */ +#define UPDBITS {s->bitb=b;s->bitk=k;} +#define UPDIN {z->avail_in=n;z->total_in+=p-z->next_in;z->next_in=p;} +#define UPDOUT {s->write=q;} +#define UPDATE {UPDBITS UPDIN UPDOUT} +#define LEAVE {UPDATE return inflate_flush(s,z,r);} +/* get bytes and bits */ +#define LOADIN {p=z->next_in;n=z->avail_in;b=s->bitb;k=s->bitk;} +#define NEEDBYTE {if(n)r=Z_OK;else LEAVE} +#define NEXTBYTE (n--,*p++) +#define NEEDBITS(j) {while(k<(j)){NEEDBYTE;b|=((uLong)NEXTBYTE)<>=(j);k-=(j);} +/* output bytes */ +#define WAVAIL (uInt)(qread?s->read-q-1:s->end-q) +#define LOADOUT {q=s->write;m=(uInt)WAVAIL;} +#define WWRAP {if(q==s->end&&s->read!=s->window){q=s->window;m=(uInt)WAVAIL;}} +#define FLUSH {UPDOUT r=inflate_flush(s,z,r); LOADOUT} +#define NEEDOUT {if(m==0){WWRAP if(m==0){FLUSH WWRAP if(m==0) LEAVE}}r=Z_OK;} +#define OUTBYTE(a) {*q++=(Byte)(a);m--;} +/* load local pointers */ +#define LOAD {LOADIN LOADOUT} + +/* masks for lower bits (size given to avoid silly warnings with Visual C++) */ +extern uInt inflate_mask[17]; + +/* copy as much as possible from the sliding window to the output area */ +extern int inflate_flush OF(( + inflate_blocks_statef *, + z_streamp , + int)); + +#ifndef NO_DUMMY_DECL +struct internal_state {int dummy;}; /* for buggy compilers */ +#endif + +#endif +/* --- infutil.h */ + +#ifndef NO_DUMMY_DECL +struct inflate_codes_state {int dummy;}; /* for buggy compilers */ +#endif + +/* Table for deflate from PKZIP's appnote.txt. */ +local const uInt border[] = { /* Order of the bit length code lengths */ + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; + +/* + Notes beyond the 1.93a appnote.txt: + + 1. Distance pointers never point before the beginning of the output + stream. + 2. Distance pointers can point back across blocks, up to 32k away. + 3. There is an implied maximum of 7 bits for the bit length table and + 15 bits for the actual data. + 4. If only one code exists, then it is encoded using one bit. (Zero + would be more efficient, but perhaps a little confusing.) If two + codes exist, they are coded using one bit each (0 and 1). + 5. There is no way of sending zero distance codes--a dummy must be + sent if there are none. (History: a pre 2.0 version of PKZIP would + store blocks with no distance codes, but this was discovered to be + too harsh a criterion.) Valid only for 1.93a. 2.04c does allow + zero distance codes, which is sent as one code of zero bits in + length. + 6. There are up to 286 literal/length codes. Code 256 represents the + end-of-block. Note however that the static length tree defines + 288 codes just to fill out the Huffman codes. Codes 286 and 287 + cannot be used though, since there is no length base or extra bits + defined for them. Similarily, there are up to 30 distance codes. + However, static trees define 32 codes (all 5 bits) to fill out the + Huffman codes, but the last two had better not show up in the data. + 7. Unzip can check dynamic Huffman blocks for complete code sets. + The exception is that a single code would not be complete (see #4). + 8. The five bits following the block type is really the number of + literal codes sent minus 257. + 9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits + (1+6+6). Therefore, to output three times the length, you output + three codes (1+1+1), whereas to output four times the same length, + you only need two codes (1+3). Hmm. + 10. In the tree reconstruction algorithm, Code = Code + Increment + only if BitLength(i) is not zero. (Pretty obvious.) + 11. Correction: 4 Bits: # of Bit Length codes - 4 (4 - 19) + 12. Note: length code 284 can represent 227-258, but length code 285 + really is 258. The last length deserves its own, short code + since it gets used a lot in very redundant files. The length + 258 is special since 258 - 3 (the min match length) is 255. + 13. The literal/length and distance code bit lengths are read as a + single stream of lengths. It is possible (and advantageous) for + a repeat code (16, 17, or 18) to go across the boundary between + the two sets of lengths. + */ + + +void inflate_blocks_reset(s, z, c) +inflate_blocks_statef *s; +z_streamp z; +uLongf *c; +{ + if (s->checkfn != Z_NULL) + *c = s->check; + if (s->mode == BTREE || s->mode == DTREE) + ZFREE(z, s->sub.trees.blens); + if (s->mode == CODES) + { + inflate_codes_free(s->sub.decode.codes, z); + inflate_trees_free(s->sub.decode.td, z); + inflate_trees_free(s->sub.decode.tl, z); + } + s->mode = TYPE; + s->bitk = 0; + s->bitb = 0; + s->read = s->write = s->window; + if (s->checkfn != Z_NULL) + z->adler = s->check = (*s->checkfn)(0L, Z_NULL, 0); + Trace((stderr, "inflate: blocks reset\n")); +} + + +inflate_blocks_statef *inflate_blocks_new(z, c, w) +z_streamp z; +check_func c; +uInt w; +{ + inflate_blocks_statef *s; + + if ((s = (inflate_blocks_statef *)ZALLOC + (z,1,sizeof(struct inflate_blocks_state))) == Z_NULL) + return s; + if ((s->window = (Bytef *)ZALLOC(z, 1, w)) == Z_NULL) + { + ZFREE(z, s); + return Z_NULL; + } + s->end = s->window + w; + s->checkfn = c; + s->mode = TYPE; + Trace((stderr, "inflate: blocks allocated\n")); + inflate_blocks_reset(s, z, &s->check); + return s; +} + + +#ifdef DEBUG_ZLIB + extern uInt inflate_hufts; +#endif +int inflate_blocks(s, z, r) +inflate_blocks_statef *s; +z_streamp z; +int r; +{ + uInt t; /* temporary storage */ + uLong b; /* bit buffer */ + uInt k; /* bits in bit buffer */ + Bytef *p; /* input data pointer */ + uInt n; /* bytes available there */ + Bytef *q; /* output window write pointer */ + uInt m; /* bytes to end of window or read pointer */ + + /* copy input/output information to locals (UPDATE macro restores) */ + LOAD + + /* process input based on current state */ + while (1) switch (s->mode) + { + case TYPE: + NEEDBITS(3) + t = (uInt)b & 7; + s->last = t & 1; + switch (t >> 1) + { + case 0: /* stored */ + Trace((stderr, "inflate: stored block%s\n", + s->last ? " (last)" : "")); + DUMPBITS(3) + t = k & 7; /* go to byte boundary */ + DUMPBITS(t) + s->mode = LENS; /* get length of stored block */ + break; + case 1: /* fixed */ + Trace((stderr, "inflate: fixed codes block%s\n", + s->last ? " (last)" : "")); + { + uInt bl, bd; + inflate_huft *tl, *td; + + inflate_trees_fixed(&bl, &bd, &tl, &td); + s->sub.decode.codes = inflate_codes_new(bl, bd, tl, td, z); + if (s->sub.decode.codes == Z_NULL) + { + r = Z_MEM_ERROR; + LEAVE + } + s->sub.decode.tl = Z_NULL; /* don't try to free these */ + s->sub.decode.td = Z_NULL; + } + DUMPBITS(3) + s->mode = CODES; + break; + case 2: /* dynamic */ + Trace((stderr, "inflate: dynamic codes block%s\n", + s->last ? " (last)" : "")); + DUMPBITS(3) + s->mode = TABLE; + break; + case 3: /* illegal */ + DUMPBITS(3) + s->mode = BADB; + z->msg = (char*)"invalid block type"; + r = Z_DATA_ERROR; + LEAVE + } + break; + case LENS: + NEEDBITS(32) + if ((((~b) >> 16) & 0xffff) != (b & 0xffff)) + { + s->mode = BADB; + z->msg = (char*)"invalid stored block lengths"; + r = Z_DATA_ERROR; + LEAVE + } + s->sub.left = (uInt)b & 0xffff; + b = k = 0; /* dump bits */ + Tracev((stderr, "inflate: stored length %u\n", s->sub.left)); + s->mode = s->sub.left ? STORED : (s->last ? DRY : TYPE); + break; + case STORED: + if (n == 0) + LEAVE + NEEDOUT + t = s->sub.left; + if (t > n) t = n; + if (t > m) t = m; + zmemcpy(q, p, t); + p += t; n -= t; + q += t; m -= t; + if ((s->sub.left -= t) != 0) + break; + Tracev((stderr, "inflate: stored end, %lu total out\n", + z->total_out + (q >= s->read ? q - s->read : + (s->end - s->read) + (q - s->window)))); + s->mode = s->last ? DRY : TYPE; + break; + case TABLE: + NEEDBITS(14) + s->sub.trees.table = t = (uInt)b & 0x3fff; +#ifndef PKZIP_BUG_WORKAROUND + if ((t & 0x1f) > 29 || ((t >> 5) & 0x1f) > 29) + { + s->mode = BADB; + z->msg = (char*)"too many length or distance symbols"; + r = Z_DATA_ERROR; + LEAVE + } +#endif + t = 258 + (t & 0x1f) + ((t >> 5) & 0x1f); + if (t < 19) + t = 19; + if ((s->sub.trees.blens = (uIntf*)ZALLOC(z, t, sizeof(uInt))) == Z_NULL) + { + r = Z_MEM_ERROR; + LEAVE + } + DUMPBITS(14) + s->sub.trees.index = 0; + Tracev((stderr, "inflate: table sizes ok\n")); + s->mode = BTREE; + case BTREE: + while (s->sub.trees.index < 4 + (s->sub.trees.table >> 10)) + { + NEEDBITS(3) + s->sub.trees.blens[border[s->sub.trees.index++]] = (uInt)b & 7; + DUMPBITS(3) + } + while (s->sub.trees.index < 19) + s->sub.trees.blens[border[s->sub.trees.index++]] = 0; + s->sub.trees.bb = 7; + t = inflate_trees_bits(s->sub.trees.blens, &s->sub.trees.bb, + &s->sub.trees.tb, z); + if (t != Z_OK) + { + r = t; + if (r == Z_DATA_ERROR) { + ZFREE(z, s->sub.trees.blens); + s->mode = BADB; + } + LEAVE + } + s->sub.trees.index = 0; + Tracev((stderr, "inflate: bits tree ok\n")); + s->mode = DTREE; + case DTREE: + while (t = s->sub.trees.table, + s->sub.trees.index < 258 + (t & 0x1f) + ((t >> 5) & 0x1f)) + { + inflate_huft *h; + uInt i, j, c; + + t = s->sub.trees.bb; + NEEDBITS(t) + h = s->sub.trees.tb + ((uInt)b & inflate_mask[t]); + t = h->word.what.Bits; + c = h->more.Base; + if (c < 16) + { + DUMPBITS(t) + s->sub.trees.blens[s->sub.trees.index++] = c; + } + else /* c == 16..18 */ + { + i = c == 18 ? 7 : c - 14; + j = c == 18 ? 11 : 3; + NEEDBITS(t + i) + DUMPBITS(t) + j += (uInt)b & inflate_mask[i]; + DUMPBITS(i) + i = s->sub.trees.index; + t = s->sub.trees.table; + if (i + j > 258 + (t & 0x1f) + ((t >> 5) & 0x1f) || + (c == 16 && i < 1)) + { + inflate_trees_free(s->sub.trees.tb, z); + ZFREE(z, s->sub.trees.blens); + s->mode = BADB; + z->msg = (char*)"invalid bit length repeat"; + r = Z_DATA_ERROR; + LEAVE + } + c = c == 16 ? s->sub.trees.blens[i - 1] : 0; + do { + s->sub.trees.blens[i++] = c; + } while (--j); + s->sub.trees.index = i; + } + } + inflate_trees_free(s->sub.trees.tb, z); + s->sub.trees.tb = Z_NULL; + { + uInt bl, bd; + inflate_huft *tl, *td; + inflate_codes_statef *c; + + bl = 9; /* must be <= 9 for lookahead assumptions */ + bd = 6; /* must be <= 9 for lookahead assumptions */ + t = s->sub.trees.table; +#ifdef DEBUG_ZLIB + inflate_hufts = 0; +#endif + t = inflate_trees_dynamic(257 + (t & 0x1f), 1 + ((t >> 5) & 0x1f), + s->sub.trees.blens, &bl, &bd, &tl, &td, z); + if (t != Z_OK) + { + if (t == (uInt)Z_DATA_ERROR) { + ZFREE(z, s->sub.trees.blens); + s->mode = BADB; + } + r = t; + LEAVE + } + Tracev((stderr, "inflate: trees ok, %d * %d bytes used\n", + inflate_hufts, sizeof(inflate_huft))); + if ((c = inflate_codes_new(bl, bd, tl, td, z)) == Z_NULL) + { + inflate_trees_free(td, z); + inflate_trees_free(tl, z); + r = Z_MEM_ERROR; + LEAVE + } + /* + * this ZFREE must occur *BEFORE* we mess with sub.decode, because + * sub.trees is union'd with sub.decode. + */ + ZFREE(z, s->sub.trees.blens); + s->sub.decode.codes = c; + s->sub.decode.tl = tl; + s->sub.decode.td = td; + } + s->mode = CODES; + case CODES: + UPDATE + if ((r = inflate_codes(s, z, r)) != Z_STREAM_END) + return inflate_flush(s, z, r); + r = Z_OK; + inflate_codes_free(s->sub.decode.codes, z); + inflate_trees_free(s->sub.decode.td, z); + inflate_trees_free(s->sub.decode.tl, z); + LOAD + Tracev((stderr, "inflate: codes end, %lu total out\n", + z->total_out + (q >= s->read ? q - s->read : + (s->end - s->read) + (q - s->window)))); + if (!s->last) + { + s->mode = TYPE; + break; + } + if (k > 7) /* return unused byte, if any */ + { + Assert(k < 16, "inflate_codes grabbed too many bytes") + k -= 8; + n++; + p--; /* can always return one */ + } + s->mode = DRY; + case DRY: + FLUSH + if (s->read != s->write) + LEAVE + s->mode = DONEB; + case DONEB: + r = Z_STREAM_END; + LEAVE + case BADB: + r = Z_DATA_ERROR; + LEAVE + default: + r = Z_STREAM_ERROR; + LEAVE + } +} + + +int inflate_blocks_free(s, z, c) +inflate_blocks_statef *s; +z_streamp z; +uLongf *c; +{ + inflate_blocks_reset(s, z, c); + ZFREE(z, s->window); + ZFREE(z, s); + Trace((stderr, "inflate: blocks freed\n")); + return Z_OK; +} + + +void inflate_set_dictionary(s, d, n) +inflate_blocks_statef *s; +const Bytef *d; +uInt n; +{ + zmemcpy((charf *)s->window, d, n); + s->read = s->write = s->window + n; +} + +/* + * This subroutine adds the data at next_in/avail_in to the output history + * without performing any output. The output buffer must be "caught up"; + * i.e. no pending output (hence s->read equals s->write), and the state must + * be BLOCKS (i.e. we should be willing to see the start of a series of + * BLOCKS). On exit, the output will also be caught up, and the checksum + * will have been updated if need be. + */ +int inflate_addhistory(s, z) +inflate_blocks_statef *s; +z_stream *z; +{ + uLong b; /* bit buffer */ /* NOT USED HERE */ + uInt k; /* bits in bit buffer */ /* NOT USED HERE */ + uInt t; /* temporary storage */ + Bytef *p; /* input data pointer */ + uInt n; /* bytes available there */ + Bytef *q; /* output window write pointer */ + uInt m; /* bytes to end of window or read pointer */ + + if (s->read != s->write) + return Z_STREAM_ERROR; + if (s->mode != TYPE) + return Z_DATA_ERROR; + + /* we're ready to rock */ + LOAD + /* while there is input ready, copy to output buffer, moving + * pointers as needed. + */ + while (n) { + t = n; /* how many to do */ + /* is there room until end of buffer? */ + if (t > m) t = m; + /* update check information */ + if (s->checkfn != Z_NULL) + s->check = (*s->checkfn)(s->check, q, t); + zmemcpy(q, p, t); + q += t; + p += t; + n -= t; + z->total_out += t; + s->read = q; /* drag read pointer forward */ +/* WWRAP */ /* expand WWRAP macro by hand to handle s->read */ + if (q == s->end) { + s->read = q = s->window; + m = WAVAIL; + } + } + UPDATE + return Z_OK; +} + + +/* + * At the end of a Deflate-compressed PPP packet, we expect to have seen + * a `stored' block type value but not the (zero) length bytes. + */ +int inflate_packet_flush(s) + inflate_blocks_statef *s; +{ + if (s->mode != LENS) + return Z_DATA_ERROR; + s->mode = TYPE; + return Z_OK; +} +/* --- infblock.c */ + +/* +++ inftrees.c */ +/* inftrees.c -- generate Huffman trees for efficient decoding + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* #include */ +/* #include */ + +char inflate_copyright[] = " inflate 1.0.4 Copyright 1995-1996 Mark Adler "; +/* + If you use the zlib library in a product, an acknowledgment is welcome + in the documentation of your product. If for some reason you cannot + include such an acknowledgment, I would appreciate that you keep this + copyright string in the executable of your product. + */ + +#ifndef NO_DUMMY_DECL +struct internal_state {int dummy;}; /* for buggy compilers */ +#endif + +/* simplify the use of the inflate_huft type with some defines */ +#define base more.Base +#define next more.Next +#define exop word.what.Exop +#define bits word.what.Bits + + +local int huft_build OF(( + uIntf *, /* code lengths in bits */ + uInt, /* number of codes */ + uInt, /* number of "simple" codes */ + const uIntf *, /* list of base values for non-simple codes */ + const uIntf *, /* list of extra bits for non-simple codes */ + inflate_huft * FAR*,/* result: starting table */ + uIntf *, /* maximum lookup bits (returns actual) */ + z_streamp )); /* for zalloc function */ + +local voidpf falloc OF(( + voidpf, /* opaque pointer (not used) */ + uInt, /* number of items */ + uInt)); /* size of item */ + +/* Tables for deflate from PKZIP's appnote.txt. */ +local const uInt cplens[31] = { /* Copy lengths for literal codes 257..285 */ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; + /* see note #13 above about 258 */ +local const uInt cplext[31] = { /* Extra bits for literal codes 257..285 */ + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, + 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 112, 112}; /* 112==invalid */ +local const uInt cpdist[30] = { /* Copy offsets for distance codes 0..29 */ + 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, + 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, + 8193, 12289, 16385, 24577}; +local const uInt cpdext[30] = { /* Extra bits for distance codes */ + 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 12, 13, 13}; + +/* + Huffman code decoding is performed using a multi-level table lookup. + The fastest way to decode is to simply build a lookup table whose + size is determined by the longest code. However, the time it takes + to build this table can also be a factor if the data being decoded + is not very long. The most common codes are necessarily the + shortest codes, so those codes dominate the decoding time, and hence + the speed. The idea is you can have a shorter table that decodes the + shorter, more probable codes, and then point to subsidiary tables for + the longer codes. The time it costs to decode the longer codes is + then traded against the time it takes to make longer tables. + + This results of this trade are in the variables lbits and dbits + below. lbits is the number of bits the first level table for literal/ + length codes can decode in one step, and dbits is the same thing for + the distance codes. Subsequent tables are also less than or equal to + those sizes. These values may be adjusted either when all of the + codes are shorter than that, in which case the longest code length in + bits is used, or when the shortest code is *longer* than the requested + table size, in which case the length of the shortest code in bits is + used. + + There are two different values for the two tables, since they code a + different number of possibilities each. The literal/length table + codes 286 possible values, or in a flat code, a little over eight + bits. The distance table codes 30 possible values, or a little less + than five bits, flat. The optimum values for speed end up being + about one bit more than those, so lbits is 8+1 and dbits is 5+1. + The optimum values may differ though from machine to machine, and + possibly even between compilers. Your mileage may vary. + */ + + +/* If BMAX needs to be larger than 16, then h and x[] should be uLong. */ +#define BMAX 15 /* maximum bit length of any code */ +#define N_MAX 288 /* maximum number of codes in any set */ + +#ifdef DEBUG_ZLIB + uInt inflate_hufts; +#endif + +local int huft_build(b, n, s, d, e, t, m, zs) +uIntf *b; /* code lengths in bits (all assumed <= BMAX) */ +uInt n; /* number of codes (assumed <= N_MAX) */ +uInt s; /* number of simple-valued codes (0..s-1) */ +const uIntf *d; /* list of base values for non-simple codes */ +const uIntf *e; /* list of extra bits for non-simple codes */ +inflate_huft * FAR *t; /* result: starting table */ +uIntf *m; /* maximum lookup bits, returns actual */ +z_streamp zs; /* for zalloc function */ +/* Given a list of code lengths and a maximum table size, make a set of + tables to decode that set of codes. Return Z_OK on success, Z_BUF_ERROR + if the given code set is incomplete (the tables are still built in this + case), Z_DATA_ERROR if the input is invalid (an over-subscribed set of + lengths), or Z_MEM_ERROR if not enough memory. */ +{ + + uInt a; /* counter for codes of length k */ + uInt c[BMAX+1]; /* bit length count table */ + uInt f; /* i repeats in table every f entries */ + int g; /* maximum code length */ + int h; /* table level */ + register uInt i; /* counter, current code */ + register uInt j; /* counter */ + register int k; /* number of bits in current code */ + int l; /* bits per table (returned in m) */ + register uIntf *p; /* pointer into c[], b[], or v[] */ + inflate_huft *q; /* points to current table */ + struct inflate_huft_s r; /* table entry for structure assignment */ + inflate_huft *u[BMAX]; /* table stack */ + uInt v[N_MAX]; /* values in order of bit length */ + register int w; /* bits before this table == (l * h) */ + uInt x[BMAX+1]; /* bit offsets, then code stack */ + uIntf *xp; /* pointer into x */ + int y; /* number of dummy codes added */ + uInt z; /* number of entries in current table */ + + + /* Generate counts for each bit length */ + p = c; +#define C0 *p++ = 0; +#define C2 C0 C0 C0 C0 +#define C4 C2 C2 C2 C2 + C4 /* clear c[]--assume BMAX+1 is 16 */ + p = b; i = n; + do { + c[*p++]++; /* assume all entries <= BMAX */ + } while (--i); + if (c[0] == n) /* null input--all zero length codes */ + { + *t = (inflate_huft *)Z_NULL; + *m = 0; + return Z_OK; + } + + + /* Find minimum and maximum length, bound *m by those */ + l = *m; + for (j = 1; j <= BMAX; j++) + if (c[j]) + break; + k = j; /* minimum code length */ + if ((uInt)l < j) + l = j; + for (i = BMAX; i; i--) + if (c[i]) + break; + g = i; /* maximum code length */ + if ((uInt)l > i) + l = i; + *m = l; + + + /* Adjust last length count to fill out codes, if needed */ + for (y = 1 << j; j < i; j++, y <<= 1) + if ((y -= c[j]) < 0) + return Z_DATA_ERROR; + if ((y -= c[i]) < 0) + return Z_DATA_ERROR; + c[i] += y; + + + /* Generate starting offsets into the value table for each length */ + x[1] = j = 0; + p = c + 1; xp = x + 2; + while (--i) { /* note that i == g from above */ + *xp++ = (j += *p++); + } + + + /* Make a table of values in order of bit lengths */ + p = b; i = 0; + do { + if ((j = *p++) != 0) + v[x[j]++] = i; + } while (++i < n); + n = x[g]; /* set n to length of v */ + + + /* Generate the Huffman codes and for each, make the table entries */ + x[0] = i = 0; /* first Huffman code is zero */ + p = v; /* grab values in bit order */ + h = -1; /* no tables yet--level -1 */ + w = -l; /* bits decoded == (l * h) */ + u[0] = (inflate_huft *)Z_NULL; /* just to keep compilers happy */ + q = (inflate_huft *)Z_NULL; /* ditto */ + z = 0; /* ditto */ + + /* go through the bit lengths (k already is bits in shortest code) */ + for (; k <= g; k++) + { + a = c[k]; + while (a--) + { + /* here i is the Huffman code of length k bits for value *p */ + /* make tables up to required level */ + while (k > w + l) + { + h++; + w += l; /* previous table always l bits */ + + /* compute minimum size table less than or equal to l bits */ + z = g - w; + z = z > (uInt)l ? l : z; /* table size upper limit */ + if ((f = 1 << (j = k - w)) > a + 1) /* try a k-w bit table */ + { /* too few codes for k-w bit table */ + f -= a + 1; /* deduct codes from patterns left */ + xp = c + k; + if (j < z) + while (++j < z) /* try smaller tables up to z bits */ + { + if ((f <<= 1) <= *++xp) + break; /* enough codes to use up j bits */ + f -= *xp; /* else deduct codes from patterns */ + } + } + z = 1 << j; /* table entries for j-bit table */ + + /* allocate and link in new table */ + if ((q = (inflate_huft *)ZALLOC + (zs,z + 1,sizeof(inflate_huft))) == Z_NULL) + { + if (h) + inflate_trees_free(u[0], zs); + return Z_MEM_ERROR; /* not enough memory */ + } +#ifdef DEBUG_ZLIB + inflate_hufts += z + 1; +#endif + *t = q + 1; /* link to list for huft_free() */ + *(t = &(q->next)) = Z_NULL; + u[h] = ++q; /* table starts after link */ + + /* connect to last table, if there is one */ + if (h) + { + x[h] = i; /* save pattern for backing up */ + r.bits = (Byte)l; /* bits to dump before this table */ + r.exop = (Byte)j; /* bits in this table */ + r.next = q; /* pointer to this table */ + j = i >> (w - l); /* (get around Turbo C bug) */ + u[h-1][j] = r; /* connect to last table */ + } + } + + /* set up table entry in r */ + r.bits = (Byte)(k - w); + if (p >= v + n) + r.exop = 128 + 64; /* out of values--invalid code */ + else if (*p < s) + { + r.exop = (Byte)(*p < 256 ? 0 : 32 + 64); /* 256 is end-of-block */ + r.base = *p++; /* simple code is just the value */ + } + else + { + r.exop = (Byte)(e[*p - s] + 16 + 64);/* non-simple--look up in lists */ + r.base = d[*p++ - s]; + } + + /* fill code-like entries with r */ + f = 1 << (k - w); + for (j = i >> w; j < z; j += f) + q[j] = r; + + /* backwards increment the k-bit code i */ + for (j = 1 << (k - 1); i & j; j >>= 1) + i ^= j; + i ^= j; + + /* backup over finished tables */ + while ((i & ((1 << w) - 1)) != x[h]) + { + h--; /* don't need to update q */ + w -= l; + } + } + } + + + /* Return Z_BUF_ERROR if we were given an incomplete table */ + return y != 0 && g != 1 ? Z_BUF_ERROR : Z_OK; +} + + +int inflate_trees_bits(c, bb, tb, z) +uIntf *c; /* 19 code lengths */ +uIntf *bb; /* bits tree desired/actual depth */ +inflate_huft * FAR *tb; /* bits tree result */ +z_streamp z; /* for zfree function */ +{ + int r; + + r = huft_build(c, 19, 19, (uIntf*)Z_NULL, (uIntf*)Z_NULL, tb, bb, z); + if (r == Z_DATA_ERROR) + z->msg = (char*)"oversubscribed dynamic bit lengths tree"; + else if (r == Z_BUF_ERROR || *bb == 0) + { + inflate_trees_free(*tb, z); + z->msg = (char*)"incomplete dynamic bit lengths tree"; + r = Z_DATA_ERROR; + } + return r; +} + + +int inflate_trees_dynamic(nl, nd, c, bl, bd, tl, td, z) +uInt nl; /* number of literal/length codes */ +uInt nd; /* number of distance codes */ +uIntf *c; /* that many (total) code lengths */ +uIntf *bl; /* literal desired/actual bit depth */ +uIntf *bd; /* distance desired/actual bit depth */ +inflate_huft * FAR *tl; /* literal/length tree result */ +inflate_huft * FAR *td; /* distance tree result */ +z_streamp z; /* for zfree function */ +{ + int r; + + /* build literal/length tree */ + r = huft_build(c, nl, 257, cplens, cplext, tl, bl, z); + if (r != Z_OK || *bl == 0) + { + if (r == Z_DATA_ERROR) + z->msg = (char*)"oversubscribed literal/length tree"; + else if (r != Z_MEM_ERROR) + { + inflate_trees_free(*tl, z); + z->msg = (char*)"incomplete literal/length tree"; + r = Z_DATA_ERROR; + } + return r; + } + + /* build distance tree */ + r = huft_build(c + nl, nd, 0, cpdist, cpdext, td, bd, z); + if (r != Z_OK || (*bd == 0 && nl > 257)) + { + if (r == Z_DATA_ERROR) + z->msg = (char*)"oversubscribed distance tree"; + else if (r == Z_BUF_ERROR) { +#ifdef PKZIP_BUG_WORKAROUND + r = Z_OK; + } +#else + inflate_trees_free(*td, z); + z->msg = (char*)"incomplete distance tree"; + r = Z_DATA_ERROR; + } + else if (r != Z_MEM_ERROR) + { + z->msg = (char*)"empty distance tree with lengths"; + r = Z_DATA_ERROR; + } + inflate_trees_free(*tl, z); + return r; +#endif + } + + /* done */ + return Z_OK; +} + + +/* build fixed tables only once--keep them here */ +local int fixed_built = 0; +#define FIXEDH 530 /* number of hufts used by fixed tables */ +local inflate_huft fixed_mem[FIXEDH]; +local uInt fixed_bl; +local uInt fixed_bd; +local inflate_huft *fixed_tl; +local inflate_huft *fixed_td; + + +local voidpf falloc(q, n, s) +voidpf q; /* opaque pointer */ +uInt n; /* number of items */ +uInt s; /* size of item */ +{ + Assert(s == sizeof(inflate_huft) && n <= *(intf *)q, + "inflate_trees falloc overflow"); + *(intf *)q -= n+s-s; /* s-s to avoid warning */ + return (voidpf)(fixed_mem + *(intf *)q); +} + + +int inflate_trees_fixed(bl, bd, tl, td) +uIntf *bl; /* literal desired/actual bit depth */ +uIntf *bd; /* distance desired/actual bit depth */ +inflate_huft * FAR *tl; /* literal/length tree result */ +inflate_huft * FAR *td; /* distance tree result */ +{ + /* build fixed tables if not already (multiple overlapped executions ok) */ + if (!fixed_built) + { + int k; /* temporary variable */ + unsigned c[288]; /* length list for huft_build */ + z_stream z; /* for falloc function */ + int f = FIXEDH; /* number of hufts left in fixed_mem */ + + /* set up fake z_stream for memory routines */ + z.zalloc = falloc; + z.zfree = Z_NULL; + z.opaque = (voidpf)&f; + + /* literal table */ + for (k = 0; k < 144; k++) + c[k] = 8; + for (; k < 256; k++) + c[k] = 9; + for (; k < 280; k++) + c[k] = 7; + for (; k < 288; k++) + c[k] = 8; + fixed_bl = 7; + huft_build(c, 288, 257, cplens, cplext, &fixed_tl, &fixed_bl, &z); + + /* distance table */ + for (k = 0; k < 30; k++) + c[k] = 5; + fixed_bd = 5; + huft_build(c, 30, 0, cpdist, cpdext, &fixed_td, &fixed_bd, &z); + + /* done */ + Assert(f == 0, "invalid build of fixed tables"); + fixed_built = 1; + } + *bl = fixed_bl; + *bd = fixed_bd; + *tl = fixed_tl; + *td = fixed_td; + return Z_OK; +} + + +int inflate_trees_free(t, z) +inflate_huft *t; /* table to free */ +z_streamp z; /* for zfree function */ +/* Free the malloc'ed tables built by huft_build(), which makes a linked + list of the tables it made, with the links in a dummy first entry of + each table. */ +{ + register inflate_huft *p, *q, *r; + + /* Reverse linked list */ + p = Z_NULL; + q = t; + while (q != Z_NULL) + { + r = (q - 1)->next; + (q - 1)->next = p; + p = q; + q = r; + } + /* Go through linked list, freeing from the malloced (t[-1]) address. */ + while (p != Z_NULL) + { + q = (--p)->next; + ZFREE(z,p); + p = q; + } + return Z_OK; +} +/* --- inftrees.c */ + +/* +++ infcodes.c */ +/* infcodes.c -- process literals and length/distance pairs + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* #include */ +/* #include */ +/* #include */ +/* #include */ +/* #include */ + +/* +++ inffast.h */ +/* inffast.h -- header to use inffast.c + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* WARNING: this file should *not* be used by applications. It is + part of the implementation of the compression library and is + subject to change. Applications should only use zlib.h. + */ + +extern int inflate_fast OF(( + uInt, + uInt, + inflate_huft *, + inflate_huft *, + inflate_blocks_statef *, + z_streamp )); +/* --- inffast.h */ + +/* simplify the use of the inflate_huft type with some defines */ +#define base more.Base +#define next more.Next +#define exop word.what.Exop +#define bits word.what.Bits + +/* inflate codes private state */ +struct inflate_codes_state { + + /* mode */ + enum { /* waiting for "i:"=input, "o:"=output, "x:"=nothing */ + START, /* x: set up for LEN */ + LEN, /* i: get length/literal/eob next */ + LENEXT, /* i: getting length extra (have base) */ + DIST, /* i: get distance next */ + DISTEXT, /* i: getting distance extra */ + COPY, /* o: copying bytes in window, waiting for space */ + LIT, /* o: got literal, waiting for output space */ + WASH, /* o: got eob, possibly still output waiting */ + END, /* x: got eob and all data flushed */ + BADCODE} /* x: got error */ + mode; /* current inflate_codes mode */ + + /* mode dependent information */ + uInt len; + union { + struct { + inflate_huft *tree; /* pointer into tree */ + uInt need; /* bits needed */ + } code; /* if LEN or DIST, where in tree */ + uInt lit; /* if LIT, literal */ + struct { + uInt get; /* bits to get for extra */ + uInt dist; /* distance back to copy from */ + } copy; /* if EXT or COPY, where and how much */ + } sub; /* submode */ + + /* mode independent information */ + Byte lbits; /* ltree bits decoded per branch */ + Byte dbits; /* dtree bits decoder per branch */ + inflate_huft *ltree; /* literal/length/eob tree */ + inflate_huft *dtree; /* distance tree */ + +}; + + +inflate_codes_statef *inflate_codes_new(bl, bd, tl, td, z) +uInt bl, bd; +inflate_huft *tl; +inflate_huft *td; /* need separate declaration for Borland C++ */ +z_streamp z; +{ + inflate_codes_statef *c; + + if ((c = (inflate_codes_statef *) + ZALLOC(z,1,sizeof(struct inflate_codes_state))) != Z_NULL) + { + c->mode = START; + c->lbits = (Byte)bl; + c->dbits = (Byte)bd; + c->ltree = tl; + c->dtree = td; + Tracev((stderr, "inflate: codes new\n")); + } + return c; +} + + +int inflate_codes(s, z, r) +inflate_blocks_statef *s; +z_streamp z; +int r; +{ + uInt j; /* temporary storage */ + inflate_huft *t; /* temporary pointer */ + uInt e; /* extra bits or operation */ + uLong b; /* bit buffer */ + uInt k; /* bits in bit buffer */ + Bytef *p; /* input data pointer */ + uInt n; /* bytes available there */ + Bytef *q; /* output window write pointer */ + uInt m; /* bytes to end of window or read pointer */ + Bytef *f; /* pointer to copy strings from */ + inflate_codes_statef *c = s->sub.decode.codes; /* codes state */ + + /* copy input/output information to locals (UPDATE macro restores) */ + LOAD + + /* process input and output based on current state */ + while (1) switch (c->mode) + { /* waiting for "i:"=input, "o:"=output, "x:"=nothing */ + case START: /* x: set up for LEN */ +#ifndef SLOW + if (m >= 258 && n >= 10) + { + UPDATE + r = inflate_fast(c->lbits, c->dbits, c->ltree, c->dtree, s, z); + LOAD + if (r != Z_OK) + { + c->mode = r == Z_STREAM_END ? WASH : BADCODE; + break; + } + } +#endif /* !SLOW */ + c->sub.code.need = c->lbits; + c->sub.code.tree = c->ltree; + c->mode = LEN; + case LEN: /* i: get length/literal/eob next */ + j = c->sub.code.need; + NEEDBITS(j) + t = c->sub.code.tree + ((uInt)b & inflate_mask[j]); + DUMPBITS(t->bits) + e = (uInt)(t->exop); + if (e == 0) /* literal */ + { + c->sub.lit = t->base; + Tracevv((stderr, t->base >= 0x20 && t->base < 0x7f ? + "inflate: literal '%c'\n" : + "inflate: literal 0x%02x\n", t->base)); + c->mode = LIT; + break; + } + if (e & 16) /* length */ + { + c->sub.copy.get = e & 15; + c->len = t->base; + c->mode = LENEXT; + break; + } + if ((e & 64) == 0) /* next table */ + { + c->sub.code.need = e; + c->sub.code.tree = t->next; + break; + } + if (e & 32) /* end of block */ + { + Tracevv((stderr, "inflate: end of block\n")); + c->mode = WASH; + break; + } + c->mode = BADCODE; /* invalid code */ + z->msg = (char*)"invalid literal/length code"; + r = Z_DATA_ERROR; + LEAVE + case LENEXT: /* i: getting length extra (have base) */ + j = c->sub.copy.get; + NEEDBITS(j) + c->len += (uInt)b & inflate_mask[j]; + DUMPBITS(j) + c->sub.code.need = c->dbits; + c->sub.code.tree = c->dtree; + Tracevv((stderr, "inflate: length %u\n", c->len)); + c->mode = DIST; + case DIST: /* i: get distance next */ + j = c->sub.code.need; + NEEDBITS(j) + t = c->sub.code.tree + ((uInt)b & inflate_mask[j]); + DUMPBITS(t->bits) + e = (uInt)(t->exop); + if (e & 16) /* distance */ + { + c->sub.copy.get = e & 15; + c->sub.copy.dist = t->base; + c->mode = DISTEXT; + break; + } + if ((e & 64) == 0) /* next table */ + { + c->sub.code.need = e; + c->sub.code.tree = t->next; + break; + } + c->mode = BADCODE; /* invalid code */ + z->msg = (char*)"invalid distance code"; + r = Z_DATA_ERROR; + LEAVE + case DISTEXT: /* i: getting distance extra */ + j = c->sub.copy.get; + NEEDBITS(j) + c->sub.copy.dist += (uInt)b & inflate_mask[j]; + DUMPBITS(j) + Tracevv((stderr, "inflate: distance %u\n", c->sub.copy.dist)); + c->mode = COPY; + case COPY: /* o: copying bytes in window, waiting for space */ +#ifndef __TURBOC__ /* Turbo C bug for following expression */ + f = (uInt)(q - s->window) < c->sub.copy.dist ? + s->end - (c->sub.copy.dist - (q - s->window)) : + q - c->sub.copy.dist; +#else + f = q - c->sub.copy.dist; + if ((uInt)(q - s->window) < c->sub.copy.dist) + f = s->end - (c->sub.copy.dist - (uInt)(q - s->window)); +#endif + while (c->len) + { + NEEDOUT + OUTBYTE(*f++) + if (f == s->end) + f = s->window; + c->len--; + } + c->mode = START; + break; + case LIT: /* o: got literal, waiting for output space */ + NEEDOUT + OUTBYTE(c->sub.lit) + c->mode = START; + break; + case WASH: /* o: got eob, possibly more output */ + FLUSH + if (s->read != s->write) + LEAVE + c->mode = END; + case END: + r = Z_STREAM_END; + LEAVE + case BADCODE: /* x: got error */ + r = Z_DATA_ERROR; + LEAVE + default: + r = Z_STREAM_ERROR; + LEAVE + } +} + + +void inflate_codes_free(c, z) +inflate_codes_statef *c; +z_streamp z; +{ + ZFREE(z, c); + Tracev((stderr, "inflate: codes free\n")); +} +/* --- infcodes.c */ + +/* +++ infutil.c */ +/* inflate_util.c -- data and routines common to blocks and codes + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* #include */ +/* #include */ +/* #include */ +/* #include */ +/* #include */ + +#ifndef NO_DUMMY_DECL +struct inflate_codes_state {int dummy;}; /* for buggy compilers */ +#endif + +/* And'ing with mask[n] masks the lower n bits */ +uInt inflate_mask[17] = { + 0x0000, + 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff, + 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff +}; + + +/* copy as much as possible from the sliding window to the output area */ +int inflate_flush(s, z, r) +inflate_blocks_statef *s; +z_streamp z; +int r; +{ + uInt n; + Bytef *p; + Bytef *q; + + /* local copies of source and destination pointers */ + p = z->next_out; + q = s->read; + + /* compute number of bytes to copy as far as end of window */ + n = (uInt)((q <= s->write ? s->write : s->end) - q); + if (n > z->avail_out) n = z->avail_out; + if (n && r == Z_BUF_ERROR) r = Z_OK; + + /* update counters */ + z->avail_out -= n; + z->total_out += n; + + /* update check information */ + if (s->checkfn != Z_NULL) + z->adler = s->check = (*s->checkfn)(s->check, q, n); + + /* copy as far as end of window */ + if (p != Z_NULL) { + zmemcpy(p, q, n); + p += n; + } + q += n; + + /* see if more to copy at beginning of window */ + if (q == s->end) + { + /* wrap pointers */ + q = s->window; + if (s->write == s->end) + s->write = s->window; + + /* compute bytes to copy */ + n = (uInt)(s->write - q); + if (n > z->avail_out) n = z->avail_out; + if (n && r == Z_BUF_ERROR) r = Z_OK; + + /* update counters */ + z->avail_out -= n; + z->total_out += n; + + /* update check information */ + if (s->checkfn != Z_NULL) + z->adler = s->check = (*s->checkfn)(s->check, q, n); + + /* copy */ + if (p != Z_NULL) { + zmemcpy(p, q, n); + p += n; + } + q += n; + } + + /* update pointers */ + z->next_out = p; + s->read = q; + + /* done */ + return r; +} +/* --- infutil.c */ + +/* +++ inffast.c */ +/* inffast.c -- process literals and length/distance pairs fast + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* #include */ +/* #include */ +/* #include */ +/* #include */ +/* #include */ +/* #include */ + +#ifndef NO_DUMMY_DECL +struct inflate_codes_state {int dummy;}; /* for buggy compilers */ +#endif + +/* simplify the use of the inflate_huft type with some defines */ +#define base more.Base +#define next more.Next +#define exop word.what.Exop +#define bits word.what.Bits + +/* macros for bit input with no checking and for returning unused bytes */ +#define GRABBITS(j) {while(k<(j)){b|=((uLong)NEXTBYTE)<>3);p-=c;k&=7;} + +/* Called with number of bytes left to write in window at least 258 + (the maximum string length) and number of input bytes available + at least ten. The ten bytes are six bytes for the longest length/ + distance pair plus four bytes for overloading the bit buffer. */ + +int inflate_fast(bl, bd, tl, td, s, z) +uInt bl, bd; +inflate_huft *tl; +inflate_huft *td; /* need separate declaration for Borland C++ */ +inflate_blocks_statef *s; +z_streamp z; +{ + inflate_huft *t; /* temporary pointer */ + uInt e; /* extra bits or operation */ + uLong b; /* bit buffer */ + uInt k; /* bits in bit buffer */ + Bytef *p; /* input data pointer */ + uInt n; /* bytes available there */ + Bytef *q; /* output window write pointer */ + uInt m; /* bytes to end of window or read pointer */ + uInt ml; /* mask for literal/length tree */ + uInt md; /* mask for distance tree */ + uInt c; /* bytes to copy */ + uInt d; /* distance back to copy from */ + Bytef *r; /* copy source pointer */ + + /* load input, output, bit values */ + LOAD + + /* initialize masks */ + ml = inflate_mask[bl]; + md = inflate_mask[bd]; + + /* do until not enough input or output space for fast loop */ + do { /* assume called with m >= 258 && n >= 10 */ + /* get literal/length code */ + GRABBITS(20) /* max bits for literal/length code */ + if ((e = (t = tl + ((uInt)b & ml))->exop) == 0) + { + DUMPBITS(t->bits) + Tracevv((stderr, t->base >= 0x20 && t->base < 0x7f ? + "inflate: * literal '%c'\n" : + "inflate: * literal 0x%02x\n", t->base)); + *q++ = (Byte)t->base; + m--; + continue; + } + do { + DUMPBITS(t->bits) + if (e & 16) + { + /* get extra bits for length */ + e &= 15; + c = t->base + ((uInt)b & inflate_mask[e]); + DUMPBITS(e) + Tracevv((stderr, "inflate: * length %u\n", c)); + + /* decode distance base of block to copy */ + GRABBITS(15); /* max bits for distance code */ + e = (t = td + ((uInt)b & md))->exop; + do { + DUMPBITS(t->bits) + if (e & 16) + { + /* get extra bits to add to distance base */ + e &= 15; + GRABBITS(e) /* get extra bits (up to 13) */ + d = t->base + ((uInt)b & inflate_mask[e]); + DUMPBITS(e) + Tracevv((stderr, "inflate: * distance %u\n", d)); + + /* do the copy */ + m -= c; + if ((uInt)(q - s->window) >= d) /* offset before dest */ + { /* just copy */ + r = q - d; + *q++ = *r++; c--; /* minimum count is three, */ + *q++ = *r++; c--; /* so unroll loop a little */ + } + else /* else offset after destination */ + { + e = d - (uInt)(q - s->window); /* bytes from offset to end */ + r = s->end - e; /* pointer to offset */ + if (c > e) /* if source crosses, */ + { + c -= e; /* copy to end of window */ + do { + *q++ = *r++; + } while (--e); + r = s->window; /* copy rest from start of window */ + } + } + do { /* copy all or what's left */ + *q++ = *r++; + } while (--c); + break; + } + else if ((e & 64) == 0) + e = (t = t->next + ((uInt)b & inflate_mask[e]))->exop; + else + { + z->msg = (char*)"invalid distance code"; + UNGRAB + UPDATE + return Z_DATA_ERROR; + } + } while (1); + break; + } + if ((e & 64) == 0) + { + if ((e = (t = t->next + ((uInt)b & inflate_mask[e]))->exop) == 0) + { + DUMPBITS(t->bits) + Tracevv((stderr, t->base >= 0x20 && t->base < 0x7f ? + "inflate: * literal '%c'\n" : + "inflate: * literal 0x%02x\n", t->base)); + *q++ = (Byte)t->base; + m--; + break; + } + } + else if (e & 32) + { + Tracevv((stderr, "inflate: * end of block\n")); + UNGRAB + UPDATE + return Z_STREAM_END; + } + else + { + z->msg = (char*)"invalid literal/length code"; + UNGRAB + UPDATE + return Z_DATA_ERROR; + } + } while (1); + } while (m >= 258 && n >= 10); + + /* not enough input or output--restore pointers and return */ + UNGRAB + UPDATE + return Z_OK; +} +/* --- inffast.c */ + +/* +++ zutil.c */ +/* zutil.c -- target dependent utility functions for the compression library + * Copyright (C) 1995-1996 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* From: zutil.c,v 1.17 1996/07/24 13:41:12 me Exp $ */ + +#ifdef DEBUG_ZLIB +#include +#endif + +/* #include */ + +#ifndef NO_DUMMY_DECL +struct internal_state {int dummy;}; /* for buggy compilers */ +#endif + +#ifndef STDC +extern void exit OF((int)); +#endif + +static const char *z_errmsg[10] = { +"need dictionary", /* Z_NEED_DICT 2 */ +"stream end", /* Z_STREAM_END 1 */ +"", /* Z_OK 0 */ +"file error", /* Z_ERRNO (-1) */ +"stream error", /* Z_STREAM_ERROR (-2) */ +"data error", /* Z_DATA_ERROR (-3) */ +"insufficient memory", /* Z_MEM_ERROR (-4) */ +"buffer error", /* Z_BUF_ERROR (-5) */ +"incompatible version",/* Z_VERSION_ERROR (-6) */ +""}; + + +const char *zlibVersion() +{ + return ZLIB_VERSION; +} + +#ifdef DEBUG_ZLIB +void z_error (m) + char *m; +{ + fprintf(stderr, "%s\n", m); + exit(1); +} +#endif + +#ifndef HAVE_MEMCPY + +void zmemcpy(dest, source, len) + Bytef* dest; + Bytef* source; + uInt len; +{ + if (len == 0) return; + do { + *dest++ = *source++; /* ??? to be unrolled */ + } while (--len != 0); +} + +int zmemcmp(s1, s2, len) + Bytef* s1; + Bytef* s2; + uInt len; +{ + uInt j; + + for (j = 0; j < len; j++) { + if (s1[j] != s2[j]) return 2*(s1[j] > s2[j])-1; + } + return 0; +} + +void zmemzero(dest, len) + Bytef* dest; + uInt len; +{ + if (len == 0) return; + do { + *dest++ = 0; /* ??? to be unrolled */ + } while (--len != 0); +} +#endif + +#ifdef __TURBOC__ +#if (defined( __BORLANDC__) || !defined(SMALL_MEDIUM)) && !defined(__32BIT__) +/* Small and medium model in Turbo C are for now limited to near allocation + * with reduced MAX_WBITS and MAX_MEM_LEVEL + */ +# define MY_ZCALLOC + +/* Turbo C malloc() does not allow dynamic allocation of 64K bytes + * and farmalloc(64K) returns a pointer with an offset of 8, so we + * must fix the pointer. Warning: the pointer must be put back to its + * original form in order to free it, use zcfree(). + */ + +#define MAX_PTR 10 +/* 10*64K = 640K */ + +local int next_ptr = 0; + +typedef struct ptr_table_s { + voidpf org_ptr; + voidpf new_ptr; +} ptr_table; + +local ptr_table table[MAX_PTR]; +/* This table is used to remember the original form of pointers + * to large buffers (64K). Such pointers are normalized with a zero offset. + * Since MSDOS is not a preemptive multitasking OS, this table is not + * protected from concurrent access. This hack doesn't work anyway on + * a protected system like OS/2. Use Microsoft C instead. + */ + +voidpf zcalloc (voidpf opaque, unsigned items, unsigned size) +{ + voidpf buf = opaque; /* just to make some compilers happy */ + ulg bsize = (ulg)items*size; + + /* If we allocate less than 65520 bytes, we assume that farmalloc + * will return a usable pointer which doesn't have to be normalized. + */ + if (bsize < 65520L) { + buf = farmalloc(bsize); + if (*(ush*)&buf != 0) return buf; + } else { + buf = farmalloc(bsize + 16L); + } + if (buf == NULL || next_ptr >= MAX_PTR) return NULL; + table[next_ptr].org_ptr = buf; + + /* Normalize the pointer to seg:0 */ + *((ush*)&buf+1) += ((ush)((uch*)buf-0) + 15) >> 4; + *(ush*)&buf = 0; + table[next_ptr++].new_ptr = buf; + return buf; +} + +void zcfree (voidpf opaque, voidpf ptr) +{ + int n; + if (*(ush*)&ptr != 0) { /* object < 64K */ + farfree(ptr); + return; + } + /* Find the original pointer */ + for (n = 0; n < next_ptr; n++) { + if (ptr != table[n].new_ptr) continue; + + farfree(table[n].org_ptr); + while (++n < next_ptr) { + table[n-1] = table[n]; + } + next_ptr--; + return; + } + ptr = opaque; /* just to make some compilers happy */ + Assert(0, "zcfree: ptr not found"); +} +#endif +#endif /* __TURBOC__ */ + + +#if defined(M_I86) && !defined(__32BIT__) +/* Microsoft C in 16-bit mode */ + +# define MY_ZCALLOC + +#if (!defined(_MSC_VER) || (_MSC_VER < 600)) +# define _halloc halloc +# define _hfree hfree +#endif + +voidpf zcalloc (voidpf opaque, unsigned items, unsigned size) +{ + if (opaque) opaque = 0; /* to make compiler happy */ + return _halloc((long)items, size); +} + +void zcfree (voidpf opaque, voidpf ptr) +{ + if (opaque) opaque = 0; /* to make compiler happy */ + _hfree(ptr); +} + +#endif /* MSC */ + + +#ifndef MY_ZCALLOC /* Any system without a special alloc function */ + +#ifndef STDC +extern voidp calloc OF((uInt items, uInt size)); +extern void free OF((voidpf ptr)); +#endif + +voidpf zcalloc (opaque, items, size) + voidpf opaque; + unsigned items; + unsigned size; +{ + if (opaque) items += size - size; /* make compiler happy */ + return (voidpf)calloc(items, size); +} + +void zcfree (opaque, ptr) + voidpf opaque; + voidpf ptr; +{ + free(ptr); + if (opaque) return; /* make compiler happy */ +} + +#endif /* MY_ZCALLOC */ +/* --- zutil.c */ + +/* +++ adler32.c */ +/* adler32.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* From: adler32.c,v 1.10 1996/05/22 11:52:18 me Exp $ */ + +/* #include */ + +#define BASE 65521L /* largest prime smaller than 65536 */ +#define NMAX 5552 +/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ + +#define DO1(buf,i) {s1 += buf[(i)]; s2 += s1;} +#define DO2(buf,i) DO1(buf,i); DO1(buf,(i)+1); +#define DO4(buf,i) DO2(buf,i); DO2(buf,(i)+2); +#define DO8(buf,i) DO4(buf,i); DO4(buf,(i)+4); +#define DO16(buf) DO8(buf,0); DO8(buf,8); + +/* ========================================================================= */ +uLong adler32(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned long s1 = adler & 0xffff; + unsigned long s2 = (adler >> 16) & 0xffff; + int k; + + if (buf == Z_NULL) return 1L; + + while (len > 0) { + k = len < NMAX ? len : NMAX; + len -= k; + while (k >= 16) { + DO16(buf); + buf += 16; + k -= 16; + } + if (k != 0) do { + s1 += *buf++; + s2 += s1; + } while (--k); + s1 %= BASE; + s2 %= BASE; + } + return (s2 << 16) | s1; +} +/* --- adler32.c */ + +#ifdef _KERNEL +static int +zlib_modevent(module_t mod, int type, void *unused) +{ + switch (type) { + case MOD_LOAD: + return 0; + case MOD_UNLOAD: + return 0; + } + return EINVAL; +} + +static moduledata_t zlib_mod = { + "zlib", + zlib_modevent, + 0 +}; +DECLARE_MODULE(zlib, zlib_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); +MODULE_VERSION(zlib, 1); +#endif /* _KERNEL */ diff --git a/freebsd/sys/net/zlib.h b/freebsd/sys/net/zlib.h new file mode 100644 index 00000000..3da670fd --- /dev/null +++ b/freebsd/sys/net/zlib.h @@ -0,0 +1,1018 @@ +/* $FreeBSD$ */ + +/* + * This file is derived from zlib.h and zconf.h from the zlib-1.0.4 + * distribution by Jean-loup Gailly and Mark Adler, with some additions + * by Paul Mackerras to aid in implementing Deflate compression and + * decompression for PPP packets. + */ + +/* + * ==FILEVERSION 971127== + * + * This marker is used by the Linux installation script to determine + * whether an up-to-date version of this file is already installed. + */ + + +/* +++ zlib.h */ +/*- + zlib.h -- interface of the 'zlib' general purpose compression library + version 1.0.4, Jul 24th, 1996. + + Copyright (C) 1995-1996 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + gzip@prep.ai.mit.edu madler@alumni.caltech.edu +*/ +/* + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files ftp://ds.internic.net/rfc/rfc1950.txt + (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). +*/ + +#ifndef _ZLIB_H +#define _ZLIB_H + +#ifdef __cplusplus +extern "C" { +#endif + + +/* +++ zconf.h */ +/* zconf.h -- configuration of the zlib compression library + * Copyright (C) 1995-1996 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* From: zconf.h,v 1.20 1996/07/02 15:09:28 me Exp $ */ + +#ifndef _ZCONF_H +#define _ZCONF_H + +/* + * If you *really* need a unique prefix for all types and library functions, + * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it. + */ +#ifdef Z_PREFIX +# define deflateInit_ z_deflateInit_ +# define deflate z_deflate +# define deflateEnd z_deflateEnd +# define inflateInit_ z_inflateInit_ +# define inflate z_inflate +# define inflateEnd z_inflateEnd +# define deflateInit2_ z_deflateInit2_ +# define deflateSetDictionary z_deflateSetDictionary +# define deflateCopy z_deflateCopy +# define deflateReset z_deflateReset +# define deflateParams z_deflateParams +# define inflateInit2_ z_inflateInit2_ +# define inflateSetDictionary z_inflateSetDictionary +# define inflateSync z_inflateSync +# define inflateReset z_inflateReset +# define compress z_compress +# define uncompress z_uncompress +# define adler32 z_adler32 +#if 0 +# define crc32 z_crc32 +# define get_crc_table z_get_crc_table +#endif + +# define Byte z_Byte +# define uInt z_uInt +# define uLong z_uLong +# define Bytef z_Bytef +# define charf z_charf +# define intf z_intf +# define uIntf z_uIntf +# define uLongf z_uLongf +# define voidpf z_voidpf +# define voidp z_voidp +#endif + +#if (defined(_WIN32) || defined(__WIN32__)) && !defined(WIN32) +# define WIN32 +#endif +#if defined(__GNUC__) || defined(WIN32) || defined(__386__) || defined(i386) +# ifndef __32BIT__ +# define __32BIT__ +# endif +#endif +#if defined(__MSDOS__) && !defined(MSDOS) +# define MSDOS +#endif + +/* + * Compile with -DMAXSEG_64K if the alloc function cannot allocate more + * than 64k bytes at a time (needed on systems with 16-bit int). + */ +#if defined(MSDOS) && !defined(__32BIT__) +# define MAXSEG_64K +#endif +#ifdef MSDOS +# define UNALIGNED_OK +#endif + +#if (defined(MSDOS) || defined(_WINDOWS) || defined(WIN32)) && !defined(STDC) +# define STDC +#endif +#if (defined(__STDC__) || defined(__cplusplus)) && !defined(STDC) +# define STDC +#endif + +#ifndef STDC +# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */ +# define const +# endif +#endif + +/* Some Mac compilers merge all .h files incorrectly: */ +#if defined(__MWERKS__) || defined(applec) ||defined(THINK_C) ||defined(__SC__) +# define NO_DUMMY_DECL +#endif + +/* Maximum value for memLevel in deflateInit2 */ +#ifndef MAX_MEM_LEVEL +# ifdef MAXSEG_64K +# define MAX_MEM_LEVEL 8 +# else +# define MAX_MEM_LEVEL 9 +# endif +#endif + +/* Maximum value for windowBits in deflateInit2 and inflateInit2 */ +#ifndef MAX_WBITS +# define MAX_WBITS 15 /* 32K LZ77 window */ +#endif + +/* The memory requirements for deflate are (in bytes): + 1 << (windowBits+2) + 1 << (memLevel+9) + that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) + plus a few kilobytes for small objects. For example, if you want to reduce + the default memory requirements from 256K to 128K, compile with + make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" + Of course this will generally degrade compression (there's no free lunch). + + The memory requirements for inflate are (in bytes) 1 << windowBits + that is, 32K for windowBits=15 (default value) plus a few kilobytes + for small objects. +*/ + + /* Type declarations */ + +#ifndef OF /* function prototypes */ +# ifdef STDC +# define OF(args) args +# else +# define OF(args) () +# endif +#endif + +/* The following definitions for FAR are needed only for MSDOS mixed + * model programming (small or medium model with some far allocations). + * This was tested only with MSC; for other MSDOS compilers you may have + * to define NO_MEMCPY in zutil.h. If you don't need the mixed model, + * just define FAR to be empty. + */ +#if (defined(M_I86SM) || defined(M_I86MM)) && !defined(__32BIT__) + /* MSC small or medium model */ +# define SMALL_MEDIUM +# ifdef _MSC_VER +# define FAR __far +# else +# define FAR far +# endif +#endif +#if defined(__BORLANDC__) && (defined(__SMALL__) || defined(__MEDIUM__)) +# ifndef __32BIT__ +# define SMALL_MEDIUM +# define FAR __far +# endif +#endif +#ifndef FAR +# define FAR +#endif + +typedef unsigned char Byte; /* 8 bits */ +typedef unsigned int uInt; /* 16 bits or more */ +typedef unsigned long uLong; /* 32 bits or more */ + +#if defined(__BORLANDC__) && defined(SMALL_MEDIUM) + /* Borland C/C++ ignores FAR inside typedef */ +# define Bytef Byte FAR +#else + typedef Byte FAR Bytef; +#endif +typedef char FAR charf; +typedef int FAR intf; +typedef uInt FAR uIntf; +typedef uLong FAR uLongf; + +#ifdef STDC + typedef void FAR *voidpf; + typedef void *voidp; +#else + typedef Byte FAR *voidpf; + typedef Byte *voidp; +#endif + + +/* Compile with -DZLIB_DLL for Windows DLL support */ +#if (defined(_WINDOWS) || defined(WINDOWS)) && defined(ZLIB_DLL) +# include +# define EXPORT WINAPI +#else +# define EXPORT +#endif + +#endif /* _ZCONF_H */ +/* --- zconf.h */ + +#define ZLIB_VERSION "1.0.4P" + +/* + The 'zlib' compression library provides in-memory compression and + decompression functions, including integrity checks of the uncompressed + data. This version of the library supports only one compression method + (deflation) but other algorithms may be added later and will have the same + stream interface. + + For compression the application must provide the output buffer and + may optionally provide the input buffer for optimization. For decompression, + the application must provide the input buffer and may optionally provide + the output buffer for optimization. + + Compression can be done in a single step if the buffers are large + enough (for example if an input file is mmap'ed), or can be done by + repeated calls of the compression function. In the latter case, the + application must provide more input and/or consume the output + (providing more output space) before each call. + + The library does not install any signal handler. It is recommended to + add at least a handler for SIGSEGV when decompressing; the library checks + the consistency of the input data whenever possible but may go nuts + for some forms of corrupted input. +*/ + +typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size)); +typedef void (*free_func) OF((voidpf opaque, voidpf address)); + +struct internal_state; + +typedef struct z_stream_s { + Bytef *next_in; /* next input byte */ + uInt avail_in; /* number of bytes available at next_in */ + uLong total_in; /* total nb of input bytes read so far */ + + Bytef *next_out; /* next output byte should be put there */ + uInt avail_out; /* remaining free space at next_out */ + uLong total_out; /* total nb of bytes output so far */ + + const char *msg; /* last error message, NULL if no error */ + struct internal_state FAR *state; /* not visible by applications */ + + alloc_func zalloc; /* used to allocate the internal state */ + free_func zfree; /* used to free the internal state */ + voidpf opaque; /* private data object passed to zalloc and zfree */ + + int data_type; /* best guess about the data type: ascii or binary */ + uLong adler; /* adler32 value of the uncompressed data */ + uLong reserved; /* reserved for future use */ +} z_stream; + +typedef z_stream FAR *z_streamp; + +/* + The application must update next_in and avail_in when avail_in has + dropped to zero. It must update next_out and avail_out when avail_out + has dropped to zero. The application must initialize zalloc, zfree and + opaque before calling the init function. All other fields are set by the + compression library and must not be updated by the application. + + The opaque value provided by the application will be passed as the first + parameter for calls of zalloc and zfree. This can be useful for custom + memory management. The compression library attaches no meaning to the + opaque value. + + zalloc must return Z_NULL if there is not enough memory for the object. + On 16-bit systems, the functions zalloc and zfree must be able to allocate + exactly 65536 bytes, but will not be required to allocate more than this + if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, + pointers returned by zalloc for objects of exactly 65536 bytes *must* + have their offset normalized to zero. The default allocation function + provided by this library ensures this (see zutil.c). To reduce memory + requirements and avoid any allocation of 64K objects, at the expense of + compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h). + + The fields total_in and total_out can be used for statistics or + progress reports. After compression, total_in holds the total size of + the uncompressed data and may be saved for use in the decompressor + (particularly if the decompressor wants to decompress everything in + a single step). +*/ + + /* constants */ + +#define Z_NO_FLUSH 0 +#define Z_PARTIAL_FLUSH 1 +#define Z_PACKET_FLUSH 2 +#define Z_SYNC_FLUSH 3 +#define Z_FULL_FLUSH 4 +#define Z_FINISH 5 +/* Allowed flush values; see deflate() below for details */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) +/* Return codes for the compression/decompression functions. Negative + * values are errors, positive values are used for special but normal events. + */ + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) +/* compression levels */ + +#define Z_FILTERED 1 +#define Z_HUFFMAN_ONLY 2 +#define Z_DEFAULT_STRATEGY 0 +/* compression strategy; see deflateInit2() below for details */ + +#define Z_BINARY 0 +#define Z_ASCII 1 +#define Z_UNKNOWN 2 +/* Possible values of the data_type field */ + +#define Z_DEFLATED 8 +/* The deflate compression method (the only one supported in this version) */ + +#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ + +#define zlib_version zlibVersion() +/* for compatibility with versions < 1.0.2 */ + + /* basic functions */ + +extern const char * EXPORT zlibVersion OF((void)); +/* The application can compare zlibVersion and ZLIB_VERSION for consistency. + If the first character differs, the library code actually used is + not compatible with the zlib.h header file used by the application. + This check is automatically made by deflateInit and inflateInit. + */ + +/* +extern int EXPORT deflateInit OF((z_streamp strm, int level)); + + Initializes the internal stream state for compression. The fields + zalloc, zfree and opaque must be initialized before by the caller. + If zalloc and zfree are set to Z_NULL, deflateInit updates them to + use default allocation functions. + + The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9: + 1 gives best speed, 9 gives best compression, 0 gives no compression at + all (the input data is simply copied a block at a time). + Z_DEFAULT_COMPRESSION requests a default compromise between speed and + compression (currently equivalent to level 6). + + deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if level is not a valid compression level, + Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible + with the version assumed by the caller (ZLIB_VERSION). + msg is set to null if there is no error message. deflateInit does not + perform any compression: this will be done by deflate(). +*/ + + +extern int EXPORT deflate OF((z_streamp strm, int flush)); +/* + Performs one or both of the following actions: + + - Compress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in and avail_in are updated and + processing will resume at this point for the next call of deflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. This action is forced if the parameter flush is non zero. + Forcing flush frequently degrades the compression ratio, so this parameter + should be set only when necessary (in interactive applications). + Some output may be provided even if flush is not set. + + Before the call of deflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating avail_in or avail_out accordingly; avail_out + should never be zero before the call. The application can consume the + compressed output when it wants, for example when the output buffer is full + (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK + and with zero avail_out, it must be called again after making room in the + output buffer because there might be more output pending. + + If the parameter flush is set to Z_PARTIAL_FLUSH, the current compression + block is terminated and flushed to the output buffer so that the + decompressor can get all input data available so far. For method 9, a future + variant on method 8, the current block will be flushed but not terminated. + Z_SYNC_FLUSH has the same effect as partial flush except that the compressed + output is byte aligned (the compressor can clear its internal bit buffer) + and the current block is always terminated; this can be useful if the + compressor has to be restarted from scratch after an interruption (in which + case the internal state of the compressor may be lost). + If flush is set to Z_FULL_FLUSH, the compression block is terminated, a + special marker is output and the compression dictionary is discarded; this + is useful to allow the decompressor to synchronize if one compressed block + has been damaged (see inflateSync below). Flushing degrades compression and + so should be used only when necessary. Using Z_FULL_FLUSH too often can + seriously degrade the compression. If deflate returns with avail_out == 0, + this function must be called again with the same value of the flush + parameter and more output space (updated avail_out), until the flush is + complete (deflate returns with non-zero avail_out). + + If the parameter flush is set to Z_PACKET_FLUSH, the compression + block is terminated, and a zero-length stored block is output, + omitting the length bytes (the effect of this is that the 3-bit type + code 000 for a stored block is output, and the output is then + byte-aligned). This is designed for use at the end of a PPP packet. + + If the parameter flush is set to Z_FINISH, pending input is processed, + pending output is flushed and deflate returns with Z_STREAM_END if there + was enough output space; if deflate returns with Z_OK, this function must be + called again with Z_FINISH and more output space (updated avail_out) but no + more input data, until it returns with Z_STREAM_END or an error. After + deflate has returned Z_STREAM_END, the only possible operations on the + stream are deflateReset or deflateEnd. + + Z_FINISH can be used immediately after deflateInit if all the compression + is to be done in a single step. In this case, avail_out must be at least + 0.1% larger than avail_in plus 12 bytes. If deflate does not return + Z_STREAM_END, then it must be called again as described above. + + deflate() may update data_type if it can make a good guess about + the input data type (Z_ASCII or Z_BINARY). In doubt, the data is considered + binary. This field is only for information purposes and does not affect + the compression algorithm in any manner. + + deflate() returns Z_OK if some progress has been made (more input + processed or more output produced), Z_STREAM_END if all input has been + consumed and all output has been produced (only when flush is set to + Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example + if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible. +*/ + + +extern int EXPORT deflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the + stream state was inconsistent, Z_DATA_ERROR if the stream was freed + prematurely (some input or output was discarded). In the error case, + msg may be set but then points to a static string (which must not be + deallocated). +*/ + + +/* +extern int EXPORT inflateInit OF((z_streamp strm)); + + Initializes the internal stream state for decompression. The fields + zalloc, zfree and opaque must be initialized before by the caller. If + zalloc and zfree are set to Z_NULL, inflateInit updates them to use default + allocation functions. + + inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_VERSION_ERROR if the zlib library version is incompatible + with the version assumed by the caller. msg is set to null if there is no + error message. inflateInit does not perform any decompression: this will be + done by inflate(). +*/ + +#if defined(__FreeBSD__) && defined(_KERNEL) +#define inflate inflate_ppp /* FreeBSD already has an inflate :-( */ +#endif + +extern int EXPORT inflate OF((z_streamp strm, int flush)); +/* + Performs one or both of the following actions: + + - Decompress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in is updated and processing + will resume at this point for the next call of inflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. inflate() provides as much output as possible, until there + is no more input data or no more space in the output buffer (see below + about the flush parameter). + + Before the call of inflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating the next_* and avail_* values accordingly. + The application can consume the uncompressed output when it wants, for + example when the output buffer is full (avail_out == 0), or after each + call of inflate(). If inflate returns Z_OK and with zero avail_out, it + must be called again after making room in the output buffer because there + might be more output pending. + + If the parameter flush is set to Z_PARTIAL_FLUSH or Z_PACKET_FLUSH, + inflate flushes as much output as possible to the output buffer. The + flushing behavior of inflate is not specified for values of the flush + parameter other than Z_PARTIAL_FLUSH, Z_PACKET_FLUSH or Z_FINISH, but the + current implementation actually flushes as much output as possible + anyway. For Z_PACKET_FLUSH, inflate checks that once all the input data + has been consumed, it is expecting to see the length field of a stored + block; if not, it returns Z_DATA_ERROR. + + inflate() should normally be called until it returns Z_STREAM_END or an + error. However if all decompression is to be performed in a single step + (a single call of inflate), the parameter flush should be set to + Z_FINISH. In this case all pending input is processed and all pending + output is flushed; avail_out must be large enough to hold all the + uncompressed data. (The size of the uncompressed data may have been saved + by the compressor for this purpose.) The next operation on this stream must + be inflateEnd to deallocate the decompression state. The use of Z_FINISH + is never required, but can be used to inform inflate that a faster routine + may be used for the single inflate() call. + + inflate() returns Z_OK if some progress has been made (more input + processed or more output produced), Z_STREAM_END if the end of the + compressed data has been reached and all uncompressed output has been + produced, Z_NEED_DICT if a preset dictionary is needed at this point (see + inflateSetDictionary below), Z_DATA_ERROR if the input data was corrupted, + Z_STREAM_ERROR if the stream structure was inconsistent (for example if + next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory, + Z_BUF_ERROR if no progress is possible or if there was not enough room in + the output buffer when Z_FINISH is used. In the Z_DATA_ERROR case, the + application may then call inflateSync to look for a good compression block. + In the Z_NEED_DICT case, strm->adler is set to the Adler32 value of the + dictionary chosen by the compressor. +*/ + + +extern int EXPORT inflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state + was inconsistent. In the error case, msg may be set but then points to a + static string (which must not be deallocated). +*/ + + /* Advanced functions */ + +/* + The following functions are needed only in some special applications. +*/ + +/* +extern int EXPORT deflateInit2 OF((z_streamp strm, + int level, + int method, + int windowBits, + int memLevel, + int strategy)); + + This is another version of deflateInit with more compression options. The + fields next_in, zalloc, zfree and opaque must be initialized before by + the caller. + + The method parameter is the compression method. It must be Z_DEFLATED in + this version of the library. (Method 9 will allow a 64K history buffer and + partial block flushes.) + + The windowBits parameter is the base two logarithm of the window size + (the size of the history buffer). It should be in the range 8..15 for this + version of the library (the value 16 will be allowed for method 9). Larger + values of this parameter result in better compression at the expense of + memory usage. The default value is 15 if deflateInit is used instead. + + The memLevel parameter specifies how much memory should be allocated + for the internal compression state. memLevel=1 uses minimum memory but + is slow and reduces compression ratio; memLevel=9 uses maximum memory + for optimal speed. The default value is 8. See zconf.h for total memory + usage as a function of windowBits and memLevel. + + The strategy parameter is used to tune the compression algorithm. Use the + value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a + filter (or predictor), or Z_HUFFMAN_ONLY to force Huffman encoding only (no + string match). Filtered data consists mostly of small values with a + somewhat random distribution. In this case, the compression algorithm is + tuned to compress them better. The effect of Z_FILTERED is to force more + Huffman coding and less string matching; it is somewhat intermediate + between Z_DEFAULT and Z_HUFFMAN_ONLY. The strategy parameter only affects + the compression ratio but not the correctness of the compressed output even + if it is not set appropriately. + + If next_in is not null, the library will use this buffer to hold also + some history information; the buffer must either hold the entire input + data, or have at least 1<<(windowBits+1) bytes and be writable. If next_in + is null, the library will allocate its own history buffer (and leave next_in + null). next_out need not be provided here but must be provided by the + application for the next call of deflate(). + + If the history buffer is provided by the application, next_in must + must never be changed by the application since the compressor maintains + information inside this buffer from call to call; the application + must provide more input only by increasing avail_in. next_in is always + reset by the library in this case. + + deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was + not enough memory, Z_STREAM_ERROR if a parameter is invalid (such as + an invalid method). msg is set to null if there is no error message. + deflateInit2 does not perform any compression: this will be done by + deflate(). +*/ + +extern int EXPORT deflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the compression dictionary (history buffer) from the given + byte sequence without producing any compressed output. This function must + be called immediately after deflateInit or deflateInit2, before any call + of deflate. The compressor and decompressor must use exactly the same + dictionary (see inflateSetDictionary). + The dictionary should consist of strings (byte sequences) that are likely + to be encountered later in the data to be compressed, with the most commonly + used strings preferably put towards the end of the dictionary. Using a + dictionary is most useful when the data to be compressed is short and + can be predicted with good accuracy; the data can then be compressed better + than with the default empty dictionary. In this version of the library, + only the last 32K bytes of the dictionary are used. + Upon return of this function, strm->adler is set to the Adler32 value + of the dictionary; the decompressor may later use this value to determine + which dictionary has been used by the compressor. (The Adler32 value + applies to the whole dictionary even if only a subset of the dictionary is + actually used by the compressor.) + + deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state + is inconsistent (for example if deflate has already been called for this + stream). deflateSetDictionary does not perform any compression: this will + be done by deflate(). +*/ + +extern int EXPORT deflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. If + the source stream is using an application-supplied history buffer, a new + buffer is allocated for the destination stream. The compressed output + buffer is always application-supplied. It's the responsibility of the + application to provide the correct values of next_out and avail_out for the + next call of deflate. + + This function can be useful when several compression strategies will be + tried, for example when there are several ways of pre-processing the input + data with a filter. The streams that will be discarded should then be freed + by calling deflateEnd. Note that deflateCopy duplicates the internal + compression state which can be quite large, so this strategy is slow and + can consume lots of memory. + + deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being NULL). msg is left unchanged in both source and + destination. +*/ + +extern int EXPORT deflateReset OF((z_streamp strm)); +/* + This function is equivalent to deflateEnd followed by deflateInit, + but does not free and reallocate all the internal compression state. + The stream will keep the same compression level and any other attributes + that may have been set by deflateInit2. + + deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +extern int EXPORT deflateParams OF((z_streamp strm, int level, int strategy)); +/* + Dynamically update the compression level and compression strategy. + This can be used to switch between compression and straight copy of + the input data, or to switch to a different kind of input data requiring + a different strategy. If the compression level is changed, the input + available so far is compressed with the old level (and may be flushed); + the new level will take effect only at the next call of deflate(). + + Before the call of deflateParams, the stream state must be set as for + a call of deflate(), since the currently available input may have to + be compressed and flushed. In particular, strm->avail_out must be non-zero. + + deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source + stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR + if strm->avail_out was zero. +*/ + +extern int EXPORT deflateOutputPending OF((z_streamp strm)); +/* + Returns the number of bytes of output which are immediately + available from the compressor (i.e. without any further input + or flush). +*/ + +/* +extern int EXPORT inflateInit2 OF((z_streamp strm, + int windowBits)); + + This is another version of inflateInit with more compression options. The + fields next_out, zalloc, zfree and opaque must be initialized before by + the caller. + + The windowBits parameter is the base two logarithm of the maximum window + size (the size of the history buffer). It should be in the range 8..15 for + this version of the library (the value 16 will be allowed soon). The + default value is 15 if inflateInit is used instead. If a compressed stream + with a larger window size is given as input, inflate() will return with + the error code Z_DATA_ERROR instead of trying to allocate a larger window. + + If next_out is not null, the library will use this buffer for the history + buffer; the buffer must either be large enough to hold the entire output + data, or have at least 1<