diff options
author | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2019-09-24 11:05:03 +0200 |
---|---|---|
committer | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2019-11-13 10:47:04 +0100 |
commit | a5ddb0ea69f21c16b7697a935d7a0c16bb3cffcf (patch) | |
tree | db091fb0f7d091804482156c9f3f55879ac93d5b /freebsd/sys | |
parent | test/syscalls01: Fix sporadic test failures (diff) | |
download | rtems-libbsd-a5ddb0ea69f21c16b7697a935d7a0c16bb3cffcf.tar.bz2 |
Update to FreeBSD head 2019-09-24
Git mirror commit 6b0307a0a5184339393f555d5d424190d8a8277a.
Diffstat (limited to 'freebsd/sys')
447 files changed, 20648 insertions, 10684 deletions
diff --git a/freebsd/sys/arm/include/machine/cpufunc.h b/freebsd/sys/arm/include/machine/cpufunc.h index 9dba8043..f34cfbff 100644 --- a/freebsd/sys/arm/include/machine/cpufunc.h +++ b/freebsd/sys/arm/include/machine/cpufunc.h @@ -360,6 +360,64 @@ extern u_int arm_cache_level; extern u_int arm_cache_loc; extern u_int arm_cache_type[14]; +#if __ARM_ARCH >= 6 +#define HAVE_INLINE_FFS + +static __inline __pure2 int +ffs(int mask) +{ + + return (__builtin_ffs(mask)); +} + +#define HAVE_INLINE_FFSL + +static __inline __pure2 int +ffsl(long mask) +{ + + return (__builtin_ffsl(mask)); +} + +#define HAVE_INLINE_FFSLL + +static __inline __pure2 int +ffsll(long long mask) +{ + + return (__builtin_ffsll(mask)); +} + +#define HAVE_INLINE_FLS + +static __inline __pure2 int +fls(int mask) +{ + + return (mask == 0 ? 0 : + 8 * sizeof(mask) - __builtin_clz((u_int)mask)); +} + +#define HAVE_INLINE_FLSL + +static __inline __pure2 int +flsl(long mask) +{ + + return (mask == 0 ? 0 : + 8 * sizeof(mask) - __builtin_clzl((u_long)mask)); +} + +#define HAVE_INLINE_FLSLL + +static __inline __pure2 int +flsll(long long mask) +{ + + return (mask == 0 ? 0 : + 8 * sizeof(mask) - __builtin_clzll((unsigned long long)mask)); +} +#endif #else /* !_KERNEL */ static __inline void diff --git a/freebsd/sys/arm/ti/am335x/tda19988.c b/freebsd/sys/arm/ti/am335x/tda19988.c index 282353ab..7ff4cf5b 100644 --- a/freebsd/sys/arm/ti/am335x/tda19988.c +++ b/freebsd/sys/arm/ti/am335x/tda19988.c @@ -245,7 +245,6 @@ struct tda19988_softc { uint32_t sc_addr; uint32_t sc_cec_addr; uint16_t sc_version; - struct intr_config_hook enum_hook; int sc_current_page; uint8_t *sc_edid; uint32_t sc_edid_len; @@ -647,15 +646,14 @@ done: } static void -tda19988_start(void *xdev) +tda19988_start(struct tda19988_softc *sc) { - struct tda19988_softc *sc; - device_t dev = (device_t)xdev; + device_t dev; uint8_t data; uint16_t version; - sc = device_get_softc(dev); - + dev = sc->sc_dev; + tda19988_cec_write(sc, TDA_CEC_ENAMODS, ENAMODS_RXSENS | ENAMODS_HDMI); DELAY(1000); tda19988_cec_read(sc, 0xfe, &data); @@ -701,7 +699,7 @@ tda19988_start(void *xdev) break; default: device_printf(dev, "Unknown device: %04x\n", sc->sc_version); - goto done; + return; } tda19988_reg_write(sc, TDA_DDC_CTRL, DDC_ENABLE); @@ -712,16 +710,13 @@ tda19988_start(void *xdev) if (tda19988_read_edid(sc) < 0) { device_printf(dev, "failed to read EDID\n"); - goto done; + return; } /* Default values for RGB 4:4:4 mapping */ tda19988_reg_write(sc, TDA_VIP_CNTRL_0, 0x23); tda19988_reg_write(sc, TDA_VIP_CNTRL_1, 0x01); tda19988_reg_write(sc, TDA_VIP_CNTRL_2, 0x45); - -done: - config_intrhook_disestablish(&sc->enum_hook); } static int @@ -740,15 +735,11 @@ tda19988_attach(device_t dev) device_set_desc(dev, "NXP TDA19988 HDMI transmitter"); - sc->enum_hook.ich_func = tda19988_start; - sc->enum_hook.ich_arg = dev; - - if (config_intrhook_establish(&sc->enum_hook) != 0) - return (ENOMEM); - node = ofw_bus_get_node(dev); OF_device_register_xref(OF_xref_from_node(node), dev); + tda19988_start(sc); + return (0); } diff --git a/freebsd/sys/arm/ti/cpsw/if_cpsw.c b/freebsd/sys/arm/ti/cpsw/if_cpsw.c index 1fbda688..be9ad62b 100644 --- a/freebsd/sys/arm/ti/cpsw/if_cpsw.c +++ b/freebsd/sys/arm/ti/cpsw/if_cpsw.c @@ -84,6 +84,8 @@ __FBSDID("$FreeBSD$"); #include <dev/ofw/ofw_bus.h> #include <dev/ofw/ofw_bus_subr.h> + +#include <dev/fdt/fdt_common.h> #ifdef CPSW_ETHERSWITCH #include <dev/etherswitch/etherswitch.h> @@ -749,7 +751,7 @@ cpsw_get_fdt_data(struct cpsw_softc *sc, int port) phandle_t child; unsigned long mdio_child_addr; - /* Find any slave with phy_id */ + /* Find any slave with phy-handle/phy_id */ phy = -1; vlan = -1; for (child = OF_child(sc->node); child != 0; child = OF_peer(child)) { @@ -760,14 +762,20 @@ cpsw_get_fdt_data(struct cpsw_softc *sc, int port) continue; } OF_prop_free(name); - if (mdio_child_addr != slave_mdio_addr[port]) + + if (mdio_child_addr != slave_mdio_addr[port] && + mdio_child_addr != (slave_mdio_addr[port] & 0xFFF)) continue; - len = OF_getproplen(child, "phy_id"); - if (len / sizeof(pcell_t) == 2) { - /* Get phy address from fdt */ - if (OF_getencprop(child, "phy_id", phy_id, len) > 0) - phy = phy_id[1]; + if (fdt_get_phyaddr(child, NULL, &phy, NULL) != 0){ + /* Users with old DTB will have phy_id instead */ + phy = -1; + len = OF_getproplen(child, "phy_id"); + if (len / sizeof(pcell_t) == 2) { + /* Get phy address from fdt */ + if (OF_getencprop(child, "phy_id", phy_id, len) > 0) + phy = phy_id[1]; + } } len = OF_getproplen(child, "dual_emac_res_vlan"); diff --git a/freebsd/sys/arm/ti/ti_hwmods.c b/freebsd/sys/arm/ti/ti_hwmods.c index 450679a7..a546d762 100644 --- a/freebsd/sys/arm/ti/ti_hwmods.c +++ b/freebsd/sys/arm/ti/ti_hwmods.c @@ -99,6 +99,16 @@ struct hwmod ti_hwmods[] = { {NULL, 0} }; +static inline int +ti_get_hwmods_prop(phandle_t node, void **name) +{ + int len; + + if ((len = OF_getprop_alloc(node, "ti,hwmods", name)) > 0) + return (len); + return (OF_getprop_alloc(OF_parent(node), "ti,hwmods", name)); +} + clk_ident_t ti_hwmods_get_clock(device_t dev) { @@ -112,7 +122,7 @@ ti_hwmods_get_clock(device_t dev) if ((node = ofw_bus_get_node(dev)) == 0) return (INVALID_CLK_IDENT); - if ((len = OF_getprop_alloc(node, "ti,hwmods", (void**)&name)) <= 0) + if ((len = ti_get_hwmods_prop(node, (void **)&name)) <= 0) return (INVALID_CLK_IDENT); buf = name; @@ -150,7 +160,7 @@ int ti_hwmods_contains(device_t dev, const char *hwmod) if ((node = ofw_bus_get_node(dev)) == 0) return (0); - if ((len = OF_getprop_alloc(node, "ti,hwmods", (void**)&name)) <= 0) + if ((len = ti_get_hwmods_prop(node, (void **)&name)) <= 0) return (0); buf = name; @@ -184,7 +194,7 @@ ti_hwmods_get_unit(device_t dev, const char *hwmod) if ((node = ofw_bus_get_node(dev)) == 0) return (0); - if ((len = OF_getprop_alloc(node, "ti,hwmods", (void**)&name)) <= 0) + if ((len = ti_get_hwmods_prop(node, (void **)&name)) <= 0) return (0); buf = name; diff --git a/freebsd/sys/arm/ti/ti_sdhci.c b/freebsd/sys/arm/ti/ti_sdhci.c index c5d29cb6..a2be1f19 100644 --- a/freebsd/sys/arm/ti/ti_sdhci.c +++ b/freebsd/sys/arm/ti/ti_sdhci.c @@ -484,15 +484,14 @@ ti_sdhci_hw_init(device_t dev) * The attach() routine has examined fdt data and set flags in * slot.host.caps to reflect what voltages we can handle. Set those * values in the CAPA register. The manual says that these values can - * only be set once, "before initialization" whatever that means, and - * that they survive a reset. So maybe doing this will be a no-op if - * u-boot has already initialized the hardware. + * only be set once, and that they survive a reset so unless u-boot didn't + * set this register this code is a no-op. */ regval = ti_mmchs_read_4(sc, MMCHS_SD_CAPA); if (sc->slot.host.caps & MMC_OCR_LOW_VOLTAGE) regval |= MMCHS_SD_CAPA_VS18; - if (sc->slot.host.caps & (MMC_OCR_290_300 | MMC_OCR_300_310)) - regval |= MMCHS_SD_CAPA_VS30; + if (sc->slot.host.caps & (MMC_OCR_320_330 | MMC_OCR_330_340)) + regval |= MMCHS_SD_CAPA_VS33; ti_mmchs_write_4(sc, MMCHS_SD_CAPA, regval); /* Set initial host configuration (1-bit, std speed, pwr off). */ @@ -526,17 +525,20 @@ ti_sdhci_attach(device_t dev) } /* - * The hardware can inherently do dual-voltage (1p8v, 3p0v) on the first + * The hardware can inherently do dual-voltage (1p8v, 3p3v) on the first * device, and only 1p8v on other devices unless an external transceiver * is used. The only way we could know about a transceiver is fdt data. * Note that we have to do this before calling ti_sdhci_hw_init() so * that it can set the right values in the CAPA register, which can only * be done once and never reset. */ - sc->slot.host.caps |= MMC_OCR_LOW_VOLTAGE; - if (sc->mmchs_clk_id == MMC1_CLK || OF_hasprop(node, "ti,dual-volt")) { - sc->slot.host.caps |= MMC_OCR_290_300 | MMC_OCR_300_310; - } + if (OF_hasprop(node, "ti,dual-volt")) { + sc->slot.host.caps |= MMC_OCR_LOW_VOLTAGE | MMC_OCR_320_330 | MMC_OCR_330_340; + } else if (OF_hasprop(node, "no-1-8-v")) { + sc->slot.host.caps |= MMC_OCR_320_330 | MMC_OCR_330_340; + } else + sc->slot.host.caps |= MMC_OCR_LOW_VOLTAGE; + /* * Set the offset from the device's memory start to the MMCHS registers. @@ -757,7 +759,7 @@ static driver_t ti_sdhci_driver = { DRIVER_MODULE(sdhci_ti, simplebus, ti_sdhci_driver, ti_sdhci_devclass, NULL, NULL); -MODULE_DEPEND(sdhci_ti, sdhci, 1, 1, 1); +SDHCI_DEPEND(sdhci_ti); #ifndef MMCCAM MMC_DECLARE_BRIDGE(sdhci_ti); diff --git a/freebsd/sys/cam/ata/ata_all.h b/freebsd/sys/cam/ata/ata_all.h index 087d6820..ca635253 100644 --- a/freebsd/sys/cam/ata/ata_all.h +++ b/freebsd/sys/cam/ata/ata_all.h @@ -135,6 +135,7 @@ void ata_read_log(struct ccb_ataio *ataio, uint32_t retries, uint16_t block_count, uint32_t protocol, uint8_t *data_ptr, uint32_t dxfer_len, uint32_t timeout); +void ata_param_fixup(struct ata_params *ident_buf); void ata_bswap(int8_t *buf, int len); void ata_btrim(int8_t *buf, int len); void ata_bpack(int8_t *src, int8_t *dst, int len); diff --git a/freebsd/sys/cam/cam.c b/freebsd/sys/cam/cam.c index 5d07bebf..25f99ae7 100644 --- a/freebsd/sys/cam/cam.c +++ b/freebsd/sys/cam/cam.c @@ -418,7 +418,6 @@ cam_error_string(struct cam_device *device, union ccb *ccb, char *str, switch (ccb->ccb_h.func_code) { case XPT_ATA_IO: ata_command_sbuf(&ccb->ataio, &sb); - sbuf_printf(&sb, "\n"); break; case XPT_SCSI_IO: #ifdef _KERNEL @@ -426,17 +425,22 @@ cam_error_string(struct cam_device *device, union ccb *ccb, char *str, #else /* !_KERNEL */ scsi_command_string(device, &ccb->csio, &sb); #endif /* _KERNEL/!_KERNEL */ - sbuf_printf(&sb, "\n"); break; case XPT_SMP_IO: smp_command_sbuf(&ccb->smpio, &sb, path_str, 79 - strlen(path_str), (proto_flags & CAM_ESMF_PRINT_FULL_CMD) ? 79 : 0); - sbuf_printf(&sb, "\n"); + break; + case XPT_NVME_IO: + case XPT_NVME_ADMIN: + nvme_command_sbuf(&ccb->nvmeio, &sb); break; default: + sbuf_printf(&sb, "CAM func %#x", + ccb->ccb_h.func_code); break; } + sbuf_printf(&sb, "\n"); } if (flags & CAM_ESF_CAM_STATUS) { diff --git a/freebsd/sys/cam/cam_ccb.h b/freebsd/sys/cam/cam_ccb.h index 9119468d..7deeb523 100644 --- a/freebsd/sys/cam/cam_ccb.h +++ b/freebsd/sys/cam/cam_ccb.h @@ -1074,6 +1074,7 @@ struct ccb_trans_settings_mmc { #define MMC_CAP_8_BIT_DATA (1 << 1) /* Can do 8-bit data transfers */ #define MMC_CAP_HSPEED (1 << 2) /* Can do High Speed transfers */ uint32_t host_caps; + uint32_t host_max_data; }; /* Get/Set transfer rate/width/disconnection/tag queueing settings */ diff --git a/freebsd/sys/cam/cam_periph.h b/freebsd/sys/cam/cam_periph.h index 6eb0084a..8cd9f800 100644 --- a/freebsd/sys/cam/cam_periph.h +++ b/freebsd/sys/cam/cam_periph.h @@ -37,6 +37,8 @@ #include <cam/cam_sim.h> #ifdef _KERNEL +#include <sys/lock.h> +#include <sys/mutex.h> #include <sys/sysctl.h> #include <sys/taskqueue.h> @@ -149,6 +151,7 @@ struct cam_periph { struct cam_periph_map_info { int num_bufs_used; + void *orig[CAM_PERIPH_MAXMAPS]; struct buf *bp[CAM_PERIPH_MAXMAPS]; }; diff --git a/freebsd/sys/cam/cam_sim.h b/freebsd/sys/cam/cam_sim.h index 95dedb08..55392e48 100644 --- a/freebsd/sys/cam/cam_sim.h +++ b/freebsd/sys/cam/cam_sim.h @@ -67,6 +67,15 @@ struct cam_sim * cam_sim_alloc(sim_action_func sim_action, int max_dev_transactions, int max_tagged_dev_transactions, struct cam_devq *queue); +struct cam_sim * cam_sim_alloc_dev(sim_action_func sim_action, + sim_poll_func sim_poll, + const char *sim_name, + void *softc, + device_t dev, + struct mtx *mtx, + int max_dev_transactions, + int max_tagged_dev_transactions, + struct cam_devq *queue); void cam_sim_free(struct cam_sim *sim, int free_devq); void cam_sim_hold(struct cam_sim *sim); void cam_sim_release(struct cam_sim *sim); @@ -150,6 +159,7 @@ struct cam_sim { struct callout callout; struct cam_devq *devq; /* Device Queue to use for this SIM */ int refcount; /* References to the SIM. */ + device_t sim_dev; /* For attached peripherals. */ #endif /* __rtems__ */ }; diff --git a/freebsd/sys/cam/nvme/nvme_all.h b/freebsd/sys/cam/nvme/nvme_all.h index e31c1e5e..da40dea1 100644 --- a/freebsd/sys/cam/nvme/nvme_all.h +++ b/freebsd/sys/cam/nvme/nvme_all.h @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2015 Netflix, Inc + * Copyright (c) 2015 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -42,8 +42,10 @@ int nvme_identify_match(caddr_t identbuffer, caddr_t table_entry); struct sbuf; void nvme_print_ident(const struct nvme_controller_data *, const struct nvme_namespace_data *, struct sbuf *); -const char *nvme_op_string(const struct nvme_command *); +const char *nvme_op_string(const struct nvme_command *, int admin); const char *nvme_cmd_string(const struct nvme_command *, char *, size_t); +void nvme_cmd_sbuf(const struct nvme_command *, struct sbuf *sb); +int nvme_command_sbuf(struct ccb_nvmeio *nvmeio, struct sbuf *sb); const void *nvme_get_identify_cntrl(struct cam_periph *); const void *nvme_get_identify_ns(struct cam_periph *); diff --git a/freebsd/sys/cam/scsi/scsi_all.c b/freebsd/sys/cam/scsi/scsi_all.c index 0be7e692..99d82fee 100644 --- a/freebsd/sys/cam/scsi/scsi_all.c +++ b/freebsd/sys/cam/scsi/scsi_all.c @@ -382,7 +382,7 @@ static struct op_table_entry scsi_op_codes[] = { { 0x40, D | T | L | P | W | R | O | M | S | C, "CHANGE DEFINITION" }, /* 41 O WRITE SAME(10) */ { 0x41, D, "WRITE SAME(10)" }, - /* 42 O UNMAP */ + /* 42 O UNMAP */ { 0x42, D, "UNMAP" }, /* 42 O READ SUB-CHANNEL */ { 0x42, R, "READ SUB-CHANNEL" }, @@ -397,7 +397,8 @@ static struct op_table_entry scsi_op_codes[] = { { 0x46, R, "GET CONFIGURATION" }, /* 47 O PLAY AUDIO MSF */ { 0x47, R, "PLAY AUDIO MSF" }, - /* 48 */ + /* 48 O SANITIZE */ + { 0x48, D, "SANITIZE" }, /* 49 */ /* 4A M GET EVENT STATUS NOTIFICATION */ { 0x4A, R, "GET EVENT STATUS NOTIFICATION" }, @@ -1165,7 +1166,7 @@ static struct asc_table_entry asc_table[] = { { SST(0x04, 0x1A, SS_RDEF, /* XXX TBD */ "Logical unit not ready, START/STOP UNIT command in progress") }, /* D B */ - { SST(0x04, 0x1B, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x1B, SS_WAIT | EBUSY, "Logical unit not ready, sanitize in progress") }, /* DT MAEB */ { SST(0x04, 0x1C, SS_START | SSQ_DECREMENT_COUNT | ENXIO, @@ -1456,7 +1457,7 @@ static struct asc_table_entry asc_table[] = { { SST(0x11, 0x14, SS_RDEF, /* XXX TBD */ "Read error - LBA marked bad by application client") }, /* D */ - { SST(0x11, 0x15, SS_RDEF, /* XXX TBD */ + { SST(0x11, 0x15, SS_FATAL | EIO, "Write after sanitize required") }, /* D W O BK */ { SST(0x12, 0x00, SS_RDEF, @@ -2058,7 +2059,7 @@ static struct asc_table_entry asc_table[] = { { SST(0x30, 0x13, SS_RDEF, /* XXX TBD */ "Cleaning volume expired") }, /* DT WRO BK */ - { SST(0x31, 0x00, SS_RDEF, + { SST(0x31, 0x00, SS_FATAL | ENXIO, "Medium format corrupted") }, /* D L RO B */ { SST(0x31, 0x01, SS_RDEF, @@ -2067,7 +2068,7 @@ static struct asc_table_entry asc_table[] = { { SST(0x31, 0x02, SS_RDEF, /* XXX TBD */ "Zoned formatting failed due to spare linking") }, /* D B */ - { SST(0x31, 0x03, SS_RDEF, /* XXX TBD */ + { SST(0x31, 0x03, SS_FATAL | EIO, "SANITIZE command failed") }, /* D W O BK */ { SST(0x32, 0x00, SS_RDEF, @@ -3947,7 +3948,7 @@ scsi_set_sense_data_fixed_va(struct scsi_sense_data *sense_data, } if (len > sizeof(sense->cmd_spec_info)) { data += len - sizeof(sense->cmd_spec_info); - len -= len - sizeof(sense->cmd_spec_info); + len = sizeof(sense->cmd_spec_info); } bcopy(data, &sense->cmd_spec_info[ sizeof(sense->cmd_spec_info) - len], len); @@ -4066,6 +4067,10 @@ scsi_get_sense_info(struct scsi_sense_data *sense_data, u_int sense_len, struct scsi_sense_info *info_desc; info_desc = (struct scsi_sense_info *)desc; + + if ((info_desc->byte2 & SSD_INFO_VALID) == 0) + goto bailout; + *info = scsi_8btou64(info_desc->info); if (signed_info != NULL) *signed_info = *info; @@ -4086,6 +4091,9 @@ scsi_get_sense_info(struct scsi_sense_data *sense_data, u_int sense_len, fru_desc = (struct scsi_sense_fru *)desc; + if (fru_desc->fru == 0) + goto bailout; + *info = fru_desc->fru; if (signed_info != NULL) *signed_info = (int8_t)fru_desc->fru; @@ -4186,10 +4194,9 @@ scsi_get_sks(struct scsi_sense_data *sense_data, u_int sense_len, uint8_t *sks) if (desc == NULL) goto bailout; - /* - * No need to check the SKS valid bit for descriptor sense. - * If the descriptor is present, it is valid. - */ + if ((desc->sense_key_spec[0] & SSD_SKS_VALID) == 0) + goto bailout; + bcopy(desc->sense_key_spec, sks, sizeof(desc->sense_key_spec)); break; } @@ -4266,9 +4273,6 @@ scsi_get_block_info(struct scsi_sense_data *sense_data, u_int sense_len, if (SSD_FIXED_IS_PRESENT(sense, sense_len, flags) == 0) goto bailout; - if ((sense->flags & SSD_ILI) == 0) - goto bailout; - *block_bits = sense->flags & SSD_ILI; break; } @@ -4322,9 +4326,6 @@ scsi_get_stream_info(struct scsi_sense_data *sense_data, u_int sense_len, if (SSD_FIXED_IS_PRESENT(sense, sense_len, flags) == 0) goto bailout; - if ((sense->flags & (SSD_ILI|SSD_EOM|SSD_FILEMARK)) == 0) - goto bailout; - *stream_bits = sense->flags & (SSD_ILI|SSD_EOM|SSD_FILEMARK); break; } @@ -4366,8 +4367,6 @@ scsi_progress_sbuf(struct sbuf *sb, uint16_t progress) int scsi_sks_sbuf(struct sbuf *sb, int sense_key, uint8_t *sks) { - if ((sks[0] & SSD_SKS_VALID) == 0) - return (1); switch (sense_key) { case SSD_KEY_ILLEGAL_REQUEST: { @@ -4464,7 +4463,7 @@ scsi_fru_sbuf(struct sbuf *sb, uint64_t fru) } void -scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits, uint64_t info) +scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits) { int need_comma; @@ -4472,6 +4471,7 @@ scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits, uint64_t info) /* * XXX KDM this needs more descriptive decoding. */ + sbuf_printf(sb, "Stream Command Sense Data: "); if (stream_bits & SSD_DESC_STREAM_FM) { sbuf_printf(sb, "Filemark"); need_comma = 1; @@ -4484,15 +4484,15 @@ scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits, uint64_t info) if (stream_bits & SSD_DESC_STREAM_ILI) sbuf_printf(sb, "%sILI", (need_comma) ? "," : ""); - - sbuf_printf(sb, ": Info: %#jx", (uintmax_t) info); } void -scsi_block_sbuf(struct sbuf *sb, uint8_t block_bits, uint64_t info) +scsi_block_sbuf(struct sbuf *sb, uint8_t block_bits) { + + sbuf_printf(sb, "Block Command Sense Data: "); if (block_bits & SSD_DESC_BLOCK_ILI) - sbuf_printf(sb, "ILI: residue %#jx", (uintmax_t) info); + sbuf_printf(sb, "ILI"); } void @@ -4505,6 +4505,9 @@ scsi_sense_info_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, info = (struct scsi_sense_info *)header; + if ((info->byte2 & SSD_INFO_VALID) == 0) + return; + scsi_info_sbuf(sb, cdb, cdb_len, inq_data, scsi_8btou64(info->info)); } @@ -4533,6 +4536,9 @@ scsi_sense_sks_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, sks = (struct scsi_sense_sks *)header; + if ((sks->sense_key_spec[0] & SSD_SKS_VALID) == 0) + return; + scsi_extract_sense_len(sense, sense_len, &error_code, &sense_key, &asc, &ascq, /*show_errors*/ 1); @@ -4549,6 +4555,9 @@ scsi_sense_fru_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, fru = (struct scsi_sense_fru *)header; + if (fru->fru == 0) + return; + scsi_fru_sbuf(sb, (uint64_t)fru->fru); } @@ -4559,14 +4568,9 @@ scsi_sense_stream_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, struct scsi_sense_desc_header *header) { struct scsi_sense_stream *stream; - uint64_t info; stream = (struct scsi_sense_stream *)header; - info = 0; - - scsi_get_sense_info(sense, sense_len, SSD_DESC_INFO, &info, NULL); - - scsi_stream_sbuf(sb, stream->byte3, info); + scsi_stream_sbuf(sb, stream->byte3); } void @@ -4576,14 +4580,9 @@ scsi_sense_block_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, struct scsi_sense_desc_header *header) { struct scsi_sense_block *block; - uint64_t info; block = (struct scsi_sense_block *)header; - info = 0; - - scsi_get_sense_info(sense, sense_len, SSD_DESC_INFO, &info, NULL); - - scsi_block_sbuf(sb, block->byte3, info); + scsi_block_sbuf(sb, block->byte3); } void @@ -4868,7 +4867,7 @@ scsi_sense_only_sbuf(struct scsi_sense_data *sense, u_int sense_len, const char *asc_desc; uint8_t sks[3]; uint64_t val; - int info_valid; + uint8_t bits; /* * Get descriptions for the sense key, ASC, and ASCQ. If @@ -4887,42 +4886,28 @@ scsi_sense_only_sbuf(struct scsi_sense_data *sense, u_int sense_len, sbuf_printf(sb, " asc:%x,%x (%s)\n", asc, ascq, asc_desc); /* - * Get the info field if it is valid. + * Print any block or stream device-specific information. */ - if (scsi_get_sense_info(sense, sense_len, SSD_DESC_INFO, - &val, NULL) == 0) - info_valid = 1; - else - info_valid = 0; - - if (info_valid != 0) { - uint8_t bits; + if (scsi_get_block_info(sense, sense_len, inq_data, + &bits) == 0 && bits != 0) { + sbuf_cat(sb, path_str); + scsi_block_sbuf(sb, bits); + sbuf_printf(sb, "\n"); + } else if (scsi_get_stream_info(sense, sense_len, inq_data, + &bits) == 0 && bits != 0) { + sbuf_cat(sb, path_str); + scsi_stream_sbuf(sb, bits); + sbuf_printf(sb, "\n"); + } - /* - * Determine whether we have any block or stream - * device-specific information. - */ - if (scsi_get_block_info(sense, sense_len, inq_data, - &bits) == 0) { - sbuf_cat(sb, path_str); - scsi_block_sbuf(sb, bits, val); - sbuf_printf(sb, "\n"); - } else if (scsi_get_stream_info(sense, sense_len, - inq_data, &bits) == 0) { - sbuf_cat(sb, path_str); - scsi_stream_sbuf(sb, bits, val); - sbuf_printf(sb, "\n"); - } else if (val != 0) { - /* - * The information field can be valid but 0. - * If the block or stream bits aren't set, - * and this is 0, it isn't terribly useful - * to print it out. - */ - sbuf_cat(sb, path_str); - scsi_info_sbuf(sb, cdb, cdb_len, inq_data, val); - sbuf_printf(sb, "\n"); - } + /* + * Print the info field. + */ + if (scsi_get_sense_info(sense, sense_len, SSD_DESC_INFO, + &val, NULL) == 0) { + sbuf_cat(sb, path_str); + scsi_info_sbuf(sb, cdb, cdb_len, inq_data, val); + sbuf_printf(sb, "\n"); } /* @@ -5600,6 +5585,7 @@ scsi_devid_is_naa_ieee_reg(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; struct scsi_vpd_id_naa_basic *naa; + int n; descr = (struct scsi_vpd_id_descriptor *)bufp; naa = (struct scsi_vpd_id_naa_basic *)descr->identifier; @@ -5607,7 +5593,8 @@ scsi_devid_is_naa_ieee_reg(uint8_t *bufp) return 0; if (descr->length < sizeof(struct scsi_vpd_id_naa_ieee_reg)) return 0; - if ((naa->naa >> SVPD_ID_NAA_NAA_SHIFT) != SVPD_ID_NAA_IEEE_REG) + n = naa->naa >> SVPD_ID_NAA_NAA_SHIFT; + if (n != SVPD_ID_NAA_LOCAL_REG && n != SVPD_ID_NAA_IEEE_REG) return 0; return 1; } @@ -8307,10 +8294,10 @@ scsi_ata_identify(struct ccb_scsiio *csio, u_int32_t retries, tag_action, /*protocol*/AP_PROTO_PIO_IN, /*ata_flags*/AP_FLAG_TDIR_FROM_DEV | - AP_FLAG_BYT_BLOK_BYTES | + AP_FLAG_BYT_BLOK_BLOCKS | AP_FLAG_TLEN_SECT_CNT, /*features*/0, - /*sector_count*/dxfer_len, + /*sector_count*/dxfer_len / 512, /*lba*/0, /*command*/ATA_ATA_IDENTIFY, /*device*/ 0, diff --git a/freebsd/sys/cam/scsi/scsi_all.h b/freebsd/sys/cam/scsi/scsi_all.h index b5c45bc0..1e0c75bb 100644 --- a/freebsd/sys/cam/scsi/scsi_all.h +++ b/freebsd/sys/cam/scsi/scsi_all.h @@ -264,7 +264,9 @@ struct scsi_mode_hdr_10 u_int8_t datalen[2]; u_int8_t medium_type; u_int8_t dev_specific; - u_int8_t reserved[2]; + u_int8_t flags; +#define SMH_LONGLBA 0x01 + u_int8_t reserved; u_int8_t block_descr_len[2]; }; @@ -276,6 +278,20 @@ struct scsi_mode_block_descr u_int8_t block_len[3]; }; +struct scsi_mode_block_descr_dshort +{ + u_int8_t num_blocks[4]; + u_int8_t reserved; + u_int8_t block_len[3]; +}; + +struct scsi_mode_block_descr_dlong +{ + u_int8_t num_blocks[8]; + u_int8_t reserved[4]; + u_int8_t block_len[4]; +}; + struct scsi_per_res_in { u_int8_t opcode; @@ -568,6 +584,7 @@ struct scsi_log_sense #define SLS_ERROR_NONMEDIUM_PAGE 0x06 #define SLS_ERROR_LASTN_PAGE 0x07 #define SLS_LOGICAL_BLOCK_PROVISIONING 0x0c +#define SLS_TEMPERATURE 0x0d #define SLS_SELF_TEST_PAGE 0x10 #define SLS_SOLID_STATE_MEDIA 0x11 #define SLS_STAT_AND_PERF 0x19 @@ -683,6 +700,14 @@ struct scsi_log_informational_exceptions { uint8_t temperature; }; +struct scsi_log_temperature { + struct scsi_log_param_header hdr; +#define SLP_TEMPERATURE 0x0000 +#define SLP_REFTEMPERATURE 0x0001 + uint8_t reserved; + uint8_t temperature; +}; + struct scsi_control_page { u_int8_t page_code; u_int8_t page_length; @@ -2763,6 +2788,19 @@ struct scsi_vpd_tpc }; /* + * SCSI Feature Sets VPD Page + */ +struct scsi_vpd_sfs +{ + uint8_t device; + uint8_t page_code; +#define SVPD_SCSI_SFS 0x92 + uint8_t page_length[2]; + uint8_t reserved[4]; + uint8_t codes[]; +}; + +/* * Block Device Characteristics VPD Page based on * T10/1799-D Revision 31 */ @@ -2803,11 +2841,15 @@ struct scsi_vpd_block_device_characteristics uint8_t flags; #define SVPD_VBULS 0x01 #define SVPD_FUAB 0x02 +#define SVPD_BOCS 0x04 +#define SVPD_RBWZ 0x08 #define SVPD_ZBC_NR 0x00 /* Not Reported */ #define SVPD_HAW_ZBC 0x10 /* Host Aware */ #define SVPD_DM_ZBC 0x20 /* Drive Managed */ #define SVPD_ZBC_MASK 0x30 /* Zoned mask */ - uint8_t reserved[55]; + uint8_t reserved[3]; + uint8_t depopulation_time[4]; + uint8_t reserved2[48]; }; #define SBDC_IS_PRESENT(bdc, length, field) \ @@ -2844,7 +2886,7 @@ struct scsi_vpd_logical_block_prov }; /* - * Block Limits VDP Page based on SBC-4 Revision 2 + * Block Limits VDP Page based on SBC-4 Revision 17 */ struct scsi_vpd_block_limits { @@ -2854,7 +2896,8 @@ struct scsi_vpd_block_limits u_int8_t page_length[2]; #define SVPD_BL_PL_BASIC 0x10 #define SVPD_BL_PL_TP 0x3C - u_int8_t reserved1; + u_int8_t flags; +#define SVPD_BL_WSNZ 0x01 u_int8_t max_cmp_write_len; u_int8_t opt_txfer_len_grain[2]; u_int8_t max_txfer_len[4]; @@ -2931,6 +2974,7 @@ struct scsi_read_capacity_data_long uint8_t length[4]; #define SRC16_PROT_EN 0x01 #define SRC16_P_TYPE 0x0e +#define SRC16_P_TYPE_SHIFT 1 #define SRC16_PTYPE_1 0x00 #define SRC16_PTYPE_2 0x02 #define SRC16_PTYPE_3 0x04 @@ -3578,7 +3622,9 @@ struct scsi_mode_header_10 u_int8_t data_length[2];/* Sense data length */ u_int8_t medium_type; u_int8_t dev_spec; - u_int8_t unused[2]; + u_int8_t flags; +#define SMH_LONGLBA 0x01 + u_int8_t unused; u_int8_t blk_desc_len[2]; }; @@ -3749,8 +3795,8 @@ void scsi_command_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len, void scsi_progress_sbuf(struct sbuf *sb, uint16_t progress); int scsi_sks_sbuf(struct sbuf *sb, int sense_key, uint8_t *sks); void scsi_fru_sbuf(struct sbuf *sb, uint64_t fru); -void scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits, uint64_t info); -void scsi_block_sbuf(struct sbuf *sb, uint8_t block_bits, uint64_t info); +void scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits); +void scsi_block_sbuf(struct sbuf *sb, uint8_t block_bits); void scsi_sense_info_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, diff --git a/freebsd/sys/crypto/blowfish/bf_skey.c b/freebsd/sys/crypto/blowfish/bf_skey.c index f793d689..9bccaaf2 100644 --- a/freebsd/sys/crypto/blowfish/bf_skey.c +++ b/freebsd/sys/crypto/blowfish/bf_skey.c @@ -75,11 +75,11 @@ void BF_set_key(key, len, data) BF_KEY *key; int len; - unsigned char *data; + const unsigned char *data; { int i; BF_LONG *p, ri, in[2]; - unsigned char *d, *end; + const unsigned char *d, *end; memcpy((char *)key, (const char *)&bf_init, sizeof(BF_KEY)); p = key->P; diff --git a/freebsd/sys/crypto/blowfish/blowfish.h b/freebsd/sys/crypto/blowfish/blowfish.h index ecc14075..d09f83cf 100644 --- a/freebsd/sys/crypto/blowfish/blowfish.h +++ b/freebsd/sys/crypto/blowfish/blowfish.h @@ -80,7 +80,7 @@ typedef struct bf_key_st { BF_LONG S[4*256]; } BF_KEY; -void BF_set_key(BF_KEY *, int, unsigned char *); +void BF_set_key(BF_KEY *, int, const unsigned char *); void BF_encrypt(BF_LONG *, BF_KEY *); void BF_decrypt(BF_LONG *, BF_KEY *); void BF_ecb_encrypt(const unsigned char *, unsigned char *, diff --git a/freebsd/sys/crypto/chacha20/chacha-sw.c b/freebsd/sys/crypto/chacha20/chacha-sw.c index 0a03d91b..f610dac0 100644 --- a/freebsd/sys/crypto/chacha20/chacha-sw.c +++ b/freebsd/sys/crypto/chacha20/chacha-sw.c @@ -9,7 +9,7 @@ __FBSDID("$FreeBSD$"); #include <opencrypto/xform_enc.h> static int -chacha20_xform_setkey(u_int8_t **sched, u_int8_t *key, int len) +chacha20_xform_setkey(u_int8_t **sched, const u_int8_t *key, int len) { struct chacha_ctx *ctx; @@ -26,7 +26,7 @@ chacha20_xform_setkey(u_int8_t **sched, u_int8_t *key, int len) } static void -chacha20_xform_reinit(caddr_t key, u_int8_t *iv) +chacha20_xform_reinit(caddr_t key, const u_int8_t *iv) { struct chacha_ctx *ctx; diff --git a/freebsd/sys/crypto/chacha20/chacha.c b/freebsd/sys/crypto/chacha20/chacha.c index 154726c2..ff7c4f81 100644 --- a/freebsd/sys/crypto/chacha20/chacha.c +++ b/freebsd/sys/crypto/chacha20/chacha.c @@ -90,12 +90,32 @@ chacha_keysetup(chacha_ctx *x,const u8 *k,u_int kbits) LOCAL void chacha_ivsetup(chacha_ctx *x, const u8 *iv, const u8 *counter) { +#ifndef CHACHA_NONCE0_CTR128 x->input[12] = counter == NULL ? 0 : U8TO32_LITTLE(counter + 0); x->input[13] = counter == NULL ? 0 : U8TO32_LITTLE(counter + 4); x->input[14] = U8TO32_LITTLE(iv + 0); x->input[15] = U8TO32_LITTLE(iv + 4); +#else + // CHACHA_STATELEN + (void)iv; + x->input[12] = U8TO32_LITTLE(counter + 0); + x->input[13] = U8TO32_LITTLE(counter + 4); + x->input[14] = U8TO32_LITTLE(counter + 8); + x->input[15] = U8TO32_LITTLE(counter + 12); +#endif } +#ifdef CHACHA_NONCE0_CTR128 +LOCAL void +chacha_ctrsave(const chacha_ctx *x, u8 *counter) +{ + U32TO8_LITTLE(counter + 0, x->input[12]); + U32TO8_LITTLE(counter + 4, x->input[13]); + U32TO8_LITTLE(counter + 8, x->input[14]); + U32TO8_LITTLE(counter + 12, x->input[15]); +} +#endif + LOCAL void #ifndef __rtems__ chacha_encrypt_bytes(chacha_ctx *x,const u8 *m,u8 *c,u32 bytes) @@ -202,7 +222,16 @@ chacha_encrypt_bytes(chacha_ctx *x,const u8 *m,u8 *c,u_int bytes) j12 = PLUSONE(j12); if (!j12) { j13 = PLUSONE(j13); +#ifndef CHACHA_NONCE0_CTR128 /* stopping at 2^70 bytes per nonce is user's responsibility */ +#else + if (!j13) { + j14 = PLUSONE(j14); + if (!j14) { + j15 = PLUSONE(j15); + } + } +#endif } U32TO8_LITTLE(c + 0,x0); @@ -228,6 +257,10 @@ chacha_encrypt_bytes(chacha_ctx *x,const u8 *m,u8 *c,u_int bytes) } x->input[12] = j12; x->input[13] = j13; +#ifdef CHACHA_NONCE0_CTR128 + x->input[14] = j14; + x->input[15] = j15; +#endif return; } bytes -= 64; diff --git a/freebsd/sys/crypto/chacha20/chacha.h b/freebsd/sys/crypto/chacha20/chacha.h index 73548331..32262b04 100644 --- a/freebsd/sys/crypto/chacha20/chacha.h +++ b/freebsd/sys/crypto/chacha20/chacha.h @@ -26,10 +26,19 @@ Public domain. #define LOCAL #endif +#ifdef CHACHA_NONCE0_CTR128 +#define CHACHA_UNUSED __unused +#else +#define CHACHA_UNUSED +#endif + LOCAL void chacha_keysetup(struct chacha_ctx *x, const u_char *k, u_int kbits); -LOCAL void chacha_ivsetup(struct chacha_ctx *x, const u_char *iv, const u_char *ctr); +LOCAL void chacha_ivsetup(struct chacha_ctx *x, const u_char *iv CHACHA_UNUSED, + const u_char *ctr); LOCAL void chacha_encrypt_bytes(struct chacha_ctx *x, const u_char *m, u_char *c, u_int bytes); +#undef CHACHA_UNUSED + #endif /* CHACHA_H */ diff --git a/freebsd/sys/crypto/des/des.h b/freebsd/sys/crypto/des/des.h index 81c7bfbe..339921a7 100644 --- a/freebsd/sys/crypto/des/des.h +++ b/freebsd/sys/crypto/des/des.h @@ -82,7 +82,7 @@ typedef struct des_ks_struct extern int des_check_key; /* defaults to false */ char *des_options(void); -void des_ecb_encrypt(des_cblock *, des_cblock *, des_key_schedule, int); +void des_ecb_encrypt(unsigned char *, unsigned char *, des_key_schedule, int); void des_encrypt1(DES_LONG *, des_key_schedule, int); void des_encrypt2(DES_LONG *, des_key_schedule, int); @@ -91,24 +91,17 @@ void des_encrypt3(DES_LONG *, des_key_schedule, des_key_schedule, void des_decrypt3(DES_LONG *, des_key_schedule, des_key_schedule, des_key_schedule); -void des_ecb3_encrypt(des_cblock *, des_cblock *, des_key_schedule, +void des_ecb3_encrypt(unsigned char *, unsigned char *, des_key_schedule, des_key_schedule, des_key_schedule, int); -void des_ncbc_encrypt(const unsigned char *, unsigned char *, long, - des_key_schedule, des_cblock *, int); - -void des_ede3_cbc_encrypt(const unsigned char *, unsigned char *, long, - des_key_schedule, des_key_schedule, - des_key_schedule, des_cblock *, int); - -void des_set_odd_parity(des_cblock *); -void des_fixup_key_parity(des_cblock *); -int des_is_weak_key(des_cblock *); -int des_set_key(des_cblock *, des_key_schedule); -int des_key_sched(des_cblock *, des_key_schedule); -int des_set_key_checked(des_cblock *, des_key_schedule); -void des_set_key_unchecked(des_cblock *, des_key_schedule); -int des_check_key_parity(des_cblock *); +void des_set_odd_parity(unsigned char *); +void des_fixup_key_parity(unsigned char *); +int des_is_weak_key(const unsigned char *); +int des_set_key(const unsigned char *, des_key_schedule); +int des_key_sched(const unsigned char *, des_key_schedule); +int des_set_key_checked(const unsigned char *, des_key_schedule); +void des_set_key_unchecked(const unsigned char *, des_key_schedule); +int des_check_key_parity(const unsigned char *); #ifdef __cplusplus } diff --git a/freebsd/sys/crypto/des/des_ecb.c b/freebsd/sys/crypto/des/des_ecb.c index 4c383f1d..3819d91b 100644 --- a/freebsd/sys/crypto/des/des_ecb.c +++ b/freebsd/sys/crypto/des/des_ecb.c @@ -99,13 +99,13 @@ char *des_options(void) } return(buf); } -void des_ecb_encrypt(des_cblock *input, des_cblock *output, +void des_ecb_encrypt(unsigned char *input, unsigned char *output, des_key_schedule ks, int enc) { register DES_LONG l; DES_LONG ll[2]; - const unsigned char *in=&(*input)[0]; - unsigned char *out = &(*output)[0]; + const unsigned char *in = input; + unsigned char *out = output; c2l(in,l); ll[0]=l; c2l(in,l); ll[1]=l; @@ -115,14 +115,14 @@ void des_ecb_encrypt(des_cblock *input, des_cblock *output, l=ll[0]=ll[1]=0; } -void des_ecb3_encrypt(des_cblock *input, des_cblock *output, +void des_ecb3_encrypt(unsigned char *input, unsigned char *output, des_key_schedule ks1, des_key_schedule ks2, des_key_schedule ks3, int enc) { register DES_LONG l0,l1; DES_LONG ll[2]; - const unsigned char *in = &(*input)[0]; - unsigned char *out = &(*output)[0]; + const unsigned char *in = input; + unsigned char *out = output; c2l(in,l0); c2l(in,l1); diff --git a/freebsd/sys/crypto/des/des_setkey.c b/freebsd/sys/crypto/des/des_setkey.c index 966b17d0..32a55ab1 100644 --- a/freebsd/sys/crypto/des/des_setkey.c +++ b/freebsd/sys/crypto/des/des_setkey.c @@ -69,21 +69,21 @@ __FBSDID("$FreeBSD$"); int des_check_key=0; -void des_set_odd_parity(des_cblock *key) +void des_set_odd_parity(unsigned char *key) { int i; for (i=0; i<DES_KEY_SZ; i++) - (*key)[i]=odd_parity[(*key)[i]]; + key[i]=odd_parity[key[i]]; } -int des_check_key_parity(des_cblock *key) +int des_check_key_parity(const unsigned char *key) { int i; for (i=0; i<DES_KEY_SZ; i++) { - if ((*key)[i] != odd_parity[(*key)[i]]) + if (key[i] != odd_parity[key[i]]) return(0); } return(1); @@ -119,7 +119,7 @@ static des_cblock weak_keys[NUM_WEAK_KEY]={ {0xE0,0xFE,0xE0,0xFE,0xF1,0xFE,0xF1,0xFE}, {0xFE,0xE0,0xFE,0xE0,0xFE,0xF1,0xFE,0xF1}}; -int des_is_weak_key(des_cblock *key) +int des_is_weak_key(const unsigned char *key) { int i; @@ -144,7 +144,7 @@ int des_is_weak_key(des_cblock *key) #define HPERM_OP(a,t,n,m) ((t)=((((a)<<(16-(n)))^(a))&(m)),\ (a)=(a)^(t)^(t>>(16-(n)))) -int des_set_key(des_cblock *key, des_key_schedule schedule) +int des_set_key(const unsigned char *key, des_key_schedule schedule) { if (des_check_key) { @@ -161,7 +161,7 @@ int des_set_key(des_cblock *key, des_key_schedule schedule) * return -1 if key parity error, * return -2 if illegal weak key. */ -int des_set_key_checked(des_cblock *key, des_key_schedule schedule) +int des_set_key_checked(const unsigned char *key, des_key_schedule schedule) { if (!des_check_key_parity(key)) return(-1); @@ -171,7 +171,7 @@ int des_set_key_checked(des_cblock *key, des_key_schedule schedule) return 0; } -void des_set_key_unchecked(des_cblock *key, des_key_schedule schedule) +void des_set_key_unchecked(const unsigned char *key, des_key_schedule schedule) { static int shifts2[16]={0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0}; DES_LONG c,d,t,s,t2; @@ -180,7 +180,7 @@ void des_set_key_unchecked(des_cblock *key, des_key_schedule schedule) int i; k = &schedule->ks.deslong[0]; - in = &(*key)[0]; + in = key; c2l(in,c); c2l(in,d); @@ -227,12 +227,12 @@ void des_set_key_unchecked(des_cblock *key, des_key_schedule schedule) } } -int des_key_sched(des_cblock *key, des_key_schedule schedule) +int des_key_sched(const unsigned char *key, des_key_schedule schedule) { return(des_set_key(key,schedule)); } -void des_fixup_key_parity(des_cblock *key) +void des_fixup_key_parity(unsigned char *key) { des_set_odd_parity(key); } diff --git a/freebsd/sys/dev/bge/if_bge.c b/freebsd/sys/dev/bge/if_bge.c index a4a937b2..79e930ca 100644 --- a/freebsd/sys/dev/bge/if_bge.c +++ b/freebsd/sys/dev/bge/if_bge.c @@ -2929,10 +2929,14 @@ bge_dma_ring_alloc(struct bge_softc *sc, bus_size_t alignment, bus_addr_t *paddr, const char *msg) { struct bge_dmamap_arg ctx; + bus_addr_t lowaddr; + bus_size_t ring_end; int error; + lowaddr = BUS_SPACE_MAXADDR; +again: error = bus_dma_tag_create(sc->bge_cdata.bge_parent_tag, - alignment, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, + alignment, 0, lowaddr, BUS_SPACE_MAXADDR, NULL, NULL, maxsize, 1, maxsize, 0, NULL, NULL, tag); if (error != 0) { device_printf(sc->bge_dev, @@ -2957,6 +2961,25 @@ bge_dma_ring_alloc(struct bge_softc *sc, bus_size_t alignment, return (ENOMEM); } *paddr = ctx.bge_busaddr; + ring_end = *paddr + maxsize; + if ((sc->bge_flags & BGE_FLAG_4G_BNDRY_BUG) != 0 && + BGE_ADDR_HI(*paddr) != BGE_ADDR_HI(ring_end)) { + /* + * 4GB boundary crossed. Limit maximum allowable DMA + * address space to 32bit and try again. + */ + bus_dmamap_unload(*tag, *map); + bus_dmamem_free(*tag, *ring, *map); + bus_dma_tag_destroy(*tag); + if (bootverbose) + device_printf(sc->bge_dev, "4GB boundary crossed, " + "limit DMA address space to 32bit for %s\n", msg); + *ring = NULL; + *tag = NULL; + *map = NULL; + lowaddr = BUS_SPACE_MAXADDR_32BIT; + goto again; + } return (0); } @@ -2964,7 +2987,7 @@ static int bge_dma_alloc(struct bge_softc *sc) { bus_addr_t lowaddr; - bus_size_t rxmaxsegsz, sbsz, txsegsz, txmaxsegsz; + bus_size_t boundary, sbsz, rxmaxsegsz, txsegsz, txmaxsegsz; int i, error; lowaddr = BUS_SPACE_MAXADDR; @@ -3051,7 +3074,9 @@ bge_dma_alloc(struct bge_softc *sc) } /* Create parent tag for buffers. */ + boundary = 0; if ((sc->bge_flags & BGE_FLAG_4G_BNDRY_BUG) != 0) { + boundary = BGE_DMA_BNDRY; /* * XXX * watchdog timeout issue was observed on BCM5704 which @@ -3062,10 +3087,10 @@ bge_dma_alloc(struct bge_softc *sc) if (sc->bge_pcixcap != 0) lowaddr = BUS_SPACE_MAXADDR_32BIT; } - error = bus_dma_tag_create(bus_get_dma_tag(sc->bge_dev), 1, 0, lowaddr, - BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE_32BIT, 0, - BUS_SPACE_MAXSIZE_32BIT, 0, NULL, NULL, - &sc->bge_cdata.bge_buffer_tag); + error = bus_dma_tag_create(bus_get_dma_tag(sc->bge_dev), + 1, boundary, lowaddr, BUS_SPACE_MAXADDR, NULL, + NULL, BUS_SPACE_MAXSIZE_32BIT, 0, BUS_SPACE_MAXSIZE_32BIT, + 0, NULL, NULL, &sc->bge_cdata.bge_buffer_tag); if (error != 0) { device_printf(sc->bge_dev, "could not allocate buffer dma tag\n"); @@ -3253,6 +3278,8 @@ bge_mbox_reorder(struct bge_softc *sc) bus = device_get_parent(dev); if (device_get_devclass(dev) != pcib) break; + if (device_get_devclass(bus) != pci) + break; for (i = 0; i < nitems(mbox_reorder_lists); i++) { if (pci_get_vendor(dev) == mbox_reorder_lists[i].vendor && @@ -3264,8 +3291,6 @@ bge_mbox_reorder(struct bge_softc *sc) return (1); } } - if (device_get_devclass(bus) != pci) - break; } return (0); } diff --git a/freebsd/sys/dev/bge/if_bgereg.h b/freebsd/sys/dev/bge/if_bgereg.h index eb7686e7..58fe8040 100644 --- a/freebsd/sys/dev/bge/if_bgereg.h +++ b/freebsd/sys/dev/bge/if_bgereg.h @@ -2866,6 +2866,12 @@ struct bge_gib { #define BGE_DMA_MAXADDR 0xFFFFFFFFFF #endif +#if (BUS_SPACE_MAXSIZE > 0xFFFFFFFF) +#define BGE_DMA_BNDRY 0x100000000 +#else +#define BGE_DMA_BNDRY 0 +#endif + /* * Ring structures. Most of these reside in host memory and we tell * the NIC where they are via the ring control blocks. The exceptions diff --git a/freebsd/sys/dev/cadence/if_cgem.c b/freebsd/sys/dev/cadence/if_cgem.c index 191362c4..34340f22 100644 --- a/freebsd/sys/dev/cadence/if_cgem.c +++ b/freebsd/sys/dev/cadence/if_cgem.c @@ -107,6 +107,14 @@ __FBSDID("$FreeBSD$"); #define CGEM_CKSUM_ASSIST (CSUM_IP | CSUM_TCP | CSUM_UDP | \ CSUM_TCP_IPV6 | CSUM_UDP_IPV6) +#ifndef __rtems__ +static struct ofw_compat_data compat_data[] = { + { "cadence,gem", 1 }, + { "cdns,macb", 1 }, + { NULL, 0 }, +}; +#endif /* __rtems__ */ + struct cgem_softc { if_t ifp; struct mtx sc_mtx; @@ -1724,7 +1732,7 @@ cgem_probe(device_t dev) if (!ofw_bus_status_okay(dev)) return (ENXIO); - if (!ofw_bus_is_compatible(dev, "cadence,gem")) + if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0) return (ENXIO); #endif /* __rtems__ */ diff --git a/freebsd/sys/dev/e1000/em_txrx.c b/freebsd/sys/dev/e1000/em_txrx.c index c2b60743..4faf806e 100644 --- a/freebsd/sys/dev/e1000/em_txrx.c +++ b/freebsd/sys/dev/e1000/em_txrx.c @@ -459,16 +459,11 @@ em_isc_txd_credits_update(void *arg, uint16_t txqid, bool clear) prev = txr->tx_cidx_processed; ntxd = scctx->isc_ntxd[0]; do { + MPASS(prev != cur); delta = (int32_t)cur - (int32_t)prev; - /* - * XXX This appears to be a hack for first-packet. - * A correct fix would prevent prev == cur in the first place. - */ - MPASS(prev == 0 || delta != 0); - if (prev == 0 && cur == 0) - delta += 1; if (delta < 0) delta += ntxd; + MPASS(delta > 0); DPRINTF(iflib_get_dev(adapter->ctx), "%s: cidx_processed=%u cur=%u clear=%d delta=%d\n", __FUNCTION__, prev, cur, clear, delta); diff --git a/freebsd/sys/dev/e1000/if_em.c b/freebsd/sys/dev/e1000/if_em.c index 803b68ec..32eb4afe 100644 --- a/freebsd/sys/dev/e1000/if_em.c +++ b/freebsd/sys/dev/e1000/if_em.c @@ -251,6 +251,7 @@ static int em_if_mtu_set(if_ctx_t ctx, uint32_t mtu); static void em_if_timer(if_ctx_t ctx, uint16_t qid); static void em_if_vlan_register(if_ctx_t ctx, u16 vtag); static void em_if_vlan_unregister(if_ctx_t ctx, u16 vtag); +static void em_if_watchdog_reset(if_ctx_t ctx); static void em_identify_hardware(if_ctx_t ctx); static int em_allocate_pci_resources(if_ctx_t ctx); @@ -262,10 +263,14 @@ static int em_setup_msix(if_ctx_t ctx); static void em_initialize_transmit_unit(if_ctx_t ctx); static void em_initialize_receive_unit(if_ctx_t ctx); -static void em_if_enable_intr(if_ctx_t ctx); -static void em_if_disable_intr(if_ctx_t ctx); +static void em_if_intr_enable(if_ctx_t ctx); +static void em_if_intr_disable(if_ctx_t ctx); +static void igb_if_intr_enable(if_ctx_t ctx); +static void igb_if_intr_disable(if_ctx_t ctx); static int em_if_rx_queue_intr_enable(if_ctx_t ctx, uint16_t rxqid); static int em_if_tx_queue_intr_enable(if_ctx_t ctx, uint16_t txqid); +static int igb_if_rx_queue_intr_enable(if_ctx_t ctx, uint16_t rxqid); +static int igb_if_tx_queue_intr_enable(if_ctx_t ctx, uint16_t txqid); static void em_if_multi_set(if_ctx_t ctx); static void em_if_update_admin_status(if_ctx_t ctx); static void em_if_debug(if_ctx_t ctx); @@ -295,7 +300,7 @@ static void em_disable_aspm(struct adapter *); int em_intr(void *arg); static void em_disable_promisc(if_ctx_t ctx); -/* MSIX handlers */ +/* MSI-X handlers */ static int em_if_msix_intr_assign(if_ctx_t, int); static int em_msix_link(void *); static void em_handle_link(void *context); @@ -376,8 +381,8 @@ static device_method_t em_if_methods[] = { DEVMETHOD(ifdi_init, em_if_init), DEVMETHOD(ifdi_stop, em_if_stop), DEVMETHOD(ifdi_msix_intr_assign, em_if_msix_intr_assign), - DEVMETHOD(ifdi_intr_enable, em_if_enable_intr), - DEVMETHOD(ifdi_intr_disable, em_if_disable_intr), + DEVMETHOD(ifdi_intr_enable, em_if_intr_enable), + DEVMETHOD(ifdi_intr_disable, em_if_intr_disable), DEVMETHOD(ifdi_tx_queues_alloc, em_if_tx_queues_alloc), DEVMETHOD(ifdi_rx_queues_alloc, em_if_rx_queues_alloc), DEVMETHOD(ifdi_queues_free, em_if_queues_free), @@ -388,6 +393,7 @@ static device_method_t em_if_methods[] = { DEVMETHOD(ifdi_mtu_set, em_if_mtu_set), DEVMETHOD(ifdi_promisc_set, em_if_set_promisc), DEVMETHOD(ifdi_timer, em_if_timer), + DEVMETHOD(ifdi_watchdog_reset, em_if_watchdog_reset), DEVMETHOD(ifdi_vlan_register, em_if_vlan_register), DEVMETHOD(ifdi_vlan_unregister, em_if_vlan_unregister), DEVMETHOD(ifdi_get_counter, em_if_get_counter), @@ -398,14 +404,47 @@ static device_method_t em_if_methods[] = { DEVMETHOD_END }; -/* - * note that if (adapter->msix_mem) is replaced by: - * if (adapter->intr_type == IFLIB_INTR_MSIX) - */ static driver_t em_if_driver = { "em_if", em_if_methods, sizeof(struct adapter) }; +static device_method_t igb_if_methods[] = { + DEVMETHOD(ifdi_attach_pre, em_if_attach_pre), + DEVMETHOD(ifdi_attach_post, em_if_attach_post), + DEVMETHOD(ifdi_detach, em_if_detach), + DEVMETHOD(ifdi_shutdown, em_if_shutdown), + DEVMETHOD(ifdi_suspend, em_if_suspend), + DEVMETHOD(ifdi_resume, em_if_resume), + DEVMETHOD(ifdi_init, em_if_init), + DEVMETHOD(ifdi_stop, em_if_stop), + DEVMETHOD(ifdi_msix_intr_assign, em_if_msix_intr_assign), + DEVMETHOD(ifdi_intr_enable, igb_if_intr_enable), + DEVMETHOD(ifdi_intr_disable, igb_if_intr_disable), + DEVMETHOD(ifdi_tx_queues_alloc, em_if_tx_queues_alloc), + DEVMETHOD(ifdi_rx_queues_alloc, em_if_rx_queues_alloc), + DEVMETHOD(ifdi_queues_free, em_if_queues_free), + DEVMETHOD(ifdi_update_admin_status, em_if_update_admin_status), + DEVMETHOD(ifdi_multi_set, em_if_multi_set), + DEVMETHOD(ifdi_media_status, em_if_media_status), + DEVMETHOD(ifdi_media_change, em_if_media_change), + DEVMETHOD(ifdi_mtu_set, em_if_mtu_set), + DEVMETHOD(ifdi_promisc_set, em_if_set_promisc), + DEVMETHOD(ifdi_timer, em_if_timer), + DEVMETHOD(ifdi_watchdog_reset, em_if_watchdog_reset), + DEVMETHOD(ifdi_vlan_register, em_if_vlan_register), + DEVMETHOD(ifdi_vlan_unregister, em_if_vlan_unregister), + DEVMETHOD(ifdi_get_counter, em_if_get_counter), + DEVMETHOD(ifdi_led_func, em_if_led_func), + DEVMETHOD(ifdi_rx_queue_intr_enable, igb_if_rx_queue_intr_enable), + DEVMETHOD(ifdi_tx_queue_intr_enable, igb_if_tx_queue_intr_enable), + DEVMETHOD(ifdi_debug, em_if_debug), + DEVMETHOD_END +}; + +static driver_t igb_if_driver = { + "igb_if", igb_if_methods, sizeof(struct adapter) +}; + /********************************************************************* * Tunable default values. *********************************************************************/ @@ -525,7 +564,7 @@ static struct if_shared_ctx igb_sctx_init = { .isc_admin_intrcnt = 1, .isc_vendor_info = igb_vendor_info_array, .isc_driver_version = em_driver_version, - .isc_driver = &em_if_driver, + .isc_driver = &igb_if_driver, .isc_flags = IFLIB_NEED_SCRATCH | IFLIB_TSO_INIT_IP | IFLIB_NEED_ZERO_CSUM, .isc_nrxd_min = {EM_MIN_RXD}, @@ -723,7 +762,6 @@ em_set_num_queues(if_ctx_t ctx) * * return 0 on success, positive on failure *********************************************************************/ - static int em_if_attach_pre(if_ctx_t ctx) { @@ -733,15 +771,10 @@ em_if_attach_pre(if_ctx_t ctx) struct e1000_hw *hw; int error = 0; - INIT_DEBUGOUT("em_if_attach_pre begin"); + INIT_DEBUGOUT("em_if_attach_pre: begin"); dev = iflib_get_dev(ctx); adapter = iflib_get_softc(ctx); - if (resource_disabled("em", device_get_unit(dev))) { - device_printf(dev, "Disabled by device hint\n"); - return (ENXIO); - } - adapter->ctx = adapter->osdep.ctx = ctx; adapter->dev = adapter->osdep.dev = dev; scctx = adapter->shared = iflib_get_softc_ctx(ctx); @@ -779,14 +812,13 @@ em_if_attach_pre(if_ctx_t ctx) /* Determine hardware and mac info */ em_identify_hardware(ctx); - scctx->isc_msix_bar = PCIR_BAR(EM_MSIX_BAR); scctx->isc_tx_nsegments = EM_MAX_SCATTER; scctx->isc_nrxqsets_max = scctx->isc_ntxqsets_max = em_set_num_queues(ctx); - device_printf(dev, "attach_pre capping queues at %d\n", scctx->isc_ntxqsets_max); + if (bootverbose) + device_printf(dev, "attach_pre capping queues at %d\n", + scctx->isc_ntxqsets_max); if (adapter->hw.mac.type >= igb_mac_min) { - int try_second_bar; - scctx->isc_txqsizes[0] = roundup2(scctx->isc_ntxd[0] * sizeof(union e1000_adv_tx_desc), EM_DBA_ALIGN); scctx->isc_rxqsizes[0] = roundup2(scctx->isc_nrxd[0] * sizeof(union e1000_adv_rx_desc), EM_DBA_ALIGN); scctx->isc_txd_size[0] = sizeof(union e1000_adv_tx_desc); @@ -800,14 +832,13 @@ em_if_attach_pre(if_ctx_t ctx) CSUM_IP6_TCP | CSUM_IP6_UDP; if (adapter->hw.mac.type != e1000_82575) scctx->isc_tx_csum_flags |= CSUM_SCTP | CSUM_IP6_SCTP; - /* ** Some new devices, as with ixgbe, now may ** use a different BAR, so we need to keep ** track of which is used. */ - try_second_bar = pci_read_config(dev, scctx->isc_msix_bar, 4); - if (try_second_bar == 0) + scctx->isc_msix_bar = PCIR_BAR(EM_MSIX_BAR); + if (pci_read_config(dev, scctx->isc_msix_bar, 4) == 0) scctx->isc_msix_bar += 4; } else if (adapter->hw.mac.type >= em_mac_min) { scctx->isc_txqsizes[0] = roundup2(scctx->isc_ntxd[0]* sizeof(struct e1000_tx_desc), EM_DBA_ALIGN); @@ -837,6 +868,16 @@ em_if_attach_pre(if_ctx_t ctx) */ scctx->isc_capenable &= ~(IFCAP_TSO4 | IFCAP_VLAN_HWTSO); scctx->isc_tx_csum_flags = CSUM_TCP | CSUM_UDP | CSUM_IP_TSO; + /* + * We support MSI-X with 82574 only, but indicate to iflib(4) + * that it shall give MSI at least a try with other devices. + */ + if (adapter->hw.mac.type == e1000_82574) { + scctx->isc_msix_bar = PCIR_BAR(EM_MSIX_BAR); + } else { + scctx->isc_msix_bar = -1; + scctx->isc_disable_msix = 1; + } } else { scctx->isc_txqsizes[0] = roundup2((scctx->isc_ntxd[0] + 1) * sizeof(struct e1000_tx_desc), EM_DBA_ALIGN); scctx->isc_rxqsizes[0] = roundup2((scctx->isc_nrxd[0] + 1) * sizeof(struct e1000_rx_desc), EM_DBA_ALIGN); @@ -847,6 +888,7 @@ em_if_attach_pre(if_ctx_t ctx) scctx->isc_capabilities = scctx->isc_capenable = LEM_CAPS; if (adapter->hw.mac.type < e1000_82543) scctx->isc_capenable &= ~(IFCAP_HWCSUM|IFCAP_VLAN_HWCSUM); + /* INTx only */ scctx->isc_msix_bar = 0; } @@ -1092,13 +1134,12 @@ err_late: * * return 0 on success, positive on failure *********************************************************************/ - static int em_if_detach(if_ctx_t ctx) { struct adapter *adapter = iflib_get_softc(ctx); - INIT_DEBUGOUT("em_detach: begin"); + INIT_DEBUGOUT("em_if_detach: begin"); e1000_phy_hw_reset(&adapter->hw); @@ -1203,16 +1244,16 @@ em_if_mtu_set(if_ctx_t ctx, uint32_t mtu) * by the driver as a hw/sw initialization routine to get to a * consistent state. * - * return 0 on success, positive on failure **********************************************************************/ - static void em_if_init(if_ctx_t ctx) { struct adapter *adapter = iflib_get_softc(ctx); + if_softc_ctx_t scctx = adapter->shared; struct ifnet *ifp = iflib_get_ifp(ctx); struct em_tx_queue *tx_que; int i; + INIT_DEBUGOUT("em_if_init: begin"); /* Get the latest mac address, User can use a LAA */ @@ -1242,7 +1283,14 @@ em_if_init(if_ctx_t ctx) for (i = 0, tx_que = adapter->tx_queues; i < adapter->tx_num_queues; i++, tx_que++) { struct tx_ring *txr = &tx_que->txr; - txr->tx_rs_cidx = txr->tx_rs_pidx = txr->tx_cidx_processed = 0; + txr->tx_rs_cidx = txr->tx_rs_pidx; + + /* Initialize the last processed descriptor to be the end of + * the ring, rather than the start, so that we avoid an + * off-by-one error when calculating how many descriptors are + * done in the credits_update function. + */ + txr->tx_cidx_processed = scctx->isc_ntxd[0] - 1; } /* Setup VLAN support, basic and offload if available */ @@ -1261,21 +1309,7 @@ em_if_init(if_ctx_t ctx) /* Setup Multicast table */ em_if_multi_set(ctx); - /* - * Figure out the desired mbuf - * pool for doing jumbos - */ - if (adapter->hw.mac.max_frame_size <= 2048) - adapter->rx_mbuf_sz = MCLBYTES; -#ifndef CONTIGMALLOC_WORKS - else - adapter->rx_mbuf_sz = MJUMPAGESIZE; -#else - else if (adapter->hw.mac.max_frame_size <= 4096) - adapter->rx_mbuf_sz = MJUMPAGESIZE; - else - adapter->rx_mbuf_sz = MJUM9BYTES; -#endif + adapter->rx_mbuf_sz = iflib_get_rx_mbuf_sz(ctx); em_initialize_receive_unit(ctx); /* Use real VLAN Filter support? */ @@ -1295,7 +1329,7 @@ em_if_init(if_ctx_t ctx) em_if_set_promisc(ctx, IFF_PROMISC); e1000_clear_hw_cntrs_base_generic(&adapter->hw); - /* MSI/X configuration for 82574 */ + /* MSI-X configuration for 82574 */ if (adapter->hw.mac.type == e1000_82574) { int tmp = E1000_READ_REG(&adapter->hw, E1000_CTRL_EXT); @@ -1338,8 +1372,6 @@ em_intr(void *arg) reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); - if (adapter->intr_type != IFLIB_INTR_LEGACY) - goto skip_stray; /* Hot eject? */ if (reg_icr == 0xffffffff) return FILTER_STRAY; @@ -1356,7 +1388,14 @@ em_intr(void *arg) (reg_icr & E1000_ICR_INT_ASSERTED) == 0) return FILTER_STRAY; -skip_stray: + /* + * Only MSI-X interrupts have one-shot behavior by taking advantage + * of the EIAC register. Thus, explicitly disable interrupts. This + * also works around the MSI message reordering errata on certain + * systems. + */ + IFDI_INTR_DISABLE(ctx); + /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { adapter->hw.mac.get_link_status = 1; @@ -1369,59 +1408,49 @@ skip_stray: return (FILTER_SCHEDULE_THREAD); } -static void -igb_rx_enable_queue(struct adapter *adapter, struct em_rx_queue *rxq) +static int +em_if_rx_queue_intr_enable(if_ctx_t ctx, uint16_t rxqid) { - E1000_WRITE_REG(&adapter->hw, E1000_EIMS, rxq->eims); -} + struct adapter *adapter = iflib_get_softc(ctx); + struct em_rx_queue *rxq = &adapter->rx_queues[rxqid]; -static void -em_rx_enable_queue(struct adapter *adapter, struct em_rx_queue *rxq) -{ E1000_WRITE_REG(&adapter->hw, E1000_IMS, rxq->eims); + return (0); } -static void -igb_tx_enable_queue(struct adapter *adapter, struct em_tx_queue *txq) +static int +em_if_tx_queue_intr_enable(if_ctx_t ctx, uint16_t txqid) { - E1000_WRITE_REG(&adapter->hw, E1000_EIMS, txq->eims); -} + struct adapter *adapter = iflib_get_softc(ctx); + struct em_tx_queue *txq = &adapter->tx_queues[txqid]; -static void -em_tx_enable_queue(struct adapter *adapter, struct em_tx_queue *txq) -{ E1000_WRITE_REG(&adapter->hw, E1000_IMS, txq->eims); + return (0); } static int -em_if_rx_queue_intr_enable(if_ctx_t ctx, uint16_t rxqid) +igb_if_rx_queue_intr_enable(if_ctx_t ctx, uint16_t rxqid) { struct adapter *adapter = iflib_get_softc(ctx); struct em_rx_queue *rxq = &adapter->rx_queues[rxqid]; - if (adapter->hw.mac.type >= igb_mac_min) - igb_rx_enable_queue(adapter, rxq); - else - em_rx_enable_queue(adapter, rxq); + E1000_WRITE_REG(&adapter->hw, E1000_EIMS, rxq->eims); return (0); } static int -em_if_tx_queue_intr_enable(if_ctx_t ctx, uint16_t txqid) +igb_if_tx_queue_intr_enable(if_ctx_t ctx, uint16_t txqid) { struct adapter *adapter = iflib_get_softc(ctx); struct em_tx_queue *txq = &adapter->tx_queues[txqid]; - if (adapter->hw.mac.type >= igb_mac_min) - igb_tx_enable_queue(adapter, txq); - else - em_tx_enable_queue(adapter, txq); + E1000_WRITE_REG(&adapter->hw, E1000_EIMS, txq->eims); return (0); } /********************************************************************* * - * MSIX RX Interrupt Service routine + * MSI-X RX Interrupt Service routine * **********************************************************************/ static int @@ -1436,7 +1465,7 @@ em_msix_que(void *arg) /********************************************************************* * - * MSIX Link Fast Interrupt Service routine + * MSI-X Link Fast Interrupt Service routine * **********************************************************************/ static int @@ -1689,37 +1718,24 @@ em_if_multi_set(if_ctx_t ctx) } } - /********************************************************************* * Timer routine * - * This routine checks for link status and updates statistics. + * This routine schedules em_if_update_admin_status() to check for + * link status and to gather statistics as well as to perform some + * controller-specific hardware patting. * **********************************************************************/ - static void em_if_timer(if_ctx_t ctx, uint16_t qid) { - struct adapter *adapter = iflib_get_softc(ctx); - struct em_rx_queue *que; - int i; - int trigger = 0; if (qid != 0) return; iflib_admin_intr_deferred(ctx); - - /* Mask to use in the irq trigger */ - if (adapter->intr_type == IFLIB_INTR_MSIX) { - for (i = 0, que = adapter->rx_queues; i < adapter->rx_num_queues; i++, que++) - trigger |= que->eims; - } else { - trigger = E1000_ICS_RXDMT0; - } } - static void em_if_update_admin_status(if_ctx_t ctx) { @@ -1825,21 +1841,30 @@ em_if_update_admin_status(if_ctx_t ctx) E1000_WRITE_REG(&adapter->hw, E1000_IMS, EM_MSIX_LINK | E1000_IMS_LSC); } +static void +em_if_watchdog_reset(if_ctx_t ctx) +{ + struct adapter *adapter = iflib_get_softc(ctx); + + /* + * Just count the event; iflib(4) will already trigger a + * sufficient reset of the controller. + */ + adapter->watchdog_events++; +} + /********************************************************************* * * This routine disables all traffic on the adapter by issuing a - * global reset on the MAC and deallocates TX/RX buffers. + * global reset on the MAC. * - * This routine should always be called with BOTH the CORE - * and TX locks. **********************************************************************/ - static void em_if_stop(if_ctx_t ctx) { struct adapter *adapter = iflib_get_softc(ctx); - INIT_DEBUGOUT("em_stop: begin"); + INIT_DEBUGOUT("em_if_stop: begin"); e1000_reset_hw(&adapter->hw); if (adapter->hw.mac.type >= e1000_82544) @@ -1849,7 +1874,6 @@ em_if_stop(if_ctx_t ctx) e1000_cleanup_led(&adapter->hw); } - /********************************************************************* * * Determine hardware revision. @@ -1906,7 +1930,6 @@ em_allocate_pci_resources(if_ctx_t ctx) for (rid = PCIR_BAR(0); rid < PCIR_CIS;) { val = pci_read_config(dev, rid, 4); if (EM_BAR_TYPE(val) == EM_BAR_TYPE_IO) { - adapter->io_rid = rid; break; } rid += 4; @@ -1918,8 +1941,8 @@ em_allocate_pci_resources(if_ctx_t ctx) device_printf(dev, "Unable to locate IO BAR\n"); return (ENXIO); } - adapter->ioport = bus_alloc_resource_any(dev, - SYS_RES_IOPORT, &adapter->io_rid, RF_ACTIVE); + adapter->ioport = bus_alloc_resource_any(dev, SYS_RES_IOPORT, + &rid, RF_ACTIVE); if (adapter->ioport == NULL) { device_printf(dev, "Unable to allocate bus resource: " "ioport\n"); @@ -1939,7 +1962,7 @@ em_allocate_pci_resources(if_ctx_t ctx) /********************************************************************* * - * Setup the MSIX Interrupt handlers + * Set up the MSI-X Interrupt handlers * **********************************************************************/ static int @@ -1968,7 +1991,7 @@ em_if_msix_intr_assign(if_ctx_t ctx, int msix) * Set the bit to enable interrupt * in E1000_IMS -- bits 20 and 21 * are for RX0 and RX1, note this has - * NOTHING to do with the MSIX vector + * NOTHING to do with the MSI-X vector */ if (adapter->hw.mac.type == e1000_82574) { rx_que->eims = 1 << (20 + i); @@ -1989,22 +2012,22 @@ em_if_msix_intr_assign(if_ctx_t ctx, int msix) &adapter->rx_queues[i % adapter->rx_num_queues].que_irq, IFLIB_INTR_TX, tx_que, tx_que->me, buf); - tx_que->msix = (vector % adapter->tx_num_queues); + tx_que->msix = (vector % adapter->rx_num_queues); /* * Set the bit to enable interrupt * in E1000_IMS -- bits 22 and 23 * are for TX0 and TX1, note this has - * NOTHING to do with the MSIX vector + * NOTHING to do with the MSI-X vector */ if (adapter->hw.mac.type == e1000_82574) { tx_que->eims = 1 << (22 + i); adapter->ims |= tx_que->eims; adapter->ivars |= (8 | tx_que->msix) << (8 + (i * 4)); } else if (adapter->hw.mac.type == e1000_82575) { - tx_que->eims = E1000_EICR_TX_QUEUE0 << (i % adapter->tx_num_queues); + tx_que->eims = E1000_EICR_TX_QUEUE0 << i; } else { - tx_que->eims = 1 << (i % adapter->tx_num_queues); + tx_que->eims = 1 << i; } } @@ -2044,7 +2067,7 @@ igb_configure_queues(struct adapter *adapter) E1000_GPIE_MSIX_MODE | E1000_GPIE_EIAME | E1000_GPIE_PBA | E1000_GPIE_NSICR); - /* Turn on MSIX */ + /* Turn on MSI-X */ switch (adapter->hw.mac.type) { case e1000_82580: case e1000_i350: @@ -2178,7 +2201,7 @@ em_free_pci_resources(if_ctx_t ctx) struct em_rx_queue *que = adapter->rx_queues; device_t dev = iflib_get_dev(ctx); - /* Release all msix queue resources */ + /* Release all MSI-X queue resources */ if (adapter->intr_type == IFLIB_INTR_MSIX) iflib_irq_free(ctx, &adapter->irq); @@ -2186,24 +2209,26 @@ em_free_pci_resources(if_ctx_t ctx) iflib_irq_free(ctx, &que->que_irq); } - /* First release all the interrupt resources */ if (adapter->memory != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, - PCIR_BAR(0), adapter->memory); + rman_get_rid(adapter->memory), adapter->memory); adapter->memory = NULL; } if (adapter->flash != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, - EM_FLASH, adapter->flash); + rman_get_rid(adapter->flash), adapter->flash); adapter->flash = NULL; } - if (adapter->ioport != NULL) + + if (adapter->ioport != NULL) { bus_release_resource(dev, SYS_RES_IOPORT, - adapter->io_rid, adapter->ioport); + rman_get_rid(adapter->ioport), adapter->ioport); + adapter->ioport = NULL; + } } -/* Setup MSI or MSI/X */ +/* Set up MSI or MSI-X */ static int em_setup_msix(if_ctx_t ctx) { @@ -2217,11 +2242,9 @@ em_setup_msix(if_ctx_t ctx) /********************************************************************* * - * Initialize the hardware to a configuration - * as specified by the adapter structure. + * Workaround for SmartSpeed on 82541 and 82547 controllers * **********************************************************************/ - static void lem_smartspeed(struct adapter *adapter) { @@ -2386,6 +2409,12 @@ igb_init_dmac(struct adapter *adapter, u32 pba) } } +/********************************************************************* + * + * Initialize the hardware to a configuration as specified by the + * adapter structure. + * + **********************************************************************/ static void em_reset(if_ctx_t ctx) { @@ -2620,6 +2649,11 @@ em_reset(if_ctx_t ctx) e1000_check_for_link(hw); } +/* + * Initialise the RSS mapping for NICs that support multiple transmit/ + * receive rings. + */ + #define RSSKEYLEN 10 static void em_initialize_rss_mapping(struct adapter *adapter) @@ -2660,7 +2694,6 @@ em_initialize_rss_mapping(struct adapter *adapter) E1000_MRQC_RSS_FIELD_IPV6_TCP_EX | E1000_MRQC_RSS_FIELD_IPV6_EX | E1000_MRQC_RSS_FIELD_IPV6); - } static void @@ -2760,7 +2793,7 @@ igb_initialize_rss_mapping(struct adapter *adapter) /********************************************************************* * - * Setup networking device structure and register an interface. + * Setup networking device structure and register interface media. * **********************************************************************/ static int @@ -2845,7 +2878,9 @@ em_if_tx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t *paddrs, int ntxqs txr->tx_paddr = paddrs[i*ntxqs]; } - device_printf(iflib_get_dev(ctx), "allocated for %d tx_queues\n", adapter->tx_num_queues); + if (bootverbose) + device_printf(iflib_get_dev(ctx), + "allocated for %d tx_queues\n", adapter->tx_num_queues); return (0); fail: em_if_queues_free(ctx); @@ -2883,8 +2918,10 @@ em_if_rx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t *paddrs, int nrxqs rxr->rx_base = (union e1000_rx_desc_extended *)vaddrs[i*nrxqs]; rxr->rx_paddr = paddrs[i*nrxqs]; } - - device_printf(iflib_get_dev(ctx), "allocated for %d rx_queues\n", adapter->rx_num_queues); + + if (bootverbose) + device_printf(iflib_get_dev(ctx), + "allocated for %d rx_queues\n", adapter->rx_num_queues); return (0); fail: @@ -3127,7 +3164,7 @@ em_initialize_receive_unit(if_ctx_t ctx) rfctl = E1000_READ_REG(hw, E1000_RFCTL); rfctl |= E1000_RFCTL_EXTEN; /* - * When using MSIX interrupts we need to throttle + * When using MSI-X interrupts we need to throttle * using the EITR register (82574 only) */ if (hw->mac.type == e1000_82574) { @@ -3371,7 +3408,7 @@ em_setup_vlan_hw_support(struct adapter *adapter) } static void -em_if_enable_intr(if_ctx_t ctx) +em_if_intr_enable(if_ctx_t ctx) { struct adapter *adapter = iflib_get_softc(ctx); struct e1000_hw *hw = &adapter->hw; @@ -3380,30 +3417,51 @@ em_if_enable_intr(if_ctx_t ctx) if (hw->mac.type == e1000_82574) { E1000_WRITE_REG(hw, EM_EIAC, EM_MSIX_MASK); ims_mask |= adapter->ims; - } else if (adapter->intr_type == IFLIB_INTR_MSIX && hw->mac.type >= igb_mac_min) { - u32 mask = (adapter->que_mask | adapter->link_mask); - - E1000_WRITE_REG(&adapter->hw, E1000_EIAC, mask); - E1000_WRITE_REG(&adapter->hw, E1000_EIAM, mask); - E1000_WRITE_REG(&adapter->hw, E1000_EIMS, mask); - ims_mask = E1000_IMS_LSC; } - E1000_WRITE_REG(hw, E1000_IMS, ims_mask); } static void -em_if_disable_intr(if_ctx_t ctx) +em_if_intr_disable(if_ctx_t ctx) +{ + struct adapter *adapter = iflib_get_softc(ctx); + struct e1000_hw *hw = &adapter->hw; + + if (hw->mac.type == e1000_82574) + E1000_WRITE_REG(hw, EM_EIAC, 0); + E1000_WRITE_REG(hw, E1000_IMC, 0xffffffff); +} + +static void +igb_if_intr_enable(if_ctx_t ctx) +{ + struct adapter *adapter = iflib_get_softc(ctx); + struct e1000_hw *hw = &adapter->hw; + u32 mask; + + if (__predict_true(adapter->intr_type == IFLIB_INTR_MSIX)) { + mask = (adapter->que_mask | adapter->link_mask); + E1000_WRITE_REG(hw, E1000_EIAC, mask); + E1000_WRITE_REG(hw, E1000_EIAM, mask); + E1000_WRITE_REG(hw, E1000_EIMS, mask); + E1000_WRITE_REG(hw, E1000_IMS, E1000_IMS_LSC); + } else + E1000_WRITE_REG(hw, E1000_IMS, IMS_ENABLE_MASK); + E1000_WRITE_FLUSH(hw); +} + +static void +igb_if_intr_disable(if_ctx_t ctx) { struct adapter *adapter = iflib_get_softc(ctx); struct e1000_hw *hw = &adapter->hw; - if (adapter->intr_type == IFLIB_INTR_MSIX) { - if (hw->mac.type >= igb_mac_min) - E1000_WRITE_REG(&adapter->hw, E1000_EIMC, ~0); - E1000_WRITE_REG(&adapter->hw, E1000_EIAC, 0); + if (__predict_true(adapter->intr_type == IFLIB_INTR_MSIX)) { + E1000_WRITE_REG(hw, E1000_EIMC, 0xffffffff); + E1000_WRITE_REG(hw, E1000_EIAC, 0); } - E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff); + E1000_WRITE_REG(hw, E1000_IMC, 0xffffffff); + E1000_WRITE_FLUSH(hw); } /* @@ -4001,13 +4059,7 @@ em_add_hw_stats(struct adapter *adapter) "Driver dropped packets"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "link_irq", CTLFLAG_RD, &adapter->link_irq, - "Link MSIX IRQ Handled"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "mbuf_defrag_fail", - CTLFLAG_RD, &adapter->mbuf_defrag_failed, - "Defragmenting mbuf chain failed"); - SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_dma_fail", - CTLFLAG_RD, &adapter->no_tx_dma_setup, - "Driver tx dma failure in xmit"); + "Link MSI-X IRQ Handled"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_overruns", CTLFLAG_RD, &adapter->rx_overruns, "RX overruns"); @@ -4518,7 +4570,7 @@ em_print_debug_info(struct adapter *adapter) /* * 82574 only: - * Write a new value to the EEPROM increasing the number of MSIX + * Write a new value to the EEPROM increasing the number of MSI-X * vectors from 3 to 5, for proper multiqueue support. */ static void @@ -4530,10 +4582,11 @@ em_enable_vectors_82574(if_ctx_t ctx) u16 edata; e1000_read_nvm(hw, EM_NVM_PCIE_CTRL, 1, &edata); - printf("Current cap: %#06x\n", edata); + if (bootverbose) + device_printf(dev, "EM_NVM_PCIE_CTRL = %#06x\n", edata); if (((edata & EM_NVM_MSIX_N_MASK) >> EM_NVM_MSIX_N_SHIFT) != 4) { device_printf(dev, "Writing to eeprom: increasing " - "reported MSIX vectors from 3 to 5...\n"); + "reported MSI-X vectors from 3 to 5...\n"); edata &= ~(EM_NVM_MSIX_N_MASK); edata |= 4 << EM_NVM_MSIX_N_SHIFT; e1000_write_nvm(hw, EM_NVM_PCIE_CTRL, 1, &edata); diff --git a/freebsd/sys/dev/e1000/if_em.h b/freebsd/sys/dev/e1000/if_em.h index d573107b..392f6b3a 100644 --- a/freebsd/sys/dev/e1000/if_em.h +++ b/freebsd/sys/dev/e1000/if_em.h @@ -352,8 +352,8 @@ /* * 82574 has a nonstandard address for EIAC - * and since its only used in MSIX, and in - * the em driver only 82574 uses MSIX we can + * and since its only used in MSI-X, and in + * the em driver only 82574 uses MSI-X we can * solve it just using this define. */ #define EM_EIAC 0x000DC @@ -468,7 +468,6 @@ struct adapter { struct resource *memory; struct resource *flash; struct resource *ioport; - int io_rid; struct resource *res; void *tag; @@ -520,7 +519,6 @@ struct adapter { u64 que_mask; - struct em_int_delay_info tx_int_delay; struct em_int_delay_info tx_abs_int_delay; struct em_int_delay_info rx_int_delay; @@ -530,9 +528,6 @@ struct adapter { /* Misc stats maintained by the driver */ unsigned long dropped_pkts; unsigned long link_irq; - unsigned long mbuf_defrag_failed; - unsigned long no_tx_dma_setup; - unsigned long no_tx_map_avail; unsigned long rx_overruns; unsigned long watchdog_events; diff --git a/freebsd/sys/dev/e1000/igb_txrx.c b/freebsd/sys/dev/e1000/igb_txrx.c index c54315f0..6da52b7e 100644 --- a/freebsd/sys/dev/e1000/igb_txrx.c +++ b/freebsd/sys/dev/e1000/igb_txrx.c @@ -334,16 +334,11 @@ igb_isc_txd_credits_update(void *arg, uint16_t txqid, bool clear) prev = txr->tx_cidx_processed; ntxd = scctx->isc_ntxd[0]; do { + MPASS(prev != cur); delta = (int32_t)cur - (int32_t)prev; - /* - * XXX This appears to be a hack for first-packet. - * A correct fix would prevent prev == cur in the first place. - */ - MPASS(prev == 0 || delta != 0); - if (prev == 0 && cur == 0) - delta += 1; if (delta < 0) delta += ntxd; + MPASS(delta > 0); processed += delta; prev = cur; diff --git a/freebsd/sys/dev/evdev/evdev.c b/freebsd/sys/dev/evdev/evdev.c index 63e651a2..90d6423d 100644 --- a/freebsd/sys/dev/evdev/evdev.c +++ b/freebsd/sys/dev/evdev/evdev.c @@ -75,14 +75,16 @@ int evdev_rcpt_mask = EVDEV_RCPT_HW_MOUSE | EVDEV_RCPT_HW_KBD; #endif /* __rtems__ */ int evdev_sysmouse_t_axis = 0; -#ifdef EVDEV_SUPPORT SYSCTL_NODE(_kern, OID_AUTO, evdev, CTLFLAG_RW, 0, "Evdev args"); +#ifdef EVDEV_SUPPORT SYSCTL_INT(_kern_evdev, OID_AUTO, rcpt_mask, CTLFLAG_RW, &evdev_rcpt_mask, 0, "Who is receiving events: bit0 - sysmouse, bit1 - kbdmux, " "bit2 - mouse hardware, bit3 - keyboard hardware"); SYSCTL_INT(_kern_evdev, OID_AUTO, sysmouse_t_axis, CTLFLAG_RW, &evdev_sysmouse_t_axis, 0, "Extract T-axis from 0-none, 1-ums, 2-psm"); #endif +SYSCTL_NODE(_kern_evdev, OID_AUTO, input, CTLFLAG_RD, 0, + "Evdev input devices"); static void evdev_start_repeat(struct evdev_dev *, uint16_t); static void evdev_stop_repeat(struct evdev_dev *); @@ -202,6 +204,87 @@ evdev_estimate_report_size(struct evdev_dev *evdev) return (size); } +static void +evdev_sysctl_create(struct evdev_dev *evdev) +{ + struct sysctl_oid *ev_sysctl_tree; + char ev_unit_str[8]; + + snprintf(ev_unit_str, sizeof(ev_unit_str), "%d", evdev->ev_unit); + sysctl_ctx_init(&evdev->ev_sysctl_ctx); + + ev_sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&evdev->ev_sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_kern_evdev_input), OID_AUTO, + ev_unit_str, CTLFLAG_RD, NULL, "", "device index"); + + SYSCTL_ADD_STRING(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "name", CTLFLAG_RD, + evdev->ev_name, 0, + "Input device name"); + + SYSCTL_ADD_STRUCT(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "id", CTLFLAG_RD, + &evdev->ev_id, input_id, + "Input device identification"); + + /* ioctl returns ENOENT if phys is not set. sysctl returns "" here */ + SYSCTL_ADD_STRING(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "phys", CTLFLAG_RD, + evdev->ev_shortname, 0, + "Input device short name"); + + /* ioctl returns ENOENT if uniq is not set. sysctl returns "" here */ + SYSCTL_ADD_STRING(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "uniq", CTLFLAG_RD, + evdev->ev_serial, 0, + "Input device unique number"); + + SYSCTL_ADD_OPAQUE(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "props", CTLFLAG_RD, + evdev->ev_prop_flags, sizeof(evdev->ev_prop_flags), "", + "Input device properties"); + + SYSCTL_ADD_OPAQUE(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "type_bits", CTLFLAG_RD, + evdev->ev_type_flags, sizeof(evdev->ev_type_flags), "", + "Input device supported events types"); + + SYSCTL_ADD_OPAQUE(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "key_bits", CTLFLAG_RD, + evdev->ev_key_flags, sizeof(evdev->ev_key_flags), + "", "Input device supported keys"); + + SYSCTL_ADD_OPAQUE(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "rel_bits", CTLFLAG_RD, + evdev->ev_rel_flags, sizeof(evdev->ev_rel_flags), "", + "Input device supported relative events"); + + SYSCTL_ADD_OPAQUE(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "abs_bits", CTLFLAG_RD, + evdev->ev_abs_flags, sizeof(evdev->ev_abs_flags), "", + "Input device supported absolute events"); + + SYSCTL_ADD_OPAQUE(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "msc_bits", CTLFLAG_RD, + evdev->ev_msc_flags, sizeof(evdev->ev_msc_flags), "", + "Input device supported miscellaneous events"); + + SYSCTL_ADD_OPAQUE(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "led_bits", CTLFLAG_RD, + evdev->ev_led_flags, sizeof(evdev->ev_led_flags), "", + "Input device supported LED events"); + + SYSCTL_ADD_OPAQUE(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "snd_bits", CTLFLAG_RD, + evdev->ev_snd_flags, sizeof(evdev->ev_snd_flags), "", + "Input device supported sound events"); + + SYSCTL_ADD_OPAQUE(&evdev->ev_sysctl_ctx, + SYSCTL_CHILDREN(ev_sysctl_tree), OID_AUTO, "sw_bits", CTLFLAG_RD, + evdev->ev_sw_flags, sizeof(evdev->ev_sw_flags), "", + "Input device supported switch events"); +} + static int evdev_register_common(struct evdev_dev *evdev) { @@ -241,6 +324,12 @@ evdev_register_common(struct evdev_dev *evdev) /* Create char device node */ ret = evdev_cdev_create(evdev); + if (ret != 0) + goto bail_out; + + /* Create sysctls (for device enumeration without /dev/input access rights) */ + evdev_sysctl_create(evdev); + bail_out: return (ret); } @@ -278,6 +367,8 @@ evdev_unregister(struct evdev_dev *evdev) debugf(evdev, "%s: unregistered evdev provider: %s\n", evdev->ev_shortname, evdev->ev_name); + sysctl_ctx_free(&evdev->ev_sysctl_ctx); + EVDEV_LOCK(evdev); evdev->ev_cdev->si_drv1 = NULL; /* Wake up sleepers */ diff --git a/freebsd/sys/dev/evdev/evdev_private.h b/freebsd/sys/dev/evdev/evdev_private.h index 71bdecaa..d7f0b4ea 100644 --- a/freebsd/sys/dev/evdev/evdev_private.h +++ b/freebsd/sys/dev/evdev/evdev_private.h @@ -32,9 +32,12 @@ #include <sys/bitstring.h> #include <sys/kbio.h> +#include <sys/lock.h> #include <sys/malloc.h> +#include <sys/mutex.h> #include <sys/queue.h> #include <sys/selinfo.h> +#include <sys/sysctl.h> #include <dev/evdev/evdev.h> #include <dev/evdev/input.h> @@ -132,6 +135,9 @@ struct evdev_dev const struct evdev_methods * ev_methods; void * ev_softc; + /* Sysctl: */ + struct sysctl_ctx_list ev_sysctl_ctx; + LIST_ENTRY(evdev_dev) ev_link; LIST_HEAD(, evdev_client) ev_clients; }; diff --git a/freebsd/sys/dev/extres/clk/clk.h b/freebsd/sys/dev/extres/clk/clk.h index 617cd5e7..7e9fe4eb 100644 --- a/freebsd/sys/dev/extres/clk/clk.h +++ b/freebsd/sys/dev/extres/clk/clk.h @@ -48,6 +48,7 @@ #define CLK_SET_ROUND_EXACT 0 #define CLK_SET_ROUND_UP 0x00000001 #define CLK_SET_ROUND_DOWN 0x00000002 +#define CLK_SET_ROUND_MULTIPLE 0x00000004 #define CLK_SET_ROUND_ANY (CLK_SET_ROUND_UP | CLK_SET_ROUND_DOWN) #define CLK_SET_USER_MASK 0x0000FFFF diff --git a/freebsd/sys/dev/fb/fbd.c b/freebsd/sys/dev/fb/fbd.c index 871e193c..56f3605d 100644 --- a/freebsd/sys/dev/fb/fbd.c +++ b/freebsd/sys/dev/fb/fbd.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/bus.h> #include <sys/conf.h> +#include <sys/eventhandler.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/module.h> diff --git a/freebsd/sys/dev/fdt/fdt_common.c b/freebsd/sys/dev/fdt/fdt_common.c index ff32dc0a..12979fda 100644 --- a/freebsd/sys/dev/fdt/fdt_common.c +++ b/freebsd/sys/dev/fdt/fdt_common.c @@ -401,6 +401,9 @@ fdt_get_phyaddr(phandle_t node, device_t dev, int *phy_addr, void **phy_sc) *phy_addr = phy_reg; + if (phy_sc == NULL) + return (0); + /* * Search for softc used to communicate with phy. */ diff --git a/freebsd/sys/dev/gpio/gpiobus.c b/freebsd/sys/dev/gpio/gpiobus.c index 2b1899e6..d256ee4a 100644 --- a/freebsd/sys/dev/gpio/gpiobus.c +++ b/freebsd/sys/dev/gpio/gpiobus.c @@ -257,13 +257,6 @@ gpiobus_alloc_ivars(struct gpiobus_ivar *devi) M_NOWAIT | M_ZERO); if (devi->pins == NULL) return (ENOMEM); - devi->flags = malloc(sizeof(uint32_t) * devi->npins, M_DEVBUF, - M_NOWAIT | M_ZERO); - if (devi->flags == NULL) { - free(devi->pins, M_DEVBUF); - return (ENOMEM); - } - return (0); } @@ -271,14 +264,11 @@ void gpiobus_free_ivars(struct gpiobus_ivar *devi) { - if (devi->flags) { - free(devi->flags, M_DEVBUF); - devi->flags = NULL; - } if (devi->pins) { free(devi->pins, M_DEVBUF); devi->pins = NULL; } + devi->npins = 0; } int @@ -328,6 +318,34 @@ gpiobus_release_pin(device_t bus, uint32_t pin) } static int +gpiobus_acquire_child_pins(device_t dev, device_t child) +{ + struct gpiobus_ivar *devi = GPIOBUS_IVAR(child); + int i; + + for (i = 0; i < devi->npins; i++) { + /* Reserve the GPIO pin. */ + if (gpiobus_acquire_pin(dev, devi->pins[i]) != 0) { + device_printf(child, "cannot acquire pin %d\n", + devi->pins[i]); + while (--i >= 0) { + (void)gpiobus_release_pin(dev, + devi->pins[i]); + } + gpiobus_free_ivars(devi); + return (EBUSY); + } + } + for (i = 0; i < devi->npins; i++) { + /* Use the child name as pin name. */ + GPIOBUS_PIN_SETNAME(dev, devi->pins[i], + device_get_nameunit(child)); + + } + return (0); +} + +static int gpiobus_parse_pins(struct gpiobus_softc *sc, device_t child, int mask) { struct gpiobus_ivar *devi = GPIOBUS_IVAR(child); @@ -351,17 +369,66 @@ gpiobus_parse_pins(struct gpiobus_softc *sc, device_t child, int mask) for (i = 0; i < 32; i++) { if ((mask & (1 << i)) == 0) continue; - /* Reserve the GPIO pin. */ - if (gpiobus_acquire_pin(sc->sc_busdev, i) != 0) { - gpiobus_free_ivars(devi); - return (EINVAL); - } devi->pins[npins++] = i; - /* Use the child name as pin name. */ - GPIOBUS_PIN_SETNAME(sc->sc_busdev, i, - device_get_nameunit(child)); } + if (gpiobus_acquire_child_pins(sc->sc_busdev, child) != 0) + return (EINVAL); + return (0); +} + +static int +gpiobus_parse_pin_list(struct gpiobus_softc *sc, device_t child, + const char *pins) +{ + struct gpiobus_ivar *devi = GPIOBUS_IVAR(child); + const char *p; + char *endp; + unsigned long pin; + int i, npins; + + npins = 0; + p = pins; + for (;;) { + pin = strtoul(p, &endp, 0); + if (endp == p) + break; + npins++; + if (*endp == '\0') + break; + p = endp + 1; + } + + if (*endp != '\0') { + device_printf(child, "garbage in the pin list: %s\n", endp); + return (EINVAL); + } + if (npins == 0) { + device_printf(child, "empty pin list\n"); + return (EINVAL); + } + + devi->npins = npins; + if (gpiobus_alloc_ivars(devi) != 0) { + device_printf(child, "cannot allocate device ivars\n"); + return (EINVAL); + } + + i = 0; + p = pins; + for (;;) { + pin = strtoul(p, &endp, 0); + + devi->pins[i] = pin; + + if (*endp == '\0') + break; + i++; + p = endp + 1; + } + + if (gpiobus_acquire_child_pins(sc->sc_busdev, child) != 0) + return (EINVAL); return (0); } @@ -541,15 +608,26 @@ gpiobus_hinted_child(device_t bus, const char *dname, int dunit) struct gpiobus_softc *sc = GPIOBUS_SOFTC(bus); struct gpiobus_ivar *devi; device_t child; - int irq, pins; + const char *pins; + int irq, pinmask; child = BUS_ADD_CHILD(bus, 0, dname, dunit); devi = GPIOBUS_IVAR(child); - resource_int_value(dname, dunit, "pins", &pins); - if (gpiobus_parse_pins(sc, child, pins)) { - resource_list_free(&devi->rl); - free(devi, M_DEVBUF); - device_delete_child(bus, child); + if (resource_int_value(dname, dunit, "pins", &pinmask) == 0) { + if (gpiobus_parse_pins(sc, child, pinmask)) { + resource_list_free(&devi->rl); + free(devi, M_DEVBUF); + device_delete_child(bus, child); + return; + } + } + else if (resource_string_value(dname, dunit, "pin_list", &pins) == 0) { + if (gpiobus_parse_pin_list(sc, child, pins)) { + resource_list_free(&devi->rl); + free(devi, M_DEVBUF); + device_delete_child(bus, child); + return; + } } if (resource_int_value(dname, dunit, "irq", &irq) == 0) { if (bus_set_resource(child, SYS_RES_IRQ, 0, irq, 1) != 0) @@ -576,6 +654,61 @@ gpiobus_set_resource(device_t dev, device_t child, int type, int rid, return (0); } +static int +gpiobus_read_ivar(device_t dev, device_t child, int which, uintptr_t *result) +{ + struct gpiobus_ivar *devi; + + devi = GPIOBUS_IVAR(child); + switch (which) { + case GPIOBUS_IVAR_NPINS: + *result = devi->npins; + break; + case GPIOBUS_IVAR_PINS: + /* Children do not ever need to directly examine this. */ + return (ENOTSUP); + default: + return (ENOENT); + } + + return (0); +} + +static int +gpiobus_write_ivar(device_t dev, device_t child, int which, uintptr_t value) +{ + struct gpiobus_ivar *devi; + const uint32_t *ptr; + int i; + + devi = GPIOBUS_IVAR(child); + switch (which) { + case GPIOBUS_IVAR_NPINS: + /* GPIO ivars are set once. */ + if (devi->npins != 0) { + return (EBUSY); + } + devi->npins = value; + if (gpiobus_alloc_ivars(devi) != 0) { + device_printf(child, "cannot allocate device ivars\n"); + devi->npins = 0; + return (ENOMEM); + } + break; + case GPIOBUS_IVAR_PINS: + ptr = (const uint32_t *)value; + for (i = 0; i < devi->npins; i++) + devi->pins[i] = ptr[i]; + if (gpiobus_acquire_child_pins(dev, child) != 0) + return (EBUSY); + break; + default: + return (ENOENT); + } + + return (0); +} + static struct resource * gpiobus_alloc_resource(device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) @@ -835,6 +968,8 @@ static device_method_t gpiobus_methods[] = { DEVMETHOD(bus_child_pnpinfo_str, gpiobus_child_pnpinfo_str), DEVMETHOD(bus_child_location_str, gpiobus_child_location_str), DEVMETHOD(bus_hinted_child, gpiobus_hinted_child), + DEVMETHOD(bus_read_ivar, gpiobus_read_ivar), + DEVMETHOD(bus_write_ivar, gpiobus_write_ivar), /* GPIO protocol */ DEVMETHOD(gpiobus_acquire_bus, gpiobus_acquire_bus), diff --git a/freebsd/sys/dev/gpio/gpiobusvar.h b/freebsd/sys/dev/gpio/gpiobusvar.h index 29677298..3ba8993e 100644 --- a/freebsd/sys/dev/gpio/gpiobusvar.h +++ b/freebsd/sys/dev/gpio/gpiobusvar.h @@ -107,10 +107,22 @@ struct gpiobus_ivar { struct resource_list rl; /* isr resource list */ uint32_t npins; /* pins total */ - uint32_t *flags; /* pins flags */ uint32_t *pins; /* pins map */ }; +enum gpiobus_ivars { + GPIOBUS_IVAR_NPINS = 10500, + GPIOBUS_IVAR_PINS, +}; + +#define GPIOBUS_ACCESSOR(var, ivar, type) \ + __BUS_ACCESSOR(gpiobus, var, GPIOBUS, ivar, type) + +GPIOBUS_ACCESSOR(npins, NPINS, uint32_t) +GPIOBUS_ACCESSOR(pins, PINS, const uint32_t *) + +#undef GPIOBUS_ACCESSOR + #ifdef FDT struct ofw_gpiobus_devinfo { struct gpiobus_ivar opd_dinfo; diff --git a/freebsd/sys/dev/gpio/ofw_gpiobus.c b/freebsd/sys/dev/gpio/ofw_gpiobus.c index ac0ea9bf..1cf3aa82 100644 --- a/freebsd/sys/dev/gpio/ofw_gpiobus.c +++ b/freebsd/sys/dev/gpio/ofw_gpiobus.c @@ -327,10 +327,8 @@ ofw_gpiobus_setup_devinfo(device_t bus, device_t child, phandle_t node) ofw_gpiobus_destroy_devinfo(bus, dinfo); return (NULL); } - for (i = 0; i < devi->npins; i++) { - devi->flags[i] = pins[i].flags; + for (i = 0; i < devi->npins; i++) devi->pins[i] = pins[i].pin; - } free(pins, M_DEVBUF); /* Parse the interrupt resources. */ if (ofw_bus_intr_to_rl(bus, node, &dinfo->opd_dinfo.rl, NULL) != 0) { diff --git a/freebsd/sys/dev/iicbus/iicbus.c b/freebsd/sys/dev/iicbus/iicbus.c index d1e55483..db8c8d92 100644 --- a/freebsd/sys/dev/iicbus/iicbus.c +++ b/freebsd/sys/dev/iicbus/iicbus.c @@ -196,9 +196,6 @@ iicbus_read_ivar(device_t bus, device_t child, int which, uintptr_t *result) case IICBUS_IVAR_ADDR: *result = devi->addr; break; - case IICBUS_IVAR_NOSTOP: - *result = devi->nostop; - break; } return (0); } @@ -215,9 +212,6 @@ iicbus_write_ivar(device_t bus, device_t child, int which, uintptr_t value) if (devi->addr != 0) return (EINVAL); devi->addr = value; - case IICBUS_IVAR_NOSTOP: - devi->nostop = value; - break; } return (0); } diff --git a/freebsd/sys/dev/iicbus/iicbus.h b/freebsd/sys/dev/iicbus/iicbus.h index 503305c7..c6382b63 100644 --- a/freebsd/sys/dev/iicbus/iicbus.h +++ b/freebsd/sys/dev/iicbus/iicbus.h @@ -41,6 +41,7 @@ struct iicbus_softc { device_t dev; /* Myself */ device_t owner; /* iicbus owner device structure */ + device_t busydev; /* iicbus_release_bus calls unbusy on this */ u_int owncount; /* iicbus ownership nesting count */ u_char started; /* address of the 'started' slave * 0 if no start condition succeeded */ @@ -54,24 +55,27 @@ struct iicbus_ivar { uint32_t addr; struct resource_list rl; - bool nostop; }; enum { - IICBUS_IVAR_ADDR, /* Address or base address */ - IICBUS_IVAR_NOSTOP, /* nostop defaults */ + IICBUS_IVAR_ADDR /* Address or base address */ }; #define IICBUS_ACCESSOR(A, B, T) \ __BUS_ACCESSOR(iicbus, A, IICBUS, B, T) IICBUS_ACCESSOR(addr, ADDR, uint32_t) -IICBUS_ACCESSOR(nostop, NOSTOP, bool) #define IICBUS_LOCK(sc) mtx_lock(&(sc)->lock) #define IICBUS_UNLOCK(sc) mtx_unlock(&(sc)->lock) #define IICBUS_ASSERT_LOCKED(sc) mtx_assert(&(sc)->lock, MA_OWNED) +#ifdef FDT +#define IICBUS_FDT_PNP_INFO(t) FDTCOMPAT_PNP_INFO(t, iicbus) +#else +#define IICBUS_FDT_PNP_INFO(t) +#endif + int iicbus_generic_intr(device_t dev, int event, char *buf); void iicbus_init_frequency(device_t dev, u_int bus_freq); diff --git a/freebsd/sys/dev/iicbus/iiconf.c b/freebsd/sys/dev/iicbus/iiconf.c index afdce118..c4926852 100644 --- a/freebsd/sys/dev/iicbus/iiconf.c +++ b/freebsd/sys/dev/iicbus/iiconf.c @@ -44,6 +44,18 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/iicbus_if.h> /* + * Encode a system errno value into the IIC_Exxxxx space by setting the + * IIC_ERRNO marker bit, so that iic2errno() can turn it back into a plain + * system errno value later. This lets controller- and bus-layer code get + * important system errno values (such as EINTR/ERESTART) back to the caller. + */ +int +errno2iic(int error) +{ + return ((error == 0) ? 0 : error | IIC_ERRNO); +} + +/* * Translate IIC_Exxxxx status values to vaguely-equivelent errno values. */ int @@ -61,7 +73,22 @@ iic2errno(int iic_status) case IIC_ENOTSUPP: return (EOPNOTSUPP); case IIC_ENOADDR: return (EADDRNOTAVAIL); case IIC_ERESOURCE: return (ENOMEM); - default: return (EIO); + default: + /* + * If the high bit is set, that means it's a system errno value + * that was encoded into the IIC_Exxxxxx space by setting the + * IIC_ERRNO marker bit. If lots of high-order bits are set, + * then it's one of the negative pseudo-errors such as ERESTART + * and we return it as-is. Otherwise it's a plain "small + * positive integer" errno, so just remove the IIC_ERRNO marker + * bit. If it's some unknown number without the high bit set, + * there isn't much we can do except call it an I/O error. + */ + if ((iic_status & IIC_ERRNO) == 0) + return (EIO); + if ((iic_status & 0xFFFF0000) != 0) + return (iic_status); + return (iic_status & ~IIC_ERRNO); } } @@ -99,7 +126,7 @@ iicbus_poll(struct iicbus_softc *sc, int how) return (IIC_EBUSBSY); } - return (error); + return (errno2iic(error)); } /* @@ -130,6 +157,18 @@ iicbus_request_bus(device_t bus, device_t dev, int how) ++sc->owncount; if (sc->owner == NULL) { sc->owner = dev; + /* + * Mark the device busy while it owns the bus, to + * prevent detaching the device, bus, or hardware + * controller, until ownership is relinquished. If the + * device is doing IO from its probe method before + * attaching, it cannot be busied; mark the bus busy. + */ + if (device_get_state(dev) < DS_ATTACHING) + sc->busydev = bus; + else + sc->busydev = dev; + device_busy(sc->busydev); /* * Drop the lock around the call to the bus driver, it * should be allowed to sleep in the IIC_WAIT case. @@ -146,6 +185,7 @@ iicbus_request_bus(device_t bus, device_t dev, int how) sc->owner = NULL; sc->owncount = 0; wakeup_one(sc); + device_unbusy(sc->busydev); } } } @@ -179,6 +219,7 @@ iicbus_release_bus(device_t bus, device_t dev) IICBUS_LOCK(sc); sc->owner = NULL; wakeup_one(sc); + device_unbusy(sc->busydev); } IICBUS_UNLOCK(sc); return (0); @@ -422,7 +463,7 @@ iicbus_transfer_gen(device_t dev, struct iic_msg *msgs, uint32_t nmsgs) { int i, error, lenread, lenwrote, nkid, rpstart, addr; device_t *children, bus; - bool nostop, started; + bool started; if ((error = device_get_children(dev, &children, &nkid)) != 0) return (IIC_ERESOURCE); @@ -433,7 +474,6 @@ iicbus_transfer_gen(device_t dev, struct iic_msg *msgs, uint32_t nmsgs) bus = children[0]; rpstart = 0; free(children, M_TEMP); - nostop = iicbus_get_nostop(dev); started = false; for (i = 0, error = 0; i < nmsgs && error == 0; i++) { addr = msgs[i].slave; @@ -461,12 +501,11 @@ iicbus_transfer_gen(device_t dev, struct iic_msg *msgs, uint32_t nmsgs) if (error != 0) break; - if ((msgs[i].flags & IIC_M_NOSTOP) != 0 || - (nostop && i + 1 < nmsgs)) { - rpstart = 1; /* Next message gets repeated start */ - } else { + if (!(msgs[i].flags & IIC_M_NOSTOP)) { rpstart = 0; iicbus_stop(bus); + } else { + rpstart = 1; /* Next message gets repeated start */ } } if (error != 0 && started) diff --git a/freebsd/sys/dev/iicbus/iiconf.h b/freebsd/sys/dev/iicbus/iiconf.h index c264183e..5fbfcea5 100644 --- a/freebsd/sys/dev/iicbus/iiconf.h +++ b/freebsd/sys/dev/iicbus/iiconf.h @@ -96,12 +96,14 @@ #define IIC_ENOTSUPP 0x8 /* request not supported */ #define IIC_ENOADDR 0x9 /* no address assigned to the interface */ #define IIC_ERESOURCE 0xa /* resources (memory, whatever) unavailable */ +#define IIC_ERRNO INT_MIN /* marker bit: errno is in low-order bits */ /* * Note that all iicbus functions return IIC_Exxxxx status values, * except iic2errno() (obviously) and iicbus_started() (returns bool). */ extern int iic2errno(int); +extern int errno2iic(int); extern int iicbus_request_bus(device_t, device_t, int); extern int iicbus_release_bus(device_t, device_t); extern device_t iicbus_alloc_bus(device_t); diff --git a/freebsd/sys/dev/led/led.c b/freebsd/sys/dev/led/led.c index 70de95bb..43ae2f66 100644 --- a/freebsd/sys/dev/led/led.c +++ b/freebsd/sys/dev/led/led.c @@ -17,16 +17,19 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/conf.h> +#include <sys/ctype.h> #include <sys/kernel.h> -#include <sys/systm.h> #include <sys/limits.h> +#include <sys/lock.h> #include <sys/malloc.h> -#include <sys/ctype.h> -#include <sys/sbuf.h> +#include <sys/mutex.h> #include <sys/queue.h> -#include <dev/led/led.h> -#include <sys/uio.h> +#include <sys/sbuf.h> #include <sys/sx.h> +#include <sys/systm.h> +#include <sys/uio.h> + +#include <dev/led/led.h> struct ledsc { LIST_ENTRY(ledsc) list; @@ -263,7 +266,7 @@ led_set(char const *name, char const *cmd) mtx_unlock(&led_mtx); if (sb != NULL) sbuf_delete(sb); - return (0); + return (error); } static struct cdevsw led_cdevsw = { diff --git a/freebsd/sys/dev/mii/micphy.c b/freebsd/sys/dev/mii/micphy.c index 01e75357..f9493332 100644 --- a/freebsd/sys/dev/mii/micphy.c +++ b/freebsd/sys/dev/mii/micphy.c @@ -1,7 +1,7 @@ #include <machine/rtems-bsd-kernel-space.h> /*- - * Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com> + * Copyright (c) 2014,2019 Ruslan Bukin <br@bsdpad.com> * All rights reserved. * * This software was developed by SRI International and the University of @@ -34,7 +34,7 @@ __FBSDID("$FreeBSD$"); /* - * Micrel KSZ9021 Gigabit Ethernet Transceiver + * Micrel KSZ8081/KSZ9021/KSZ9031 Gigabit Ethernet Transceiver */ #include <sys/param.h> @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include <dev/ofw/openfirm.h> #include <dev/ofw/ofw_bus.h> #include <dev/ofw/ofw_bus_subr.h> +#include <dev/mii/mii_fdt.h> #define MII_KSZPHY_EXTREG 0x0b #define KSZPHY_EXTREG_WRITE (1 << 15) @@ -253,6 +254,7 @@ micphy_probe(device_t dev) static int micphy_attach(device_t dev) { + mii_fdt_phy_config_t *cfg; struct mii_softc *sc; phandle_t node; device_t miibus; @@ -273,10 +275,12 @@ micphy_attach(device_t dev) if ((node = ofw_bus_get_node(parent)) == -1) return (ENXIO); + cfg = mii_fdt_get_config(dev); + if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ9031) - ksz9031_load_values(sc, node); + ksz9031_load_values(sc, cfg->phynode); else - ksz9021_load_values(sc, node); + ksz9021_load_values(sc, cfg->phynode); return (0); } diff --git a/freebsd/sys/dev/mmc/bridge.h b/freebsd/sys/dev/mmc/bridge.h index 7af811f1..d32abbac 100644 --- a/freebsd/sys/dev/mmc/bridge.h +++ b/freebsd/sys/dev/mmc/bridge.h @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2006 M. Warner Losh. All rights reserved. + * Copyright (c) 2006 M. Warner Losh. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/freebsd/sys/dev/mmc/mmc.c b/freebsd/sys/dev/mmc/mmc.c index 4c8aefcf..5bc3bbf7 100644 --- a/freebsd/sys/dev/mmc/mmc.c +++ b/freebsd/sys/dev/mmc/mmc.c @@ -4,7 +4,7 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Bernd Walter. All rights reserved. - * Copyright (c) 2006 M. Warner Losh. All rights reserved. + * Copyright (c) 2006 M. Warner Losh. * Copyright (c) 2017 Marius Strobl <marius@FreeBSD.org> * * Redistribution and use in source and binary forms, with or without diff --git a/freebsd/sys/dev/mmc/mmc_private.h b/freebsd/sys/dev/mmc/mmc_private.h index 633d0784..a633d235 100644 --- a/freebsd/sys/dev/mmc/mmc_private.h +++ b/freebsd/sys/dev/mmc/mmc_private.h @@ -1,6 +1,6 @@ /*- * Copyright (c) 2006 Bernd Walter. All rights reserved. - * Copyright (c) 2006 M. Warner Losh. All rights reserved. + * Copyright (c) 2006 M. Warner Losh. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/freebsd/sys/dev/mmc/mmc_subr.c b/freebsd/sys/dev/mmc/mmc_subr.c index 76a14028..1e3bef16 100644 --- a/freebsd/sys/dev/mmc/mmc_subr.c +++ b/freebsd/sys/dev/mmc/mmc_subr.c @@ -2,7 +2,7 @@ /*- * Copyright (c) 2006 Bernd Walter. All rights reserved. - * Copyright (c) 2006 M. Warner Losh. All rights reserved. + * Copyright (c) 2006 M. Warner Losh. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/freebsd/sys/dev/mmc/mmc_subr.h b/freebsd/sys/dev/mmc/mmc_subr.h index 33ea6760..80c1ce2f 100644 --- a/freebsd/sys/dev/mmc/mmc_subr.h +++ b/freebsd/sys/dev/mmc/mmc_subr.h @@ -1,6 +1,6 @@ /*- * Copyright (c) 2006 Bernd Walter. All rights reserved. - * Copyright (c) 2006 M. Warner Losh. All rights reserved. + * Copyright (c) 2006 M. Warner Losh. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/freebsd/sys/dev/mmc/mmcbrvar.h b/freebsd/sys/dev/mmc/mmcbrvar.h index acddd3a3..f2f107c7 100644 --- a/freebsd/sys/dev/mmc/mmcbrvar.h +++ b/freebsd/sys/dev/mmc/mmcbrvar.h @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Bernd Walter. All rights reserved. - * Copyright (c) 2006 M. Warner Losh. All rights reserved. + * Copyright (c) 2006 M. Warner Losh. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/freebsd/sys/dev/mmc/mmcreg.h b/freebsd/sys/dev/mmc/mmcreg.h index 4b1f8a0e..f6031410 100644 --- a/freebsd/sys/dev/mmc/mmcreg.h +++ b/freebsd/sys/dev/mmc/mmcreg.h @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2006 M. Warner Losh. All rights reserved. + * Copyright (c) 2006 M. Warner Losh. * Copyright (c) 2017 Marius Strobl <marius@FreeBSD.org> * Copyright (c) 2015-2016 Ilya Bakulin <kibab@FreeBSD.org> * @@ -200,7 +200,10 @@ struct mmc_data { #define MMC_DATA_READ (1UL << 1) #define MMC_DATA_STREAM (1UL << 2) #define MMC_DATA_MULTI (1UL << 3) +#define MMC_DATA_BLOCK_SIZE (1UL << 4) struct mmc_request *mrq; + size_t block_size; /* block size for CMD53 */ + size_t block_count; /* block count for CMD53 */ }; struct mmc_request { @@ -554,30 +557,39 @@ struct mmc_request { #define SD_IO_RW_LEN(x) (((x) & 0xFF) << 0) #define SD_IOE_RW_LEN(x) (((x) & 0x1FF) << 0) +#define SD_IOE_RW_ADR(x) (((x) & 0x1FFFF) << 9) +#define SD_IOE_RW_INCR (1u << 26) #define SD_IOE_RW_BLK (1u << 27) +#define SD_IOE_RW_FUNC(x) (((x) & 0x7) << 28) +#define SD_IOE_RW_WR (1u << 31) /* Card Common Control Registers (CCCR) */ -#define SD_IO_CCCR_START 0x00000 -#define SD_IO_CCCR_SIZE 0x100 -#define SD_IO_CCCR_FN_ENABLE 0x02 -#define SD_IO_CCCR_FN_READY 0x03 -#define SD_IO_CCCR_INT_ENABLE 0x04 -#define SD_IO_CCCR_INT_PENDING 0x05 -#define SD_IO_CCCR_CTL 0x06 -#define CCCR_CTL_RES (1 << 3) -#define SD_IO_CCCR_BUS_WIDTH 0x07 +#define SD_IO_CCCR_START 0x00000 /* Offset in F0 address space */ +#define SD_IO_CCCR_SIZE 0x100 /* Total size of CCCR */ +#define SD_IO_CCCR_FN_ENABLE 0x02 /* Enabled functions */ +#define SD_IO_CCCR_FN_READY 0x03 /* Function ready status */ +#define SD_IO_CCCR_INT_ENABLE 0x04 /* Per-function interrupt enable */ +#define SD_IO_CCCR_INT_PENDING 0x05 /* Per-function interrupt pending */ +#define SD_IO_CCCR_CTL 0x06 /* I/O Abort register */ +#define CCCR_CTL_RES (1 << 3) /* Perform SDIO reset */ +#define SD_IO_CCCR_BUS_WIDTH 0x07 /* Bus Width register */ #define CCCR_BUS_WIDTH_4 (1 << 1) #define CCCR_BUS_WIDTH_1 (1 << 0) -#define SD_IO_CCCR_CARDCAP 0x08 -#define SD_IO_CCCR_CISPTR 0x09 /* XXX 9-10, 10-11, or 9-12 */ - +#define SD_IO_CCCR_CARDCAP 0x08 /* SDIO card capabilities */ +#define CCCR_CC_SMB (1 << 1) /* CMD53 block mode support */ +#define SD_IO_CCCR_CISPTR 0x09 /* 0x09 - 0x0B */ +#define SD_IO_CCCR_FN0_BLKSZ 0x10 /* 0x10 - 0x11 */ /* Function Basic Registers (FBR) */ -#define SD_IO_FBR_START 0x00100 -#define SD_IO_FBR_SIZE 0x00700 +#define SD_IO_FBR_START 0x00100 /* Offset in F0 address space */ +#define SD_IO_FBR_SIZE 0x00700 /* Total size of FBR */ +#define SD_IO_FBR_F_SIZE 0x00100 /* Size of each function */ +#define SD_IO_FBR_START_F(n) (SD_IO_FBR_START + (n-1) * SD_IO_FBR_F_SIZE) +#define SD_IO_FBR_CIS_OFFSET 0x9 /* Offset of this function's info block within CIS area */ +#define SD_IO_FBR_IOBLKSZ 0x10 /* Block size for CMD53 block mode operations */ /* Card Information Structure (CIS) */ -#define SD_IO_CIS_START 0x01000 -#define SD_IO_CIS_SIZE 0x17000 +#define SD_IO_CIS_START 0x01000 /* Offset in F0 address space */ +#define SD_IO_CIS_SIZE 0x17000 /* Total size of CIS */ /* CIS tuple codes (based on PC Card 16) */ #define SD_IO_CISTPL_VERS_1 0x15 diff --git a/freebsd/sys/dev/mmc/mmcsd.c b/freebsd/sys/dev/mmc/mmcsd.c index e469c1d5..8fc06eb2 100644 --- a/freebsd/sys/dev/mmc/mmcsd.c +++ b/freebsd/sys/dev/mmc/mmcsd.c @@ -4,7 +4,7 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Bernd Walter. All rights reserved. - * Copyright (c) 2006 M. Warner Losh. All rights reserved. + * Copyright (c) 2006 M. Warner Losh. * Copyright (c) 2017 Marius Strobl <marius@FreeBSD.org> * * Redistribution and use in source and binary forms, with or without @@ -813,7 +813,7 @@ mmcsd_add_part(struct mmcsd_softc *sc, u_int type, const char *name, u_int cnt, speed / 1000000, (speed / 100000) % 10, mmcsd_bus_bit_width(dev), sc->max_data); } else if (type == EXT_CSD_PART_CONFIG_ACC_RPMB) { - printf("%s: %ju%sB partion %d%s at %s\n", part->name, bytes, + printf("%s: %ju%sB partition %d%s at %s\n", part->name, bytes, unit, type, ro ? " (read-only)" : "", device_get_nameunit(dev)); } else { @@ -849,12 +849,12 @@ mmcsd_add_part(struct mmcsd_softc *sc, u_int type, const char *name, u_int cnt, } } if (ext == NULL) - printf("%s%d: %ju%sB partion %d%s%s at %s\n", + printf("%s%d: %ju%sB partition %d%s%s at %s\n", part->name, cnt, bytes, unit, type, enh ? " enhanced" : "", ro ? " (read-only)" : "", device_get_nameunit(dev)); else - printf("%s%d: %ju%sB partion %d extended 0x%x " + printf("%s%d: %ju%sB partition %d extended 0x%x " "(%s)%s at %s\n", part->name, cnt, bytes, unit, type, extattr, ext, ro ? " (read-only)" : "", device_get_nameunit(dev)); diff --git a/freebsd/sys/dev/mmc/mmcvar.h b/freebsd/sys/dev/mmc/mmcvar.h index 1604c306..8d8c5547 100644 --- a/freebsd/sys/dev/mmc/mmcvar.h +++ b/freebsd/sys/dev/mmc/mmcvar.h @@ -2,7 +2,7 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Bernd Walter. All rights reserved. - * Copyright (c) 2006 M. Warner Losh. All rights reserved. + * Copyright (c) 2006 M. Warner Losh. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/freebsd/sys/dev/nvme/nvme.h b/freebsd/sys/dev/nvme/nvme.h index 845ba75b..0d4a18b7 100644 --- a/freebsd/sys/dev/nvme/nvme.h +++ b/freebsd/sys/dev/nvme/nvme.h @@ -40,6 +40,7 @@ #define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command) #define NVME_RESET_CONTROLLER _IO('n', 1) +#define NVME_GET_NSID _IOR('n', 2, struct nvme_get_nsid) #define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test) #define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test) @@ -69,15 +70,39 @@ #define NVME_CAP_LO_REG_AMS_MASK (0x3) #define NVME_CAP_LO_REG_TO_SHIFT (24) #define NVME_CAP_LO_REG_TO_MASK (0xFF) +#define NVME_CAP_LO_MQES(x) \ + (((x) >> NVME_CAP_LO_REG_MQES_SHIFT) & NVME_CAP_LO_REG_MQES_MASK) +#define NVME_CAP_LO_CQR(x) \ + (((x) >> NVME_CAP_LO_REG_CQR_SHIFT) & NVME_CAP_LO_REG_CQR_MASK) +#define NVME_CAP_LO_AMS(x) \ + (((x) >> NVME_CAP_LO_REG_AMS_SHIFT) & NVME_CAP_LO_REG_AMS_MASK) +#define NVME_CAP_LO_TO(x) \ + (((x) >> NVME_CAP_LO_REG_TO_SHIFT) & NVME_CAP_LO_REG_TO_MASK) #define NVME_CAP_HI_REG_DSTRD_SHIFT (0) #define NVME_CAP_HI_REG_DSTRD_MASK (0xF) +#define NVME_CAP_HI_REG_NSSRS_SHIFT (4) +#define NVME_CAP_HI_REG_NSSRS_MASK (0x1) #define NVME_CAP_HI_REG_CSS_NVM_SHIFT (5) #define NVME_CAP_HI_REG_CSS_NVM_MASK (0x1) +#define NVME_CAP_HI_REG_BPS_SHIFT (13) +#define NVME_CAP_HI_REG_BPS_MASK (0x1) #define NVME_CAP_HI_REG_MPSMIN_SHIFT (16) #define NVME_CAP_HI_REG_MPSMIN_MASK (0xF) #define NVME_CAP_HI_REG_MPSMAX_SHIFT (20) #define NVME_CAP_HI_REG_MPSMAX_MASK (0xF) +#define NVME_CAP_HI_REG_PMRS_SHIFT (24) +#define NVME_CAP_HI_REG_PMRS_MASK (0x1) +#define NVME_CAP_HI_REG_CMBS_SHIFT (25) +#define NVME_CAP_HI_REG_CMBS_MASK (0x1) +#define NVME_CAP_HI_DSTRD(x) \ + (((x) >> NVME_CAP_HI_REG_DSTRD_SHIFT) & NVME_CAP_HI_REG_DSTRD_MASK) +#define NVME_CAP_HI_CSS_NVM(x) \ + (((x) >> NVME_CAP_HI_REG_CSS_NVM_SHIFT) & NVME_CAP_HI_REG_CSS_NVM_MASK) +#define NVME_CAP_HI_MPSMIN(x) \ + (((x) >> NVME_CAP_HI_REG_MPSMIN_SHIFT) & NVME_CAP_HI_REG_MPSMIN_MASK) +#define NVME_CAP_HI_MPSMAX(x) \ + (((x) >> NVME_CAP_HI_REG_MPSMAX_SHIFT) & NVME_CAP_HI_REG_MPSMAX_MASK) #define NVME_CC_REG_EN_SHIFT (0) #define NVME_CC_REG_EN_MASK (0x1) @@ -100,6 +125,10 @@ #define NVME_CSTS_REG_CFS_MASK (0x1) #define NVME_CSTS_REG_SHST_SHIFT (2) #define NVME_CSTS_REG_SHST_MASK (0x3) +#define NVME_CSTS_REG_NVSRO_SHIFT (4) +#define NVME_CSTS_REG_NVSRO_MASK (0x1) +#define NVME_CSTS_REG_PP_SHIFT (5) +#define NVME_CSTS_REG_PP_MASK (0x1) #define NVME_CSTS_GET_SHST(csts) (((csts) >> NVME_CSTS_REG_SHST_SHIFT) & NVME_CSTS_REG_SHST_MASK) @@ -119,6 +148,8 @@ #define NVME_STATUS_SC_MASK (0xFF) #define NVME_STATUS_SCT_SHIFT (9) #define NVME_STATUS_SCT_MASK (0x7) +#define NVME_STATUS_CRD_SHIFT (12) +#define NVME_STATUS_CRD_MASK (0x3) #define NVME_STATUS_M_SHIFT (14) #define NVME_STATUS_M_MASK (0x1) #define NVME_STATUS_DNR_SHIFT (15) @@ -159,6 +190,9 @@ /* SR-IOV Virtual Function */ #define NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT (2) #define NVME_CTRLR_DATA_MIC_SRIOVVF_MASK (0x1) +/* Asymmetric Namespace Access Reporting */ +#define NVME_CTRLR_DATA_MIC_ANAR_SHIFT (3) +#define NVME_CTRLR_DATA_MIC_ANAR_MASK (0x1) /** OACS - optional admin command support */ /* supports security send/receive commands */ @@ -188,6 +222,9 @@ /* supports Doorbell Buffer Config */ #define NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT (8) #define NVME_CTRLR_DATA_OACS_DBBUFFER_MASK (0x1) +/* supports Get LBA Status */ +#define NVME_CTRLR_DATA_OACS_GETLBA_SHIFT (9) +#define NVME_CTRLR_DATA_OACS_GETLBA_MASK (0x1) /** firmware updates */ /* first slot is read-only */ @@ -196,6 +233,9 @@ /* number of firmware slots */ #define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT (1) #define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK (0x7) +/* firmware activation without reset */ +#define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_SHIFT (4) +#define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_MASK (0x1) /** log page attributes */ /* per namespace smart/health log page */ @@ -212,6 +252,26 @@ #define NVME_CTRLR_DATA_APSTA_APST_SUPP_SHIFT (0) #define NVME_CTRLR_DATA_APSTA_APST_SUPP_MASK (0x1) +/** Sanitize Capabilities */ +/* Crypto Erase Support */ +#define NVME_CTRLR_DATA_SANICAP_CES_SHIFT (0) +#define NVME_CTRLR_DATA_SANICAP_CES_MASK (0x1) +/* Block Erase Support */ +#define NVME_CTRLR_DATA_SANICAP_BES_SHIFT (1) +#define NVME_CTRLR_DATA_SANICAP_BES_MASK (0x1) +/* Overwrite Support */ +#define NVME_CTRLR_DATA_SANICAP_OWS_SHIFT (2) +#define NVME_CTRLR_DATA_SANICAP_OWS_MASK (0x1) +/* No-Deallocate Inhibited */ +#define NVME_CTRLR_DATA_SANICAP_NDI_SHIFT (29) +#define NVME_CTRLR_DATA_SANICAP_NDI_MASK (0x1) +/* No-Deallocate Modifies Media After Sanitize */ +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT (30) +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_MASK (0x3) +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_UNDEF (0) +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_NO (1) +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_YES (2) + /** submission queue entry size */ #define NVME_CTRLR_DATA_SQES_MIN_SHIFT (0) #define NVME_CTRLR_DATA_SQES_MIN_MASK (0xF) @@ -239,6 +299,8 @@ #define NVME_CTRLR_DATA_ONCS_RESERV_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT (6) #define NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_VERIFY_SHIFT (7) +#define NVME_CTRLR_DATA_ONCS_VERIFY_MASK (0x1) /** Fused Operation Support */ #define NVME_CTRLR_DATA_FUSES_CNW_SHIFT (0) @@ -253,8 +315,15 @@ #define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK (0x1) /** volatile write cache */ +/* volatile write cache present */ #define NVME_CTRLR_DATA_VWC_PRESENT_SHIFT (0) #define NVME_CTRLR_DATA_VWC_PRESENT_MASK (0x1) +/* flush all namespaces supported */ +#define NVME_CTRLR_DATA_VWC_ALL_SHIFT (1) +#define NVME_CTRLR_DATA_VWC_ALL_MASK (0x3) +#define NVME_CTRLR_DATA_VWC_ALL_UNKNOWN (0) +#define NVME_CTRLR_DATA_VWC_ALL_NO (2) +#define NVME_CTRLR_DATA_VWC_ALL_YES (3) /** namespace features */ /* thin provisioning */ @@ -269,6 +338,9 @@ /* NGUID and EUI64 fields are not reusable */ #define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_SHIFT (3) #define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_MASK (0x1) +/* NPWG, NPWA, NPDG, NPDA, and NOWS are valid */ +#define NVME_NS_DATA_NSFEAT_NPVALID_SHIFT (4) +#define NVME_NS_DATA_NSFEAT_NPVALID_MASK (0x1) /** formatted lba size */ #define NVME_NS_DATA_FLBAS_FORMAT_SHIFT (0) @@ -349,6 +421,20 @@ #define NVME_NS_DATA_FPI_SUPP_SHIFT (7) #define NVME_NS_DATA_FPI_SUPP_MASK (0x1) +/** Deallocate Logical Block Features */ +/* deallocated logical block read behavior */ +#define NVME_NS_DATA_DLFEAT_READ_SHIFT (0) +#define NVME_NS_DATA_DLFEAT_READ_MASK (0x07) +#define NVME_NS_DATA_DLFEAT_READ_NR (0x00) +#define NVME_NS_DATA_DLFEAT_READ_00 (0x01) +#define NVME_NS_DATA_DLFEAT_READ_FF (0x02) +/* supports the Deallocate bit in the Write Zeroes */ +#define NVME_NS_DATA_DLFEAT_DWZ_SHIFT (3) +#define NVME_NS_DATA_DLFEAT_DWZ_MASK (0x01) +/* Guard field for deallocated logical blocks is set to the CRC */ +#define NVME_NS_DATA_DLFEAT_GCRC_SHIFT (4) +#define NVME_NS_DATA_DLFEAT_GCRC_MASK (0x01) + /** lba format support */ /* metadata size */ #define NVME_NS_DATA_LBAF_MS_SHIFT (0) @@ -373,6 +459,35 @@ enum nvme_critical_warning_state { #define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT (0) #define NVME_FIRMWARE_PAGE_AFI_SLOT_MASK (0x7) +/* Commands Supported and Effects */ +#define NVME_CE_PAGE_CSUP_SHIFT (0) +#define NVME_CE_PAGE_CSUP_MASK (0x1) +#define NVME_CE_PAGE_LBCC_SHIFT (1) +#define NVME_CE_PAGE_LBCC_MASK (0x1) +#define NVME_CE_PAGE_NCC_SHIFT (2) +#define NVME_CE_PAGE_NCC_MASK (0x1) +#define NVME_CE_PAGE_NIC_SHIFT (3) +#define NVME_CE_PAGE_NIC_MASK (0x1) +#define NVME_CE_PAGE_CCC_SHIFT (4) +#define NVME_CE_PAGE_CCC_MASK (0x1) +#define NVME_CE_PAGE_CSE_SHIFT (16) +#define NVME_CE_PAGE_CSE_MASK (0x7) +#define NVME_CE_PAGE_UUID_SHIFT (19) +#define NVME_CE_PAGE_UUID_MASK (0x1) + +/* Sanitize Status */ +#define NVME_SS_PAGE_SSTAT_STATUS_SHIFT (0) +#define NVME_SS_PAGE_SSTAT_STATUS_MASK (0x7) +#define NVME_SS_PAGE_SSTAT_STATUS_NEVER (0) +#define NVME_SS_PAGE_SSTAT_STATUS_COMPLETED (1) +#define NVME_SS_PAGE_SSTAT_STATUS_INPROG (2) +#define NVME_SS_PAGE_SSTAT_STATUS_FAILED (3) +#define NVME_SS_PAGE_SSTAT_STATUS_COMPLETEDWD (4) +#define NVME_SS_PAGE_SSTAT_PASSES_SHIFT (3) +#define NVME_SS_PAGE_SSTAT_PASSES_MASK (0x1f) +#define NVME_SS_PAGE_SSTAT_GDE_SHIFT (8) +#define NVME_SS_PAGE_SSTAT_GDE_MASK (0x1) + /* CC register SHN field values */ enum shn_value { NVME_SHN_NORMAL = 0x1, @@ -388,34 +503,37 @@ enum shst_value { struct nvme_registers { - /** controller capabilities */ - uint32_t cap_lo; - uint32_t cap_hi; - - uint32_t vs; /* version */ - uint32_t intms; /* interrupt mask set */ - uint32_t intmc; /* interrupt mask clear */ - - /** controller configuration */ - uint32_t cc; - - uint32_t reserved1; - - /** controller status */ - uint32_t csts; - - uint32_t reserved2; - - /** admin queue attributes */ - uint32_t aqa; - - uint64_t asq; /* admin submission queue base addr */ - uint64_t acq; /* admin completion queue base addr */ - uint32_t reserved3[0x3f2]; - + uint32_t cap_lo; /* controller capabilities */ + uint32_t cap_hi; + uint32_t vs; /* version */ + uint32_t intms; /* interrupt mask set */ + uint32_t intmc; /* interrupt mask clear */ + uint32_t cc; /* controller configuration */ + uint32_t reserved1; + uint32_t csts; /* controller status */ + uint32_t nssr; /* NVM Subsystem Reset */ + uint32_t aqa; /* admin queue attributes */ + uint64_t asq; /* admin submission queue base addr */ + uint64_t acq; /* admin completion queue base addr */ + uint32_t cmbloc; /* Controller Memory Buffer Location */ + uint32_t cmbsz; /* Controller Memory Buffer Size */ + uint32_t bpinfo; /* Boot Partition Information */ + uint32_t bprsel; /* Boot Partition Read Select */ + uint64_t bpmbl; /* Boot Partition Memory Buffer Location */ + uint64_t cmbmsc; /* Controller Memory Buffer Memory Space Control */ + uint32_t cmbsts; /* Controller Memory Buffer Status */ + uint8_t reserved3[3492]; /* 5Ch - DFFh */ + uint32_t pmrcap; /* Persistent Memory Capabilities */ + uint32_t pmrctl; /* Persistent Memory Region Control */ + uint32_t pmrsts; /* Persistent Memory Region Status */ + uint32_t pmrebs; /* Persistent Memory Region Elasticity Buffer Size */ + uint32_t pmrswtp; /* Persistent Memory Region Sustained Write Throughput */ + uint32_t pmrmsc_lo; /* Persistent Memory Region Controller Memory Space Control */ + uint32_t pmrmsc_hi; + uint8_t reserved4[484]; /* E1Ch - FFFh */ struct { - uint32_t sq_tdbl; /* submission queue tail doorbell */ - uint32_t cq_hdbl; /* completion queue head doorbell */ + uint32_t sq_tdbl; /* submission queue tail doorbell */ + uint32_t cq_hdbl; /* completion queue head doorbell */ } doorbell[1] __packed; } __packed; @@ -490,6 +608,7 @@ enum nvme_status_code_type { NVME_SCT_GENERIC = 0x0, NVME_SCT_COMMAND_SPECIFIC = 0x1, NVME_SCT_MEDIA_ERROR = 0x2, + NVME_SCT_PATH_RELATED = 0x3, /* 0x3-0x6 - reserved */ NVME_SCT_VENDOR_SPECIFIC = 0x7, }; @@ -528,6 +647,9 @@ enum nvme_generic_command_status_code { NVME_SC_SANITIZE_IN_PROGRESS = 0x1d, NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID = 0x1e, NVME_SC_NOT_SUPPORTED_IN_CMB = 0x1f, + NVME_SC_NAMESPACE_IS_WRITE_PROTECTED = 0x20, + NVME_SC_COMMAND_INTERRUPTED = 0x21, + NVME_SC_TRANSIENT_TRANSPORT_ERROR = 0x22, NVME_SC_LBA_OUT_OF_RANGE = 0x80, NVME_SC_CAPACITY_EXCEEDED = 0x81, @@ -573,6 +695,9 @@ enum nvme_command_specific_status_code { NVME_SC_INVALID_SEC_CTRLR_STATE = 0x20, NVME_SC_INVALID_NUM_OF_CTRLR_RESRC = 0x21, NVME_SC_INVALID_RESOURCE_ID = 0x22, + NVME_SC_SANITIZE_PROHIBITED_WPMRE = 0x23, + NVME_SC_ANA_GROUP_ID_INVALID = 0x24, + NVME_SC_ANA_ATTACH_FAILED = 0x25, NVME_SC_CONFLICTING_ATTRIBUTES = 0x80, NVME_SC_INVALID_PROTECTION_INFO = 0x81, @@ -591,6 +716,17 @@ enum nvme_media_error_status_code { NVME_SC_DEALLOCATED_OR_UNWRITTEN = 0x87, }; +/* path related status codes */ +enum nvme_path_related_status_code { + NVME_SC_INTERNAL_PATH_ERROR = 0x00, + NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS = 0x01, + NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE = 0x02, + NVME_SC_ASYMMETRIC_ACCESS_TRANSITION = 0x03, + NVME_SC_CONTROLLER_PATHING_ERROR = 0x60, + NVME_SC_HOST_PATHING_ERROR = 0x70, + NVME_SC_COMMAND_ABOTHED_BY_HOST = 0x71, +}; + /* admin opcodes */ enum nvme_admin_opcode { NVME_OPC_DELETE_IO_SQ = 0x00, @@ -610,20 +746,27 @@ enum nvme_admin_opcode { /* 0x0e-0x0f - reserved */ NVME_OPC_FIRMWARE_ACTIVATE = 0x10, NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD = 0x11, + /* 0x12-0x13 - reserved */ NVME_OPC_DEVICE_SELF_TEST = 0x14, NVME_OPC_NAMESPACE_ATTACHMENT = 0x15, + /* 0x16-0x17 - reserved */ NVME_OPC_KEEP_ALIVE = 0x18, NVME_OPC_DIRECTIVE_SEND = 0x19, NVME_OPC_DIRECTIVE_RECEIVE = 0x1a, + /* 0x1b - reserved */ NVME_OPC_VIRTUALIZATION_MANAGEMENT = 0x1c, NVME_OPC_NVME_MI_SEND = 0x1d, NVME_OPC_NVME_MI_RECEIVE = 0x1e, + /* 0x1f-0x7b - reserved */ NVME_OPC_DOORBELL_BUFFER_CONFIG = 0x7c, NVME_OPC_FORMAT_NVM = 0x80, NVME_OPC_SECURITY_SEND = 0x81, NVME_OPC_SECURITY_RECEIVE = 0x82, + /* 0x83 - reserved */ NVME_OPC_SANITIZE = 0x84, + /* 0x85 - reserved */ + NVME_OPC_GET_LBA_STATUS = 0x86, }; /* nvme nvm opcodes */ @@ -634,11 +777,11 @@ enum nvme_nvm_opcode { /* 0x03 - reserved */ NVME_OPC_WRITE_UNCORRECTABLE = 0x04, NVME_OPC_COMPARE = 0x05, - /* 0x06 - reserved */ + /* 0x06-0x07 - reserved */ NVME_OPC_WRITE_ZEROES = 0x08, - /* 0x07 - reserved */ NVME_OPC_DATASET_MANAGEMENT = 0x09, - /* 0x0a-0x0c - reserved */ + /* 0x0a-0x0b - reserved */ + NVME_OPC_VERIFY = 0x0c, NVME_OPC_RESERVATION_REGISTER = 0x0d, NVME_OPC_RESERVATION_REPORT = 0x0e, /* 0x0f-0x10 - reserved */ @@ -666,10 +809,21 @@ enum nvme_feature { NVME_FEAT_KEEP_ALIVE_TIMER = 0x0F, NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT = 0x10, NVME_FEAT_NON_OP_POWER_STATE_CONFIG = 0x11, - /* 0x12-0x77 - reserved */ + NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG = 0x12, + NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG = 0x13, + NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW = 0x14, + NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES = 0x15, + NVME_FEAT_HOST_BEHAVIOR_SUPPORT = 0x16, + NVME_FEAT_SANITIZE_CONFIG = 0x17, + NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION = 0x18, + /* 0x19-0x77 - reserved */ /* 0x78-0x7f - NVMe Management Interface */ NVME_FEAT_SOFTWARE_PROGRESS_MARKER = 0x80, - /* 0x81-0xBF - command set specific (reserved) */ + NVME_FEAT_HOST_IDENTIFIER = 0x81, + NVME_FEAT_RESERVATION_NOTIFICATION_MASK = 0x82, + NVME_FEAT_RESERVATION_PERSISTENCE = 0x83, + NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG = 0x84, + /* 0x85-0xBF - command set specific (reserved) */ /* 0xC0-0xFF - vendor specific */ }; @@ -763,12 +917,27 @@ struct nvme_controller_data { /** Controller Attributes */ uint32_t ctratt; /* bitfield really */ - uint8_t reserved1[12]; + /** Read Recovery Levels Supported */ + uint16_t rrls; + + uint8_t reserved1[9]; + + /** Controller Type */ + uint8_t cntrltype; /** FRU Globally Unique Identifier */ uint8_t fguid[16]; - uint8_t reserved2[128]; + /** Command Retry Delay Time 1 */ + uint16_t crdt1; + + /** Command Retry Delay Time 2 */ + uint16_t crdt2; + + /** Command Retry Delay Time 3 */ + uint16_t crdt3; + + uint8_t reserved2[122]; /* bytes 256-511: admin command set attributes */ @@ -848,7 +1017,34 @@ struct nvme_controller_data { /** Sanitize Capabilities */ uint32_t sanicap; /* Really a bitfield */ - uint8_t reserved3[180]; + /** Host Memory Buffer Minimum Descriptor Entry Size */ + uint32_t hmminds; + + /** Host Memory Maximum Descriptors Entries */ + uint16_t hmmaxd; + + /** NVM Set Identifier Maximum */ + uint16_t nsetidmax; + + /** Endurance Group Identifier Maximum */ + uint16_t endgidmax; + + /** ANA Transition Time */ + uint8_t anatt; + + /** Asymmetric Namespace Access Capabilities */ + uint8_t anacap; + + /** ANA Group Identifier Maximum */ + uint32_t anagrpmax; + + /** Number of ANA Group Identifiers */ + uint32_t nanagrpid; + + /** Persistent Event Log Size */ + uint32_t pels; + + uint8_t reserved3[156]; /* bytes 512-703: nvm command set attributes */ /** submission queue entry size */ @@ -883,7 +1079,9 @@ struct nvme_controller_data { /** NVM Vendor Specific Command Configuration */ uint8_t nvscc; - uint8_t reserved5; + + /** Namespace Write Protection Capabilities */ + uint8_t nwpc; /** Atomic Compare & Write Unit */ uint16_t acwu; @@ -892,8 +1090,11 @@ struct nvme_controller_data { /** SGL Support */ uint32_t sgls; + /** Maximum Number of Allowed Namespaces */ + uint32_t mnan; + /* bytes 540-767: Reserved */ - uint8_t reserved7[228]; + uint8_t reserved7[224]; /** NVM Subsystem NVMe Qualified Name */ uint8_t subnqn[256]; @@ -978,8 +1179,38 @@ struct nvme_namespace_data { /** NVM Capacity */ uint8_t nvmcap[16]; - /* bytes 64-103: Reserved */ - uint8_t reserved5[40]; + /** Namespace Preferred Write Granularity */ + uint16_t npwg; + + /** Namespace Preferred Write Alignment */ + uint16_t npwa; + + /** Namespace Preferred Deallocate Granularity */ + uint16_t npdg; + + /** Namespace Preferred Deallocate Alignment */ + uint16_t npda; + + /** Namespace Optimal Write Size */ + uint16_t nows; + + /* bytes 74-91: Reserved */ + uint8_t reserved5[18]; + + /** ANA Group Identifier */ + uint32_t anagrpid; + + /* bytes 96-98: Reserved */ + uint8_t reserved6[3]; + + /** Namespace Attributes */ + uint8_t nsattr; + + /** NVM Set Identifier */ + uint16_t nvmsetid; + + /** Endurance Group Identifier */ + uint16_t endgid; /** Namespace Globally Unique Identifier */ uint8_t nguid[16]; @@ -990,7 +1221,7 @@ struct nvme_namespace_data { /** lba format support */ uint32_t lbaf[16]; - uint8_t reserved6[192]; + uint8_t reserved7[192]; uint8_t vendor_specific[3712]; } __packed __aligned(4); @@ -1005,9 +1236,21 @@ enum nvme_log_page { NVME_LOG_FIRMWARE_SLOT = 0x03, NVME_LOG_CHANGED_NAMESPACE = 0x04, NVME_LOG_COMMAND_EFFECT = 0x05, + NVME_LOG_DEVICE_SELF_TEST = 0x06, + NVME_LOG_TELEMETRY_HOST_INITIATED = 0x07, + NVME_LOG_TELEMETRY_CONTROLLER_INITIATED = 0x08, + NVME_LOG_ENDURANCE_GROUP_INFORMATION = 0x09, + NVME_LOG_PREDICTABLE_LATENCY_PER_NVM_SET = 0x0a, + NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE = 0x0b, + NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS = 0x0c, + NVME_LOG_PERSISTENT_EVENT_LOG = 0x0d, + NVME_LOG_LBA_STATUS_INFORMATION = 0x0e, + NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE = 0x0f, /* 0x06-0x7F - reserved */ /* 0x80-0xBF - I/O command set specific */ NVME_LOG_RES_NOTIFICATION = 0x80, + NVME_LOG_SANITIZE_STATUS = 0x81, + /* 0x82-0xBF - reserved */ /* 0xC0-0xFF - vendor specific */ /* @@ -1036,7 +1279,11 @@ struct nvme_error_information_entry { uint64_t lba; uint32_t nsid; uint8_t vendor_specific; - uint8_t reserved[35]; + uint8_t trtype; + uint16_t reserved30; + uint64_t csi; + uint16_t ttsi; + uint8_t reserved[22]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry"); @@ -1072,8 +1319,16 @@ struct nvme_health_information_page { uint32_t warning_temp_time; uint32_t error_temp_time; uint16_t temp_sensor[8]; - - uint8_t reserved2[296]; + /* Thermal Management Temperature 1 Transition Count */ + uint32_t tmt1tc; + /* Thermal Management Temperature 2 Transition Count */ + uint32_t tmt2tc; + /* Total Time For Thermal Management Temperature 1 */ + uint32_t ttftmt1; + /* Total Time For Thermal Management Temperature 2 */ + uint32_t ttftmt2; + + uint8_t reserved2[280]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page"); @@ -1094,6 +1349,43 @@ struct nvme_ns_list { _Static_assert(sizeof(struct nvme_ns_list) == 4096, "bad size for nvme_ns_list"); +struct nvme_command_effects_page { + uint32_t acs[256]; + uint32_t iocs[256]; + uint8_t reserved[2048]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_command_effects_page) == 4096, + "bad size for nvme_command_effects_page"); + +struct nvme_res_notification_page { + uint64_t log_page_count; + uint8_t log_page_type; + uint8_t available_log_pages; + uint8_t reserved2; + uint32_t nsid; + uint8_t reserved[48]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_res_notification_page) == 64, + "bad size for nvme_res_notification_page"); + +struct nvme_sanitize_status_page { + uint16_t sprog; + uint16_t sstat; + uint32_t scdw10; + uint32_t etfo; + uint32_t etfbe; + uint32_t etfce; + uint32_t etfownd; + uint32_t etfbewnd; + uint32_t etfcewnd; + uint8_t reserved[480]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_sanitize_status_page) == 512, + "bad size for nvme_sanitize_status_page"); + struct intel_log_temp_stats { uint64_t current; @@ -1109,6 +1401,56 @@ struct intel_log_temp_stats _Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats"); +struct nvme_resv_reg_ctrlr +{ + uint16_t ctrlr_id; /* Controller ID */ + uint8_t rcsts; /* Reservation Status */ + uint8_t reserved3[5]; + uint64_t hostid; /* Host Identifier */ + uint64_t rkey; /* Reservation Key */ +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_resv_reg_ctrlr) == 24, "bad size for nvme_resv_reg_ctrlr"); + +struct nvme_resv_reg_ctrlr_ext +{ + uint16_t ctrlr_id; /* Controller ID */ + uint8_t rcsts; /* Reservation Status */ + uint8_t reserved3[5]; + uint64_t rkey; /* Reservation Key */ + uint64_t hostid[2]; /* Host Identifier */ + uint8_t reserved32[32]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_resv_reg_ctrlr_ext) == 64, "bad size for nvme_resv_reg_ctrlr_ext"); + +struct nvme_resv_status +{ + uint32_t gen; /* Generation */ + uint8_t rtype; /* Reservation Type */ + uint8_t regctl[2]; /* Number of Registered Controllers */ + uint8_t reserved7[2]; + uint8_t ptpls; /* Persist Through Power Loss State */ + uint8_t reserved10[14]; + struct nvme_resv_reg_ctrlr ctrlr[0]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_resv_status) == 24, "bad size for nvme_resv_status"); + +struct nvme_resv_status_ext +{ + uint32_t gen; /* Generation */ + uint8_t rtype; /* Reservation Type */ + uint8_t regctl[2]; /* Number of Registered Controllers */ + uint8_t reserved7[2]; + uint8_t ptpls; /* Persist Through Power Loss State */ + uint8_t reserved10[14]; + uint8_t reserved24[40]; + struct nvme_resv_reg_ctrlr_ext ctrlr[0]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_resv_status_ext) == 64, "bad size for nvme_resv_status_ext"); + #define NVME_TEST_MAX_THREADS 128 struct nvme_io_test { @@ -1184,6 +1526,11 @@ struct nvme_pt_command { struct mtx * driver_lock; }; +struct nvme_get_nsid { + char cdev[SPECNAMELEN + 1]; + uint32_t nsid; +}; + #define nvme_completion_is_error(cpl) \ (NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0) @@ -1192,6 +1539,7 @@ void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen); #ifdef _KERNEL struct bio; +struct thread; struct nvme_namespace; struct nvme_controller; @@ -1281,6 +1629,8 @@ uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns); int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn); +int nvme_ns_ioctl_process(struct nvme_namespace *ns, u_long cmd, + caddr_t arg, int flag, struct thread *td); /* * Command building helper functions -- shared with CAM @@ -1372,6 +1722,10 @@ void nvme_controller_data_swapbytes(struct nvme_controller_data *s) s->rtd3e = le32toh(s->rtd3e); s->oaes = le32toh(s->oaes); s->ctratt = le32toh(s->ctratt); + s->rrls = le16toh(s->rrls); + s->crdt1 = le16toh(s->crdt1); + s->crdt2 = le16toh(s->crdt2); + s->crdt3 = le16toh(s->crdt3); s->oacs = le16toh(s->oacs); s->wctemp = le16toh(s->wctemp); s->cctemp = le16toh(s->cctemp); @@ -1385,6 +1739,13 @@ void nvme_controller_data_swapbytes(struct nvme_controller_data *s) s->mntmt = le16toh(s->mntmt); s->mxtmt = le16toh(s->mxtmt); s->sanicap = le32toh(s->sanicap); + s->hmminds = le32toh(s->hmminds); + s->hmmaxd = le16toh(s->hmmaxd); + s->nsetidmax = le16toh(s->nsetidmax); + s->endgidmax = le16toh(s->endgidmax); + s->anagrpmax = le32toh(s->anagrpmax); + s->nanagrpid = le32toh(s->nanagrpid); + s->pels = le32toh(s->pels); s->maxcmd = le16toh(s->maxcmd); s->nn = le32toh(s->nn); s->oncs = le16toh(s->oncs); @@ -1393,6 +1754,7 @@ void nvme_controller_data_swapbytes(struct nvme_controller_data *s) s->awupf = le16toh(s->awupf); s->acwu = le16toh(s->acwu); s->sgls = le32toh(s->sgls); + s->mnan = le32toh(s->mnan); for (i = 0; i < 32; i++) nvme_power_state_swapbytes(&s->power_state[i]); } @@ -1412,6 +1774,14 @@ void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s) s->nabo = le16toh(s->nabo); s->nabspf = le16toh(s->nabspf); s->noiob = le16toh(s->noiob); + s->npwg = le16toh(s->npwg); + s->npwa = le16toh(s->npwa); + s->npdg = le16toh(s->npdg); + s->npda = le16toh(s->npda); + s->nows = le16toh(s->nows); + s->anagrpid = le32toh(s->anagrpid); + s->nvmsetid = le16toh(s->nvmsetid); + s->endgid = le16toh(s->endgid); for (i = 0; i < 16; i++) s->lbaf[i] = le32toh(s->lbaf[i]); } @@ -1427,6 +1797,8 @@ void nvme_error_information_entry_swapbytes(struct nvme_error_information_entry s->error_location = le16toh(s->error_location); s->lba = le64toh(s->lba); s->nsid = le32toh(s->nsid); + s->csi = le64toh(s->csi); + s->ttsi = le16toh(s->ttsi); } static inline @@ -1467,6 +1839,10 @@ void nvme_health_information_page_swapbytes(struct nvme_health_information_page s->error_temp_time = le32toh(s->error_temp_time); for (i = 0; i < 8; i++) s->temp_sensor[i] = le16toh(s->temp_sensor[i]); + s->tmt1tc = le32toh(s->tmt1tc); + s->tmt2tc = le32toh(s->tmt2tc); + s->ttftmt1 = le32toh(s->ttftmt1); + s->ttftmt2 = le32toh(s->ttftmt2); } @@ -1489,6 +1865,38 @@ void nvme_ns_list_swapbytes(struct nvme_ns_list *s) } static inline +void nvme_command_effects_page_swapbytes(struct nvme_command_effects_page *s) +{ + int i; + + for (i = 0; i < 256; i++) + s->acs[i] = le32toh(s->acs[i]); + for (i = 0; i < 256; i++) + s->iocs[i] = le32toh(s->iocs[i]); +} + +static inline +void nvme_res_notification_page_swapbytes(struct nvme_res_notification_page *s) +{ + s->log_page_count = le64toh(s->log_page_count); + s->nsid = le32toh(s->nsid); +} + +static inline +void nvme_sanitize_status_page_swapbytes(struct nvme_sanitize_status_page *s) +{ + s->sprog = le16toh(s->sprog); + s->sstat = le16toh(s->sstat); + s->scdw10 = le32toh(s->scdw10); + s->etfo = le32toh(s->etfo); + s->etfbe = le32toh(s->etfbe); + s->etfce = le32toh(s->etfce); + s->etfownd = le32toh(s->etfownd); + s->etfbewnd = le32toh(s->etfbewnd); + s->etfcewnd = le32toh(s->etfcewnd); +} + +static inline void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s) { @@ -1503,4 +1911,34 @@ void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s) s->est_offset = le64toh(s->est_offset); } +static inline +void nvme_resv_status_swapbytes(struct nvme_resv_status *s, size_t size) +{ + u_int i, n; + + s->gen = le32toh(s->gen); + n = (s->regctl[1] << 8) | s->regctl[0]; + n = MIN(n, (size - sizeof(s)) / sizeof(s->ctrlr[0])); + for (i = 0; i < n; i++) { + s->ctrlr[i].ctrlr_id = le16toh(s->ctrlr[i].ctrlr_id); + s->ctrlr[i].hostid = le64toh(s->ctrlr[i].hostid); + s->ctrlr[i].rkey = le64toh(s->ctrlr[i].rkey); + } +} + +static inline +void nvme_resv_status_ext_swapbytes(struct nvme_resv_status_ext *s, size_t size) +{ + u_int i, n; + + s->gen = le32toh(s->gen); + n = (s->regctl[1] << 8) | s->regctl[0]; + n = MIN(n, (size - sizeof(s)) / sizeof(s->ctrlr[0])); + for (i = 0; i < n; i++) { + s->ctrlr[i].ctrlr_id = le16toh(s->ctrlr[i].ctrlr_id); + s->ctrlr[i].rkey = le64toh(s->ctrlr[i].rkey); + nvme_le128toh((void *)s->ctrlr[i].hostid); + } +} + #endif /* __NVME_H__ */ diff --git a/freebsd/sys/dev/ofw/ofw_bus_subr.h b/freebsd/sys/dev/ofw/ofw_bus_subr.h index 468fdc39..218ba710 100644 --- a/freebsd/sys/dev/ofw/ofw_bus_subr.h +++ b/freebsd/sys/dev/ofw/ofw_bus_subr.h @@ -65,9 +65,11 @@ struct intr_map_data_fdt { }; #endif -#define SIMPLEBUS_PNP_DESCR "Z:compat;P:#;" -#define SIMPLEBUS_PNP_INFO(t) \ - MODULE_PNP_INFO(SIMPLEBUS_PNP_DESCR, simplebus, t, t, sizeof(t) / sizeof(t[0])); +#define FDTCOMPAT_PNP_DESCR "Z:compat;P:#;" +#define FDTCOMPAT_PNP_INFO(t, busname) \ + MODULE_PNP_INFO(FDTCOMPAT_PNP_DESCR, busname, t, t, sizeof(t) / sizeof(t[0])); + +#define SIMPLEBUS_PNP_INFO(t) FDTCOMPAT_PNP_INFO(t, simplebus) /* Generic implementation of ofw_bus_if.m methods and helper routines */ int ofw_bus_gen_setup_devinfo(struct ofw_bus_devinfo *, phandle_t); diff --git a/freebsd/sys/dev/ofw/ofw_subr.c b/freebsd/sys/dev/ofw/ofw_subr.c index 4a20727c..7483a2d2 100644 --- a/freebsd/sys/dev/ofw/ofw_subr.c +++ b/freebsd/sys/dev/ofw/ofw_subr.c @@ -81,7 +81,8 @@ int ofw_reg_to_paddr(phandle_t dev, int regno, bus_addr_t *paddr, bus_size_t *psize, pcell_t *ppci_hi) { - pcell_t cell[32], pci_hi; + static pcell_t cell[256]; + pcell_t pci_hi; uint64_t addr, raddr, baddr; uint64_t size, rsize; uint32_t c, nbridge, naddr, nsize; diff --git a/freebsd/sys/dev/pci/pci.c b/freebsd/sys/dev/pci/pci.c index f2a46d03..5402cb66 100644 --- a/freebsd/sys/dev/pci/pci.c +++ b/freebsd/sys/dev/pci/pci.c @@ -33,20 +33,22 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <rtems/bsd/local/opt_acpi.h> #include <rtems/bsd/local/opt_bus.h> #include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/module.h> -#include <sys/limits.h> -#include <sys/linker.h> -#include <sys/fcntl.h> #include <sys/conf.h> +#include <sys/endian.h> +#include <sys/eventhandler.h> +#include <sys/fcntl.h> #include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/linker.h> +#include <sys/malloc.h> +#include <sys/module.h> #include <sys/queue.h> #include <sys/sysctl.h> -#include <sys/endian.h> +#include <sys/systm.h> #include <vm/vm.h> #include <vm/pmap.h> @@ -137,6 +139,10 @@ static int pci_remap_intr_method(device_t bus, device_t dev, static void pci_hint_device_unit(device_t acdev, device_t child, const char *name, int *unitp); #endif /* __rtems__ */ +static int pci_reset_post(device_t dev, device_t child); +static int pci_reset_prepare(device_t dev, device_t child); +static int pci_reset_child(device_t dev, device_t child, + int flags); static int pci_get_id_method(device_t dev, device_t child, enum pci_id_type type, uintptr_t *rid); @@ -161,6 +167,9 @@ static device_method_t pci_methods[] = { DEVMETHOD(bus_driver_added, pci_driver_added), DEVMETHOD(bus_setup_intr, pci_setup_intr), DEVMETHOD(bus_teardown_intr, pci_teardown_intr), + DEVMETHOD(bus_reset_prepare, pci_reset_prepare), + DEVMETHOD(bus_reset_post, pci_reset_post), + DEVMETHOD(bus_reset_child, pci_reset_child), DEVMETHOD(bus_get_dma_tag, pci_get_dma_tag), DEVMETHOD(bus_get_resource_list,pci_get_resource_list), @@ -355,7 +364,7 @@ SYSCTL_INT(_hw_pci, OID_AUTO, enable_io_modes, CTLFLAG_RWTUN, " enable these bits correctly. We'd like to do this all the time, but" " there are some peripherals that this causes problems with."); -static int pci_do_realloc_bars = 0; +static int pci_do_realloc_bars = 1; SYSCTL_INT(_hw_pci, OID_AUTO, realloc_bars, CTLFLAG_RWTUN, &pci_do_realloc_bars, 0, "Attempt to allocate a new range for any BARs whose original " @@ -1678,10 +1687,13 @@ pci_mask_msix(device_t dev, u_int index) KASSERT(msix->msix_msgnum > index, ("bogus index")); offset = msix->msix_table_offset + index * 16 + 12; val = bus_read_4(msix->msix_table_res, offset); - if (!(val & PCIM_MSIX_VCTRL_MASK)) { - val |= PCIM_MSIX_VCTRL_MASK; - bus_write_4(msix->msix_table_res, offset, val); - } + val |= PCIM_MSIX_VCTRL_MASK; + + /* + * Some devices (e.g. Samsung PM961) do not support reads of this + * register, so always write the new value. + */ + bus_write_4(msix->msix_table_res, offset, val); } void @@ -1694,10 +1706,13 @@ pci_unmask_msix(device_t dev, u_int index) KASSERT(msix->msix_table_len > index, ("bogus index")); offset = msix->msix_table_offset + index * 16 + 12; val = bus_read_4(msix->msix_table_res, offset); - if (val & PCIM_MSIX_VCTRL_MASK) { - val &= ~PCIM_MSIX_VCTRL_MASK; - bus_write_4(msix->msix_table_res, offset, val); - } + val &= ~PCIM_MSIX_VCTRL_MASK; + + /* + * Some devices (e.g. Samsung PM961) do not support reads of this + * register, so always write the new value. + */ + bus_write_4(msix->msix_table_res, offset, val); } int @@ -4359,9 +4374,6 @@ pci_attach_common(device_t dev) { struct pci_softc *sc; int busno, domain; -#ifdef PCI_DMA_BOUNDARY - int error, tag_valid; -#endif #ifdef PCI_RES_BUS int rid; #endif @@ -4381,23 +4393,7 @@ pci_attach_common(device_t dev) if (bootverbose) device_printf(dev, "domain=%d, physical bus=%d\n", domain, busno); -#ifdef PCI_DMA_BOUNDARY - tag_valid = 0; - if (device_get_devclass(device_get_parent(device_get_parent(dev))) != - devclass_find("pci")) { - error = bus_dma_tag_create(bus_get_dma_tag(dev), 1, - PCI_DMA_BOUNDARY, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, - NULL, NULL, BUS_SPACE_MAXSIZE, BUS_SPACE_UNRESTRICTED, - BUS_SPACE_MAXSIZE, 0, NULL, NULL, &sc->sc_dma_tag); - if (error) - device_printf(dev, "Failed to create DMA tag: %d\n", - error); - else - tag_valid = 1; - } - if (!tag_valid) -#endif - sc->sc_dma_tag = bus_get_dma_tag(dev); + sc->sc_dma_tag = bus_get_dma_tag(dev); return (0); } @@ -5730,6 +5726,26 @@ pci_get_resource_list (device_t dev, device_t child) return (&dinfo->resources); } +#ifdef ACPI_DMAR +bus_dma_tag_t dmar_get_dma_tag(device_t dev, device_t child); +bus_dma_tag_t +pci_get_dma_tag(device_t bus, device_t dev) +{ + bus_dma_tag_t tag; + struct pci_softc *sc; + + if (device_get_parent(dev) == bus) { + /* try dmar and return if it works */ + tag = dmar_get_dma_tag(bus, dev); + } else + tag = NULL; + if (tag == NULL) { + sc = device_get_softc(bus); + tag = sc->sc_dma_tag; + } + return (tag); +} +#else bus_dma_tag_t pci_get_dma_tag(device_t bus, device_t dev) { @@ -5737,6 +5753,7 @@ pci_get_dma_tag(device_t bus, device_t dev) return (sc->sc_dma_tag); } +#endif uint32_t pci_read_config_method(device_t dev, device_t child, int reg, int width) @@ -6402,6 +6419,94 @@ pcie_flr(device_t dev, u_int max_delay, bool force) return (true); } +/* + * Attempt a power-management reset by cycling the device in/out of D3 + * state. PCI spec says we can only go into D3 state from D0 state. + * Transition from D[12] into D0 before going to D3 state. + */ +int +pci_power_reset(device_t dev) +{ + int ps; + + ps = pci_get_powerstate(dev); + if (ps != PCI_POWERSTATE_D0 && ps != PCI_POWERSTATE_D3) + pci_set_powerstate(dev, PCI_POWERSTATE_D0); + pci_set_powerstate(dev, PCI_POWERSTATE_D3); + pci_set_powerstate(dev, ps); + return (0); +} + +/* + * Try link drop and retrain of the downstream port of upstream + * switch, for PCIe. According to the PCIe 3.0 spec 6.6.1, this must + * cause Conventional Hot reset of the device in the slot. + * Alternative, for PCIe, could be the secondary bus reset initiatied + * on the upstream switch PCIR_BRIDGECTL_1, bit 6. + */ +int +pcie_link_reset(device_t port, int pcie_location) +{ + uint16_t v; + + v = pci_read_config(port, pcie_location + PCIER_LINK_CTL, 2); + v |= PCIEM_LINK_CTL_LINK_DIS; + pci_write_config(port, pcie_location + PCIER_LINK_CTL, v, 2); + pause_sbt("pcier1", mstosbt(20), 0, 0); + v &= ~PCIEM_LINK_CTL_LINK_DIS; + v |= PCIEM_LINK_CTL_RETRAIN_LINK; + pci_write_config(port, pcie_location + PCIER_LINK_CTL, v, 2); + pause_sbt("pcier2", mstosbt(100), 0, 0); /* 100 ms */ + v = pci_read_config(port, pcie_location + PCIER_LINK_STA, 2); + return ((v & PCIEM_LINK_STA_TRAINING) != 0 ? ETIMEDOUT : 0); +} + +static int +pci_reset_post(device_t dev, device_t child) +{ + + if (dev == device_get_parent(child)) + pci_restore_state(child); + return (0); +} + +static int +pci_reset_prepare(device_t dev, device_t child) +{ + + if (dev == device_get_parent(child)) + pci_save_state(child); + return (0); +} + +static int +pci_reset_child(device_t dev, device_t child, int flags) +{ + int error; + + if (dev == NULL || device_get_parent(child) != dev) + return (0); + if ((flags & DEVF_RESET_DETACH) != 0) { + error = device_get_state(child) == DS_ATTACHED ? + device_detach(child) : 0; + } else { + error = BUS_SUSPEND_CHILD(dev, child); + } + if (error == 0) { + if (!pcie_flr(child, 1000, false)) { + error = BUS_RESET_PREPARE(dev, child); + if (error == 0) + pci_power_reset(child); + BUS_RESET_POST(dev, child); + } + if ((flags & DEVF_RESET_DETACH) != 0) + device_probe_and_attach(child); + else + BUS_RESUME_CHILD(dev, child); + } + return (error); +} + const struct pci_device_table * pci_match_device(device_t child, const struct pci_device_table *id, size_t nelt) { diff --git a/freebsd/sys/dev/pci/pci_pci.c b/freebsd/sys/dev/pci/pci_pci.c index 607a0614..cdcba150 100644 --- a/freebsd/sys/dev/pci/pci_pci.c +++ b/freebsd/sys/dev/pci/pci_pci.c @@ -44,8 +44,11 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/bus.h> #include <sys/kernel.h> +#include <sys/lock.h> #include <sys/malloc.h> #include <sys/module.h> +#include <sys/mutex.h> +#include <sys/pciio.h> #include <sys/rman.h> #include <sys/sysctl.h> #include <sys/systm.h> @@ -82,6 +85,7 @@ static void pcib_pcie_dll_timeout(void *arg); #endif static int pcib_request_feature_default(device_t pcib, device_t dev, enum pci_feature feature); +static int pcib_reset_child(device_t dev, device_t child, int flags); static device_method_t pcib_methods[] = { /* Device interface */ @@ -108,6 +112,7 @@ static device_method_t pcib_methods[] = { DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), + DEVMETHOD(bus_reset_child, pcib_reset_child), /* pcib interface */ DEVMETHOD(pcib_maxslots, pcib_ari_maxslots), @@ -1167,9 +1172,11 @@ pcib_pcie_intr_hotplug(void *arg) { struct pcib_softc *sc; device_t dev; + uint16_t old_slot_sta; sc = arg; dev = sc->dev; + old_slot_sta = sc->pcie_slot_sta; sc->pcie_slot_sta = pcie_read_config(dev, PCIER_SLOT_STA, 2); /* Clear the events just reported. */ @@ -1185,7 +1192,8 @@ pcib_pcie_intr_hotplug(void *arg) "Attention Button Pressed: Detach Cancelled\n"); sc->flags &= ~PCIB_DETACH_PENDING; callout_stop(&sc->pcie_ab_timer); - } else { + } else if (old_slot_sta & PCIEM_SLOT_STA_PDS) { + /* Only initiate detach sequence if device present. */ device_printf(dev, "Attention Button Pressed: Detaching in 5 seconds\n"); sc->flags |= PCIB_DETACH_PENDING; @@ -1266,11 +1274,8 @@ pcib_pcie_cc_timeout(void *arg) mtx_assert(&Giant, MA_OWNED); sta = pcie_read_config(dev, PCIER_SLOT_STA, 2); if (!(sta & PCIEM_SLOT_STA_CC)) { - device_printf(dev, - "HotPlug Command Timed Out - forcing detach\n"); - sc->flags &= ~(PCIB_HOTPLUG_CMD_PENDING | PCIB_DETACH_PENDING); - sc->flags |= PCIB_DETACHING; - pcib_pcie_hotplug_update(sc, 0, 0, true); + device_printf(dev, "HotPlug Command Timed Out\n"); + sc->flags &= ~PCIB_HOTPLUG_CMD_PENDING; } else { device_printf(dev, "Missed HotPlug interrupt waiting for Command Completion\n"); @@ -2911,3 +2916,31 @@ pcib_request_feature_default(device_t pcib, device_t dev, bus = device_get_parent(pcib); return (PCIB_REQUEST_FEATURE(device_get_parent(bus), dev, feature)); } + +static int +pcib_reset_child(device_t dev, device_t child, int flags) +{ + struct pci_devinfo *pdinfo; + int error; + + error = 0; + if (dev == NULL || device_get_parent(child) != dev) + goto out; + error = ENXIO; + if (device_get_devclass(child) != devclass_find("pci")) + goto out; + pdinfo = device_get_ivars(dev); + if (pdinfo->cfg.pcie.pcie_location != 0 && + (pdinfo->cfg.pcie.pcie_type == PCIEM_TYPE_DOWNSTREAM_PORT || + pdinfo->cfg.pcie.pcie_type == PCIEM_TYPE_ROOT_PORT)) { + error = bus_helper_reset_prepare(child, flags); + if (error == 0) { + error = pcie_link_reset(dev, + pdinfo->cfg.pcie.pcie_location); + /* XXXKIB call _post even if error != 0 ? */ + bus_helper_reset_post(child, flags); + } + } +out: + return (error); +} diff --git a/freebsd/sys/dev/pci/pci_user.c b/freebsd/sys/dev/pci/pci_user.c index 3e2c3c7e..bdf8a935 100644 --- a/freebsd/sys/dev/pci/pci_user.c +++ b/freebsd/sys/dev/pci/pci_user.c @@ -858,6 +858,7 @@ pci_bar_mmap(device_t pcidev, struct pci_bar_mmap *pbm) struct thread *td; struct sglist *sg; struct pci_map *pm; + vm_paddr_t membase; vm_paddr_t pbase; vm_size_t plen; vm_offset_t addr; @@ -880,8 +881,9 @@ pci_bar_mmap(device_t pcidev, struct pci_bar_mmap *pbm) return (EBUSY); /* XXXKIB enable if _ACTIVATE */ if (!PCI_BAR_MEM(pm->pm_value)) return (EIO); - pbase = trunc_page(pm->pm_value); - plen = round_page(pm->pm_value + ((pci_addr_t)1 << pm->pm_size)) - + membase = pm->pm_value & PCIM_BAR_MEM_BASE; + pbase = trunc_page(membase); + plen = round_page(membase + ((pci_addr_t)1 << pm->pm_size)) - pbase; prot = VM_PROT_READ | (((pbm->pbm_flags & PCIIO_BAR_MMAP_RW) != 0) ? VM_PROT_WRITE : 0); @@ -913,7 +915,7 @@ pci_bar_mmap(device_t pcidev, struct pci_bar_mmap *pbm) } pbm->pbm_map_base = (void *)addr; pbm->pbm_map_length = plen; - pbm->pbm_bar_off = pm->pm_value - pbase; + pbm->pbm_bar_off = membase - pbase; pbm->pbm_bar_length = (pci_addr_t)1 << pm->pm_size; out: diff --git a/freebsd/sys/dev/pci/pcivar.h b/freebsd/sys/dev/pci/pcivar.h index 2ea7b877..d27cd1d2 100644 --- a/freebsd/sys/dev/pci/pcivar.h +++ b/freebsd/sys/dev/pci/pcivar.h @@ -33,7 +33,7 @@ #define _PCIVAR_H_ #include <sys/queue.h> -#include <sys/eventhandler.h> +#include <sys/_eventhandler.h> /* some PCI bus constants */ #define PCI_MAXMAPS_0 6 /* max. no. of memory/port maps */ @@ -259,6 +259,13 @@ typedef struct { extern uint32_t pci_numdevs; +/* + * The bitfield has to be stable and match the fields below (so that + * match_flag_vendor must be bit 0) so we have to do the endian dance. We can't + * use enums or #define constants because then the macros for subsetting matches + * wouldn't work. These tables are parsed by devmatch and others to connect + * modules with devices on the PCI bus. + */ struct pci_device_table { #if BYTE_ORDER == LITTLE_ENDIAN uint16_t @@ -674,6 +681,7 @@ int pci_get_max_read_req(device_t dev); void pci_restore_state(device_t dev); void pci_save_state(device_t dev); int pci_set_max_read_req(device_t dev, int size); +int pci_power_reset(device_t dev); uint32_t pcie_read_config(device_t dev, int reg, int width); void pcie_write_config(device_t dev, int reg, uint32_t value, int width); uint32_t pcie_adjust_config(device_t dev, int reg, uint32_t mask, @@ -681,17 +689,10 @@ uint32_t pcie_adjust_config(device_t dev, int reg, uint32_t mask, bool pcie_flr(device_t dev, u_int max_delay, bool force); int pcie_get_max_completion_timeout(device_t dev); bool pcie_wait_for_pending_transactions(device_t dev, u_int max_delay); +int pcie_link_reset(device_t port, int pcie_location); void pci_print_faulted_dev(void); -#ifdef BUS_SPACE_MAXADDR -#if (BUS_SPACE_MAXADDR > 0xFFFFFFFF) -#define PCI_DMA_BOUNDARY 0x100000000 -#else -#define PCI_DMA_BOUNDARY 0 -#endif -#endif - #endif /* _SYS_BUS_H_ */ /* diff --git a/freebsd/sys/dev/rtwn/if_rtwn.c b/freebsd/sys/dev/rtwn/if_rtwn.c index 79868dc0..0ca83e8a 100644 --- a/freebsd/sys/dev/rtwn/if_rtwn.c +++ b/freebsd/sys/dev/rtwn/if_rtwn.c @@ -155,9 +155,6 @@ static void rtwn_stop(struct rtwn_softc *); MALLOC_DEFINE(M_RTWN_PRIV, "rtwn_priv", "rtwn driver private state"); -static const uint8_t rtwn_chan_2ghz[] = - { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; - static const uint16_t wme2reg[] = { R92C_EDCA_BE_PARAM, R92C_EDCA_BK_PARAM, R92C_EDCA_VI_PARAM, R92C_EDCA_VO_PARAM }; @@ -1536,9 +1533,8 @@ rtwn_getradiocaps(struct ieee80211com *ic, setbit(bands, IEEE80211_MODE_11B); setbit(bands, IEEE80211_MODE_11G); setbit(bands, IEEE80211_MODE_11NG); - ieee80211_add_channel_list_2ghz(chans, maxchans, nchans, - rtwn_chan_2ghz, nitems(rtwn_chan_2ghz), bands, - !!(ic->ic_htcaps & IEEE80211_HTCAP_CHWIDTH40)); + ieee80211_add_channels_default_2ghz(chans, maxchans, nchans, + bands, !!(ic->ic_htcaps & IEEE80211_HTCAP_CHWIDTH40)); /* XXX workaround add_channel_list() limitations */ setbit(bands, IEEE80211_MODE_11A); @@ -1566,10 +1562,6 @@ rtwn_set_channel(struct ieee80211com *ic) RTWN_LOCK(sc); rtwn_set_chan(sc, c); - sc->sc_rxtap.wr_chan_freq = htole16(c->ic_freq); - sc->sc_rxtap.wr_chan_flags = htole16(c->ic_flags); - sc->sc_txtap.wt_chan_freq = htole16(c->ic_freq); - sc->sc_txtap.wt_chan_flags = htole16(c->ic_flags); RTWN_UNLOCK(sc); } diff --git a/freebsd/sys/dev/rtwn/if_rtwnvar.h b/freebsd/sys/dev/rtwn/if_rtwnvar.h index 3ebcba52..a6e7ea9f 100644 --- a/freebsd/sys/dev/rtwn/if_rtwnvar.h +++ b/freebsd/sys/dev/rtwn/if_rtwnvar.h @@ -62,9 +62,10 @@ struct rtwn_rx_radiotap_header { struct rtwn_tx_radiotap_header { struct ieee80211_radiotap_header wt_ihdr; uint8_t wt_flags; + uint8_t wt_pad; uint16_t wt_chan_freq; uint16_t wt_chan_flags; -} __packed __aligned(8); +} __packed; #define RTWN_TX_RADIOTAP_PRESENT \ (1 << IEEE80211_RADIOTAP_FLAGS | \ diff --git a/freebsd/sys/dev/rtwn/pci/rtwn_pci_attach.h b/freebsd/sys/dev/rtwn/pci/rtwn_pci_attach.h index 6df5812e..796d8c4d 100644 --- a/freebsd/sys/dev/rtwn/pci/rtwn_pci_attach.h +++ b/freebsd/sys/dev/rtwn/pci/rtwn_pci_attach.h @@ -17,9 +17,11 @@ */ void r92ce_attach(struct rtwn_pci_softc *); +void r88ee_attach(struct rtwn_pci_softc *); enum { RTWN_CHIP_RTL8192CE, + RTWN_CHIP_RTL8188EE, RTWN_CHIP_MAX_PCI }; @@ -32,13 +34,16 @@ struct rtwn_pci_ident { static const struct rtwn_pci_ident rtwn_pci_ident_table[] = { { 0x10ec, 0x8176, "Realtek RTL8188CE", RTWN_CHIP_RTL8192CE }, + { 0x10ec, 0x8179, "Realtek RTL8188EE", RTWN_CHIP_RTL8188EE }, + { 0x10ec, 0x8178, "Realtek RTL8192CE", RTWN_CHIP_RTL8192CE }, { 0, 0, NULL, RTWN_CHIP_MAX_PCI } }; typedef void (*chip_pci_attach)(struct rtwn_pci_softc *); static const chip_pci_attach rtwn_chip_pci_attach[RTWN_CHIP_MAX_PCI] = { - [RTWN_CHIP_RTL8192CE] = r92ce_attach + [RTWN_CHIP_RTL8192CE] = r92ce_attach, + [RTWN_CHIP_RTL8188EE] = r88ee_attach }; static __inline void diff --git a/freebsd/sys/dev/rtwn/pci/rtwn_pci_reg.c b/freebsd/sys/dev/rtwn/pci/rtwn_pci_reg.c index 8ce54f0d..5b998b5a 100644 --- a/freebsd/sys/dev/rtwn/pci/rtwn_pci_reg.c +++ b/freebsd/sys/dev/rtwn/pci/rtwn_pci_reg.c @@ -120,6 +120,6 @@ rtwn_pci_delay(struct rtwn_softc *sc, int usec) DELAY(usec); else { (void) mtx_sleep(sc, &sc->sc_mtx, 0, "rtwn_pci", - MAX(msecs_to_ticks(usec / 1000), 1)); + msecs_to_ticks(usec / 1000)); } } diff --git a/freebsd/sys/dev/rtwn/pci/rtwn_pci_rx.c b/freebsd/sys/dev/rtwn/pci/rtwn_pci_rx.c index 1934b741..f97fea44 100644 --- a/freebsd/sys/dev/rtwn/pci/rtwn_pci_rx.c +++ b/freebsd/sys/dev/rtwn/pci/rtwn_pci_rx.c @@ -85,12 +85,12 @@ rtwn_pci_setup_rx_desc(struct rtwn_pci_softc *pc, } static void -rtwn_pci_rx_frame(struct rtwn_softc *sc, struct rtwn_rx_stat_pci *rx_desc, - int desc_idx) +rtwn_pci_rx_frame(struct rtwn_pci_softc *pc) { - struct rtwn_pci_softc *pc = RTWN_PCI_SOFTC(sc); + struct rtwn_softc *sc = &pc->pc_sc; struct rtwn_rx_ring *ring = &pc->rx_ring; - struct rtwn_rx_data *rx_data = &ring->rx_data[desc_idx]; + struct rtwn_rx_stat_pci *rx_desc = &ring->desc[ring->cur]; + struct rtwn_rx_data *rx_data = &ring->rx_data[ring->cur]; struct ieee80211com *ic = &sc->sc_ic; struct ieee80211_node *ni; uint32_t rxdw0; @@ -150,9 +150,6 @@ rtwn_pci_rx_frame(struct rtwn_softc *sc, struct rtwn_rx_stat_pci *rx_desc, panic("%s: could not load old RX mbuf", device_get_name(sc->sc_dev)); - /* Physical address may have changed. */ - rtwn_pci_setup_rx_desc(pc, rx_desc, rx_data->paddr, - MJUMPAGESIZE, desc_idx); goto fail; } @@ -167,10 +164,6 @@ rtwn_pci_rx_frame(struct rtwn_softc *sc, struct rtwn_rx_stat_pci *rx_desc, "%s: Rx frame len %d, infosz %d, shift %d\n", __func__, pktlen, infosz, shift); - /* Update RX descriptor. */ - rtwn_pci_setup_rx_desc(pc, rx_desc, rx_data->paddr, MJUMPAGESIZE, - desc_idx); - /* Send the frame to the 802.11 layer. */ RTWN_UNLOCK(sc); if (ni != NULL) { @@ -188,6 +181,72 @@ fail: counter_u64_add(ic->ic_ierrors, 1); } +static int +rtwn_pci_rx_buf_copy(struct rtwn_pci_softc *pc) +{ + struct rtwn_rx_ring *ring = &pc->rx_ring; + struct rtwn_rx_stat_pci *rx_desc = &ring->desc[ring->cur]; + struct rtwn_rx_data *rx_data = &ring->rx_data[ring->cur]; + uint32_t rxdw0; + int desc_size, pktlen; + + /* + * NB: tx_report() / c2h_report() expects to see USB Rx + * descriptor - same as for PCIe, but without rxbufaddr* fields. + */ + desc_size = sizeof(struct rtwn_rx_stat_common); + KASSERT(sizeof(pc->pc_rx_buf) >= desc_size, + ("adjust size for PCIe Rx buffer!")); + + memcpy(pc->pc_rx_buf, rx_desc, desc_size); + + rxdw0 = le32toh(rx_desc->rxdw0); + pktlen = MS(rxdw0, RTWN_RXDW0_PKTLEN); + + if (pktlen > sizeof(pc->pc_rx_buf) - desc_size) + { + /* Looks like an ordinary Rx frame. */ + return (desc_size); + } + + bus_dmamap_sync(ring->data_dmat, rx_data->map, BUS_DMASYNC_POSTREAD); + memcpy(pc->pc_rx_buf + desc_size, mtod(rx_data->m, void *), pktlen); + + return (desc_size + pktlen); +} + +static void +rtwn_pci_tx_report(struct rtwn_pci_softc *pc, int len) +{ + struct rtwn_softc *sc = &pc->pc_sc; + + if (sc->sc_ratectl != RTWN_RATECTL_NET80211) { + /* shouldn't happen */ + device_printf(sc->sc_dev, + "%s called while ratectl = %d!\n", + __func__, sc->sc_ratectl); + return; + } + + RTWN_NT_LOCK(sc); + rtwn_handle_tx_report(sc, pc->pc_rx_buf, len); + RTWN_NT_UNLOCK(sc); + +#ifdef IEEE80211_SUPPORT_SUPERG + /* + * NB: this will executed only when 'report' bit is set. + */ + if (sc->sc_tx_n_active > 0 && --sc->sc_tx_n_active <= 1) + rtwn_cmd_sleepable(sc, NULL, 0, rtwn_ff_flush_all); +#endif +} + +static void +rtwn_pci_c2h_report(struct rtwn_pci_softc *pc, int len) +{ + rtwn_handle_c2h_report(&pc->pc_sc, pc->pc_rx_buf, len); +} + static void rtwn_pci_tx_done(struct rtwn_softc *sc, int qid) { @@ -199,7 +258,8 @@ rtwn_pci_tx_done(struct rtwn_softc *sc, int qid) RTWN_DPRINTF(sc, RTWN_DEBUG_INTR, "%s: qid %d, last %d, cur %d\n", __func__, qid, ring->last, ring->cur); - bus_dmamap_sync(ring->desc_dmat, ring->desc_map, BUS_DMASYNC_POSTREAD); + bus_dmamap_sync(ring->desc_dmat, ring->desc_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); while(ring->last != ring->cur) { data = &ring->tx_data[ring->last]; @@ -264,21 +324,50 @@ rtwn_pci_rx_done(struct rtwn_softc *sc) { struct rtwn_pci_softc *pc = RTWN_PCI_SOFTC(sc); struct rtwn_rx_ring *ring = &pc->rx_ring; + struct rtwn_rx_stat_pci *rx_desc; + struct rtwn_rx_data *rx_data; + int len; bus_dmamap_sync(ring->desc_dmat, ring->desc_map, BUS_DMASYNC_POSTREAD); for (;;) { - struct rtwn_rx_stat_pci *rx_desc = &ring->desc[ring->cur]; + rx_desc = &ring->desc[ring->cur]; + rx_data = &ring->rx_data[ring->cur]; if (le32toh(rx_desc->rxdw0) & RTWN_RXDW0_OWN) break; - rtwn_pci_rx_frame(sc, rx_desc, ring->cur); + len = rtwn_pci_rx_buf_copy(pc); + + switch (rtwn_classify_intr(sc, pc->pc_rx_buf, len)) { + case RTWN_RX_DATA: + rtwn_pci_rx_frame(pc); + break; + case RTWN_RX_TX_REPORT: + rtwn_pci_tx_report(pc, len); + break; + case RTWN_RX_OTHER: + rtwn_pci_c2h_report(pc, len); + break; + default: + /* NOTREACHED */ + KASSERT(0, ("unknown Rx classification code")); + break; + } + + /* Update / reset RX descriptor (and set OWN bit). */ + rtwn_pci_setup_rx_desc(pc, rx_desc, rx_data->paddr, + MJUMPAGESIZE, ring->cur); if (!(sc->sc_flags & RTWN_RUNNING)) return; - ring->cur = (ring->cur + 1) % RTWN_PCI_RX_LIST_COUNT; + /* NB: device can reuse current descriptor. */ + bus_dmamap_sync(ring->desc_dmat, ring->desc_map, + BUS_DMASYNC_POSTREAD); + + if (le32toh(rx_desc->rxdw0) & RTWN_RXDW0_OWN) + ring->cur = (ring->cur + 1) % RTWN_PCI_RX_LIST_COUNT; } } @@ -290,13 +379,13 @@ rtwn_pci_intr(void *arg) int i, status, tx_rings; RTWN_LOCK(sc); - status = rtwn_classify_intr(sc, &tx_rings, 0); + status = rtwn_pci_get_intr_status(pc, &tx_rings); RTWN_DPRINTF(sc, RTWN_DEBUG_INTR, "%s: status %08X, tx_rings %08X\n", __func__, status, tx_rings); if (status == 0 && tx_rings == 0) goto unlock; - if (status & RTWN_PCI_INTR_RX) { + if (status & (RTWN_PCI_INTR_RX | RTWN_PCI_INTR_TX_REPORT)) { rtwn_pci_rx_done(sc); if (!(sc->sc_flags & RTWN_RUNNING)) goto unlock; diff --git a/freebsd/sys/dev/rtwn/pci/rtwn_pci_tx.c b/freebsd/sys/dev/rtwn/pci/rtwn_pci_tx.c index 50e915ee..1d3852ca 100644 --- a/freebsd/sys/dev/rtwn/pci/rtwn_pci_tx.c +++ b/freebsd/sys/dev/rtwn/pci/rtwn_pci_tx.c @@ -176,8 +176,8 @@ rtwn_pci_tx_start_frame(struct rtwn_softc *sc, struct ieee80211_node *ni, rtwn_dump_tx_desc(sc, txd); bus_dmamap_sync(ring->desc_dmat, ring->desc_map, - BUS_DMASYNC_POSTWRITE); - bus_dmamap_sync(ring->data_dmat, data->map, BUS_DMASYNC_POSTWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + bus_dmamap_sync(ring->data_dmat, data->map, BUS_DMASYNC_PREWRITE); data->m = m; data->ni = ni; diff --git a/freebsd/sys/dev/rtwn/pci/rtwn_pci_var.h b/freebsd/sys/dev/rtwn/pci/rtwn_pci_var.h index 194fab4a..95b2effe 100644 --- a/freebsd/sys/dev/rtwn/pci/rtwn_pci_var.h +++ b/freebsd/sys/dev/rtwn/pci/rtwn_pci_var.h @@ -26,6 +26,9 @@ #define RTWN_PCI_RX_LIST_COUNT 256 #define RTWN_PCI_TX_LIST_COUNT 256 +/* sizeof(struct rtwn_rx_stat_common) + R88E_INTR_MSG_LEN */ +#define RTWN_PCI_RX_TMP_BUF_SIZE 84 + struct rtwn_rx_data { bus_dmamap_t map; struct mbuf *m; @@ -95,8 +98,8 @@ enum { /* Shortcuts */ /* Vendor driver treats RX errors like ROK... */ #define RTWN_PCI_INTR_RX \ - (RTWN_PCI_INTR_RX_OVERFLOW | RTWN_PCI_INTR_RX_DESC_UNAVAIL | \ - RTWN_PCI_INTR_RX_DONE) + (RTWN_PCI_INTR_RX_ERROR | RTWN_PCI_INTR_RX_OVERFLOW | \ + RTWN_PCI_INTR_RX_DESC_UNAVAIL | RTWN_PCI_INTR_RX_DONE) struct rtwn_pci_softc { @@ -109,6 +112,7 @@ struct rtwn_pci_softc { void *pc_ih; bus_size_t pc_mapsize; + uint8_t pc_rx_buf[RTWN_PCI_RX_TMP_BUF_SIZE]; struct rtwn_rx_ring rx_ring; struct rtwn_tx_ring tx_ring[RTWN_PCI_NTXQUEUES]; @@ -122,6 +126,8 @@ struct rtwn_pci_softc { void *, bus_dma_segment_t *); void (*pc_copy_tx_desc)(void *, const void *); void (*pc_enable_intr)(struct rtwn_pci_softc *); + int (*pc_get_intr_status)(struct rtwn_pci_softc *, + int *); }; #define RTWN_PCI_SOFTC(sc) ((struct rtwn_pci_softc *)(sc)) @@ -133,5 +139,7 @@ struct rtwn_pci_softc { (((_pc)->pc_copy_tx_desc)((_dest), (_src))) #define rtwn_pci_enable_intr(_pc) \ (((_pc)->pc_enable_intr)((_pc))) +#define rtwn_pci_get_intr_status(_pc, _tx_rings) \ + (((_pc)->pc_get_intr_status)((_pc), (_tx_rings))) #endif /* RTWN_PCI_VAR_H */ diff --git a/freebsd/sys/dev/rtwn/rtl8188e/r88e.h b/freebsd/sys/dev/rtwn/rtl8188e/r88e.h index 3a3c0865..ce9fa19a 100644 --- a/freebsd/sys/dev/rtwn/rtl8188e/r88e.h +++ b/freebsd/sys/dev/rtwn/rtl8188e/r88e.h @@ -24,9 +24,7 @@ /* * Global definitions. */ -#define R88E_PUBQ_NPAGES 142 #define R88E_TXPKTBUF_COUNT 177 -#define R88E_TX_PAGE_COUNT 169 #define R88E_MACID_MAX 63 #define R88E_RX_DMA_BUFFER_SIZE 0x2400 @@ -67,9 +65,8 @@ int r88e_set_pwrmode(struct rtwn_softc *, struct ieee80211vap *, int); #endif /* r88e_init.c */ -void r88e_init_bb(struct rtwn_softc *); +void r88e_init_bb_common(struct rtwn_softc *); void r88e_init_rf(struct rtwn_softc *); -int r88e_power_on(struct rtwn_softc *); /* r88e_led.c */ void r88e_set_led(struct rtwn_softc *, int, int); @@ -81,6 +78,7 @@ void r88e_rf_write(struct rtwn_softc *, int, uint8_t, uint32_t); void r88e_parse_rom(struct rtwn_softc *, uint8_t *); /* r88e_rx.c */ +int r88e_classify_intr(struct rtwn_softc *, void *, int); void r88e_ratectl_tx_complete(struct rtwn_softc *, uint8_t *, int); void r88e_handle_c2h_report(struct rtwn_softc *, uint8_t *, int); int8_t r88e_get_rssi_cck(struct rtwn_softc *, void *); diff --git a/freebsd/sys/dev/rtwn/rtl8188e/r88e_calib.c b/freebsd/sys/dev/rtwn/rtl8188e/r88e_calib.c index 592f391a..94974983 100644 --- a/freebsd/sys/dev/rtwn/rtl8188e/r88e_calib.c +++ b/freebsd/sys/dev/rtwn/rtl8188e/r88e_calib.c @@ -1,7 +1,7 @@ #include <machine/rtems-bsd-kernel-space.h> /*- - * Copyright (c) 2016 Andriy Voskoboinyk <avos@FreeBSD.org> + * Copyright (c) 2016-2019 Andriy Voskoboinyk <avos@FreeBSD.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -44,16 +44,343 @@ __FBSDID("$FreeBSD$"); #include <dev/rtwn/if_rtwnreg.h> #include <dev/rtwn/if_rtwnvar.h> +#include <dev/rtwn/if_rtwn_debug.h> #include <dev/rtwn/rtl8188e/r88e.h> #include <dev/rtwn/rtl8188e/r88e_reg.h> +/* Registers to save and restore during IQ calibration. */ +struct r88e_iq_cal_reg_vals { + uint32_t adda[16]; + uint8_t txpause; + uint8_t bcn_ctrl[2]; + uint32_t gpio_muxcfg; + uint32_t cck0_afesetting; + uint32_t ofdm0_trxpathena; + uint32_t ofdm0_trmuxpar; + uint32_t fpga0_rfifacesw0; + uint32_t fpga0_rfifacesw1; + uint32_t fpga0_rfifaceoe0; + uint32_t fpga0_rfifaceoe1; + uint32_t config_ant0; + uint32_t config_ant1; +}; + +static int +r88e_iq_calib_chain(struct rtwn_softc *sc, uint16_t tx[2], uint16_t rx[2]) +{ + uint32_t status; + + /* Set Rx IQ calibration mode table. */ + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0); + rtwn_rf_write(sc, 0, R88E_RF_WE_LUT, 0x800a0); + rtwn_rf_write(sc, 0, R92C_RF_RCK_OS, 0x30000); + rtwn_rf_write(sc, 0, R92C_RF_TXPA_G(0), 0xf); + rtwn_rf_write(sc, 0, R92C_RF_TXPA_G(1), 0xf117b); + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0x80800000); + + /* IQ calibration settings. */ + rtwn_bb_write(sc, R92C_TX_IQK, 0x01007c00); + rtwn_bb_write(sc, R92C_RX_IQK, 0x81004800); + + /* IQ calibration settings for chain 0. */ + rtwn_bb_write(sc, R92C_TX_IQK_TONE(0), 0x10008c1c); + rtwn_bb_write(sc, R92C_RX_IQK_TONE(0), 0x30008c1c); + rtwn_bb_write(sc, R92C_TX_IQK_PI(0), 0x82160804); + rtwn_bb_write(sc, R92C_RX_IQK_PI(0), 0x28160000); + + /* LO calibration settings. */ + rtwn_bb_write(sc, R92C_IQK_AGC_RSP, 0x0046a911); + + /* We're doing LO and IQ calibration in one shot. */ + rtwn_bb_write(sc, R92C_IQK_AGC_PTS, 0xf9000000); + rtwn_bb_write(sc, R92C_IQK_AGC_PTS, 0xf8000000); + + /* Give LO and IQ calibrations the time to complete. */ + rtwn_delay(sc, 10000); + + /* Read IQ calibration status. */ + status = rtwn_bb_read(sc, R92C_RX_POWER_IQK_AFTER(0)); + if (status & (1 << 28)) + return (0); /* Tx failed. */ + + /* Read Tx IQ calibration results. */ + tx[0] = MS(rtwn_bb_read(sc, R92C_TX_POWER_IQK_BEFORE(0)), + R92C_POWER_IQK_RESULT); + tx[1] = MS(rtwn_bb_read(sc, R92C_TX_POWER_IQK_AFTER(0)), + R92C_POWER_IQK_RESULT); + if (tx[0] == 0x142 || tx[1] == 0x042) + return (0); /* Tx failed. */ + + rtwn_bb_write(sc, R92C_TX_IQK, 0x80007c00 | (tx[0] << 16) | tx[1]); + + /* Set Rx IQ calibration mode table. */ + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0); + rtwn_rf_write(sc, 0, R88E_RF_WE_LUT, 0x800a0); + rtwn_rf_write(sc, 0, R92C_RF_RCK_OS, 0x30000); + rtwn_rf_write(sc, 0, R92C_RF_TXPA_G(0), 0xf); + rtwn_rf_write(sc, 0, R92C_RF_TXPA_G(1), 0xf7ffa); + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0x80800000); + + /* IQ calibration settings. */ + rtwn_bb_write(sc, R92C_RX_IQK, 0x01004800); + + /* IQ calibration settings for chain 0. */ + rtwn_bb_write(sc, R92C_TX_IQK_TONE(0), 0x30008c1c); + rtwn_bb_write(sc, R92C_RX_IQK_TONE(0), 0x10008c1c); + rtwn_bb_write(sc, R92C_TX_IQK_PI(0), 0x82160c05); + rtwn_bb_write(sc, R92C_RX_IQK_PI(0), 0x28160c05); + + /* LO calibration settings. */ + rtwn_bb_write(sc, R92C_IQK_AGC_RSP, 0x0046a911); + + /* We're doing LO and IQ calibration in one shot. */ + rtwn_bb_write(sc, R92C_IQK_AGC_PTS, 0xf9000000); + rtwn_bb_write(sc, R92C_IQK_AGC_PTS, 0xf8000000); + + /* Give LO and IQ calibrations the time to complete. */ + rtwn_delay(sc, 10000); + + /* Read IQ calibration status. */ + status = rtwn_bb_read(sc, R92C_RX_POWER_IQK_AFTER(0)); + if (status & (1 << 27)) + return (1); /* Rx failed. */ + + /* Read Rx IQ calibration results. */ + rx[0] = MS(rtwn_bb_read(sc, R92C_RX_POWER_IQK_BEFORE(0)), + R92C_POWER_IQK_RESULT); + rx[1] = MS(status, R92C_POWER_IQK_RESULT); + if (rx[0] == 0x132 || rx[1] == 0x036) + return (1); /* Rx failed. */ + + return (3); /* Both Tx and Rx succeeded. */ +} + +static void +r88e_iq_calib_run(struct rtwn_softc *sc, int n, uint16_t tx[2], + uint16_t rx[2], struct r88e_iq_cal_reg_vals *vals) +{ + /* Registers to save and restore during IQ calibration. */ + static const uint16_t reg_adda[16] = { + 0x85c, 0xe6c, 0xe70, 0xe74, + 0xe78, 0xe7c, 0xe80, 0xe84, + 0xe88, 0xe8c, 0xed0, 0xed4, + 0xed8, 0xedc, 0xee0, 0xeec + }; + int i; + uint32_t hssi_param1; + + if (n == 0) { + for (i = 0; i < nitems(reg_adda); i++) + vals->adda[i] = rtwn_bb_read(sc, reg_adda[i]); + + vals->txpause = rtwn_read_1(sc, R92C_TXPAUSE); + vals->bcn_ctrl[0] = rtwn_read_1(sc, R92C_BCN_CTRL(0)); + vals->bcn_ctrl[1] = rtwn_read_1(sc, R92C_BCN_CTRL(1)); + vals->gpio_muxcfg = rtwn_read_4(sc, R92C_GPIO_MUXCFG); + } + + rtwn_bb_write(sc, reg_adda[0], 0x0b1b25a0); + for (i = 1; i < nitems(reg_adda); i++) + rtwn_bb_write(sc, reg_adda[i], 0x0bdb25a0); + + hssi_param1 = rtwn_bb_read(sc, R92C_HSSI_PARAM1(0)); + if (!(hssi_param1 & R92C_HSSI_PARAM1_PI)) { + rtwn_bb_write(sc, R92C_HSSI_PARAM1(0), + hssi_param1 | R92C_HSSI_PARAM1_PI); + rtwn_bb_write(sc, R92C_HSSI_PARAM1(1), + hssi_param1 | R92C_HSSI_PARAM1_PI); + } + + if (n == 0) { + vals->cck0_afesetting = rtwn_bb_read(sc, R92C_CCK0_AFESETTING); + vals->ofdm0_trxpathena = + rtwn_bb_read(sc, R92C_OFDM0_TRXPATHENA); + vals->ofdm0_trmuxpar = rtwn_bb_read(sc, R92C_OFDM0_TRMUXPAR); + vals->fpga0_rfifacesw0 = + rtwn_bb_read(sc, R92C_FPGA0_RFIFACESW(0)); + vals->fpga0_rfifacesw1 = + rtwn_bb_read(sc, R92C_FPGA0_RFIFACESW(1)); + vals->fpga0_rfifaceoe0 = + rtwn_bb_read(sc, R92C_FPGA0_RFIFACEOE(0)); + vals->fpga0_rfifaceoe1 = + rtwn_bb_read(sc, R92C_FPGA0_RFIFACEOE(1)); + vals->config_ant0 = rtwn_bb_read(sc, R92C_CONFIG_ANT(0)); + vals->config_ant1 = rtwn_bb_read(sc, R92C_CONFIG_ANT(1)); + } + + rtwn_bb_setbits(sc, R92C_CCK0_AFESETTING, 0, 0x0f000000); + rtwn_bb_write(sc, R92C_OFDM0_TRXPATHENA, 0x03a05600); + rtwn_bb_write(sc, R92C_OFDM0_TRMUXPAR, 0x000800e4); + rtwn_bb_write(sc, R92C_FPGA0_RFIFACESW(1), 0x22204000); + rtwn_bb_setbits(sc, R92C_FPGA0_RFIFACESW(0), 0, 0x04000400); + rtwn_bb_setbits(sc, R92C_FPGA0_RFIFACEOE(0), 0x400, 0); + rtwn_bb_setbits(sc, R92C_FPGA0_RFIFACEOE(1), 0x400, 0); + + rtwn_write_1(sc, R92C_TXPAUSE, + R92C_TX_QUEUE_AC | R92C_TX_QUEUE_MGT | R92C_TX_QUEUE_HIGH); + rtwn_write_1(sc, R92C_BCN_CTRL(0), + vals->bcn_ctrl[0] & ~R92C_BCN_CTRL_EN_BCN); + rtwn_write_1(sc, R92C_BCN_CTRL(1), + vals->bcn_ctrl[1] & ~R92C_BCN_CTRL_EN_BCN); + rtwn_write_1(sc, R92C_GPIO_MUXCFG, + vals->gpio_muxcfg & ~R92C_GPIO_MUXCFG_ENBT); + + rtwn_bb_write(sc, R92C_CONFIG_ANT(0), 0x0f600000); + + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0x80800000); + rtwn_bb_write(sc, R92C_TX_IQK, 0x01007c00); + rtwn_bb_write(sc, R92C_RX_IQK, 0x01004800); + + /* Run IQ calibration twice. */ + for (i = 0; i < 2; i++) { + int ret; + + ret = r88e_iq_calib_chain(sc, tx, rx); + if (ret == 0) { + RTWN_DPRINTF(sc, RTWN_DEBUG_CALIB, "%s: Tx failed.\n", + __func__); + tx[0] = 0xff; + tx[1] = 0xff; + rx[0] = 0xff; + rx[1] = 0xff; + } else if (ret == 1) { + RTWN_DPRINTF(sc, RTWN_DEBUG_CALIB, "%s: Rx failed.\n", + __func__); + rx[0] = 0xff; + rx[1] = 0xff; + } else if (ret == 3) { + RTWN_DPRINTF(sc, RTWN_DEBUG_CALIB, "%s: Both Tx and Rx" + " succeeded.\n", __func__); + } + } + + RTWN_DPRINTF(sc, RTWN_DEBUG_CALIB, + "%s: results for run %d: tx[0] 0x%x, tx[1] 0x%x, rx[0] 0x%x, " + "rx[1] 0x%x\n", __func__, n, tx[0], tx[1], rx[0], rx[1]); + + rtwn_bb_write(sc, R92C_CCK0_AFESETTING, vals->cck0_afesetting); + rtwn_bb_write(sc, R92C_OFDM0_TRXPATHENA, vals->ofdm0_trxpathena); + rtwn_bb_write(sc, R92C_FPGA0_RFIFACESW(0), vals->fpga0_rfifacesw0); + rtwn_bb_write(sc, R92C_FPGA0_RFIFACESW(1), vals->fpga0_rfifacesw1); + rtwn_bb_write(sc, R92C_OFDM0_TRMUXPAR, vals->ofdm0_trmuxpar); + rtwn_bb_write(sc, R92C_FPGA0_RFIFACEOE(0), vals->fpga0_rfifaceoe0); + rtwn_bb_write(sc, R92C_FPGA0_RFIFACEOE(1), vals->fpga0_rfifaceoe1); + rtwn_bb_write(sc, R92C_CONFIG_ANT(0), vals->config_ant0); + rtwn_bb_write(sc, R92C_CONFIG_ANT(1), vals->config_ant1); + + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0); + rtwn_bb_write(sc, R92C_LSSI_PARAM(0), 0x00032ed3); + + if (n != 0) { + if (!(hssi_param1 & R92C_HSSI_PARAM1_PI)) { + rtwn_bb_write(sc, R92C_HSSI_PARAM1(0), hssi_param1); + rtwn_bb_write(sc, R92C_HSSI_PARAM1(1), hssi_param1); + } + + for (i = 0; i < nitems(reg_adda); i++) + rtwn_bb_write(sc, reg_adda[i], vals->adda[i]); + + rtwn_write_1(sc, R92C_TXPAUSE, vals->txpause); + rtwn_write_1(sc, R92C_BCN_CTRL(0), vals->bcn_ctrl[0]); + rtwn_write_1(sc, R92C_BCN_CTRL(1), vals->bcn_ctrl[1]); + rtwn_write_4(sc, R92C_GPIO_MUXCFG, vals->gpio_muxcfg); + } +} + +#define RTWN_IQ_CAL_MAX_TOLERANCE 5 +static int +r88e_iq_calib_compare_results(struct rtwn_softc *sc, uint16_t tx1[2], + uint16_t rx1[2], uint16_t tx2[2], uint16_t rx2[2]) +{ + int i, tx_ok, rx_ok; + + tx_ok = rx_ok = 0; + for (i = 0; i < 2; i++) { + if (tx1[i] == 0xff || tx2[i] == 0xff || + rx1[i] == 0xff || rx2[i] == 0xff) + continue; + + tx_ok = (abs(tx1[i] - tx2[i]) <= RTWN_IQ_CAL_MAX_TOLERANCE); + rx_ok = (abs(rx1[i] - rx2[i]) <= RTWN_IQ_CAL_MAX_TOLERANCE); + } + + return (tx_ok && rx_ok); +} +#undef RTWN_IQ_CAL_MAX_TOLERANCE + +static void +r88e_iq_calib_write_results(struct rtwn_softc *sc, uint16_t tx[2], + uint16_t rx[2]) +{ + uint32_t reg, val, x; + long y, tx_c; + + if (tx[0] == 0xff || tx[1] == 0xff) + return; + + reg = rtwn_bb_read(sc, R92C_OFDM0_TXIQIMBALANCE(0)); + val = ((reg >> 22) & 0x3ff); + x = tx[0]; + if (x & 0x00000200) + x |= 0xfffffc00; + reg = (((x * val) >> 8) & 0x3ff); + rtwn_bb_setbits(sc, R92C_OFDM0_TXIQIMBALANCE(0), 0x3ff, reg); + rtwn_bb_setbits(sc, R92C_OFDM0_ECCATHRESHOLD, 0x80000000, + ((x * val) & 0x80) << 24); + + y = tx[1]; + if (y & 0x00000200) + y |= 0xfffffc00; + tx_c = (y * val) >> 8; + rtwn_bb_setbits(sc, R92C_OFDM0_TXAFE(0), 0xf0000000, + (tx_c & 0x3c0) << 22); + rtwn_bb_setbits(sc, R92C_OFDM0_TXIQIMBALANCE(0), 0x003f0000, + (tx_c & 0x3f) << 16); + rtwn_bb_setbits(sc, R92C_OFDM0_ECCATHRESHOLD, 0x20000000, + ((y * val) & 0x80) << 22); + + if (rx[0] == 0xff || rx[1] == 0xff) + return; + + rtwn_bb_setbits(sc, R92C_OFDM0_RXIQIMBALANCE(0), 0x3ff, + rx[0] & 0x3ff); + rtwn_bb_setbits(sc, R92C_OFDM0_RXIQIMBALANCE(0), 0xfc00, + (rx[1] & 0x3f) << 10); + rtwn_bb_setbits(sc, R92C_OFDM0_RXIQEXTANTA, 0xf0000000, + (rx[1] & 0x3c0) << 22); +} + +#define RTWN_IQ_CAL_NRUN 3 void r88e_iq_calib(struct rtwn_softc *sc) { - /* XXX TODO */ + struct r88e_iq_cal_reg_vals vals; + uint16_t tx[RTWN_IQ_CAL_NRUN][2], rx[RTWN_IQ_CAL_NRUN][2]; + int n, valid; + + KASSERT(sc->ntxchains == 1, + ("%s: only 1T1R configuration is supported!\n", __func__)); + + valid = 0; + for (n = 0; n < RTWN_IQ_CAL_NRUN; n++) { + r88e_iq_calib_run(sc, n, tx[n], rx[n], &vals); + + if (n == 0) + continue; + + /* Valid results remain stable after consecutive runs. */ + valid = r88e_iq_calib_compare_results(sc, tx[n - 1], + rx[n - 1], tx[n], rx[n]); + if (valid) + break; + } + + if (valid) + r88e_iq_calib_write_results(sc, tx[n], rx[n]); } +#undef RTWN_IQ_CAL_NRUN void r88e_temp_measure(struct rtwn_softc *sc) diff --git a/freebsd/sys/dev/rtwn/rtl8188e/r88e_fw.c b/freebsd/sys/dev/rtwn/rtl8188e/r88e_fw.c index ddc9d0a4..ff48d244 100644 --- a/freebsd/sys/dev/rtwn/rtl8188e/r88e_fw.c +++ b/freebsd/sys/dev/rtwn/rtl8188e/r88e_fw.c @@ -111,7 +111,11 @@ r88e_fw_reset(struct rtwn_softc *sc, int reason) reg = rtwn_read_2(sc, R92C_SYS_FUNC_EN); rtwn_write_2(sc, R92C_SYS_FUNC_EN, reg & ~R92C_SYS_FUNC_EN_CPUEN); - rtwn_write_2(sc, R92C_SYS_FUNC_EN, reg | R92C_SYS_FUNC_EN_CPUEN); + + if (reason != RTWN_FW_RESET_SHUTDOWN) { + rtwn_write_2(sc, R92C_SYS_FUNC_EN, + reg | R92C_SYS_FUNC_EN_CPUEN); + } } void diff --git a/freebsd/sys/dev/rtwn/rtl8188e/r88e_init.c b/freebsd/sys/dev/rtwn/rtl8188e/r88e_init.c index 5225c43f..8c5bbac2 100644 --- a/freebsd/sys/dev/rtwn/rtl8188e/r88e_init.c +++ b/freebsd/sys/dev/rtwn/rtl8188e/r88e_init.c @@ -72,20 +72,8 @@ r88e_crystalcap_write(struct rtwn_softc *sc) } void -r88e_init_bb(struct rtwn_softc *sc) +r88e_init_bb_common(struct rtwn_softc *sc) { - - /* Enable BB and RF. */ - rtwn_setbits_2(sc, R92C_SYS_FUNC_EN, 0, - R92C_SYS_FUNC_EN_BBRSTB | R92C_SYS_FUNC_EN_BB_GLB_RST | - R92C_SYS_FUNC_EN_DIO_RF); - - rtwn_write_1(sc, R92C_RF_CTRL, - R92C_RF_CTRL_EN | R92C_RF_CTRL_RSTB | R92C_RF_CTRL_SDMRSTB); - rtwn_write_1(sc, R92C_SYS_FUNC_EN, - R92C_SYS_FUNC_EN_USBA | R92C_SYS_FUNC_EN_USBD | - R92C_SYS_FUNC_EN_BB_GLB_RST | R92C_SYS_FUNC_EN_BBRSTB); - r92c_init_bb_common(sc); rtwn_bb_write(sc, R92C_OFDM0_AGCCORE1(0), 0x69553422); @@ -95,66 +83,3 @@ r88e_init_bb(struct rtwn_softc *sc) r88e_crystalcap_write(sc); } - -int -r88e_power_on(struct rtwn_softc *sc) -{ -#define RTWN_CHK(res) do { \ - if (res != 0) \ - return (EIO); \ -} while(0) - int ntries; - - /* Wait for power ready bit. */ - for (ntries = 0; ntries < 5000; ntries++) { - if (rtwn_read_4(sc, R92C_APS_FSMCO) & R92C_APS_FSMCO_SUS_HOST) - break; - rtwn_delay(sc, 10); - } - if (ntries == 5000) { - device_printf(sc->sc_dev, - "timeout waiting for chip power up\n"); - return (ETIMEDOUT); - } - - /* Reset BB. */ - RTWN_CHK(rtwn_setbits_1(sc, R92C_SYS_FUNC_EN, - R92C_SYS_FUNC_EN_BBRSTB | R92C_SYS_FUNC_EN_BB_GLB_RST, 0)); - - RTWN_CHK(rtwn_setbits_1(sc, R92C_AFE_XTAL_CTRL + 2, 0, 0x80)); - - /* Disable HWPDN. */ - RTWN_CHK(rtwn_setbits_1_shift(sc, R92C_APS_FSMCO, - R92C_APS_FSMCO_APDM_HPDN, 0, 1)); - - /* Disable WL suspend. */ - RTWN_CHK(rtwn_setbits_1_shift(sc, R92C_APS_FSMCO, - R92C_APS_FSMCO_AFSM_HSUS | R92C_APS_FSMCO_AFSM_PCIE, 0, 1)); - - RTWN_CHK(rtwn_setbits_1_shift(sc, R92C_APS_FSMCO, - 0, R92C_APS_FSMCO_APFM_ONMAC, 1)); - for (ntries = 0; ntries < 5000; ntries++) { - if (!(rtwn_read_2(sc, R92C_APS_FSMCO) & - R92C_APS_FSMCO_APFM_ONMAC)) - break; - rtwn_delay(sc, 10); - } - if (ntries == 5000) - return (ETIMEDOUT); - - /* Enable LDO normal mode. */ - RTWN_CHK(rtwn_setbits_1(sc, R92C_LPLDO_CTRL, - R92C_LPLDO_CTRL_SLEEP, 0)); - - /* Enable MAC DMA/WMAC/SCHEDULE/SEC blocks. */ - RTWN_CHK(rtwn_write_2(sc, R92C_CR, 0)); - RTWN_CHK(rtwn_setbits_2(sc, R92C_CR, 0, - R92C_CR_HCI_TXDMA_EN | R92C_CR_TXDMA_EN | - R92C_CR_HCI_RXDMA_EN | R92C_CR_RXDMA_EN | - R92C_CR_PROTOCOL_EN | R92C_CR_SCHEDULE_EN | - ((sc->sc_hwcrypto != RTWN_CRYPTO_SW) ? R92C_CR_ENSEC : 0) | - R92C_CR_CALTMR_EN)); - - return (0); -#undef RTWN_CHK -} diff --git a/freebsd/sys/dev/rtwn/rtl8188e/r88e_priv.h b/freebsd/sys/dev/rtwn/rtl8188e/r88e_priv.h index 28f4b1fb..8ec5502d 100644 --- a/freebsd/sys/dev/rtwn/rtl8188e/r88e_priv.h +++ b/freebsd/sys/dev/rtwn/rtl8188e/r88e_priv.h @@ -37,7 +37,7 @@ struct rtwn_r88e_txpwr { /* * MAC initialization values. */ -static const struct rtwn_mac_prog rtl8188eu_mac[] = { +static const struct rtwn_mac_prog rtl8188e_mac[] = { { 0x026, 0x41 }, { 0x027, 0x35 }, { 0x040, 0x00 }, { 0x428, 0x0a }, { 0x429, 0x10 }, { 0x430, 0x00 }, { 0x431, 0x01 }, { 0x432, 0x02 }, { 0x433, 0x04 }, { 0x434, 0x05 }, { 0x435, 0x06 }, { 0x436, 0x07 }, @@ -66,7 +66,7 @@ static const struct rtwn_mac_prog rtl8188eu_mac[] = { /* * Baseband initialization values. */ -static const uint16_t rtl8188eu_bb_regs[] = { +static const uint16_t rtl8188e_bb_regs[] = { 0x800, 0x804, 0x808, 0x80c, 0x810, 0x814, 0x818, 0x81c, 0x820, 0x824, 0x828, 0x82c, 0x830, 0x834, 0x838, 0x83c, 0x840, 0x844, 0x848, 0x84c, 0x850, 0x854, 0x858, 0x85c, @@ -93,7 +93,7 @@ static const uint16_t rtl8188eu_bb_regs[] = { 0xed8, 0xedc, 0xee0, 0xee8, 0xeec, 0xf14, 0xf4c, 0xf00 }; -static const uint32_t rtl8188eu_bb_vals[] = { +static const uint32_t rtl8188e_bb_vals[] = { 0x80040000, 0x00000003, 0x0000fc00, 0x0000000a, 0x10001331, 0x020c3d10, 0x02200385, 0x00000000, 0x01000100, 0x00390204, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -135,17 +135,17 @@ static const uint32_t rtl8188eu_bb_vals[] = { 0x00000000, 0x00000300 }; -static const struct rtwn_bb_prog rtl8188eu_bb[] = { +static const struct rtwn_bb_prog rtl8188e_bb[] = { { - nitems(rtl8188eu_bb_regs), - rtl8188eu_bb_regs, - rtl8188eu_bb_vals, + nitems(rtl8188e_bb_regs), + rtl8188e_bb_regs, + rtl8188e_bb_vals, { 0 }, NULL } }; -static const uint32_t rtl8188eu_agc_vals[] = { +static const uint32_t rtl8188e_agc_vals[] = { 0xfb000001, 0xfb010001, 0xfb020001, 0xfb030001, 0xfb040001, 0xfb050001, 0xfa060001, 0xf9070001, 0xf8080001, 0xf7090001, 0xf60a0001, 0xf50b0001, 0xf40c0001, 0xf30d0001, 0xf20e0001, @@ -174,10 +174,10 @@ static const uint32_t rtl8188eu_agc_vals[] = { 0x407d0001, 0x407e0001, 0x407f0001 }; -static const struct rtwn_agc_prog rtl8188eu_agc[] = { +static const struct rtwn_agc_prog rtl8188e_agc[] = { { - nitems(rtl8188eu_agc_vals), - rtl8188eu_agc_vals, + nitems(rtl8188e_agc_vals), + rtl8188e_agc_vals, { 0 }, NULL } @@ -186,7 +186,7 @@ static const struct rtwn_agc_prog rtl8188eu_agc[] = { /* * RF initialization values. */ -static const uint8_t rtl8188eu_rf_regs[] = { +static const uint8_t rtl8188e_rf_regs[] = { 0x00, 0x08, 0x18, 0x19, 0x1e, 0x1f, 0x2f, 0x3f, 0x42, 0x57, 0x58, 0x67, 0x83, 0xb0, 0xb1, 0xb2, 0xb4, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbf, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, @@ -199,7 +199,7 @@ static const uint8_t rtl8188eu_rf_regs[] = { 0x1f, 0xfe, 0xfe, 0x1e, 0x1f, 0x00 }; -static const uint32_t rtl8188eu_rf_vals[] = { +static const uint32_t rtl8188e_rf_vals[] = { 0x30000, 0x84000, 0x00407, 0x00012, 0x80009, 0x00880, 0x1a060, 0x00000, 0x060c0, 0xd0000, 0xbe180, 0x01552, 0x00000, 0xff8fc, 0x54400, 0xccc19, 0x43003, 0x4953e, 0x1c718, 0x060ff, 0x80001, @@ -216,11 +216,11 @@ static const uint32_t rtl8188eu_rf_vals[] = { 0x0c350, 0x0c350, 0x00001, 0x80000, 0x33e60 }; -static const struct rtwn_rf_prog rtl8188eu_rf[] = { +static const struct rtwn_rf_prog rtl8188e_rf[] = { { - nitems(rtl8188eu_rf_regs), - rtl8188eu_rf_regs, - rtl8188eu_rf_vals, + nitems(rtl8188e_rf_regs), + rtl8188e_rf_regs, + rtl8188e_rf_vals, { 0 }, NULL }, diff --git a/freebsd/sys/dev/rtwn/rtl8188e/r88e_reg.h b/freebsd/sys/dev/rtwn/rtl8188e/r88e_reg.h index f6f26fa4..94b9a660 100644 --- a/freebsd/sys/dev/rtwn/rtl8188e/r88e_reg.h +++ b/freebsd/sys/dev/rtwn/rtl8188e/r88e_reg.h @@ -32,6 +32,7 @@ #define R88E_HISR 0x0b4 #define R88E_HIMRE 0x0b8 #define R88E_HISRE 0x0bc +#define R88E_XCK_OUT_CTRL 0x07c /* MAC General Configuration. */ #define R88E_32K_CTRL 0x194 #define R88E_HMEBOX_EXT(idx) (0x1f0 + (idx) * 4) @@ -45,16 +46,49 @@ /* Bits for R88E_HIMR. */ -#define R88E_HIMR_CPWM 0x00000100 -#define R88E_HIMR_CPWM2 0x00000200 -#define R88E_HIMR_TBDER 0x04000000 -#define R88E_HIMR_PSTIMEOUT 0x20000000 +#define R88E_HIMR_ROK 0x00000001 /* receive DMA OK */ +#define R88E_HIMR_RDU 0x00000002 /* Rx descriptor unavailable */ +#define R88E_HIMR_VODOK 0x00000004 /* AC_VO DMA OK */ +#define R88E_HIMR_VIDOK 0x00000008 /* AC_VI DMA OK */ +#define R88E_HIMR_BEDOK 0x00000010 /* AC_BE DMA OK */ +#define R88E_HIMR_BKDOK 0x00000020 /* AC_BK DMA OK */ +#define R88E_HIMR_MGNTDOK 0x00000040 /* management queue DMA OK */ +#define R88E_HIMR_HIGHDOK 0x00000080 /* high queue DMA OK */ +#define R88E_HIMR_CPWM 0x00000100 /* CPU power mode intr 1 */ +#define R88E_HIMR_CPWM2 0x00000200 /* CPU power mode intr 2 */ +#define R88E_HIMR_C2HCMD 0x00000400 /* C2H command interrupt */ +#define R88E_HIMR_HISR 0x00000800 /* (HISR & HIMR) != 0 */ +#define R88E_HIMR_ATIMEND 0x00001000 /* ATIM window end interrupt */ +#define R88E_HIMR_HSISR 0x00008000 /* (HSIMR & HSISR) != 0 */ +#define R88E_HIMR_BCNDERR 0x00010000 /* beacon queue DMA error */ +#define R88E_HIMR_BCNINT 0x00100000 /* beacon DMA interrupt 0 */ +#define R88E_HIMR_TSF32 0x01000000 /* TSF 32 bit interrupt */ +#define R88E_HIMR_TBDOK 0x02000000 /* beacon transmit OK */ +#define R88E_HIMR_TBDER 0x04000000 /* beacon transmit error */ +#define R88E_HIMR_GTIMER3 0x08000000 /* GTIMER3 interrupt */ +#define R88E_HIMR_GTIMER4 0x10000000 /* GTIMER4 interrupt */ +#define R88E_HIMR_PSTIMEOUT 0x20000000 /* powersave timeout */ +#define R88E_HIMR_TXRPT 0x40000000 /* Tx report interrupt */ /* Bits for R88E_HIMRE.*/ -#define R88E_HIMRE_RXFOVW 0x00000100 -#define R88E_HIMRE_TXFOVW 0x00000200 -#define R88E_HIMRE_RXERR 0x00000400 -#define R88E_HIMRE_TXERR 0x00000800 +#define R88E_HIMRE_RXFOVW 0x00000100 /* receive FIFO overflow */ +#define R88E_HIMRE_TXFOVW 0x00000200 /* transmit FIFO overflow */ +#define R88E_HIMRE_RXERR 0x00000400 /* receive error */ +#define R88E_HIMRE_TXERR 0x00000800 /* transmit error */ +#define R88E_HIMRE_BCNDOK1 0x00004000 /* beacon queue DMA OK (1) */ +#define R88E_HIMRE_BCNDOK2 0x00008000 /* beacon queue DMA OK (2) */ +#define R88E_HIMRE_BCNDOK3 0x00010000 /* beacon queue DMA OK (3) */ +#define R88E_HIMRE_BCNDOK4 0x00020000 /* beacon queue DMA OK (4) */ +#define R88E_HIMRE_BCNDOK5 0x00040000 /* beacon queue DMA OK (5) */ +#define R88E_HIMRE_BCNDOK6 0x00080000 /* beacon queue DMA OK (6) */ +#define R88E_HIMRE_BCNDOK7 0x00100000 /* beacon queue DMA OK (7) */ +#define R88E_HIMRE_BCNDMAINT1 0x00200000 /* beacon DMA interrupt 1 */ +#define R88E_HIMRE_BCNDMAINT2 0x00400000 /* beacon DMA interrupt 2 */ +#define R88E_HIMRE_BCNDMAINT3 0x00800000 /* beacon DMA interrupt 3 */ +#define R88E_HIMRE_BCNDMAINT4 0x01000000 /* beacon DMA interrupt 4 */ +#define R88E_HIMRE_BCNDMAINT5 0x02000000 /* beacon DMA interrupt 5 */ +#define R88E_HIMRE_BCNDMAINT6 0x04000000 /* beacon DMA interrupt 6 */ +#define R88E_HIMRE_BCNDMAINT7 0x08000000 /* beacon DMA interrupt 7 */ /* Bits for R88E_TX_RPT_CTRL. */ #define R88E_TX_RPT1_ENA 0x01 @@ -79,6 +113,7 @@ * RF (6052) registers. */ #define R88E_RF_T_METER 0x42 +#define R88E_RF_WE_LUT 0xef /* Bits for R92C_RF_CHNLBW. */ #define R88E_RF_CHNLBW_BW20 0x00c00 @@ -88,4 +123,7 @@ #define R88E_RF_T_METER_VAL_S 10 #define R88E_RF_T_METER_START 0x30000 +/* Bits for R88E_XCK_OUT_CTRL. */ +#define R88E_XCK_OUT_CTRL_EN 1 + #endif /* R88E_REG_H */ diff --git a/freebsd/sys/dev/rtwn/rtl8188e/r88e_rom.c b/freebsd/sys/dev/rtwn/rtl8188e/r88e_rom.c index e1337dba..3afa2910 100644 --- a/freebsd/sys/dev/rtwn/rtl8188e/r88e_rom.c +++ b/freebsd/sys/dev/rtwn/rtl8188e/r88e_rom.c @@ -83,5 +83,6 @@ r88e_parse_rom(struct rtwn_softc *sc, uint8_t *buf) __func__,rs->regulatory); sc->thermal_meter = rom->thermal_meter; - IEEE80211_ADDR_COPY(sc->sc_ic.ic_macaddr, rom->macaddr); + + rtwn_r92c_set_rom_opts(sc, buf); } diff --git a/freebsd/sys/dev/rtwn/rtl8188e/r88e_rom_image.h b/freebsd/sys/dev/rtwn/rtl8188e/r88e_rom_image.h index c80028e0..d5d97ffb 100644 --- a/freebsd/sys/dev/rtwn/rtl8188e/r88e_rom_image.h +++ b/freebsd/sys/dev/rtwn/rtl8188e/r88e_rom_image.h @@ -44,11 +44,24 @@ struct r88e_rom { uint8_t reserved4[3]; uint8_t rf_ant_opt; uint8_t reserved5[6]; - uint16_t vid; - uint16_t pid; - uint8_t usb_opt; - uint8_t reserved6[2]; - uint8_t macaddr[IEEE80211_ADDR_LEN]; + + union { + struct { + uint16_t vid; + uint16_t pid; + uint8_t usb_opt; + uint8_t reserved6[2]; + uint8_t macaddr[IEEE80211_ADDR_LEN]; + } __packed usb; + + struct { + uint8_t macaddr[IEEE80211_ADDR_LEN]; + uint16_t vid; + uint16_t pid; + uint8_t reserved6[3]; + } __packed pci; + } __packed diff_d0; + uint8_t reserved7[2]; uint8_t string[33]; /* "realtek 802.11n NIC" */ uint8_t reserved8[256]; diff --git a/freebsd/sys/dev/rtwn/rtl8188e/r88e_rx.c b/freebsd/sys/dev/rtwn/rtl8188e/r88e_rx.c index 53cc722f..eee2f63f 100644 --- a/freebsd/sys/dev/rtwn/rtl8188e/r88e_rx.c +++ b/freebsd/sys/dev/rtwn/rtl8188e/r88e_rx.c @@ -58,6 +58,25 @@ __FBSDID("$FreeBSD$"); #include <dev/rtwn/rtl8188e/r88e_rx_desc.h> +int +r88e_classify_intr(struct rtwn_softc *sc, void *buf, int len) +{ + struct r92c_rx_stat *stat = buf; + int report_sel = MS(le32toh(stat->rxdw3), R88E_RXDW3_RPT); + + switch (report_sel) { + case R88E_RXDW3_RPT_RX: + return (RTWN_RX_DATA); + case R88E_RXDW3_RPT_TX1: /* per-packet Tx report */ + case R88E_RXDW3_RPT_TX2: /* periodical Tx report */ + return (RTWN_RX_TX_REPORT); + case R88E_RXDW3_RPT_HIS: + return (RTWN_RX_OTHER); + default: /* shut up the compiler */ + return (RTWN_RX_DATA); + } +} + void r88e_ratectl_tx_complete(struct rtwn_softc *sc, uint8_t *buf, int len) { diff --git a/freebsd/sys/dev/rtwn/rtl8188e/usb/r88eu.h b/freebsd/sys/dev/rtwn/rtl8188e/usb/r88eu.h index 85b637cb..e79b1387 100644 --- a/freebsd/sys/dev/rtwn/rtl8188e/usb/r88eu.h +++ b/freebsd/sys/dev/rtwn/rtl8188e/usb/r88eu.h @@ -25,15 +25,21 @@ /* + * Global definitions. + */ +#define R88EU_PUBQ_NPAGES 142 +#define R88EU_TX_PAGE_COUNT 169 + + +/* * Function declarations. */ /* r88eu_init.c */ +void r88eu_init_bb(struct rtwn_softc *); +int r88eu_power_on(struct rtwn_softc *); void r88eu_power_off(struct rtwn_softc *); void r88eu_init_intr(struct rtwn_softc *); void r88eu_init_rx_agg(struct rtwn_softc *); void r88eu_post_init(struct rtwn_softc *); -/* r88eu_rx.c */ -int r88eu_classify_intr(struct rtwn_softc *, void *, int); - #endif /* RTL8188EU_H */ diff --git a/freebsd/sys/dev/rtwn/rtl8188e/usb/r88eu_attach.c b/freebsd/sys/dev/rtwn/rtl8188e/usb/r88eu_attach.c index 73cc7856..6b153bcd 100644 --- a/freebsd/sys/dev/rtwn/rtl8188e/usb/r88eu_attach.c +++ b/freebsd/sys/dev/rtwn/rtl8188e/usb/r88eu_attach.c @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$"); #include <dev/rtwn/rtl8192c/usb/r92cu_tx_desc.h> #include <dev/rtwn/rtl8188e/r88e_priv.h> +#include <dev/rtwn/rtl8188e/r88e_rom_image.h> /* for 'macaddr' field */ #include <dev/rtwn/rtl8188e/usb/r88eu.h> @@ -68,6 +69,14 @@ static struct rtwn_r88e_txpwr r88e_txpwr; void r88eu_attach(struct rtwn_usb_softc *); static void +r88eu_set_macaddr(struct rtwn_softc *sc, uint8_t *buf) +{ + struct r88e_rom *rom = (struct r88e_rom *)buf; + + IEEE80211_ADDR_COPY(sc->sc_ic.ic_macaddr, rom->diff_d0.usb.macaddr); +} + +static void r88e_postattach(struct rtwn_softc *sc) { struct r92c_softc *rs = sc->sc_priv; @@ -95,7 +104,7 @@ r88eu_attach_private(struct rtwn_softc *sc) rs->rs_tx_enable_ampdu = r88e_tx_enable_ampdu; rs->rs_tx_setup_hwseq = r88e_tx_setup_hwseq; rs->rs_tx_setup_macid = r88e_tx_setup_macid; - rs->rs_set_name = rtwn_nop_softc; /* not used */ + rs->rs_set_rom_opts = r88eu_set_macaddr; rs->rf_read_delay[0] = 10; rs->rf_read_delay[1] = 100; @@ -132,7 +141,7 @@ r88eu_attach(struct rtwn_usb_softc *uc) sc->sc_get_rx_stats = r88e_get_rx_stats; sc->sc_get_rssi_cck = r88e_get_rssi_cck; sc->sc_get_rssi_ofdm = r88e_get_rssi_ofdm; - sc->sc_classify_intr = r88eu_classify_intr; + sc->sc_classify_intr = r88e_classify_intr; sc->sc_handle_tx_report = r88e_ratectl_tx_complete; sc->sc_handle_c2h_report = r88e_handle_c2h_report; sc->sc_check_frame = rtwn_nop_int_softc_mbuf; @@ -142,7 +151,7 @@ r88eu_attach(struct rtwn_usb_softc *uc) sc->sc_efuse_postread = rtwn_nop_softc; sc->sc_parse_rom = r88e_parse_rom; sc->sc_set_led = r88e_set_led; - sc->sc_power_on = r88e_power_on; + sc->sc_power_on = r88eu_power_on; sc->sc_power_off = r88eu_power_off; #ifndef RTWN_WITHOUT_UCODE sc->sc_fw_reset = r88e_fw_reset; @@ -151,7 +160,7 @@ r88eu_attach(struct rtwn_usb_softc *uc) sc->sc_llt_init = r92c_llt_init; sc->sc_set_page_size = r92c_set_page_size; sc->sc_lc_calib = r92c_lc_calib; - sc->sc_iq_calib = r88e_iq_calib; /* XXX TODO */ + sc->sc_iq_calib = r88e_iq_calib; sc->sc_read_chipid_vendor = rtwn_nop_softc_uint32; sc->sc_adj_devcaps = r88eu_adj_devcaps; sc->sc_vap_preattach = rtwn_nop_softc_vap; @@ -174,29 +183,29 @@ r88eu_attach(struct rtwn_usb_softc *uc) sc->sc_init_ampdu = rtwn_nop_softc; sc->sc_init_intr = r88eu_init_intr; sc->sc_init_edca = r92c_init_edca; - sc->sc_init_bb = r88e_init_bb; + sc->sc_init_bb = r88eu_init_bb; sc->sc_init_rf = r92c_init_rf; sc->sc_init_antsel = rtwn_nop_softc; sc->sc_post_init = r88eu_post_init; sc->sc_init_bcnq1_boundary = rtwn_nop_int_softc; - sc->mac_prog = &rtl8188eu_mac[0]; - sc->mac_size = nitems(rtl8188eu_mac); - sc->bb_prog = &rtl8188eu_bb[0]; - sc->bb_size = nitems(rtl8188eu_bb); - sc->agc_prog = &rtl8188eu_agc[0]; - sc->agc_size = nitems(rtl8188eu_agc); - sc->rf_prog = &rtl8188eu_rf[0]; + sc->mac_prog = &rtl8188e_mac[0]; + sc->mac_size = nitems(rtl8188e_mac); + sc->bb_prog = &rtl8188e_bb[0]; + sc->bb_size = nitems(rtl8188e_bb); + sc->agc_prog = &rtl8188e_agc[0]; + sc->agc_size = nitems(rtl8188e_agc); + sc->rf_prog = &rtl8188e_rf[0]; sc->name = "RTL8188EU"; sc->fwname = "rtwn-rtl8188eufw"; sc->fwsig = 0x88e; - sc->page_count = R88E_TX_PAGE_COUNT; + sc->page_count = R88EU_TX_PAGE_COUNT; sc->pktbuf_count = R88E_TXPKTBUF_COUNT; sc->ackto = 0x40; - sc->npubqpages = R88E_PUBQ_NPAGES; + sc->npubqpages = R88EU_PUBQ_NPAGES; sc->page_size = R92C_TX_PAGE_SIZE; sc->txdesc_len = sizeof(struct r92cu_tx_desc); diff --git a/freebsd/sys/dev/rtwn/rtl8188e/usb/r88eu_init.c b/freebsd/sys/dev/rtwn/rtl8188e/usb/r88eu_init.c index 2c776671..bfe3681e 100644 --- a/freebsd/sys/dev/rtwn/rtl8188e/usb/r88eu_init.c +++ b/freebsd/sys/dev/rtwn/rtl8188e/usb/r88eu_init.c @@ -57,6 +57,87 @@ __FBSDID("$FreeBSD$"); void +r88eu_init_bb(struct rtwn_softc *sc) +{ + + /* Enable BB and RF. */ + rtwn_setbits_2(sc, R92C_SYS_FUNC_EN, 0, + R92C_SYS_FUNC_EN_BBRSTB | R92C_SYS_FUNC_EN_BB_GLB_RST | + R92C_SYS_FUNC_EN_DIO_RF); + + rtwn_write_1(sc, R92C_RF_CTRL, + R92C_RF_CTRL_EN | R92C_RF_CTRL_RSTB | R92C_RF_CTRL_SDMRSTB); + rtwn_write_1(sc, R92C_SYS_FUNC_EN, + R92C_SYS_FUNC_EN_USBA | R92C_SYS_FUNC_EN_USBD | + R92C_SYS_FUNC_EN_BB_GLB_RST | R92C_SYS_FUNC_EN_BBRSTB); + + r88e_init_bb_common(sc); +} + +int +r88eu_power_on(struct rtwn_softc *sc) +{ +#define RTWN_CHK(res) do { \ + if (res != 0) \ + return (EIO); \ +} while(0) + int ntries; + + /* Wait for power ready bit. */ + for (ntries = 0; ntries < 5000; ntries++) { + if (rtwn_read_4(sc, R92C_APS_FSMCO) & R92C_APS_FSMCO_SUS_HOST) + break; + rtwn_delay(sc, 10); + } + if (ntries == 5000) { + device_printf(sc->sc_dev, + "timeout waiting for chip power up\n"); + return (ETIMEDOUT); + } + + /* Reset BB. */ + RTWN_CHK(rtwn_setbits_1(sc, R92C_SYS_FUNC_EN, + R92C_SYS_FUNC_EN_BBRSTB | R92C_SYS_FUNC_EN_BB_GLB_RST, 0)); + + RTWN_CHK(rtwn_setbits_1(sc, R92C_AFE_XTAL_CTRL + 2, 0, 0x80)); + + /* Disable HWPDN. */ + RTWN_CHK(rtwn_setbits_1_shift(sc, R92C_APS_FSMCO, + R92C_APS_FSMCO_APDM_HPDN, 0, 1)); + + /* Disable WL suspend. */ + RTWN_CHK(rtwn_setbits_1_shift(sc, R92C_APS_FSMCO, + R92C_APS_FSMCO_AFSM_HSUS | R92C_APS_FSMCO_AFSM_PCIE, 0, 1)); + + RTWN_CHK(rtwn_setbits_1_shift(sc, R92C_APS_FSMCO, + 0, R92C_APS_FSMCO_APFM_ONMAC, 1)); + for (ntries = 0; ntries < 5000; ntries++) { + if (!(rtwn_read_2(sc, R92C_APS_FSMCO) & + R92C_APS_FSMCO_APFM_ONMAC)) + break; + rtwn_delay(sc, 10); + } + if (ntries == 5000) + return (ETIMEDOUT); + + /* Enable LDO normal mode. */ + RTWN_CHK(rtwn_setbits_1(sc, R92C_LPLDO_CTRL, + R92C_LPLDO_CTRL_SLEEP, 0)); + + /* Enable MAC DMA/WMAC/SCHEDULE/SEC blocks. */ + RTWN_CHK(rtwn_write_2(sc, R92C_CR, 0)); + RTWN_CHK(rtwn_setbits_2(sc, R92C_CR, 0, + R92C_CR_HCI_TXDMA_EN | R92C_CR_TXDMA_EN | + R92C_CR_HCI_RXDMA_EN | R92C_CR_RXDMA_EN | + R92C_CR_PROTOCOL_EN | R92C_CR_SCHEDULE_EN | + ((sc->sc_hwcrypto != RTWN_CRYPTO_SW) ? R92C_CR_ENSEC : 0) | + R92C_CR_CALTMR_EN)); + + return (0); +#undef RTWN_CHK +} + +void r88eu_power_off(struct rtwn_softc *sc) { uint8_t reg; diff --git a/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce.h b/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce.h index 93379f8b..5d13f160 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce.h +++ b/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce.h @@ -60,7 +60,7 @@ void r92ce_post_init(struct rtwn_softc *); void r92ce_set_led(struct rtwn_softc *, int, int); /* r92ce_rx.c */ -int r92ce_classify_intr(struct rtwn_softc *, void *, int); +int r92ce_get_intr_status(struct rtwn_pci_softc *, int *); void r92ce_enable_intr(struct rtwn_pci_softc *); void r92ce_start_xfers(struct rtwn_softc *); diff --git a/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce_attach.c b/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce_attach.c index e43d3ac0..61791c41 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce_attach.c +++ b/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce_attach.c @@ -94,7 +94,7 @@ r92ce_postattach(struct rtwn_softc *sc) } static void -r92ce_set_name(struct rtwn_softc *sc) +r92ce_set_name(struct rtwn_softc *sc, uint8_t *buf) { struct r92c_softc *rs = sc->sc_priv; @@ -119,7 +119,7 @@ r92ce_attach_private(struct rtwn_softc *sc) rs->rs_tx_enable_ampdu = r92c_tx_enable_ampdu; rs->rs_tx_setup_hwseq = r92c_tx_setup_hwseq; rs->rs_tx_setup_macid = r92c_tx_setup_macid; - rs->rs_set_name = r92ce_set_name; + rs->rs_set_rom_opts = r92ce_set_name; /* XXX TODO: test with net80211 ratectl! */ #ifndef RTWN_WITHOUT_UCODE @@ -157,6 +157,7 @@ r92ce_attach(struct rtwn_pci_softc *pc) pc->pc_tx_postsetup = r92ce_tx_postsetup; pc->pc_copy_tx_desc = r92ce_copy_tx_desc; pc->pc_enable_intr = r92ce_enable_intr; + pc->pc_get_intr_status = r92ce_get_intr_status; pc->pc_qmap = 0xf771; pc->tcr = @@ -177,7 +178,7 @@ r92ce_attach(struct rtwn_pci_softc *pc) sc->sc_get_rx_stats = r92c_get_rx_stats; sc->sc_get_rssi_cck = r92c_get_rssi_cck; sc->sc_get_rssi_ofdm = r92c_get_rssi_ofdm; - sc->sc_classify_intr = r92ce_classify_intr; + sc->sc_classify_intr = r92c_classify_intr; sc->sc_handle_tx_report = rtwn_nop_softc_uint8_int; sc->sc_handle_c2h_report = rtwn_nop_softc_uint8_int; sc->sc_check_frame = rtwn_nop_int_softc_mbuf; diff --git a/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce_calib.c b/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce_calib.c index 070f6e1d..f1cd42e8 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce_calib.c +++ b/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce_calib.c @@ -77,55 +77,58 @@ r92ce_iq_calib_chain(struct rtwn_softc *sc, int chain, uint16_t tx[2], uint16_t rx[2]) { uint32_t status; - int offset = chain * 0x20; if (chain == 0) { /* IQ calibration for chain 0. */ /* IQ calibration settings for chain 0. */ - rtwn_bb_write(sc, 0xe30, 0x10008c1f); - rtwn_bb_write(sc, 0xe34, 0x10008c1f); - rtwn_bb_write(sc, 0xe38, 0x82140102); + rtwn_bb_write(sc, R92C_TX_IQK_TONE(0), 0x10008c1f); + rtwn_bb_write(sc, R92C_RX_IQK_TONE(0), 0x10008c1f); + rtwn_bb_write(sc, R92C_TX_IQK_PI(0), 0x82140102); if (sc->ntxchains > 1) { - rtwn_bb_write(sc, 0xe3c, 0x28160202); /* 2T */ + rtwn_bb_write(sc, R92C_RX_IQK_PI(0), 0x28160202); /* IQ calibration settings for chain 1. */ - rtwn_bb_write(sc, 0xe50, 0x10008c22); - rtwn_bb_write(sc, 0xe54, 0x10008c22); - rtwn_bb_write(sc, 0xe58, 0x82140102); - rtwn_bb_write(sc, 0xe5c, 0x28160202); + rtwn_bb_write(sc, R92C_TX_IQK_TONE(1), 0x10008c22); + rtwn_bb_write(sc, R92C_RX_IQK_TONE(1), 0x10008c22); + rtwn_bb_write(sc, R92C_TX_IQK_PI(1), 0x82140102); + rtwn_bb_write(sc, R92C_RX_IQK_PI(1), 0x28160202); } else - rtwn_bb_write(sc, 0xe3c, 0x28160502); /* 1T */ + rtwn_bb_write(sc, R92C_RX_IQK_PI(0), 0x28160502); /* LO calibration settings. */ - rtwn_bb_write(sc, 0xe4c, 0x001028d1); + rtwn_bb_write(sc, R92C_IQK_AGC_RSP, 0x001028d1); /* We're doing LO and IQ calibration in one shot. */ - rtwn_bb_write(sc, 0xe48, 0xf9000000); - rtwn_bb_write(sc, 0xe48, 0xf8000000); + rtwn_bb_write(sc, R92C_IQK_AGC_PTS, 0xf9000000); + rtwn_bb_write(sc, R92C_IQK_AGC_PTS, 0xf8000000); } else { /* IQ calibration for chain 1. */ /* We're doing LO and IQ calibration in one shot. */ - rtwn_bb_write(sc, 0xe60, 0x00000002); - rtwn_bb_write(sc, 0xe60, 0x00000000); + rtwn_bb_write(sc, R92C_IQK_AGC_CONT, 2); + rtwn_bb_write(sc, R92C_IQK_AGC_CONT, 0); } /* Give LO and IQ calibrations the time to complete. */ rtwn_delay(sc, 1000); /* Read IQ calibration status. */ - status = rtwn_bb_read(sc, 0xeac); + status = rtwn_bb_read(sc, R92C_RX_POWER_IQK_AFTER(0)); if (status & (1 << (28 + chain * 3))) return (0); /* Tx failed. */ /* Read Tx IQ calibration results. */ - tx[0] = (rtwn_bb_read(sc, 0xe94 + offset) >> 16) & 0x3ff; - tx[1] = (rtwn_bb_read(sc, 0xe9c + offset) >> 16) & 0x3ff; + tx[0] = MS(rtwn_bb_read(sc, R92C_TX_POWER_IQK_BEFORE(chain)), + R92C_POWER_IQK_RESULT); + tx[1] = MS(rtwn_bb_read(sc, R92C_TX_POWER_IQK_AFTER(chain)), + R92C_POWER_IQK_RESULT); if (tx[0] == 0x142 || tx[1] == 0x042) return (0); /* Tx failed. */ if (status & (1 << (27 + chain * 3))) return (1); /* Rx failed. */ /* Read Rx IQ calibration results. */ - rx[0] = (rtwn_bb_read(sc, 0xea4 + offset) >> 16) & 0x3ff; - rx[1] = (rtwn_bb_read(sc, 0xeac + offset) >> 16) & 0x3ff; + rx[0] = MS(rtwn_bb_read(sc, R92C_RX_POWER_IQK_BEFORE(chain)), + R92C_POWER_IQK_RESULT); + rx[1] = MS(rtwn_bb_read(sc, R92C_RX_POWER_IQK_AFTER(chain)), + R92C_POWER_IQK_RESULT); if (rx[0] == 0x132 || rx[1] == 0x036) return (1); /* Rx failed. */ @@ -202,18 +205,18 @@ r92ce_iq_calib_run(struct rtwn_softc *sc, int n, uint16_t tx[2][2], if (sc->ntxchains > 1) rtwn_bb_write(sc, 0x0b6c, 0x00080000); - rtwn_bb_write(sc, 0x0e28, 0x80800000); - rtwn_bb_write(sc, 0x0e40, 0x01007c00); - rtwn_bb_write(sc, 0x0e44, 0x01004800); + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0x80800000); + rtwn_bb_write(sc, R92C_TX_IQK, 0x01007c00); + rtwn_bb_write(sc, R92C_RX_IQK, 0x01004800); rtwn_bb_write(sc, 0x0b68, 0x00080000); for (chain = 0; chain < sc->ntxchains; chain++) { if (chain > 0) { /* Put chain 0 on standby. */ - rtwn_bb_write(sc, 0x0e28, 0x00); + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0); rtwn_bb_write(sc, R92C_LSSI_PARAM(0), 0x00010000); - rtwn_bb_write(sc, 0x0e28, 0x80800000); + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0x80800000); /* Enable chain 1. */ for (i = 0; i < nitems(reg_adda); i++) @@ -259,7 +262,7 @@ r92ce_iq_calib_run(struct rtwn_softc *sc, int n, uint16_t tx[2][2], vals->fpga0_rfifacesw1); rtwn_bb_write(sc, R92C_OFDM0_TRMUXPAR, vals->ofdm0_trmuxpar); - rtwn_bb_write(sc, 0x0e28, 0x00); + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0); rtwn_bb_write(sc, R92C_LSSI_PARAM(0), 0x00032ed3); if (sc->ntxchains > 1) rtwn_bb_write(sc, R92C_LSSI_PARAM(1), 0x00032ed3); diff --git a/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce_rx.c b/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce_rx.c index 203b0bf8..0f65d1bf 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce_rx.c +++ b/freebsd/sys/dev/rtwn/rtl8192c/pci/r92ce_rx.c @@ -60,10 +60,10 @@ __FBSDID("$FreeBSD$"); int -r92ce_classify_intr(struct rtwn_softc *sc, void *arg, int len __unused) +r92ce_get_intr_status(struct rtwn_pci_softc *pc, int *rings) { + struct rtwn_softc *sc = &pc->pc_sc; uint32_t status; - int *rings = arg; int ret; *rings = 0; diff --git a/freebsd/sys/dev/rtwn/rtl8192c/r92c.h b/freebsd/sys/dev/rtwn/rtl8192c/r92c.h index d8f7afc8..f215e34f 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/r92c.h +++ b/freebsd/sys/dev/rtwn/rtl8192c/r92c.h @@ -102,6 +102,7 @@ void r92c_efuse_postread(struct rtwn_softc *); void r92c_parse_rom(struct rtwn_softc *, uint8_t *); /* r92c_rx.c */ +int r92c_classify_intr(struct rtwn_softc *, void *, int); int8_t r92c_get_rssi_cck(struct rtwn_softc *, void *); int8_t r92c_get_rssi_ofdm(struct rtwn_softc *, void *); uint8_t r92c_rx_radiotap_flags(const void *); diff --git a/freebsd/sys/dev/rtwn/rtl8192c/r92c_calib.c b/freebsd/sys/dev/rtwn/rtl8192c/r92c_calib.c index 6c18a606..ebb4f5cf 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/r92c_calib.c +++ b/freebsd/sys/dev/rtwn/rtl8192c/r92c_calib.c @@ -48,16 +48,366 @@ __FBSDID("$FreeBSD$"); #include <dev/rtwn/if_rtwnreg.h> #include <dev/rtwn/if_rtwnvar.h> +#include <dev/rtwn/if_rtwn_debug.h> #include <dev/rtwn/rtl8192c/r92c.h> #include <dev/rtwn/rtl8192c/r92c_reg.h> +/* Registers to save and restore during IQ calibration. */ +struct r92c_iq_cal_reg_vals { + uint32_t adda[16]; + uint8_t txpause; + uint8_t bcn_ctrl[2]; + uint32_t gpio_muxcfg; + uint32_t cck0_afesetting; + uint32_t ofdm0_trxpathena; + uint32_t ofdm0_trmuxpar; + uint32_t fpga0_rfifacesw0; + uint32_t fpga0_rfifacesw1; + uint32_t fpga0_rfifaceoe0; + uint32_t fpga0_rfifaceoe1; + uint32_t config_ant0; + uint32_t config_ant1; +}; + +/* XXX TODO: merge */ +static int +r92c_iq_calib_chain(struct rtwn_softc *sc, int chain, uint16_t tx[2], + uint16_t rx[2]) +{ + uint32_t status; + + if (chain == 0) { /* IQ calibration for chain 0. */ + /* IQ calibration settings for chain 0. */ + rtwn_bb_write(sc, R92C_TX_IQK_TONE(0), 0x10008c1f); + rtwn_bb_write(sc, R92C_RX_IQK_TONE(0), 0x10008c1f); + rtwn_bb_write(sc, R92C_TX_IQK_PI(0), 0x82140102); + + if (sc->ntxchains > 1) { + rtwn_bb_write(sc, R92C_RX_IQK_PI(0), 0x28160202); + /* IQ calibration settings for chain 1. */ + rtwn_bb_write(sc, R92C_TX_IQK_TONE(1), 0x10008c22); + rtwn_bb_write(sc, R92C_RX_IQK_TONE(1), 0x10008c22); + rtwn_bb_write(sc, R92C_TX_IQK_PI(1), 0x82140102); + rtwn_bb_write(sc, R92C_RX_IQK_PI(1), 0x28160202); + } else + rtwn_bb_write(sc, R92C_RX_IQK_PI(0), 0x28160502); + + /* LO calibration settings. */ + rtwn_bb_write(sc, R92C_IQK_AGC_RSP, 0x001028d1); + /* We're doing LO and IQ calibration in one shot. */ + rtwn_bb_write(sc, R92C_IQK_AGC_PTS, 0xf9000000); + rtwn_bb_write(sc, R92C_IQK_AGC_PTS, 0xf8000000); + + } else { /* IQ calibration for chain 1. */ + /* We're doing LO and IQ calibration in one shot. */ + rtwn_bb_write(sc, R92C_IQK_AGC_CONT, 2); + rtwn_bb_write(sc, R92C_IQK_AGC_CONT, 0); + } + + /* Give LO and IQ calibrations the time to complete. */ + rtwn_delay(sc, 10000); + + /* Read IQ calibration status. */ + status = rtwn_bb_read(sc, R92C_RX_POWER_IQK_AFTER(0)); + + if (status & (1 << (28 + chain * 3))) + return (0); /* Tx failed. */ + /* Read Tx IQ calibration results. */ + tx[0] = MS(rtwn_bb_read(sc, R92C_TX_POWER_IQK_BEFORE(chain)), + R92C_POWER_IQK_RESULT); + tx[1] = MS(rtwn_bb_read(sc, R92C_TX_POWER_IQK_AFTER(chain)), + R92C_POWER_IQK_RESULT); + if (tx[0] == 0x142 || tx[1] == 0x042) + return (0); /* Tx failed. */ + + if (status & (1 << (27 + chain * 3))) + return (1); /* Rx failed. */ + /* Read Rx IQ calibration results. */ + rx[0] = MS(rtwn_bb_read(sc, R92C_RX_POWER_IQK_BEFORE(chain)), + R92C_POWER_IQK_RESULT); + rx[1] = MS(rtwn_bb_read(sc, R92C_RX_POWER_IQK_AFTER(chain)), + R92C_POWER_IQK_RESULT); + if (rx[0] == 0x132 || rx[1] == 0x036) + return (1); /* Rx failed. */ + + return (3); /* Both Tx and Rx succeeded. */ +} + +static void +r92c_iq_calib_run(struct rtwn_softc *sc, int n, uint16_t tx[2][2], + uint16_t rx[2][2], struct r92c_iq_cal_reg_vals *vals) +{ + /* Registers to save and restore during IQ calibration. */ + static const uint16_t reg_adda[16] = { + 0x85c, 0xe6c, 0xe70, 0xe74, + 0xe78, 0xe7c, 0xe80, 0xe84, + 0xe88, 0xe8c, 0xed0, 0xed4, + 0xed8, 0xedc, 0xee0, 0xeec + }; + int i, chain; + uint32_t hssi_param1; + + if (n == 0) { + for (i = 0; i < nitems(reg_adda); i++) + vals->adda[i] = rtwn_bb_read(sc, reg_adda[i]); + + vals->txpause = rtwn_read_1(sc, R92C_TXPAUSE); + vals->bcn_ctrl[0] = rtwn_read_1(sc, R92C_BCN_CTRL(0)); + vals->bcn_ctrl[1] = rtwn_read_1(sc, R92C_BCN_CTRL(1)); + vals->gpio_muxcfg = rtwn_read_4(sc, R92C_GPIO_MUXCFG); + } + + if (sc->ntxchains == 1) { + rtwn_bb_write(sc, reg_adda[0], 0x0b1b25a0); + for (i = 1; i < nitems(reg_adda); i++) + rtwn_bb_write(sc, reg_adda[i], 0x0bdb25a0); + } else { + for (i = 0; i < nitems(reg_adda); i++) + rtwn_bb_write(sc, reg_adda[i], 0x04db25a4); + } + + hssi_param1 = rtwn_bb_read(sc, R92C_HSSI_PARAM1(0)); + if (!(hssi_param1 & R92C_HSSI_PARAM1_PI)) { + rtwn_bb_write(sc, R92C_HSSI_PARAM1(0), + hssi_param1 | R92C_HSSI_PARAM1_PI); + rtwn_bb_write(sc, R92C_HSSI_PARAM1(1), + hssi_param1 | R92C_HSSI_PARAM1_PI); + } + + if (n == 0) { + vals->cck0_afesetting = rtwn_bb_read(sc, R92C_CCK0_AFESETTING); + vals->ofdm0_trxpathena = + rtwn_bb_read(sc, R92C_OFDM0_TRXPATHENA); + vals->ofdm0_trmuxpar = rtwn_bb_read(sc, R92C_OFDM0_TRMUXPAR); + vals->fpga0_rfifacesw0 = + rtwn_bb_read(sc, R92C_FPGA0_RFIFACESW(0)); + vals->fpga0_rfifacesw1 = + rtwn_bb_read(sc, R92C_FPGA0_RFIFACESW(1)); + vals->fpga0_rfifaceoe0 = + rtwn_bb_read(sc, R92C_FPGA0_RFIFACEOE(0)); + vals->fpga0_rfifaceoe1 = + rtwn_bb_read(sc, R92C_FPGA0_RFIFACEOE(1)); + vals->config_ant0 = rtwn_bb_read(sc, R92C_CONFIG_ANT(0)); + vals->config_ant1 = rtwn_bb_read(sc, R92C_CONFIG_ANT(1)); + } + + rtwn_bb_setbits(sc, R92C_CCK0_AFESETTING, 0, 0x0f000000); + rtwn_bb_write(sc, R92C_OFDM0_TRXPATHENA, 0x03a05600); + rtwn_bb_write(sc, R92C_OFDM0_TRMUXPAR, 0x000800e4); + rtwn_bb_write(sc, R92C_FPGA0_RFIFACESW(1), 0x22204000); + rtwn_bb_setbits(sc, R92C_FPGA0_RFIFACESW(0), 0, 0x04000400); + rtwn_bb_setbits(sc, R92C_FPGA0_RFIFACEOE(0), 0x400, 0); + rtwn_bb_setbits(sc, R92C_FPGA0_RFIFACEOE(1), 0x400, 0); + + if (sc->ntxchains > 1) { + rtwn_bb_write(sc, R92C_LSSI_PARAM(0), 0x00010000); + rtwn_bb_write(sc, R92C_LSSI_PARAM(1), 0x00010000); + } + + rtwn_write_1(sc, R92C_TXPAUSE, + R92C_TX_QUEUE_AC | R92C_TX_QUEUE_MGT | R92C_TX_QUEUE_HIGH); + rtwn_write_1(sc, R92C_BCN_CTRL(0), + vals->bcn_ctrl[0] & ~R92C_BCN_CTRL_EN_BCN); + rtwn_write_1(sc, R92C_BCN_CTRL(1), + vals->bcn_ctrl[1] & ~R92C_BCN_CTRL_EN_BCN); + rtwn_write_1(sc, R92C_GPIO_MUXCFG, + vals->gpio_muxcfg & ~R92C_GPIO_MUXCFG_ENBT); + + rtwn_bb_write(sc, R92C_CONFIG_ANT(0), 0x00080000); + if (sc->ntxchains > 1) + rtwn_bb_write(sc, R92C_CONFIG_ANT(1), 0x00080000); + + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0x80800000); + rtwn_bb_write(sc, R92C_TX_IQK, 0x01007c00); + rtwn_bb_write(sc, R92C_RX_IQK, 0x01004800); + + for (chain = 0; chain < sc->ntxchains; chain++) { + if (chain > 0) { + /* Put chain 0 on standby. */ + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0); + rtwn_bb_write(sc, R92C_LSSI_PARAM(0), 0x00010000); + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0x80800000); + + /* Enable chain 1. */ + for (i = 0; i < nitems(reg_adda); i++) + rtwn_bb_write(sc, reg_adda[i], 0x0b1b25a4); + } + + /* Run IQ calibration twice. */ + for (i = 0; i < 2; i++) { + int ret; + + ret = r92c_iq_calib_chain(sc, chain, + tx[chain], rx[chain]); + if (ret == 0) { + RTWN_DPRINTF(sc, RTWN_DEBUG_CALIB, + "%s: chain %d: Tx failed.\n", + __func__, chain); + tx[chain][0] = 0xff; + tx[chain][1] = 0xff; + rx[chain][0] = 0xff; + rx[chain][1] = 0xff; + } else if (ret == 1) { + RTWN_DPRINTF(sc, RTWN_DEBUG_CALIB, + "%s: chain %d: Rx failed.\n", + __func__, chain); + rx[chain][0] = 0xff; + rx[chain][1] = 0xff; + } else if (ret == 3) { + RTWN_DPRINTF(sc, RTWN_DEBUG_CALIB, + "%s: chain %d: Both Tx and Rx " + "succeeded.\n", __func__, chain); + } + } + + RTWN_DPRINTF(sc, RTWN_DEBUG_CALIB, + "%s: results for run %d chain %d: tx[0] 0x%x, " + "tx[1] 0x%x, rx[0] 0x%x, rx[1] 0x%x\n", __func__, n, chain, + tx[chain][0], tx[chain][1], rx[chain][0], rx[chain][1]); + } + + rtwn_bb_write(sc, R92C_CCK0_AFESETTING, vals->cck0_afesetting); + rtwn_bb_write(sc, R92C_OFDM0_TRXPATHENA, vals->ofdm0_trxpathena); + rtwn_bb_write(sc, R92C_FPGA0_RFIFACESW(0), vals->fpga0_rfifacesw0); + rtwn_bb_write(sc, R92C_FPGA0_RFIFACESW(1), vals->fpga0_rfifacesw1); + rtwn_bb_write(sc, R92C_OFDM0_TRMUXPAR, vals->ofdm0_trmuxpar); + rtwn_bb_write(sc, R92C_FPGA0_RFIFACEOE(0), vals->fpga0_rfifaceoe0); + rtwn_bb_write(sc, R92C_FPGA0_RFIFACEOE(1), vals->fpga0_rfifaceoe1); + rtwn_bb_write(sc, R92C_CONFIG_ANT(0), vals->config_ant0); + rtwn_bb_write(sc, R92C_CONFIG_ANT(1), vals->config_ant1); + + rtwn_bb_write(sc, R92C_FPGA0_IQK, 0); + rtwn_bb_write(sc, R92C_LSSI_PARAM(0), 0x00032ed3); + if (sc->ntxchains > 1) + rtwn_bb_write(sc, R92C_LSSI_PARAM(1), 0x00032ed3); + + if (n != 0) { + if (!(hssi_param1 & R92C_HSSI_PARAM1_PI)) { + rtwn_bb_write(sc, R92C_HSSI_PARAM1(0), hssi_param1); + rtwn_bb_write(sc, R92C_HSSI_PARAM1(1), hssi_param1); + } + + for (i = 0; i < nitems(reg_adda); i++) + rtwn_bb_write(sc, reg_adda[i], vals->adda[i]); + + rtwn_write_1(sc, R92C_TXPAUSE, vals->txpause); + rtwn_write_1(sc, R92C_BCN_CTRL(0), vals->bcn_ctrl[0]); + rtwn_write_1(sc, R92C_BCN_CTRL(1), vals->bcn_ctrl[1]); + rtwn_write_4(sc, R92C_GPIO_MUXCFG, vals->gpio_muxcfg); + + rtwn_bb_write(sc, R92C_TX_IQK_TONE(0), 0x01008c00); + rtwn_bb_write(sc, R92C_RX_IQK_TONE(0), 0x01008c00); + } +} + +#define RTWN_IQ_CAL_MAX_TOLERANCE 5 +static int +r92c_iq_calib_compare_results(struct rtwn_softc *sc, uint16_t tx1[2][2], + uint16_t rx1[2][2], uint16_t tx2[2][2], uint16_t rx2[2][2]) +{ + int chain, i, tx_ok[2], rx_ok[2]; + + tx_ok[0] = tx_ok[1] = rx_ok[0] = rx_ok[1] = 0; + for (chain = 0; chain < sc->ntxchains; chain++) { + for (i = 0; i < 2; i++) { + if (tx1[chain][i] == 0xff || tx2[chain][i] == 0xff || + rx1[chain][i] == 0xff || rx2[chain][i] == 0xff) + continue; + + tx_ok[chain] = (abs(tx1[chain][i] - tx2[chain][i]) <= + RTWN_IQ_CAL_MAX_TOLERANCE); + + rx_ok[chain] = (abs(rx1[chain][i] - rx2[chain][i]) <= + RTWN_IQ_CAL_MAX_TOLERANCE); + } + } + + if (sc->ntxchains > 1) + return (tx_ok[0] && tx_ok[1] && rx_ok[0] && rx_ok[1]); + else + return (tx_ok[0] && rx_ok[0]); +} +#undef RTWN_IQ_CAL_MAX_TOLERANCE + +static void +r92c_iq_calib_write_results(struct rtwn_softc *sc, uint16_t tx[2], + uint16_t rx[2], int chain) +{ + uint32_t reg, val, x; + long y, tx_c; + + if (tx[0] == 0xff || tx[1] == 0xff) + return; + + reg = rtwn_bb_read(sc, R92C_OFDM0_TXIQIMBALANCE(chain)); + val = ((reg >> 22) & 0x3ff); + x = tx[0]; + if (x & 0x00000200) + x |= 0xfffffc00; + reg = (((x * val) >> 8) & 0x3ff); + rtwn_bb_setbits(sc, R92C_OFDM0_TXIQIMBALANCE(chain), 0x3ff, reg); + rtwn_bb_setbits(sc, R92C_OFDM0_ECCATHRESHOLD, 0x80000000, + ((x * val) & 0x80) << 24); + + y = tx[1]; + if (y & 0x00000200) + y |= 0xfffffc00; + tx_c = (y * val) >> 8; + rtwn_bb_setbits(sc, R92C_OFDM0_TXAFE(chain), 0xf0000000, + (tx_c & 0x3c0) << 22); + rtwn_bb_setbits(sc, R92C_OFDM0_TXIQIMBALANCE(chain), 0x003f0000, + (tx_c & 0x3f) << 16); + rtwn_bb_setbits(sc, R92C_OFDM0_ECCATHRESHOLD, 0x20000000, + ((y * val) & 0x80) << 22); + + if (rx[0] == 0xff || rx[1] == 0xff) + return; + + rtwn_bb_setbits(sc, R92C_OFDM0_RXIQIMBALANCE(chain), 0x3ff, + rx[0] & 0x3ff); + rtwn_bb_setbits(sc, R92C_OFDM0_RXIQIMBALANCE(chain), 0xfc00, + (rx[1] & 0x3f) << 10); + + if (chain == 0) { + rtwn_bb_setbits(sc, R92C_OFDM0_RXIQEXTANTA, 0xf0000000, + (rx[1] & 0x3c0) << 22); + } else { + rtwn_bb_setbits(sc, R92C_OFDM0_AGCRSSITABLE, 0xf000, + (rx[1] & 0x3c0) << 6); + } +} + +#define RTWN_IQ_CAL_NRUN 3 void r92c_iq_calib(struct rtwn_softc *sc) { - /* XXX TODO */ + struct r92c_iq_cal_reg_vals vals; + uint16_t tx[RTWN_IQ_CAL_NRUN][2][2], rx[RTWN_IQ_CAL_NRUN][2][2]; + int n, valid; + + valid = 0; + for (n = 0; n < RTWN_IQ_CAL_NRUN; n++) { + r92c_iq_calib_run(sc, n, tx[n], rx[n], &vals); + + if (n == 0) + continue; + + /* Valid results remain stable after consecutive runs. */ + valid = r92c_iq_calib_compare_results(sc, tx[n - 1], + rx[n - 1], tx[n], rx[n]); + if (valid) + break; + } + + if (valid) { + r92c_iq_calib_write_results(sc, tx[n][0], rx[n][0], 0); + if (sc->ntxchains > 1) + r92c_iq_calib_write_results(sc, tx[n][1], rx[n][1], 1); + } } +#undef RTWN_IQ_CAL_NRUN void r92c_lc_calib(struct rtwn_softc *sc) diff --git a/freebsd/sys/dev/rtwn/rtl8192c/r92c_init.c b/freebsd/sys/dev/rtwn/rtl8192c/r92c_init.c index 199a419f..8cd55fbf 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/r92c_init.c +++ b/freebsd/sys/dev/rtwn/rtl8192c/r92c_init.c @@ -324,6 +324,7 @@ r92c_init_antsel(struct rtwn_softc *sc) rtwn_bb_setbits(sc, R92C_FPGA0_RFPARAM(0), 0, 0x2000); reg = rtwn_bb_read(sc, R92C_FPGA0_RFIFACEOE(0)); sc->sc_ant = MS(reg, R92C_FPGA0_RFIFACEOE0_ANT); /* XXX */ + rtwn_setbits_1(sc, R92C_LEDCFG2, 0x80, 0); } void diff --git a/freebsd/sys/dev/rtwn/rtl8192c/r92c_reg.h b/freebsd/sys/dev/rtwn/rtl8192c/r92c_reg.h index 34a4b80c..c3def33e 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/r92c_reg.h +++ b/freebsd/sys/dev/rtwn/rtl8192c/r92c_reg.h @@ -66,6 +66,7 @@ #define R92C_HSIMR 0x058 #define R92C_HSISR 0x05c #define R92C_MULTI_FUNC_CTRL 0x068 +#define R92C_AFE_XTAL_CTRL_EXT 0x078 #define R92C_LDO_SWR_CTRL 0x07c #define R92C_MCUFWDL 0x080 #define R92C_HMEBOX_EXT(idx) (0x088 + (idx) * 2) @@ -147,6 +148,7 @@ #define R92C_RD_RESP_PKT_TH 0x463 #define R92C_INIRTS_RATE_SEL 0x480 #define R92C_INIDATA_RATE_SEL(macid) (0x484 + (macid)) +#define R92C_POWER_STATUS 0x4a4 #define R92C_QUEUE_CTRL 0x4c6 #define R92C_MAX_AGGR_NUM 0x4ca #define R92C_BAR_MODE_CTRL 0x4cc @@ -347,6 +349,7 @@ /* Bits for R92C_GPIO_MUXCFG. */ #define R92C_GPIO_MUXCFG_ENBT 0x0020 +#define R92C_GPIO_MUXCFG_ENSIC 0x1000 /* Bits for R92C_LEDCFG0. */ #define R92C_LEDCFG0_DIS 0x08 @@ -691,6 +694,7 @@ #define R92C_FPGA1_TXINFO 0x90c #define R92C_CCK0_SYSTEM 0xa00 #define R92C_CCK0_AFESETTING 0xa04 +#define R92C_CONFIG_ANT(chain) (0xb68 + (chain) * 4) #define R92C_OFDM0_TRXPATHENA 0xc04 #define R92C_OFDM0_TRMUXPAR 0xc08 #define R92C_OFDM0_RXIQIMBALANCE(chain) (0xc14 + (chain) * 8) @@ -703,6 +707,20 @@ #define R92C_OFDM0_RXIQEXTANTA 0xca0 #define R92C_OFDM0_TXPSEUDONOISEWGT 0xce4 #define R92C_OFDM1_LSTF 0xd00 +#define R92C_FPGA0_IQK 0xe28 +#define R92C_TX_IQK_TONE(chain) (0xe30 + (chain) * 32) +#define R92C_RX_IQK_TONE(chain) (0xe34 + (chain) * 32) +#define R92C_TX_IQK_PI(chain) (0xe38 + (chain) * 32) +#define R92C_RX_IQK_PI(chain) (0xe3c + (chain) * 32) +#define R92C_TX_IQK 0xe40 +#define R92C_RX_IQK 0xe44 +#define R92C_IQK_AGC_PTS 0xe48 +#define R92C_IQK_AGC_RSP 0xe4c +#define R92C_IQK_AGC_CONT 0xe60 +#define R92C_TX_POWER_IQK_BEFORE(chain) (0xe94 + (chain) * 32) +#define R92C_TX_POWER_IQK_AFTER(chain) (0xe9c + (chain) * 32) +#define R92C_RX_POWER_IQK_BEFORE(chain) (0xea4 + (chain) * 32) +#define R92C_RX_POWER_IQK_AFTER(chain) (0xeac + (chain) * 32) /* Bits for R92C_FPGA[01]_RFMOD. */ #define R92C_RFMOD_40MHZ 0x00000001 @@ -828,6 +846,10 @@ #define R92C_OFDM0_AGCCORE1_GAIN_M 0x0000007f #define R92C_OFDM0_AGCCORE1_GAIN_S 0 +/* Bits for R92C_[RT]X_POWER_IQK*. */ +#define R92C_POWER_IQK_RESULT_S 16 +#define R92C_POWER_IQK_RESULT_M 0x03ff0000 + /* * RF (6052) registers. diff --git a/freebsd/sys/dev/rtwn/rtl8192c/r92c_rom.c b/freebsd/sys/dev/rtwn/rtl8192c/r92c_rom.c index 1eb5ca12..bc355e30 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/r92c_rom.c +++ b/freebsd/sys/dev/rtwn/rtl8192c/r92c_rom.c @@ -98,7 +98,7 @@ r92c_parse_rom(struct rtwn_softc *sc, uint8_t *buf) __func__, rs->regulatory); /* Need to be set before postinit() (but after preinit()). */ - rtwn_r92c_set_name(sc); + rtwn_r92c_set_rom_opts(sc, buf); r92c_set_chains(sc); for (j = 0; j < R92C_GROUP_2G; j++) { diff --git a/freebsd/sys/dev/rtwn/rtl8192c/r92c_rx.c b/freebsd/sys/dev/rtwn/rtl8192c/r92c_rx.c index c98a0203..cbc8eb56 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/r92c_rx.c +++ b/freebsd/sys/dev/rtwn/rtl8192c/r92c_rx.c @@ -54,6 +54,13 @@ __FBSDID("$FreeBSD$"); #include <dev/rtwn/rtl8192c/r92c_rx_desc.h> +int +r92c_classify_intr(struct rtwn_softc *sc, void *buf, int len) +{ + /* NB: reports are fetched from C2H_MSG register. */ + return (RTWN_RX_DATA); +} + int8_t r92c_get_rssi_cck(struct rtwn_softc *sc, void *physt) { diff --git a/freebsd/sys/dev/rtwn/rtl8192c/r92c_tx.c b/freebsd/sys/dev/rtwn/rtl8192c/r92c_tx.c index 5c80af90..36450e80 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/r92c_tx.c +++ b/freebsd/sys/dev/rtwn/rtl8192c/r92c_tx.c @@ -105,7 +105,7 @@ r92c_tx_protection(struct rtwn_softc *sc, struct r92c_tx_desc *txd, rate = rtwn_ctl_mcsrate(ic->ic_rt, ridx); else rate = ieee80211_ctl_rate(ic->ic_rt, ridx2rate[ridx]); - ridx = rate2ridx(rate); + ridx = rate2ridx(IEEE80211_RV(rate)); txd->txdw4 |= htole32(SM(R92C_TXDW4_RTSRATE, ridx)); /* RTS rate fallback limit (max). */ @@ -213,6 +213,12 @@ r92c_tx_setup_macid(void *buf, int id) struct r92c_tx_desc *txd = (struct r92c_tx_desc *)buf; txd->txdw1 |= htole32(SM(R92C_TXDW1_MACID, id)); + + /* XXX does not belong here */ + /* XXX temporary (I hope) */ + /* Force CCK1 for RTS / CTS frames (driver bug) */ + txd->txdw4 &= ~htole32(SM(R92C_TXDW4_RTSRATE, R92C_TXDW4_RTSRATE_M)); + txd->txdw4 &= ~htole32(R92C_TXDW4_RTS_SHORT); } void diff --git a/freebsd/sys/dev/rtwn/rtl8192c/r92c_var.h b/freebsd/sys/dev/rtwn/rtl8192c/r92c_var.h index 79592449..2b35ccea 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/r92c_var.h +++ b/freebsd/sys/dev/rtwn/rtl8192c/r92c_var.h @@ -58,7 +58,7 @@ struct r92c_softc { void (*rs_tx_enable_ampdu)(void *, int); void (*rs_tx_setup_hwseq)(void *); void (*rs_tx_setup_macid)(void *, int); - void (*rs_set_name)(struct rtwn_softc *); + void (*rs_set_rom_opts)(struct rtwn_softc *, uint8_t *); int rf_read_delay[3]; uint32_t rf_chnlbw[R92C_MAX_CHAINS]; @@ -77,7 +77,7 @@ struct r92c_softc { ((R92C_SOFTC(_sc)->rs_tx_setup_hwseq)((_buf))) #define rtwn_r92c_tx_setup_macid(_sc, _buf, _id) \ ((R92C_SOFTC(_sc)->rs_tx_setup_macid)((_buf), (_id))) -#define rtwn_r92c_set_name(_sc) \ - ((R92C_SOFTC(_sc)->rs_set_name)((_sc))) +#define rtwn_r92c_set_rom_opts(_sc, _buf) \ + ((R92C_SOFTC(_sc)->rs_set_rom_opts)((_sc), (_buf))) #endif /* R92C_VAR_H */ diff --git a/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu.h b/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu.h index 2486d7fa..4cc01049 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu.h +++ b/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu.h @@ -47,7 +47,6 @@ void r92cu_post_init(struct rtwn_softc *); void r92cu_set_led(struct rtwn_softc *, int, int); /* r92cu_rx.c */ -int r92cu_classify_intr(struct rtwn_softc *, void *, int); int r92cu_align_rx(int, int); /* r92cu_tx.c */ diff --git a/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu_attach.c b/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu_attach.c index 4696e8c4..c8270184 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu_attach.c +++ b/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu_attach.c @@ -96,7 +96,7 @@ r92cu_postattach(struct rtwn_softc *sc) } static void -r92cu_set_name(struct rtwn_softc *sc) +r92cu_set_name(struct rtwn_softc *sc, uint8_t *buf) { struct r92c_softc *rs = sc->sc_priv; @@ -126,7 +126,7 @@ r92cu_attach_private(struct rtwn_softc *sc) rs->rs_tx_enable_ampdu = r92c_tx_enable_ampdu; rs->rs_tx_setup_hwseq = r92c_tx_setup_hwseq; rs->rs_tx_setup_macid = r92c_tx_setup_macid; - rs->rs_set_name = r92cu_set_name; + rs->rs_set_rom_opts = r92cu_set_name; #ifndef RTWN_WITHOUT_UCODE rs->rs_c2h_timeout = hz; @@ -170,7 +170,7 @@ r92cu_attach(struct rtwn_usb_softc *uc) sc->sc_get_rx_stats = r92c_get_rx_stats; sc->sc_get_rssi_cck = r92c_get_rssi_cck; sc->sc_get_rssi_ofdm = r92c_get_rssi_ofdm; - sc->sc_classify_intr = r92cu_classify_intr; + sc->sc_classify_intr = r92c_classify_intr; sc->sc_handle_tx_report = rtwn_nop_softc_uint8_int; sc->sc_handle_c2h_report = rtwn_nop_softc_uint8_int; sc->sc_check_frame = rtwn_nop_int_softc_mbuf; @@ -189,7 +189,7 @@ r92cu_attach(struct rtwn_usb_softc *uc) sc->sc_llt_init = r92c_llt_init; sc->sc_set_page_size = r92c_set_page_size; sc->sc_lc_calib = r92c_lc_calib; - sc->sc_iq_calib = r92c_iq_calib; /* XXX TODO */ + sc->sc_iq_calib = r92c_iq_calib; sc->sc_read_chipid_vendor = r92c_read_chipid_vendor; sc->sc_adj_devcaps = r92cu_adj_devcaps; sc->sc_vap_preattach = rtwn_nop_softc_vap; diff --git a/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu_init.c b/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu_init.c index 61dd1aa0..08259875 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu_init.c +++ b/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu_init.c @@ -359,6 +359,8 @@ void r92cu_post_init(struct rtwn_softc *sc) { + rtwn_write_4(sc, R92C_POWER_STATUS, 0x5); + /* Perform LO and IQ calibrations. */ r92c_iq_calib(sc); /* Perform LC calibration. */ diff --git a/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu_rx.c b/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu_rx.c index 4d040a4e..8d8489c2 100644 --- a/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu_rx.c +++ b/freebsd/sys/dev/rtwn/rtl8192c/usb/r92cu_rx.c @@ -52,13 +52,6 @@ __FBSDID("$FreeBSD$"); int -r92cu_classify_intr(struct rtwn_softc *sc, void *buf, int len) -{ - /* NB: reports are fetched from C2H_MSG register. */ - return (RTWN_RX_DATA); -} - -int r92cu_align_rx(int totlen, int len) { return (roundup2(totlen, 128)); diff --git a/freebsd/sys/dev/rtwn/rtl8192e/usb/r92eu_attach.c b/freebsd/sys/dev/rtwn/rtl8192e/usb/r92eu_attach.c index ce1e49f4..f906d9fa 100644 --- a/freebsd/sys/dev/rtwn/rtl8192e/usb/r92eu_attach.c +++ b/freebsd/sys/dev/rtwn/rtl8192e/usb/r92eu_attach.c @@ -137,7 +137,7 @@ r92eu_attach(struct rtwn_usb_softc *uc) sc->sc_llt_init = r92e_llt_init; sc->sc_set_page_size = rtwn_nop_int_softc; sc->sc_lc_calib = r92c_lc_calib; - sc->sc_iq_calib = r88e_iq_calib; /* XXX TODO */ + sc->sc_iq_calib = rtwn_nop_softc; /* XXX TODO */ sc->sc_read_chipid_vendor = rtwn_nop_softc_uint32; sc->sc_adj_devcaps = r92eu_adj_devcaps; sc->sc_vap_preattach = rtwn_nop_softc_vap; diff --git a/freebsd/sys/dev/rtwn/rtl8812a/r12a_tx.c b/freebsd/sys/dev/rtwn/rtl8812a/r12a_tx.c index 77e9e423..910a06eb 100644 --- a/freebsd/sys/dev/rtwn/rtl8812a/r12a_tx.c +++ b/freebsd/sys/dev/rtwn/rtl8812a/r12a_tx.c @@ -113,7 +113,7 @@ r12a_tx_protection(struct rtwn_softc *sc, struct r12a_tx_desc *txd, rate = rtwn_ctl_mcsrate(ic->ic_rt, ridx); else rate = ieee80211_ctl_rate(ic->ic_rt, ridx2rate[ridx]); - ridx = rate2ridx(rate); + ridx = rate2ridx(IEEE80211_RV(rate)); txd->txdw4 |= htole32(SM(R12A_TXDW4_RTSRATE, ridx)); /* RTS rate fallback limit (max). */ diff --git a/freebsd/sys/dev/rtwn/usb/rtwn_usb_attach.h b/freebsd/sys/dev/rtwn/usb/rtwn_usb_attach.h index 432d7e97..c757cb79 100644 --- a/freebsd/sys/dev/rtwn/usb/rtwn_usb_attach.h +++ b/freebsd/sys/dev/rtwn/usb/rtwn_usb_attach.h @@ -107,6 +107,7 @@ static const STRUCT_USB_HOST_ID rtwn_devs[] = { { USB_VPI(USB_VENDOR_##v, USB_PRODUCT_##v##_##p, RTWN_CHIP_RTL8192EU) } RTWN_RTL8192EU_DEV(DLINK, DWA131E1), RTWN_RTL8192EU_DEV(REALTEK, RTL8192EU), + RTWN_RTL8192EU_DEV(TPLINK, WN821NV5), RTWN_RTL8192EU_DEV(TPLINK, WN822NV4), RTWN_RTL8192EU_DEV(TPLINK, WN823NV2), #undef RTWN_RTL8192EU_DEV @@ -155,7 +156,10 @@ static const STRUCT_USB_HOST_ID rtwn_devs[] = { RTWN_RTL8821AU_DEV(EDIMAX, EW7811UTC_2), RTWN_RTL8821AU_DEV(HAWKING, HD65U), RTWN_RTL8821AU_DEV(MELCO, WIU2433DM), - RTWN_RTL8821AU_DEV(NETGEAR, A6100) + RTWN_RTL8821AU_DEV(NETGEAR, A6100), + RTWN_RTL8821AU_DEV(REALTEK, RTL8821AU_1), + RTWN_RTL8821AU_DEV(REALTEK, RTL8821AU_2), + RTWN_RTL8821AU_DEV(TPLINK, T2UNANO) #undef RTWN_RTL8821AU_DEV }; diff --git a/freebsd/sys/dev/rtwn/usb/rtwn_usb_reg.c b/freebsd/sys/dev/rtwn/usb/rtwn_usb_reg.c index 8225282e..c7cb255e 100644 --- a/freebsd/sys/dev/rtwn/usb/rtwn_usb_reg.c +++ b/freebsd/sys/dev/rtwn/usb/rtwn_usb_reg.c @@ -174,8 +174,6 @@ rtwn_usb_delay(struct rtwn_softc *sc, int usec) /* 1ms delay as default is too big. */ if (usec < 1000) DELAY(usec); - else { - usb_pause_mtx(&sc->sc_mtx, - MAX(msecs_to_ticks(usec / 1000), 1)); - } + else + usb_pause_mtx(&sc->sc_mtx, msecs_to_ticks(usec / 1000)); } diff --git a/freebsd/sys/dev/sdhci/fsl_sdhci.c b/freebsd/sys/dev/sdhci/fsl_sdhci.c index 84665b41..be3d1de3 100644 --- a/freebsd/sys/dev/sdhci/fsl_sdhci.c +++ b/freebsd/sys/dev/sdhci/fsl_sdhci.c @@ -1016,7 +1016,7 @@ static driver_t fsl_sdhci_driver = { DRIVER_MODULE(sdhci_fsl, simplebus, fsl_sdhci_driver, fsl_sdhci_devclass, NULL, NULL); -MODULE_DEPEND(sdhci_fsl, sdhci, 1, 1, 1); +SDHCI_DEPEND(sdhci_fsl); #ifndef MMCCAM MMC_DECLARE_BRIDGE(sdhci_fsl); diff --git a/freebsd/sys/dev/sdhci/sdhci.c b/freebsd/sys/dev/sdhci/sdhci.c index 0bb9edc7..cf853360 100644 --- a/freebsd/sys/dev/sdhci/sdhci.c +++ b/freebsd/sys/dev/sdhci/sdhci.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include <sys/conf.h> #include <sys/kernel.h> #include <sys/kobj.h> +#include <sys/libkern.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/module.h> @@ -110,19 +111,20 @@ static void sdhci_retune(void *arg); static void sdhci_set_clock(struct sdhci_slot *slot, uint32_t clock); static void sdhci_set_power(struct sdhci_slot *slot, u_char power); static void sdhci_set_transfer_mode(struct sdhci_slot *slot, - struct mmc_data *data); + const struct mmc_data *data); static void sdhci_start(struct sdhci_slot *slot); static void sdhci_timeout(void *arg); static void sdhci_start_command(struct sdhci_slot *slot, struct mmc_command *cmd); -static void sdhci_start_data(struct sdhci_slot *slot, struct mmc_data *data); +static void sdhci_start_data(struct sdhci_slot *slot, + const struct mmc_data *data); static void sdhci_write_block_pio(struct sdhci_slot *slot); static void sdhci_transfer_pio(struct sdhci_slot *slot); #ifdef MMCCAM /* CAM-related */ static void sdhci_cam_action(struct cam_sim *sim, union ccb *ccb); -static int sdhci_cam_get_possible_host_clock(struct sdhci_slot *slot, +static int sdhci_cam_get_possible_host_clock(const struct sdhci_slot *slot, int proposed_clock); static void sdhci_cam_handle_mmcio(struct cam_sim *sim, union ccb *ccb); static void sdhci_cam_poll(struct cam_sim *sim); @@ -132,12 +134,14 @@ static int sdhci_cam_update_ios(struct sdhci_slot *slot); #endif /* helper routines */ +static int sdhci_dma_alloc(struct sdhci_slot *slot); +static void sdhci_dma_free(struct sdhci_slot *slot); static void sdhci_dumpregs(struct sdhci_slot *slot); static void sdhci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error); -static int slot_printf(struct sdhci_slot *slot, const char * fmt, ...) +static int slot_printf(const struct sdhci_slot *slot, const char * fmt, ...) __printflike(2, 3); -static uint32_t sdhci_tuning_intmask(struct sdhci_slot *slot); +static uint32_t sdhci_tuning_intmask(const struct sdhci_slot *slot); #define SDHCI_LOCK(_slot) mtx_lock(&(_slot)->mtx) #define SDHCI_UNLOCK(_slot) mtx_unlock(&(_slot)->mtx) @@ -181,17 +185,22 @@ sdhci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error) } static int -slot_printf(struct sdhci_slot *slot, const char * fmt, ...) +slot_printf(const struct sdhci_slot *slot, const char * fmt, ...) { + char buf[128]; va_list ap; int retval; - retval = printf("%s-slot%d: ", - device_get_nameunit(slot->bus), slot->num); - + /* + * Make sure we print a single line all together rather than in two + * halves to avoid console gibberish bingo. + */ va_start(ap, fmt); - retval += vprintf(fmt, ap); + retval = vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); + + retval += printf("%s-slot%d: %s", + device_get_nameunit(slot->bus), slot->num, buf); return (retval); } @@ -292,7 +301,7 @@ sdhci_reset(struct sdhci_slot *slot, uint8_t mask) } static uint32_t -sdhci_tuning_intmask(struct sdhci_slot *slot) +sdhci_tuning_intmask(const struct sdhci_slot *slot) { uint32_t intmask; @@ -474,7 +483,7 @@ sdhci_set_power(struct sdhci_slot *slot, u_char power) DELAY(100); } if (!(RD1(slot, SDHCI_POWER_CONTROL) & SDHCI_POWER_ON)) - slot_printf(slot, "Bus power failed to enable"); + slot_printf(slot, "Bus power failed to enable\n"); if (slot->quirks & SDHCI_QUIRK_INTEL_POWER_UP_RESET) { WR1(slot, SDHCI_POWER_CONTROL, pwr | 0x10); @@ -494,7 +503,13 @@ sdhci_read_block_pio(struct sdhci_slot *slot) buffer = slot->curcmd->data->data; buffer += slot->offset; /* Transfer one block at a time. */ - left = min(512, slot->curcmd->data->len - slot->offset); +#ifdef MMCCAM + if (slot->curcmd->data->flags & MMC_DATA_BLOCK_SIZE) + left = min(slot->curcmd->data->block_size, + slot->curcmd->data->len - slot->offset); + else +#endif + left = min(512, slot->curcmd->data->len - slot->offset); slot->offset += left; /* If we are too fast, broken controllers return zeroes. */ @@ -537,7 +552,13 @@ sdhci_write_block_pio(struct sdhci_slot *slot) buffer = slot->curcmd->data->data; buffer += slot->offset; /* Transfer one block at a time. */ - left = min(512, slot->curcmd->data->len - slot->offset); +#ifdef MMCCAM + if (slot->curcmd->data->flags & MMC_DATA_BLOCK_SIZE) { + left = min(slot->curcmd->data->block_size, + slot->curcmd->data->len - slot->offset); + } else +#endif + left = min(512, slot->curcmd->data->len - slot->offset); slot->offset += left; /* Handle unaligned and aligned buffer cases. */ @@ -739,55 +760,94 @@ sdhci_card_poll(void *arg) sdhci_card_poll, slot); } -int -sdhci_init_slot(device_t dev, struct sdhci_slot *slot, int num) +static int +sdhci_dma_alloc(struct sdhci_slot *slot) { - kobjop_desc_t kobj_desc; - kobj_method_t *kobj_method; - uint32_t caps, caps2, freq, host_caps; int err; - SDHCI_LOCK_INIT(slot); - - slot->num = num; - slot->bus = dev; + if (!(slot->quirks & SDHCI_QUIRK_BROKEN_SDMA_BOUNDARY)) { + if (MAXPHYS <= 1024 * 4) + slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_4K; + else if (MAXPHYS <= 1024 * 8) + slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_8K; + else if (MAXPHYS <= 1024 * 16) + slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_16K; + else if (MAXPHYS <= 1024 * 32) + slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_32K; + else if (MAXPHYS <= 1024 * 64) + slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_64K; + else if (MAXPHYS <= 1024 * 128) + slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_128K; + else if (MAXPHYS <= 1024 * 256) + slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_256K; + else + slot->sdma_boundary = SDHCI_BLKSZ_SDMA_BNDRY_512K; + } + slot->sdma_bbufsz = SDHCI_SDMA_BNDRY_TO_BBUFSZ(slot->sdma_boundary); - /* Allocate DMA tag. */ - err = bus_dma_tag_create(bus_get_dma_tag(dev), - DMA_BLOCK_SIZE, 0, BUS_SPACE_MAXADDR_32BIT, - BUS_SPACE_MAXADDR, NULL, NULL, - DMA_BLOCK_SIZE, 1, DMA_BLOCK_SIZE, - BUS_DMA_ALLOCNOW, NULL, NULL, - &slot->dmatag); + /* + * Allocate the DMA tag for an SDMA bounce buffer. + * Note that the SDHCI specification doesn't state any alignment + * constraint for the SDMA system address. However, controllers + * typically ignore the SDMA boundary bits in SDHCI_DMA_ADDRESS when + * forming the actual address of data, requiring the SDMA buffer to + * be aligned to the SDMA boundary. + */ + err = bus_dma_tag_create(bus_get_dma_tag(slot->bus), slot->sdma_bbufsz, + 0, BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, + slot->sdma_bbufsz, 1, slot->sdma_bbufsz, BUS_DMA_ALLOCNOW, + NULL, NULL, &slot->dmatag); if (err != 0) { - device_printf(dev, "Can't create DMA tag\n"); - SDHCI_LOCK_DESTROY(slot); + slot_printf(slot, "Can't create DMA tag for SDMA\n"); return (err); } - /* Allocate DMA memory. */ + /* Allocate DMA memory for the SDMA bounce buffer. */ err = bus_dmamem_alloc(slot->dmatag, (void **)&slot->dmamem, BUS_DMA_NOWAIT, &slot->dmamap); if (err != 0) { - device_printf(dev, "Can't alloc DMA memory\n"); + slot_printf(slot, "Can't alloc DMA memory for SDMA\n"); bus_dma_tag_destroy(slot->dmatag); - SDHCI_LOCK_DESTROY(slot); return (err); } - /* Map the memory. */ + /* Map the memory of the SDMA bounce buffer. */ err = bus_dmamap_load(slot->dmatag, slot->dmamap, - (void *)slot->dmamem, DMA_BLOCK_SIZE, - sdhci_getaddr, &slot->paddr, 0); + (void *)slot->dmamem, slot->sdma_bbufsz, sdhci_getaddr, + &slot->paddr, 0); if (err != 0 || slot->paddr == 0) { - device_printf(dev, "Can't load DMA memory\n"); + slot_printf(slot, "Can't load DMA memory for SDMA\n"); bus_dmamem_free(slot->dmatag, slot->dmamem, slot->dmamap); bus_dma_tag_destroy(slot->dmatag); - SDHCI_LOCK_DESTROY(slot); if (err) return (err); else return (EFAULT); } + return (0); +} + +static void +sdhci_dma_free(struct sdhci_slot *slot) +{ + + bus_dmamap_unload(slot->dmatag, slot->dmamap); + bus_dmamem_free(slot->dmatag, slot->dmamem, slot->dmamap); + bus_dma_tag_destroy(slot->dmatag); +} + +int +sdhci_init_slot(device_t dev, struct sdhci_slot *slot, int num) +{ + kobjop_desc_t kobj_desc; + kobj_method_t *kobj_method; + uint32_t caps, caps2, freq, host_caps; + int err; + + SDHCI_LOCK_INIT(slot); + + slot->num = num; + slot->bus = dev; + slot->version = (RD2(slot, SDHCI_HOST_VERSION) >> SDHCI_SPEC_VER_SHIFT) & SDHCI_SPEC_VER_MASK; if (slot->quirks & SDHCI_QUIRK_MISSING_CAPS) { @@ -803,12 +863,8 @@ sdhci_init_slot(device_t dev, struct sdhci_slot *slot, int num) if (slot->version >= SDHCI_SPEC_300) { if ((caps & SDHCI_SLOTTYPE_MASK) != SDHCI_SLOTTYPE_REMOVABLE && (caps & SDHCI_SLOTTYPE_MASK) != SDHCI_SLOTTYPE_EMBEDDED) { - device_printf(dev, + slot_printf(slot, "Driver doesn't support shared bus slots\n"); - bus_dmamap_unload(slot->dmatag, slot->dmamap); - bus_dmamem_free(slot->dmatag, slot->dmamem, - slot->dmamap); - bus_dma_tag_destroy(slot->dmatag); SDHCI_LOCK_DESTROY(slot); return (ENXIO); } else if ((caps & SDHCI_SLOTTYPE_MASK) == @@ -832,7 +888,7 @@ sdhci_init_slot(device_t dev, struct sdhci_slot *slot, int num) */ if (slot->max_clk == 0) { slot->max_clk = SDHCI_DEFAULT_MAX_FREQ * 1000000; - device_printf(dev, "Hardware doesn't specify base clock " + slot_printf(slot, "Hardware doesn't specify base clock " "frequency, using %dMHz as default.\n", SDHCI_DEFAULT_MAX_FREQ); } @@ -853,7 +909,7 @@ sdhci_init_slot(device_t dev, struct sdhci_slot *slot, int num) * max timeout, but still mention it. */ if (slot->timeout_clk == 0) { - device_printf(dev, "Hardware doesn't specify timeout clock " + slot_printf(slot, "Hardware doesn't specify timeout clock " "frequency, setting BROKEN_TIMEOUT quirk.\n"); slot->quirks |= SDHCI_QUIRK_BROKEN_TIMEOUT_VAL; } @@ -869,7 +925,7 @@ sdhci_init_slot(device_t dev, struct sdhci_slot *slot, int num) if ((caps & SDHCI_CAN_VDD_180) && (slot->opt & SDHCI_SLOT_EMBEDDED)) slot->host.host_ocr |= MMC_OCR_LOW_VOLTAGE; if (slot->host.host_ocr == 0) { - device_printf(dev, "Hardware doesn't report any " + slot_printf(slot, "Hardware doesn't report any " "support voltages.\n"); } @@ -955,7 +1011,7 @@ no_tuning: slot->retune_count = (caps2 & SDHCI_RETUNE_CNT_MASK) >> SDHCI_RETUNE_CNT_SHIFT; if (slot->retune_count > 0xb) { - device_printf(dev, "Unknown re-tuning count " + slot_printf(slot, "Unknown re-tuning count " "%x, using 1 sec\n", slot->retune_count); slot->retune_count = 1; } else if (slot->retune_count != 0) @@ -1014,6 +1070,19 @@ no_tuning: if (slot->opt & SDHCI_PLATFORM_TRANSFER) slot->opt &= ~SDHCI_HAVE_DMA; + if (slot->opt & SDHCI_HAVE_DMA) { + err = sdhci_dma_alloc(slot); + if (err != 0) { + if (slot->opt & SDHCI_TUNING_SUPPORTED) { + free(slot->tune_req, M_DEVBUF); + free(slot->tune_cmd, M_DEVBUF); + free(slot->tune_data, M_DEVBUF); + } + SDHCI_LOCK_DESTROY(slot); + return (err); + } + } + if (bootverbose || sdhci_debug) { slot_printf(slot, "%uMHz%s %s VDD:%s%s%s VCCQ: 3.3V%s%s DRV: B%s%s%s %s %s\n", @@ -1061,7 +1130,7 @@ no_tuning: slot->timeout = 10; SYSCTL_ADD_INT(device_get_sysctl_ctx(slot->bus), SYSCTL_CHILDREN(device_get_sysctl_tree(slot->bus)), OID_AUTO, - "timeout", CTLFLAG_RW, &slot->timeout, 0, + "timeout", CTLFLAG_RWTUN, &slot->timeout, 0, "Maximum timeout for SDHCI transfers (in secs)"); TASK_INIT(&slot->card_task, 0, sdhci_card_task, slot); TIMEOUT_TASK_INIT(taskqueue_swi_giant, &slot->card_delayed_task, 0, @@ -1111,9 +1180,8 @@ sdhci_cleanup_slot(struct sdhci_slot *slot) SDHCI_LOCK(slot); sdhci_reset(slot, SDHCI_RESET_ALL); SDHCI_UNLOCK(slot); - bus_dmamap_unload(slot->dmatag, slot->dmamap); - bus_dmamem_free(slot->dmatag, slot->dmamem, slot->dmamap); - bus_dma_tag_destroy(slot->dmatag); + if (slot->opt & SDHCI_HAVE_DMA) + sdhci_dma_free(slot); if (slot->opt & SDHCI_TUNING_SUPPORTED) { free(slot->tune_req, M_DEVBUF); free(slot->tune_cmd, M_DEVBUF); @@ -1177,7 +1245,7 @@ sdhci_generic_get_card_present(device_t brdev __unused, struct sdhci_slot *slot) void sdhci_generic_set_uhs_timing(device_t brdev __unused, struct sdhci_slot *slot) { - struct mmc_ios *ios; + const struct mmc_ios *ios; uint16_t hostctrl2; if (slot->version < SDHCI_SPEC_300) @@ -1310,7 +1378,7 @@ int sdhci_generic_tune(device_t brdev __unused, device_t reqdev, bool hs400) { struct sdhci_slot *slot = device_get_ivars(reqdev); - struct mmc_ios *ios = &slot->host.ios; + const struct mmc_ios *ios = &slot->host.ios; struct mmc_command *tune_cmd; struct mmc_data *tune_data; uint32_t opcode; @@ -1513,23 +1581,23 @@ sdhci_retune(void *arg) static void sdhci_req_done(struct sdhci_slot *slot) { - union ccb *ccb; + union ccb *ccb; if (__predict_false(sdhci_debug > 1)) slot_printf(slot, "%s\n", __func__); if (slot->ccb != NULL && slot->curcmd != NULL) { callout_stop(&slot->timeout_callout); - ccb = slot->ccb; - slot->ccb = NULL; + ccb = slot->ccb; + slot->ccb = NULL; slot->curcmd = NULL; - /* Tell CAM the request is finished */ - struct ccb_mmcio *mmcio; - mmcio = &ccb->mmcio; + /* Tell CAM the request is finished */ + struct ccb_mmcio *mmcio; + mmcio = &ccb->mmcio; - ccb->ccb_h.status = - (mmcio->cmd.error == 0 ? CAM_REQ_CMP : CAM_REQ_CMP_ERR); - xpt_done(ccb); + ccb->ccb_h.status = + (mmcio->cmd.error == 0 ? CAM_REQ_CMP : CAM_REQ_CMP_ERR); + xpt_done(ccb); } } #else @@ -1579,7 +1647,7 @@ sdhci_timeout(void *arg) } static void -sdhci_set_transfer_mode(struct sdhci_slot *slot, struct mmc_data *data) +sdhci_set_transfer_mode(struct sdhci_slot *slot, const struct mmc_data *data) { uint16_t mode; @@ -1587,9 +1655,9 @@ sdhci_set_transfer_mode(struct sdhci_slot *slot, struct mmc_data *data) return; mode = SDHCI_TRNS_BLK_CNT_EN; - if (data->len > 512) { + if (data->len > 512 || data->block_count > 1) { mode |= SDHCI_TRNS_MULTI; - if (__predict_true( + if (data->block_count == 0 && __predict_true( #ifdef MMCCAM slot->ccb->mmcio.stop.opcode == MMC_STOP_TRANSMISSION && #else @@ -1714,7 +1782,9 @@ sdhci_start_command(struct sdhci_slot *slot, struct mmc_command *cmd) /* Set data transfer mode. */ sdhci_set_transfer_mode(slot, cmd->data); if (__predict_false(sdhci_debug > 1)) - slot_printf(slot, "Starting command!\n"); + slot_printf(slot, "Starting command opcode %#04x flags %#04x\n", + cmd->opcode, flags); + /* Start command. */ WR2(slot, SDHCI_COMMAND_FLAGS, (cmd->opcode << 8) | (flags & 0xff)); /* Start timeout callout. */ @@ -1730,7 +1800,7 @@ sdhci_finish_command(struct sdhci_slot *slot) uint8_t extra; if (__predict_false(sdhci_debug > 1)) - slot_printf(slot, "%s: called, err %d flags %d\n", + slot_printf(slot, "%s: called, err %d flags %#04x\n", __func__, slot->curcmd->error, slot->curcmd->flags); slot->cmd_done = 1; /* @@ -1771,7 +1841,7 @@ sdhci_finish_command(struct sdhci_slot *slot) slot->curcmd->resp[0] = RD4(slot, SDHCI_RESPONSE); } if (__predict_false(sdhci_debug > 1)) - printf("Resp: %02x %02x %02x %02x\n", + slot_printf(slot, "Resp: %#04x %#04x %#04x %#04x\n", slot->curcmd->resp[0], slot->curcmd->resp[1], slot->curcmd->resp[2], slot->curcmd->resp[3]); @@ -1781,9 +1851,9 @@ sdhci_finish_command(struct sdhci_slot *slot) } static void -sdhci_start_data(struct sdhci_slot *slot, struct mmc_data *data) +sdhci_start_data(struct sdhci_slot *slot, const struct mmc_data *data) { - uint32_t target_timeout, current_timeout; + uint32_t blkcnt, blksz, current_timeout, sdma_bbufsz, target_timeout; uint8_t div; if (data == NULL && (slot->curcmd->flags & MMC_RSP_BUSY) == 0) { @@ -1819,7 +1889,7 @@ sdhci_start_data(struct sdhci_slot *slot, struct mmc_data *data) /* Use DMA if possible. */ if ((slot->opt & SDHCI_HAVE_DMA)) slot->flags |= SDHCI_USE_DMA; - /* If data is small, broken DMA may return zeroes instead of data, */ + /* If data is small, broken DMA may return zeroes instead of data. */ if ((slot->quirks & SDHCI_QUIRK_BROKEN_TIMINGS) && (data->len <= 512)) slot->flags &= ~SDHCI_USE_DMA; @@ -1829,20 +1899,22 @@ sdhci_start_data(struct sdhci_slot *slot, struct mmc_data *data) slot->flags &= ~SDHCI_USE_DMA; /* Load DMA buffer. */ if (slot->flags & SDHCI_USE_DMA) { + sdma_bbufsz = slot->sdma_bbufsz; if (data->flags & MMC_DATA_READ) bus_dmamap_sync(slot->dmatag, slot->dmamap, BUS_DMASYNC_PREREAD); else { - memcpy(slot->dmamem, data->data, - (data->len < DMA_BLOCK_SIZE) ? - data->len : DMA_BLOCK_SIZE); + memcpy(slot->dmamem, data->data, ulmin(data->len, + sdma_bbufsz)); bus_dmamap_sync(slot->dmatag, slot->dmamap, BUS_DMASYNC_PREWRITE); } WR4(slot, SDHCI_DMA_ADDRESS, slot->paddr); - /* Interrupt aggregation: Mask border interrupt - * for the last page and unmask else. */ - if (data->len == DMA_BLOCK_SIZE) + /* + * Interrupt aggregation: Mask border interrupt for the last + * bounce buffer and unmask otherwise. + */ + if (data->len == sdma_bbufsz) slot->intmask &= ~SDHCI_INT_DMA_END; else slot->intmask |= SDHCI_INT_DMA_END; @@ -1850,16 +1922,27 @@ sdhci_start_data(struct sdhci_slot *slot, struct mmc_data *data) } /* Current data offset for both PIO and DMA. */ slot->offset = 0; - /* Set block size and request IRQ on 4K border. */ - WR2(slot, SDHCI_BLOCK_SIZE, SDHCI_MAKE_BLKSZ(DMA_BOUNDARY, - (data->len < 512) ? data->len : 512)); - /* Set block count. */ - WR2(slot, SDHCI_BLOCK_COUNT, (data->len + 511) / 512); +#ifdef MMCCAM + if (data->flags & MMC_DATA_BLOCK_SIZE) { + /* Set block size and request border interrupts on the SDMA boundary. */ + blksz = SDHCI_MAKE_BLKSZ(slot->sdma_boundary, data->block_size); + blkcnt = data->block_count; + if (__predict_false(sdhci_debug > 0)) + slot_printf(slot, "SDIO Custom block params: blksz: " + "%#10x, blk cnt: %#10x\n", blksz, blkcnt); + } else +#endif + { + /* Set block size and request border interrupts on the SDMA boundary. */ + blksz = SDHCI_MAKE_BLKSZ(slot->sdma_boundary, ulmin(data->len, 512)); + blkcnt = howmany(data->len, 512); + } + WR2(slot, SDHCI_BLOCK_SIZE, blksz); + WR2(slot, SDHCI_BLOCK_COUNT, blkcnt); if (__predict_false(sdhci_debug > 1)) - slot_printf(slot, "Block size: %02x, count %lu\n", - (unsigned int)SDHCI_MAKE_BLKSZ(DMA_BOUNDARY, (data->len < 512) ? data->len : 512), - (unsigned long)(data->len + 511) / 512); + slot_printf(slot, "Blk size: 0x%08x | Blk cnt: 0x%08x\n", + blksz, blkcnt); } void @@ -1883,7 +1966,7 @@ sdhci_finish_data(struct sdhci_slot *slot) bus_dmamap_sync(slot->dmatag, slot->dmamap, BUS_DMASYNC_POSTREAD); memcpy((u_char*)data->data + slot->offset, slot->dmamem, - (left < DMA_BLOCK_SIZE) ? left : DMA_BLOCK_SIZE); + ulmin(left, slot->sdma_bbufsz)); } else bus_dmamap_sync(slot->dmatag, slot->dmamap, BUS_DMASYNC_POSTWRITE); @@ -1907,15 +1990,14 @@ sdhci_finish_data(struct sdhci_slot *slot) static void sdhci_start(struct sdhci_slot *slot) { - union ccb *ccb; + union ccb *ccb; + struct ccb_mmcio *mmcio; ccb = slot->ccb; if (ccb == NULL) return; - struct ccb_mmcio *mmcio; mmcio = &ccb->mmcio; - if (!(slot->flags & CMD_STARTED)) { slot->flags |= CMD_STARTED; sdhci_start_command(slot, &mmcio->cmd); @@ -1947,7 +2029,7 @@ sdhci_start(struct sdhci_slot *slot) static void sdhci_start(struct sdhci_slot *slot) { - struct mmc_request *req; + const struct mmc_request *req; req = slot->req; if (req == NULL) @@ -2076,6 +2158,7 @@ sdhci_data_irq(struct sdhci_slot *slot, uint32_t intmask) { struct mmc_data *data; size_t left; + uint32_t sdma_bbufsz; if (!slot->curcmd) { slot_printf(slot, "Got data interrupt 0x%08x, but " @@ -2130,6 +2213,7 @@ sdhci_data_irq(struct sdhci_slot *slot, uint32_t intmask) /* Handle DMA border. */ if (intmask & SDHCI_INT_DMA_END) { data = slot->curcmd->data; + sdma_bbufsz = slot->sdma_bbufsz; /* Unload DMA buffer ... */ left = data->len - slot->offset; @@ -2137,26 +2221,28 @@ sdhci_data_irq(struct sdhci_slot *slot, uint32_t intmask) bus_dmamap_sync(slot->dmatag, slot->dmamap, BUS_DMASYNC_POSTREAD); memcpy((u_char*)data->data + slot->offset, slot->dmamem, - (left < DMA_BLOCK_SIZE) ? left : DMA_BLOCK_SIZE); + ulmin(left, sdma_bbufsz)); } else { bus_dmamap_sync(slot->dmatag, slot->dmamap, BUS_DMASYNC_POSTWRITE); } /* ... and reload it again. */ - slot->offset += DMA_BLOCK_SIZE; + slot->offset += sdma_bbufsz; left = data->len - slot->offset; if (data->flags & MMC_DATA_READ) { bus_dmamap_sync(slot->dmatag, slot->dmamap, BUS_DMASYNC_PREREAD); } else { memcpy(slot->dmamem, (u_char*)data->data + slot->offset, - (left < DMA_BLOCK_SIZE)? left : DMA_BLOCK_SIZE); + ulmin(left, sdma_bbufsz)); bus_dmamap_sync(slot->dmatag, slot->dmamap, BUS_DMASYNC_PREWRITE); } - /* Interrupt aggregation: Mask border interrupt - * for the last page. */ - if (left == DMA_BLOCK_SIZE) { + /* + * Interrupt aggregation: Mask border interrupt for the last + * bounce buffer. + */ + if (left == sdma_bbufsz) { slot->intmask &= ~SDHCI_INT_DMA_END; WR4(slot, SDHCI_SIGNAL_ENABLE, slot->intmask); } @@ -2279,7 +2365,7 @@ int sdhci_generic_read_ivar(device_t bus, device_t child, int which, uintptr_t *result) { - struct sdhci_slot *slot = device_get_ivars(child); + const struct sdhci_slot *slot = device_get_ivars(child); switch (which) { default: @@ -2442,47 +2528,46 @@ sdhci_generic_write_ivar(device_t bus, device_t child, int which, void sdhci_start_slot(struct sdhci_slot *slot) { - if ((slot->devq = cam_simq_alloc(1)) == NULL) { - goto fail; - } - - mtx_init(&slot->sim_mtx, "sdhcisim", NULL, MTX_DEF); - slot->sim = cam_sim_alloc(sdhci_cam_action, sdhci_cam_poll, - "sdhci_slot", slot, device_get_unit(slot->bus), - &slot->sim_mtx, 1, 1, slot->devq); - - if (slot->sim == NULL) { - cam_simq_free(slot->devq); - slot_printf(slot, "cannot allocate CAM SIM\n"); - goto fail; - } - - mtx_lock(&slot->sim_mtx); - if (xpt_bus_register(slot->sim, slot->bus, 0) != 0) { - slot_printf(slot, - "cannot register SCSI pass-through bus\n"); - cam_sim_free(slot->sim, FALSE); - cam_simq_free(slot->devq); - mtx_unlock(&slot->sim_mtx); - goto fail; - } - - mtx_unlock(&slot->sim_mtx); - /* End CAM-specific init */ + + if ((slot->devq = cam_simq_alloc(1)) == NULL) + goto fail; + + mtx_init(&slot->sim_mtx, "sdhcisim", NULL, MTX_DEF); + slot->sim = cam_sim_alloc_dev(sdhci_cam_action, sdhci_cam_poll, + "sdhci_slot", slot, slot->bus, + &slot->sim_mtx, 1, 1, slot->devq); + + if (slot->sim == NULL) { + cam_simq_free(slot->devq); + slot_printf(slot, "cannot allocate CAM SIM\n"); + goto fail; + } + + mtx_lock(&slot->sim_mtx); + if (xpt_bus_register(slot->sim, slot->bus, 0) != 0) { + slot_printf(slot, "cannot register SCSI pass-through bus\n"); + cam_sim_free(slot->sim, FALSE); + cam_simq_free(slot->devq); + mtx_unlock(&slot->sim_mtx); + goto fail; + } + mtx_unlock(&slot->sim_mtx); + + /* End CAM-specific init */ slot->card_present = 0; sdhci_card_task(slot, 0); - return; + return; fail: - if (slot->sim != NULL) { - mtx_lock(&slot->sim_mtx); - xpt_bus_deregister(cam_sim_path(slot->sim)); - cam_sim_free(slot->sim, FALSE); - mtx_unlock(&slot->sim_mtx); - } - - if (slot->devq != NULL) - cam_simq_free(slot->devq); + if (slot->sim != NULL) { + mtx_lock(&slot->sim_mtx); + xpt_bus_deregister(cam_sim_path(slot->sim)); + cam_sim_free(slot->sim, FALSE); + mtx_unlock(&slot->sim_mtx); + } + + if (slot->devq != NULL) + cam_simq_free(slot->devq); } static void @@ -2541,6 +2626,7 @@ sdhci_cam_action(struct cam_sim *sim, union ccb *ccb) case XPT_GET_TRAN_SETTINGS: { struct ccb_trans_settings *cts = &ccb->cts; + uint32_t max_data; if (sdhci_debug > 1) slot_printf(slot, "Got XPT_GET_TRAN_SETTINGS\n"); @@ -2554,6 +2640,19 @@ sdhci_cam_action(struct cam_sim *sim, union ccb *ccb) cts->proto_specific.mmc.host_f_min = slot->host.f_min; cts->proto_specific.mmc.host_f_max = slot->host.f_max; cts->proto_specific.mmc.host_caps = slot->host.caps; + /* + * Re-tuning modes 1 and 2 restrict the maximum data length + * per read/write command to 4 MiB. + */ + if (slot->opt & SDHCI_TUNING_ENABLED && + (slot->retune_mode == SDHCI_RETUNE_MODE_1 || + slot->retune_mode == SDHCI_RETUNE_MODE_2)) { + max_data = 4 * 1024 * 1024 / MMC_SECTOR_SIZE; + } else { + max_data = 65535; + } + cts->proto_specific.mmc.host_max_data = max_data; + memcpy(&cts->proto_specific.mmc.ios, &slot->host.ios, sizeof(struct mmc_ios)); ccb->ccb_h.status = CAM_REQ_CMP; break; @@ -2601,7 +2700,8 @@ sdhci_cam_poll(struct cam_sim *sim) } static int -sdhci_cam_get_possible_host_clock(struct sdhci_slot *slot, int proposed_clock) +sdhci_cam_get_possible_host_clock(const struct sdhci_slot *slot, + int proposed_clock) { int max_clock, clock, i; @@ -2611,15 +2711,13 @@ sdhci_cam_get_possible_host_clock(struct sdhci_slot *slot, int proposed_clock) clock = max_clock; if (slot->version < SDHCI_SPEC_300) { - for (i = 0; i < SDHCI_200_MAX_DIVIDER; - i <<= 1) { + for (i = 0; i < SDHCI_200_MAX_DIVIDER; i <<= 1) { if (clock <= proposed_clock) break; clock >>= 1; } } else { - for (i = 0; i < SDHCI_300_MAX_DIVIDER; - i += 2) { + for (i = 0; i < SDHCI_300_MAX_DIVIDER; i += 2) { if (clock <= proposed_clock) break; clock = max_clock / (i + 2); @@ -2628,15 +2726,14 @@ sdhci_cam_get_possible_host_clock(struct sdhci_slot *slot, int proposed_clock) return clock; } -int +static int sdhci_cam_settran_settings(struct sdhci_slot *slot, union ccb *ccb) { struct mmc_ios *ios; - struct mmc_ios *new_ios; - struct ccb_trans_settings_mmc *cts; + const struct mmc_ios *new_ios; + const struct ccb_trans_settings_mmc *cts; ios = &slot->host.ios; - cts = &ccb->cts.proto_specific.mmc; new_ios = &cts->ios; @@ -2670,11 +2767,11 @@ sdhci_cam_settran_settings(struct sdhci_slot *slot, union ccb *ccb) slot_printf(slot, "Bus mode => %d\n", ios->bus_mode); } - /* XXX Provide a way to call a chip-specific IOS update, required for TI */ + /* XXX Provide a way to call a chip-specific IOS update, required for TI */ return (sdhci_cam_update_ios(slot)); } -int +static int sdhci_cam_update_ios(struct sdhci_slot *slot) { struct mmc_ios *ios = &slot->host.ios; @@ -2716,10 +2813,10 @@ sdhci_cam_update_ios(struct sdhci_slot *slot) return (0); } -int +static int sdhci_cam_request(struct sdhci_slot *slot, union ccb *ccb) { - struct ccb_mmcio *mmcio; + const struct ccb_mmcio *mmcio; mmcio = &ccb->mmcio; @@ -2730,15 +2827,18 @@ sdhci_cam_request(struct sdhci_slot *slot, union ccb *ccb) } */ if (__predict_false(sdhci_debug > 1)) { - slot_printf(slot, "CMD%u arg %#x flags %#x dlen %u dflags %#x\n", - mmcio->cmd.opcode, mmcio->cmd.arg, mmcio->cmd.flags, - mmcio->cmd.data != NULL ? (unsigned int) mmcio->cmd.data->len : 0, - mmcio->cmd.data != NULL ? mmcio->cmd.data->flags: 0); + slot_printf(slot, "CMD%u arg %#x flags %#x dlen %u dflags %#x " + "blksz=%zu blkcnt=%zu\n", + mmcio->cmd.opcode, mmcio->cmd.arg, mmcio->cmd.flags, + mmcio->cmd.data != NULL ? (unsigned int) mmcio->cmd.data->len : 0, + mmcio->cmd.data != NULL ? mmcio->cmd.data->flags : 0, + mmcio->cmd.data != NULL ? mmcio->cmd.data->block_size : 0, + mmcio->cmd.data != NULL ? mmcio->cmd.data->block_count : 0); } if (mmcio->cmd.data != NULL) { if (mmcio->cmd.data->len == 0 || mmcio->cmd.data->flags == 0) panic("data->len = %d, data->flags = %d -- something is b0rked", - (int)mmcio->cmd.data->len, mmcio->cmd.data->flags); + (int)mmcio->cmd.data->len, mmcio->cmd.data->flags); } slot->ccb = ccb; slot->flags = 0; @@ -2754,4 +2854,4 @@ sdhci_cam_request(struct sdhci_slot *slot, union ccb *ccb) } #endif /* MMCCAM */ -MODULE_VERSION(sdhci, 1); +MODULE_VERSION(sdhci, SDHCI_VERSION); diff --git a/freebsd/sys/dev/sdhci/sdhci.h b/freebsd/sys/dev/sdhci/sdhci.h index a22e0235..38c5e1b9 100644 --- a/freebsd/sys/dev/sdhci/sdhci.h +++ b/freebsd/sys/dev/sdhci/sdhci.h @@ -32,8 +32,8 @@ #include <rtems/bsd/local/opt_mmccam.h> -#define DMA_BLOCK_SIZE 4096 -#define DMA_BOUNDARY 0 /* DMA reload every 4K */ +/* Macro for sizing the SDMA bounce buffer on the SDMA buffer boundary. */ +#define SDHCI_SDMA_BNDRY_TO_BBUFSZ(bndry) (4096 * (1 << bndry)) /* Controller doesn't honor resets unless we touch the clock register */ #define SDHCI_QUIRK_CLOCK_BEFORE_RESET (1 << 0) @@ -95,6 +95,8 @@ #define SDHCI_QUIRK_BROKEN_AUTO_STOP (1 << 28) /* Controller supports eMMC HS400 mode if SDHCI_CAN_SDR104 is set. */ #define SDHCI_QUIRK_MMC_HS400_IF_CAN_SDR104 (1 << 29) +/* SDMA boundary in SDHCI_BLOCK_SIZE broken - use front-end supplied value. */ +#define SDHCI_QUIRK_BROKEN_SDMA_BOUNDARY (1 << 30) /* * Controller registers @@ -102,6 +104,14 @@ #define SDHCI_DMA_ADDRESS 0x00 #define SDHCI_BLOCK_SIZE 0x04 +#define SDHCI_BLKSZ_SDMA_BNDRY_4K 0x00 +#define SDHCI_BLKSZ_SDMA_BNDRY_8K 0x01 +#define SDHCI_BLKSZ_SDMA_BNDRY_16K 0x02 +#define SDHCI_BLKSZ_SDMA_BNDRY_32K 0x03 +#define SDHCI_BLKSZ_SDMA_BNDRY_64K 0x04 +#define SDHCI_BLKSZ_SDMA_BNDRY_128K 0x05 +#define SDHCI_BLKSZ_SDMA_BNDRY_256K 0x06 +#define SDHCI_BLKSZ_SDMA_BNDRY_512K 0x07 #define SDHCI_MAKE_BLKSZ(dma, blksz) (((dma & 0x7) << 12) | (blksz & 0xFFF)) #define SDHCI_BLOCK_COUNT 0x06 @@ -362,6 +372,8 @@ struct sdhci_slot { bus_dmamap_t dmamap; u_char *dmamem; bus_addr_t paddr; /* DMA buffer address */ + uint32_t sdma_bbufsz; /* SDMA bounce buffer size */ + uint8_t sdma_boundary; /* SDMA boundary */ struct task card_task; /* Card presence check task */ struct timeout_task card_delayed_task;/* Card insert delayed task */ @@ -401,10 +413,10 @@ struct sdhci_slot { #ifdef MMCCAM /* CAM stuff */ union ccb *ccb; - struct cam_devq *devq; - struct cam_sim *sim; - struct mtx sim_mtx; - u_char card_present; /* XXX Maybe derive this from elsewhere? */ + struct cam_devq *devq; + struct cam_sim *sim; + struct mtx sim_mtx; + u_char card_present; /* XXX Maybe derive this from elsewhere? */ #endif }; @@ -434,4 +446,9 @@ bool sdhci_generic_get_card_present(device_t brdev, struct sdhci_slot *slot); void sdhci_generic_set_uhs_timing(device_t brdev, struct sdhci_slot *slot); void sdhci_handle_card_present(struct sdhci_slot *slot, bool is_present); +#define SDHCI_VERSION 2 + +#define SDHCI_DEPEND(name) \ + MODULE_DEPEND(name, sdhci, SDHCI_VERSION, SDHCI_VERSION, SDHCI_VERSION); + #endif /* __SDHCI_H__ */ diff --git a/freebsd/sys/dev/usb/net/if_aue.c b/freebsd/sys/dev/usb/net/if_aue.c index 5454e2aa..d33176d9 100644 --- a/freebsd/sys/dev/usb/net/if_aue.c +++ b/freebsd/sys/dev/usb/net/if_aue.c @@ -94,6 +94,10 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/if_var.h> +#include <net/if_media.h> + +#include <dev/mii/mii.h> +#include <dev/mii/miivar.h> #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> @@ -107,6 +111,8 @@ __FBSDID("$FreeBSD$"); #include <dev/usb/net/usb_ethernet.h> #include <dev/usb/net/if_auereg.h> +#include <rtems/bsd/local/miibus_if.h> + #ifdef USB_DEBUG static int aue_debug = 0; diff --git a/freebsd/sys/dev/usb/net/if_axe.c b/freebsd/sys/dev/usb/net/if_axe.c index cf54e96e..643b46ce 100644 --- a/freebsd/sys/dev/usb/net/if_axe.c +++ b/freebsd/sys/dev/usb/net/if_axe.c @@ -118,6 +118,8 @@ __FBSDID("$FreeBSD$"); #include <dev/usb/net/usb_ethernet.h> #include <dev/usb/net/if_axereg.h> +#include <rtems/bsd/local/miibus_if.h> + /* * AXE_178_MAX_FRAME_BURST * max frame burst size for Ax88178 and Ax88772 @@ -1149,7 +1151,7 @@ axe_rxeof(struct usb_ether *ue, struct usb_page_cache *pc, unsigned int offset, } } - _IF_ENQUEUE(&ue->ue_rxq, m); + (void)mbufq_enqueue(&ue->ue_rxq, m); return (0); } diff --git a/freebsd/sys/dev/usb/net/if_axge.c b/freebsd/sys/dev/usb/net/if_axge.c index f8ed34ae..564d5221 100644 --- a/freebsd/sys/dev/usb/net/if_axge.c +++ b/freebsd/sys/dev/usb/net/if_axge.c @@ -50,6 +50,10 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/if_var.h> +#include <net/if_media.h> + +#include <dev/mii/mii.h> +#include <dev/mii/miivar.h> #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> @@ -63,6 +67,8 @@ __FBSDID("$FreeBSD$"); #include <dev/usb/net/usb_ethernet.h> #include <dev/usb/net/if_axgereg.h> +#include <rtems/bsd/local/miibus_if.h> + /* * Various supported device vendors/products. */ @@ -1037,7 +1043,7 @@ axge_rxeof(struct usb_ether *ue, struct usb_page_cache *pc, unsigned int offset, } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); - _IF_ENQUEUE(&ue->ue_rxq, m); + (void)mbufq_enqueue(&ue->ue_rxq, m); } static void diff --git a/freebsd/sys/dev/usb/net/if_cdce.c b/freebsd/sys/dev/usb/net/if_cdce.c index 9a15da69..6abe6653 100644 --- a/freebsd/sys/dev/usb/net/if_cdce.c +++ b/freebsd/sys/dev/usb/net/if_cdce.c @@ -52,11 +52,11 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <sys/gsb_crc32.h> +#include <sys/eventhandler.h> #include <sys/stdint.h> #include <sys/stddef.h> -#include <sys/param.h> #include <sys/queue.h> -#include <sys/types.h> #include <sys/systm.h> #include <sys/socket.h> #include <sys/kernel.h> @@ -278,6 +278,7 @@ static const STRUCT_USB_HOST_ID cdce_host_devs[] = { {USB_VPI(USB_VENDOR_SHARP, USB_PRODUCT_SHARP_SLA300, CDCE_FLAG_ZAURUS | CDCE_FLAG_NO_UNION)}, {USB_VPI(USB_VENDOR_SHARP, USB_PRODUCT_SHARP_SLC700, CDCE_FLAG_ZAURUS | CDCE_FLAG_NO_UNION)}, {USB_VPI(USB_VENDOR_SHARP, USB_PRODUCT_SHARP_SLC750, CDCE_FLAG_ZAURUS | CDCE_FLAG_NO_UNION)}, + {USB_VPI(USB_VENDOR_REALTEK, USB_PRODUCT_REALTEK_RTL8156, 0)}, {USB_VENDOR(USB_VENDOR_HUAWEI), USB_IFACE_CLASS(UICLASS_VENDOR), USB_IFACE_SUBCLASS(0x02), USB_IFACE_PROTOCOL(0x16), diff --git a/freebsd/sys/dev/usb/net/if_mos.c b/freebsd/sys/dev/usb/net/if_mos.c index 3dd0e5ef..3ab0e62e 100644 --- a/freebsd/sys/dev/usb/net/if_mos.c +++ b/freebsd/sys/dev/usb/net/if_mos.c @@ -122,6 +122,10 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/if_var.h> +#include <net/if_media.h> + +#include <dev/mii/mii.h> +#include <dev/mii/miivar.h> #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> @@ -134,6 +138,8 @@ __FBSDID("$FreeBSD$"); #include <dev/usb/net/usb_ethernet.h> +#include <rtems/bsd/local/miibus_if.h> + //#include <dev/usb/net/if_mosreg.h> #include "if_mosreg.h" diff --git a/freebsd/sys/dev/usb/net/if_rue.c b/freebsd/sys/dev/usb/net/if_rue.c index 810a98c8..8d095c86 100644 --- a/freebsd/sys/dev/usb/net/if_rue.c +++ b/freebsd/sys/dev/usb/net/if_rue.c @@ -91,6 +91,10 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/if_var.h> +#include <net/if_media.h> + +#include <dev/mii/mii.h> +#include <dev/mii/miivar.h> #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> @@ -104,6 +108,8 @@ __FBSDID("$FreeBSD$"); #include <dev/usb/net/usb_ethernet.h> #include <dev/usb/net/if_ruereg.h> +#include <rtems/bsd/local/miibus_if.h> + #ifdef USB_DEBUG static int rue_debug = 0; diff --git a/freebsd/sys/dev/usb/net/if_smsc.c b/freebsd/sys/dev/usb/net/if_smsc.c index 87e181d8..bbf28ad8 100644 --- a/freebsd/sys/dev/usb/net/if_smsc.c +++ b/freebsd/sys/dev/usb/net/if_smsc.c @@ -89,6 +89,10 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/if_var.h> +#include <net/if_media.h> + +#include <dev/mii/mii.h> +#include <dev/mii/miivar.h> #include <netinet/in.h> #include <netinet/ip.h> @@ -99,6 +103,7 @@ __FBSDID("$FreeBSD$"); #include <dev/fdt/fdt_common.h> #include <dev/ofw/ofw_bus.h> #include <dev/ofw/ofw_bus_subr.h> +#include <dev/usb/usb_fdt_support.h> #endif #include <dev/usb/usb.h> @@ -114,6 +119,8 @@ __FBSDID("$FreeBSD$"); #include <dev/usb/net/if_smscreg.h> +#include <rtems/bsd/local/miibus_if.h> + #ifdef USB_DEBUG static int smsc_debug = 0; @@ -1561,147 +1568,6 @@ smsc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) return (rc); } -#ifdef FDT -/* - * This is FreeBSD-specific compatibility strings for RPi/RPi2 - */ -static phandle_t -smsc_fdt_find_eth_node(phandle_t start) -{ - phandle_t child, node; - - /* Traverse through entire tree to find usb ethernet nodes. */ - for (node = OF_child(start); node != 0; node = OF_peer(node)) { - if ((ofw_bus_node_is_compatible(node, "net,ethernet") && - ofw_bus_node_is_compatible(node, "usb,device")) || - ofw_bus_node_is_compatible(node, "usb424,ec00")) - return (node); - child = smsc_fdt_find_eth_node(node); - if (child != -1) - return (child); - } - - return (-1); -} - -/* - * Check if node's path is <*>/usb/hub/ethernet - */ -static int -smsc_fdt_is_usb_eth(phandle_t node) -{ - char name[16]; - int len; - - memset(name, 0, sizeof(name)); - len = OF_getprop(node, "name", name, sizeof(name)); - if (len <= 0) - return (0); - - if (strcmp(name, "ethernet")) - return (0); - - node = OF_parent(node); - if (node == -1) - return (0); - len = OF_getprop(node, "name", name, sizeof(name)); - if (len <= 0) - return (0); - - if (strcmp(name, "hub")) - return (0); - - node = OF_parent(node); - if (node == -1) - return (0); - len = OF_getprop(node, "name", name, sizeof(name)); - if (len <= 0) - return (0); - - if (strcmp(name, "usb")) - return (0); - - return (1); -} - -static phandle_t -smsc_fdt_find_eth_node_by_path(phandle_t start) -{ - phandle_t child, node; - - /* Traverse through entire tree to find usb ethernet nodes. */ - for (node = OF_child(start); node != 0; node = OF_peer(node)) { - if (smsc_fdt_is_usb_eth(node)) - return (node); - child = smsc_fdt_find_eth_node_by_path(node); - if (child != -1) - return (child); - } - - return (-1); -} - -/* - * Look through known names that can contain mac address - * return 0 if valid MAC address has been found - */ -static int -smsc_fdt_read_mac_property(phandle_t node, unsigned char *mac) -{ - int len; - - /* Check if there is property */ - if ((len = OF_getproplen(node, "local-mac-address")) > 0) { - if (len != ETHER_ADDR_LEN) - return (EINVAL); - - OF_getprop(node, "local-mac-address", mac, - ETHER_ADDR_LEN); - return (0); - } - - if ((len = OF_getproplen(node, "mac-address")) > 0) { - if (len != ETHER_ADDR_LEN) - return (EINVAL); - - OF_getprop(node, "mac-address", mac, - ETHER_ADDR_LEN); - return (0); - } - - return (ENXIO); -} - -/** - * Get MAC address from FDT blob. Firmware or loader should fill - * mac-address or local-mac-address property. Returns 0 if MAC address - * obtained, error code otherwise. - */ -static int -smsc_fdt_find_mac(unsigned char *mac) -{ - phandle_t node, root; - - root = OF_finddevice("/"); - node = smsc_fdt_find_eth_node(root); - if (node != -1) { - if (smsc_fdt_read_mac_property(node, mac) == 0) - return (0); - } - - /* - * If it's not FreeBSD FDT blob for RPi, try more - * generic .../usb/hub/ethernet - */ - node = smsc_fdt_find_eth_node_by_path(root); - - if (node != -1) - return smsc_fdt_read_mac_property(node, mac); - - return (ENXIO); -} -#endif - /** * smsc_attach_post - Called after the driver attached to the USB interface * @ue: the USB ethernet device @@ -1750,7 +1616,7 @@ smsc_attach_post(struct usb_ether *ue) err = smsc_eeprom_read(sc, 0x01, sc->sc_ue.ue_eaddr, ETHER_ADDR_LEN); #ifdef FDT if ((err != 0) || (!ETHER_IS_VALID(sc->sc_ue.ue_eaddr))) - err = smsc_fdt_find_mac(sc->sc_ue.ue_eaddr); + err = usb_fdt_get_mac_addr(sc->sc_ue.ue_dev, &sc->sc_ue); #endif if ((err != 0) || (!ETHER_IS_VALID(sc->sc_ue.ue_eaddr))) { read_random(sc->sc_ue.ue_eaddr, ETHER_ADDR_LEN); diff --git a/freebsd/sys/dev/usb/net/if_udav.c b/freebsd/sys/dev/usb/net/if_udav.c index a4e683ac..8a2ad81f 100644 --- a/freebsd/sys/dev/usb/net/if_udav.c +++ b/freebsd/sys/dev/usb/net/if_udav.c @@ -72,12 +72,18 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/if_var.h> +#include <net/if_media.h> + +#include <dev/mii/mii.h> +#include <dev/mii/miivar.h> #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> #include <dev/usb/usbdi_util.h> #include <rtems/bsd/local/usbdevs.h> +#include <rtems/bsd/local/miibus_if.h> + #define USB_DEBUG_VAR udav_debug #include <dev/usb/usb_debug.h> #include <dev/usb/usb_process.h> diff --git a/freebsd/sys/dev/usb/net/if_ure.c b/freebsd/sys/dev/usb/net/if_ure.c index 136b61f9..0130a7bb 100644 --- a/freebsd/sys/dev/usb/net/if_ure.c +++ b/freebsd/sys/dev/usb/net/if_ure.c @@ -43,6 +43,10 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/if_var.h> +#include <net/if_media.h> + +#include <dev/mii/mii.h> +#include <dev/mii/miivar.h> #include <dev/usb/usb.h> #include <dev/usb/usbdi.h> @@ -56,6 +60,8 @@ __FBSDID("$FreeBSD$"); #include <dev/usb/net/usb_ethernet.h> #include <dev/usb/net/if_urereg.h> +#include <rtems/bsd/local/miibus_if.h> + #ifdef USB_DEBUG static int ure_debug = 0; @@ -64,6 +70,9 @@ SYSCTL_INT(_hw_usb_ure, OID_AUTO, debug, CTLFLAG_RWTUN, &ure_debug, 0, "Debug level"); #endif +#define ETHER_IS_ZERO(addr) \ + (!(addr[0] | addr[1] | addr[2] | addr[3] | addr[4] | addr[5])) + /* * Various supported device vendors/products. */ @@ -71,6 +80,7 @@ static const STRUCT_USB_HOST_ID ure_devs[] = { #define URE_DEV(v,p,i) { USB_VPI(USB_VENDOR_##v, USB_PRODUCT_##v##_##p, i) } URE_DEV(LENOVO, RTL8153, 0), URE_DEV(LENOVO, TBT3LAN, 0), + URE_DEV(LENOVO, ONELINK, 0), URE_DEV(LENOVO, USBCLAN, 0), URE_DEV(NVIDIA, RTL8153, 0), URE_DEV(REALTEK, RTL8152, URE_FLAG_8152), @@ -674,12 +684,20 @@ ure_attach_post(struct usb_ether *ue) else ure_rtl8153_init(sc); - if (sc->sc_chip & URE_CHIP_VER_4C00) + if ((sc->sc_chip & URE_CHIP_VER_4C00) || + (sc->sc_chip & URE_CHIP_VER_4C10)) ure_read_mem(sc, URE_PLA_IDR, URE_MCU_TYPE_PLA, ue->ue_eaddr, 8); else ure_read_mem(sc, URE_PLA_BACKUP, URE_MCU_TYPE_PLA, ue->ue_eaddr, 8); + + if (ETHER_IS_ZERO(sc->sc_ue.ue_eaddr)) { + device_printf(sc->sc_ue.ue_dev, "MAC assigned randomly\n"); + arc4rand(sc->sc_ue.ue_eaddr, ETHER_ADDR_LEN, 0); + sc->sc_ue.ue_eaddr[0] &= ~0x01; /* unicast */ + sc->sc_ue.ue_eaddr[0] |= 0x02; /* locally administered */ + } } static int @@ -725,8 +743,10 @@ ure_init(struct usb_ether *ue) ure_reset(sc); /* Set MAC address. */ + ure_write_1(sc, URE_PLA_CRWECR, URE_MCU_TYPE_PLA, URE_CRWECR_CONFIG); ure_write_mem(sc, URE_PLA_IDR, URE_MCU_TYPE_PLA | URE_BYTE_EN_SIX_BYTES, IF_LLADDR(ifp), 8); + ure_write_1(sc, URE_PLA_CRWECR, URE_MCU_TYPE_PLA, URE_CRWECR_NORAML); /* Reset the packet filter. */ ure_write_2(sc, URE_PLA_FMC, URE_MCU_TYPE_PLA, diff --git a/freebsd/sys/dev/usb/net/if_urereg.h b/freebsd/sys/dev/usb/net/if_urereg.h index 8eff1c25..cc70093f 100644 --- a/freebsd/sys/dev/usb/net/if_urereg.h +++ b/freebsd/sys/dev/usb/net/if_urereg.h @@ -176,7 +176,7 @@ #define URE_EEEP_CR_EEEP_TX 0x0002 /* PLA_WDT6_CTRL */ -#define URE_WDT6_SET_MODE 0x001 +#define URE_WDT6_SET_MODE 0x0010 /* PLA_TCR0 */ #define URE_TCR0_TX_EMPTY 0x0800 diff --git a/freebsd/sys/dev/usb/net/usb_ethernet.c b/freebsd/sys/dev/usb/net/usb_ethernet.c index 9ce60eff..2bbcdedc 100644 --- a/freebsd/sys/dev/usb/net/usb_ethernet.c +++ b/freebsd/sys/dev/usb/net/usb_ethernet.c @@ -221,6 +221,7 @@ ue_attach_post_task(struct usb_proc_msg *_task) ue->ue_unit = alloc_unr(ueunit); usb_callout_init_mtx(&ue->ue_watchdog, ue->ue_mtx, 0); sysctl_ctx_init(&ue->ue_sysctl_ctx); + mbufq_init(&ue->ue_rxq, 0 /* unlimited length */); error = 0; CURVNET_SET_QUIET(vnet0); @@ -286,6 +287,11 @@ ue_attach_post_task(struct usb_proc_msg *_task) fail: CURVNET_RESTORE(); + + /* drain mbuf queue */ + mbufq_drain(&ue->ue_rxq); + + /* free unit */ free_unr(ueunit, ue->ue_unit); if (ue->ue_ifp != NULL) { if_free(ue->ue_ifp); @@ -332,6 +338,9 @@ uether_ifdetach(struct usb_ether *ue) /* free sysctl */ sysctl_ctx_free(&ue->ue_sysctl_ctx); + /* drain mbuf queue */ + mbufq_drain(&ue->ue_rxq); + /* free unit */ free_unr(ueunit, ue->ue_unit); } @@ -600,7 +609,7 @@ uether_rxmbuf(struct usb_ether *ue, struct mbuf *m, m->m_pkthdr.len = m->m_len = len; /* enqueue for later when the lock can be released */ - _IF_ENQUEUE(&ue->ue_rxq, m); + (void)mbufq_enqueue(&ue->ue_rxq, m); return (0); } @@ -630,7 +639,7 @@ uether_rxbuf(struct usb_ether *ue, struct usb_page_cache *pc, m->m_pkthdr.len = m->m_len = len; /* enqueue for later when the lock can be released */ - _IF_ENQUEUE(&ue->ue_rxq, m); + (void)mbufq_enqueue(&ue->ue_rxq, m); return (0); } @@ -643,7 +652,7 @@ uether_rxflush(struct usb_ether *ue) UE_LOCK_ASSERT(ue, MA_OWNED); for (;;) { - _IF_DEQUEUE(&ue->ue_rxq, m); + m = mbufq_dequeue(&ue->ue_rxq); if (m == NULL) break; diff --git a/freebsd/sys/dev/usb/net/usb_ethernet.h b/freebsd/sys/dev/usb/net/usb_ethernet.h index 9839db16..87886559 100644 --- a/freebsd/sys/dev/usb/net/usb_ethernet.h +++ b/freebsd/sys/dev/usb/net/usb_ethernet.h @@ -48,11 +48,7 @@ #include <net/bpf.h> #include <net/ethernet.h> -#include <rtems/bsd/local/miibus_if.h> - -#include <dev/mii/mii.h> -#include <dev/mii/miivar.h> - +struct mii_data; struct usb_ether; struct usb_device_request; @@ -91,7 +87,7 @@ struct usb_ether { struct usb_process ue_tq; struct sysctl_ctx_list ue_sysctl_ctx; - struct ifqueue ue_rxq; + struct mbufq ue_rxq; struct usb_callout ue_watchdog; struct usb_ether_cfg_task ue_sync_task[2]; struct usb_ether_cfg_task ue_media_task[2]; diff --git a/freebsd/sys/dev/usb/quirk/usb_quirk.c b/freebsd/sys/dev/usb/quirk/usb_quirk.c index 2aea57c8..6c1589e9 100644 --- a/freebsd/sys/dev/usb/quirk/usb_quirk.c +++ b/freebsd/sys/dev/usb/quirk/usb_quirk.c @@ -98,10 +98,12 @@ static struct usb_quirk_entry usb_quirks[USB_DEV_QUIRKS_MAX] = { USB_QUIRK(TELEX, MIC1, 0x009, 0x009, UQ_AU_NO_FRAC), USB_QUIRK(SILICONPORTALS, YAPPHONE, 0x100, 0x100, UQ_AU_INP_ASYNC), USB_QUIRK(LOGITECH, UN53B, 0x0000, 0xffff, UQ_NO_STRINGS), + USB_QUIRK(LOGITECH, G510S, 0x0000, 0xFFFF, UQ_KBD_BOOTPROTO), USB_QUIRK(REALTEK, RTL8196EU, 0x0000, 0xffff, UQ_CFG_INDEX_1), USB_QUIRK(ELSA, MODEM1, 0x0000, 0xffff, UQ_CFG_INDEX_1), USB_QUIRK(PLANEX2, MZKUE150N, 0x0000, 0xffff, UQ_CFG_INDEX_1), USB_QUIRK(CISCOLINKSYS, USB3GIGV1, 0x0000, 0xffff, UQ_CFG_INDEX_1), + USB_QUIRK(REALTEK, RTL8156, 0x0000, 0xffff, UQ_CFG_INDEX_2), /* Quirks for printer devices */ USB_QUIRK(HP, 895C, 0x0000, 0xffff, UQ_BROKEN_BIDIR), USB_QUIRK(HP, 880C, 0x0000, 0xffff, UQ_BROKEN_BIDIR), @@ -112,8 +114,19 @@ static struct usb_quirk_entry usb_quirks[USB_DEV_QUIRKS_MAX] = { USB_QUIRK(XEROX, WCM15, 0x0000, 0xffff, UQ_BROKEN_BIDIR), /* Devices which should be ignored by uhid */ USB_QUIRK(APC, UPS, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(BELKIN, F6H375USB, 0x0000, 0xffff, UQ_HID_IGNORE), USB_QUIRK(BELKIN, F6C550AVR, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(BELKIN, F6C1250TWRK, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(BELKIN, F6C1500TWRK, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(BELKIN, F6C900UNV, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(BELKIN, F6C100UNV, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(BELKIN, F6C120UNV, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(BELKIN, F6C800UNV, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(BELKIN, F6C1100UNV, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(CYBERPOWER, BC900D, 0x0000, 0xffff, UQ_HID_IGNORE), USB_QUIRK(CYBERPOWER, 1500CAVRLCD, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(CYBERPOWER, OR2200LCDRM2U, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(DELL2, VARIOUS_UPS, 0x0000, 0xffff, UQ_HID_IGNORE), USB_QUIRK(CYPRESS, SILVERSHIELD, 0x0000, 0xffff, UQ_HID_IGNORE), USB_QUIRK(DELORME, EARTHMATE, 0x0000, 0xffff, UQ_HID_IGNORE), USB_QUIRK(DREAMLINK, DL100B, 0x0000, 0xffff, UQ_HID_IGNORE), @@ -121,8 +134,26 @@ static struct usb_quirk_entry usb_quirks[USB_DEV_QUIRKS_MAX] = { USB_QUIRK(ITUNERNET, USBLCD4X20, 0x0000, 0xffff, UQ_HID_IGNORE), USB_QUIRK(LIEBERT, POWERSURE_PXT, 0x0000, 0xffff, UQ_HID_IGNORE), USB_QUIRK(LIEBERT2, PSI1000, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(LIEBERT2, POWERSURE_PSA, 0x0000, 0xffff, UQ_HID_IGNORE), USB_QUIRK(MGE, UPS1, 0x0000, 0xffff, UQ_HID_IGNORE), USB_QUIRK(MGE, UPS2, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(POWERCOM, IMPERIAL_SERIES, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(POWERCOM, SMART_KING_PRO, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(POWERCOM, WOW, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(POWERCOM, VANGUARD, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(POWERCOM, BLACK_KNIGHT_PRO, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(TRIPPLITE2, AVR550U, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(TRIPPLITE2, AVR750U, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(TRIPPLITE2, ECO550UPS, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(TRIPPLITE2, T750_INTL, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(TRIPPLITE2, RT_2200_INTL, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(TRIPPLITE2, OMNI1000LCD, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(TRIPPLITE2, OMNI900LCD, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(TRIPPLITE2, SMART_2200RMXL2U, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(TRIPPLITE2, UPS_3014, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(TRIPPLITE2, SU1500RTXL2UA, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(TRIPPLITE2, SU6000RT4U, 0x0000, 0xffff, UQ_HID_IGNORE), + USB_QUIRK(TRIPPLITE2, SU1500RTXL2UA_2, 0x0000, 0xffff, UQ_HID_IGNORE), USB_QUIRK(APPLE, IPHONE, 0x0000, 0xffff, UQ_HID_IGNORE), USB_QUIRK(APPLE, IPHONE_3G, 0x0000, 0xffff, UQ_HID_IGNORE), USB_QUIRK(MEGATEC, UPS, 0x0000, 0xffff, UQ_HID_IGNORE), @@ -137,12 +168,15 @@ static struct usb_quirk_entry usb_quirks[USB_DEV_QUIRKS_MAX] = { USB_QUIRK(MICROSOFT, WLINTELLIMOUSE, 0x0000, 0xffff, UQ_MS_LEADING_BYTE), /* Quirk for Corsair Vengeance K60 keyboard */ USB_QUIRK(CORSAIR, K60, 0x0000, 0xffff, UQ_KBD_BOOTPROTO), + /* Quirk for Corsair Gaming K68 keyboard */ + USB_QUIRK(CORSAIR, K68, 0x0000, 0xffff, UQ_KBD_BOOTPROTO), /* Quirk for Corsair Vengeance K70 keyboard */ USB_QUIRK(CORSAIR, K70, 0x0000, 0xffff, UQ_KBD_BOOTPROTO), /* Quirk for Corsair K70 RGB keyboard */ USB_QUIRK(CORSAIR, K70_RGB, 0x0000, 0xffff, UQ_KBD_BOOTPROTO), /* Quirk for Corsair STRAFE Gaming keyboard */ USB_QUIRK(CORSAIR, STRAFE, 0x0000, 0xffff, UQ_KBD_BOOTPROTO), + USB_QUIRK(CORSAIR, STRAFE2, 0x0000, 0xffff, UQ_KBD_BOOTPROTO), /* umodem(4) device quirks */ USB_QUIRK(METRICOM, RICOCHET_GS, 0x100, 0x100, UQ_ASSUME_CM_OVER_DATA), USB_QUIRK(SANYO, SCP4900, 0x000, 0x000, UQ_ASSUME_CM_OVER_DATA), @@ -188,6 +222,8 @@ static struct usb_quirk_entry usb_quirks[USB_DEV_QUIRKS_MAX] = { USB_QUIRK(CENTURY, EX35QUAT, 0x0000, 0xffff, UQ_MSC_FORCE_WIRE_BBB, UQ_MSC_FORCE_PROTO_SCSI, UQ_MSC_FORCE_SHORT_INQ, UQ_MSC_NO_START_STOP, UQ_MSC_IGNORE_RESIDUE), + USB_QUIRK(CREATIVE, NOMAD, 0x0001, 0xffff, UQ_MSC_FORCE_WIRE_BBB, + UQ_MSC_FORCE_PROTO_SCSI, UQ_MSC_READ_CAP_OFFBY1), USB_QUIRK(CYPRESS, XX6830XX, 0x0000, 0xffff, UQ_MSC_NO_GETMAXLUN, UQ_MSC_NO_SYNC_CACHE), USB_QUIRK(DESKNOTE, UCR_61S2B, 0x0000, 0xffff, UQ_MSC_FORCE_WIRE_BBB, @@ -240,7 +276,7 @@ static struct usb_quirk_entry usb_quirks[USB_DEV_QUIRKS_MAX] = { UQ_MSC_FORCE_PROTO_RBC), USB_QUIRK(INSYSTEM, STORAGE_V2, 0x0000, 0xffff, UQ_MSC_FORCE_WIRE_CBI, UQ_MSC_FORCE_PROTO_RBC), - USB_QUIRK(INTENSO, MEMORY_BOX, 0x0000, 0xffff, UQ_MSC_NO_INQUIRY), + USB_QUIRK(VIALABS, VL701, 0x0000, 0xffff, UQ_MSC_NO_INQUIRY), USB_QUIRK(IODATA, IU_CD2, 0x0000, 0xffff, UQ_MSC_FORCE_WIRE_BBB, UQ_MSC_FORCE_PROTO_SCSI), USB_QUIRK(IODATA, DVR_UEH8, 0x0000, 0xffff, UQ_MSC_FORCE_WIRE_BBB, @@ -248,6 +284,7 @@ static struct usb_quirk_entry usb_quirks[USB_DEV_QUIRKS_MAX] = { USB_QUIRK(IOMEGA, ZIP100, 0x0000, 0xffff, UQ_MSC_FORCE_WIRE_BBB, UQ_MSC_FORCE_PROTO_SCSI, UQ_MSC_NO_TEST_UNIT_READY), /* XXX ZIP drives can also use ATAPI */ + USB_QUIRK(JMICRON, JMS566, 0x0000, 0xffff, UQ_MSC_NO_GETMAXLUN), USB_QUIRK(JMICRON, JMS567, 0x0000, 0xffff, UQ_MSC_NO_GETMAXLUN), USB_QUIRK(JMICRON, JM20337, 0x0000, 0xffff, UQ_MSC_FORCE_WIRE_BBB, UQ_MSC_FORCE_PROTO_SCSI, @@ -362,6 +399,8 @@ static struct usb_quirk_entry usb_quirks[USB_DEV_QUIRKS_MAX] = { UQ_MSC_FORCE_PROTO_SCSI, UQ_MSC_IGNORE_RESIDUE), USB_QUIRK(SANDISK, SDCZ4_256, 0x0000, 0xffff, UQ_MSC_FORCE_WIRE_BBB, UQ_MSC_FORCE_PROTO_SCSI, UQ_MSC_IGNORE_RESIDUE), + USB_QUIRK(SANDISK, SDCZ48_32, 0x0000, 0xffff, UQ_MSC_NO_SYNC_CACHE, + UQ_MSC_NO_TEST_UNIT_READY), USB_QUIRK(SANDISK, SDDR31, 0x0000, 0xffff, UQ_MSC_FORCE_WIRE_BBB, UQ_MSC_FORCE_PROTO_SCSI, UQ_MSC_READ_CAP_OFFBY1), USB_QUIRK(SANDISK, IMAGEMATE_SDDR289, 0x0000, 0xffff, @@ -498,6 +537,7 @@ static struct usb_quirk_entry usb_quirks[USB_DEV_QUIRKS_MAX] = { USB_QUIRK(VIALABS, USB30SATABRIDGE, 0x0000, 0xffff, UQ_MSC_NO_SYNC_CACHE), USB_QUIRK(QUALCOMMINC, ZTE_MF730M, 0x0000, 0xffff, UQ_MSC_NO_GETMAXLUN, UQ_MSC_NO_INQUIRY, UQ_CFG_INDEX_0), + USB_QUIRK(SMART2, G2MEMKEY, 0x0000, 0xffff, UQ_MSC_NO_INQUIRY), /* Non-standard USB MIDI devices */ USB_QUIRK(ROLAND, UM1, 0x0000, 0xffff, UQ_AU_VENDOR_CLASS), USB_QUIRK(ROLAND, SC8850, 0x0000, 0xffff, UQ_AU_VENDOR_CLASS), @@ -528,6 +568,8 @@ static struct usb_quirk_entry usb_quirks[USB_DEV_QUIRKS_MAX] = { USB_QUIRK(MAUDIO, FASTTRACKULTRA, 0x0000, 0xffff, UQ_AU_VENDOR_CLASS), USB_QUIRK(MAUDIO, FASTTRACKULTRA8R, 0x0000, 0xffff, UQ_AU_VENDOR_CLASS), USB_QUIRK(CMEDIA, CM6206, 0x0000, 0xffff, UQ_AU_SET_SPDIF_CM6206), + USB_QUIRK(PLOYTEC, SPL_CRIMSON_1, 0x0000, 0xffff, UQ_CFG_INDEX_1), + USB_QUIRK(ROLAND, UA25EX_AD, 0x0000, 0xffff, UQ_AU_VENDOR_CLASS), /* * Quirks for manufacturers which USB devices does not respond diff --git a/freebsd/sys/dev/usb/serial/u3g.c b/freebsd/sys/dev/usb/serial/u3g.c index 8d72ef49..dd82a2b2 100644 --- a/freebsd/sys/dev/usb/serial/u3g.c +++ b/freebsd/sys/dev/usb/serial/u3g.c @@ -33,11 +33,11 @@ */ +#include <sys/param.h> +#include <sys/eventhandler.h> #include <sys/stdint.h> #include <sys/stddef.h> -#include <sys/param.h> #include <sys/queue.h> -#include <sys/types.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/bus.h> diff --git a/freebsd/sys/dev/usb/serial/ugensa.c b/freebsd/sys/dev/usb/serial/ugensa.c index 2737227c..10a3ea33 100644 --- a/freebsd/sys/dev/usb/serial/ugensa.c +++ b/freebsd/sys/dev/usb/serial/ugensa.c @@ -72,6 +72,7 @@ #define UGENSA_CONFIG_INDEX 0 #define UGENSA_IFACE_INDEX 0 #define UGENSA_IFACE_MAX 8 /* exclusivly */ +#define UGENSA_PORT_MAX 8 /* exclusivly */ enum { UGENSA_BULK_DT_WR, @@ -86,11 +87,11 @@ struct ugensa_sub_softc { struct ugensa_softc { struct ucom_super_softc sc_super_ucom; - struct ucom_softc sc_ucom[UGENSA_IFACE_MAX]; - struct ugensa_sub_softc sc_sub[UGENSA_IFACE_MAX]; + struct ucom_softc sc_ucom[UGENSA_PORT_MAX]; + struct ugensa_sub_softc sc_sub[UGENSA_PORT_MAX]; struct mtx sc_mtx; - uint8_t sc_niface; + uint8_t sc_nports; }; /* prototypes */ @@ -156,12 +157,13 @@ static driver_t ugensa_driver = { .size = sizeof(struct ugensa_softc), }; +/* Driver-info is max number of serial ports per interface */ static const STRUCT_USB_HOST_ID ugensa_devs[] = { - {USB_VPI(USB_VENDOR_AIRPRIME, USB_PRODUCT_AIRPRIME_PC5220, 0)}, - {USB_VPI(USB_VENDOR_CMOTECH, USB_PRODUCT_CMOTECH_CDMA_MODEM1, 0)}, - {USB_VPI(USB_VENDOR_KYOCERA2, USB_PRODUCT_KYOCERA2_CDMA_MSM_K, 0)}, - {USB_VPI(USB_VENDOR_HP, USB_PRODUCT_HP_49GPLUS, 0)}, - {USB_VPI(USB_VENDOR_NOVATEL2, USB_PRODUCT_NOVATEL2_FLEXPACKGPS, 0)}, + {USB_VPI(USB_VENDOR_AIRPRIME, USB_PRODUCT_AIRPRIME_PC5220, 1)}, + {USB_VPI(USB_VENDOR_CMOTECH, USB_PRODUCT_CMOTECH_CDMA_MODEM1, 1)}, + {USB_VPI(USB_VENDOR_KYOCERA2, USB_PRODUCT_KYOCERA2_CDMA_MSM_K, 1)}, + {USB_VPI(USB_VENDOR_HP, USB_PRODUCT_HP_49GPLUS, 1)}, + {USB_VPI(USB_VENDOR_NOVATEL2, USB_PRODUCT_NOVATEL2_FLEXPACKGPS, 3)}, }; DRIVER_MODULE(ugensa, uhub, ugensa_driver, ugensa_devclass, NULL, 0); @@ -194,65 +196,68 @@ ugensa_attach(device_t dev) struct ugensa_softc *sc = device_get_softc(dev); struct ugensa_sub_softc *ssc; struct usb_interface *iface; + struct usb_config xfer_config[UGENSA_N_TRANSFER]; int32_t error; uint8_t iface_index; - int x, cnt; + int x, maxports; + maxports = USB_GET_DRIVER_INFO(uaa); device_set_usb_desc(dev); mtx_init(&sc->sc_mtx, "ugensa", NULL, MTX_DEF); ucom_ref(&sc->sc_super_ucom); - /* Figure out how many interfaces this device has got */ - for (cnt = 0; cnt < UGENSA_IFACE_MAX; cnt++) { - if ((usbd_get_endpoint(uaa->device, cnt, ugensa_xfer_config + 0) == NULL) || - (usbd_get_endpoint(uaa->device, cnt, ugensa_xfer_config + 1) == NULL)) { - /* we have reached the end */ - break; - } - } + for (iface_index = UGENSA_IFACE_INDEX; iface_index < UGENSA_IFACE_MAX; iface_index++) { - if (cnt == 0) { - device_printf(dev, "No interfaces\n"); - goto detach; - } - for (x = 0; x < cnt; x++) { - iface = usbd_get_iface(uaa->device, x); - if (iface->idesc->bInterfaceClass != UICLASS_VENDOR) + iface = usbd_get_iface(uaa->device, iface_index); + if (iface == NULL || iface->idesc->bInterfaceClass != UICLASS_VENDOR) /* Not a serial port, most likely a SD reader */ continue; - ssc = sc->sc_sub + sc->sc_niface; - ssc->sc_ucom_ptr = sc->sc_ucom + sc->sc_niface; - - iface_index = (UGENSA_IFACE_INDEX + x); - error = usbd_transfer_setup(uaa->device, - &iface_index, ssc->sc_xfer, ugensa_xfer_config, - UGENSA_N_TRANSFER, ssc, &sc->sc_mtx); - - if (error) { - device_printf(dev, "allocating USB " - "transfers failed\n"); - goto detach; + /* Loop over all endpoints pairwise */ + for (x = 0; x < maxports && sc->sc_nports < UGENSA_PORT_MAX; x++) { + + ssc = sc->sc_sub + sc->sc_nports; + ssc->sc_ucom_ptr = sc->sc_ucom + sc->sc_nports; + + memcpy(xfer_config, ugensa_xfer_config, sizeof ugensa_xfer_config); + xfer_config[UGENSA_BULK_DT_RD].ep_index = x; + xfer_config[UGENSA_BULK_DT_WR].ep_index = x; + + error = usbd_transfer_setup(uaa->device, + &iface_index, ssc->sc_xfer, xfer_config, + UGENSA_N_TRANSFER, ssc, &sc->sc_mtx); + + if (error) { + if (x == 0) { + device_printf(dev, "allocating USB " + "transfers failed (%d)\n", error); + goto detach; + } + break; + } + + /* clear stall at first run */ + mtx_lock(&sc->sc_mtx); + usbd_xfer_set_stall(ssc->sc_xfer[UGENSA_BULK_DT_WR]); + usbd_xfer_set_stall(ssc->sc_xfer[UGENSA_BULK_DT_RD]); + mtx_unlock(&sc->sc_mtx); + + /* initialize port number */ + ssc->sc_ucom_ptr->sc_portno = sc->sc_nports; + if (iface_index != uaa->info.bIfaceIndex) { + usbd_set_parent_iface(uaa->device, iface_index, + uaa->info.bIfaceIndex); + } + sc->sc_nports++; } - /* clear stall at first run */ - mtx_lock(&sc->sc_mtx); - usbd_xfer_set_stall(ssc->sc_xfer[UGENSA_BULK_DT_WR]); - usbd_xfer_set_stall(ssc->sc_xfer[UGENSA_BULK_DT_RD]); - mtx_unlock(&sc->sc_mtx); - - /* initialize port number */ - ssc->sc_ucom_ptr->sc_portno = sc->sc_niface; - sc->sc_niface++; - if (x != uaa->info.bIfaceIndex) - usbd_set_parent_iface(uaa->device, x, - uaa->info.bIfaceIndex); } - device_printf(dev, "Found %d interfaces.\n", sc->sc_niface); + device_printf(dev, "Found %d serial ports.\n", sc->sc_nports); - error = ucom_attach(&sc->sc_super_ucom, sc->sc_ucom, sc->sc_niface, sc, + error = ucom_attach(&sc->sc_super_ucom, sc->sc_ucom, sc->sc_nports, sc, &ugensa_callback, &sc->sc_mtx); + if (error) { - DPRINTF("attach failed\n"); + DPRINTF("ucom attach failed\n"); goto detach; } ucom_set_pnpinfo_usb(&sc->sc_super_ucom, dev); @@ -272,7 +277,7 @@ ugensa_detach(device_t dev) ucom_detach(&sc->sc_super_ucom, sc->sc_ucom); - for (x = 0; x < sc->sc_niface; x++) { + for (x = 0; x < sc->sc_nports; x++) { usbd_transfer_unsetup(sc->sc_sub[x].sc_xfer, UGENSA_N_TRANSFER); } diff --git a/freebsd/sys/dev/usb/serial/umcs.c b/freebsd/sys/dev/usb/serial/umcs.c index 8f083ce3..3a5fc70f 100644 --- a/freebsd/sys/dev/usb/serial/umcs.c +++ b/freebsd/sys/dev/usb/serial/umcs.c @@ -501,7 +501,9 @@ umcs7840_cfg_open(struct ucom_softc *ucom) * Enable DTR/RTS on modem control, enable modem interrupts -- * documented */ - sc->sc_ports[pn].sc_mcr = MCS7840_UART_MCR_DTR | MCS7840_UART_MCR_RTS | MCS7840_UART_MCR_IE; + sc->sc_ports[pn].sc_mcr = MCS7840_UART_MCR_IE; + if (ucom->sc_tty == NULL || (ucom->sc_tty->t_termios.c_cflag & CNO_RTSDTR) == 0) + sc->sc_ports[pn].sc_mcr |= MCS7840_UART_MCR_DTR | MCS7840_UART_MCR_RTS; if (umcs7840_set_UART_reg_sync(sc, pn, MCS7840_UART_REG_MCR, sc->sc_ports[pn].sc_mcr)) return; diff --git a/freebsd/sys/dev/usb/serial/usb_serial.c b/freebsd/sys/dev/usb/serial/usb_serial.c index a3f9b5de..c649056a 100644 --- a/freebsd/sys/dev/usb/serial/usb_serial.c +++ b/freebsd/sys/dev/usb/serial/usb_serial.c @@ -810,7 +810,8 @@ ucom_open(struct tty *tp) &sc->sc_start_task[0].hdr, &sc->sc_start_task[1].hdr); - ucom_modem(tp, SER_DTR | SER_RTS, 0); + if (sc->sc_tty == NULL || (sc->sc_tty->t_termios.c_cflag & CNO_RTSDTR) == 0) + ucom_modem(tp, SER_DTR | SER_RTS, 0); ucom_ring(sc, 0); diff --git a/freebsd/sys/dev/usb/usb.h b/freebsd/sys/dev/usb/usb.h index ff33cf00..0075d429 100644 --- a/freebsd/sys/dev/usb/usb.h +++ b/freebsd/sys/dev/usb/usb.h @@ -444,6 +444,7 @@ typedef struct usb_interface_assoc_descriptor usb_interface_assoc_descriptor_t; #define UIPROTO_CDC_NONE 0 #define UIPROTO_CDC_AT 1 +#define UIPROTO_CDC_EEM 7 #define UICLASS_HID 0x03 #define UISUBCLASS_BOOT 1 diff --git a/freebsd/sys/dev/usb/usb_bus.h b/freebsd/sys/dev/usb/usb_bus.h index 710436c1..9f8586e6 100644 --- a/freebsd/sys/dev/usb/usb_bus.h +++ b/freebsd/sys/dev/usb/usb_bus.h @@ -131,6 +131,7 @@ struct usb_bus { uint8_t do_probe; /* set if USB should be re-probed */ uint8_t no_explore; /* don't explore USB ports */ uint8_t dma_bits; /* number of DMA address lines */ + uint8_t control_ep_quirk; /* need 64kByte buffer for data stage */ }; #endif /* _USB_BUS_H_ */ diff --git a/freebsd/sys/dev/usb/usb_device.c b/freebsd/sys/dev/usb/usb_device.c index 5d6b9d0f..ee240949 100644 --- a/freebsd/sys/dev/usb/usb_device.c +++ b/freebsd/sys/dev/usb/usb_device.c @@ -34,8 +34,8 @@ #include <sys/stdint.h> #include <sys/stddef.h> #include <sys/param.h> +#include <sys/eventhandler.h> #include <sys/queue.h> -#include <sys/types.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/bus.h> diff --git a/freebsd/sys/sys/capability.h b/freebsd/sys/dev/usb/usb_fdt_support.h index 39195e03..e4249e85 100644 --- a/freebsd/sys/sys/capability.h +++ b/freebsd/sys/dev/usb/usb_fdt_support.h @@ -1,11 +1,7 @@ /*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2014 Robert N. M. Watson - * All rights reserved. - * - * This software was developed at the University of Cambridge Computer - * Laboratory with support from a grant from Google, Inc. + * Copyright (c) 2019 Ian Lepore <ian@FreeBSD.org> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,17 +27,22 @@ * $FreeBSD$ */ +#ifndef _USB_FDT_SUPPORT_H_ +#define _USB_FDT_SUPPORT_H_ + +struct usb_device; +struct usb_ether; + /* - * Historically, the key userspace and kernel Capsicum definitions were found - * in this file. However, it conflicted with POSIX.1e's capability.h, so has - * been renamed capsicum.h. The file remains for backwards compatibility - * reasons as a nested include. It is expected to be removed before - * FreeBSD 13. + * Get the device's MAC address from the FDT data. Fills in ue->ue_eaddr and + * returns 0 on success, otherwise leaves ue_eaddr untouched and returns + * non-zero. This first attempts to get the address from the "mac-address" + * property, and if that's not valid it tries the "local-mac-address" property; + * this matches the linux interpretation of the precedence of those properties. */ -#ifndef _SYS_CAPABILITY_H_ -#define _SYS_CAPABILITY_H_ +int usb_fdt_get_mac_addr(device_t dev, struct usb_ether* ue); -#warning this file includes <sys/capability.h> which is deprecated -#include <sys/capsicum.h> +/* Get the FDT node for dev. Returns -1 if dev is not in the FDT data. */ +phandle_t usb_fdt_get_node(device_t dev, struct usb_device* udev); -#endif /* !_SYS_CAPABILITY_H_ */ +#endif diff --git a/freebsd/sys/dev/usb/usb_generic.c b/freebsd/sys/dev/usb/usb_generic.c index 29a27d53..6d359226 100644 --- a/freebsd/sys/dev/usb/usb_generic.c +++ b/freebsd/sys/dev/usb/usb_generic.c @@ -185,7 +185,8 @@ ugen_open(struct usb_fifo *f, int fflags) struct usb_endpoint_descriptor *ed = ep->edesc; uint8_t type; - DPRINTFN(6, "flag=0x%x\n", fflags); + DPRINTFN(1, "flag=0x%x pid=%d name=%s\n", fflags, + curthread->td_proc->p_pid, curthread->td_proc->p_comm); mtx_lock(f->priv_mtx); switch (usbd_get_speed(f->udev)) { @@ -215,7 +216,9 @@ ugen_open(struct usb_fifo *f, int fflags) static void ugen_close(struct usb_fifo *f, int fflags) { - DPRINTFN(6, "flag=0x%x\n", fflags); + + DPRINTFN(1, "flag=0x%x pid=%d name=%s\n", fflags, + curthread->td_proc->p_pid, curthread->td_proc->p_comm); /* cleanup */ @@ -1218,6 +1221,40 @@ complete: } static int +ugen_fs_copy_out_cancelled(struct usb_fs_endpoint *fs_ep_uptr) +{ + struct usb_fs_endpoint fs_ep; + int error; + + error = copyin(fs_ep_uptr, &fs_ep, sizeof(fs_ep)); + if (error) + return (error); + + fs_ep.status = USB_ERR_CANCELLED; + fs_ep.aFrames = 0; + fs_ep.isoc_time_complete = 0; + + /* update "aFrames" */ + error = copyout(&fs_ep.aFrames, &fs_ep_uptr->aFrames, + sizeof(fs_ep.aFrames)); + if (error) + goto done; + + /* update "isoc_time_complete" */ + error = copyout(&fs_ep.isoc_time_complete, + &fs_ep_uptr->isoc_time_complete, + sizeof(fs_ep.isoc_time_complete)); + if (error) + goto done; + + /* update "status" */ + error = copyout(&fs_ep.status, &fs_ep_uptr->status, + sizeof(fs_ep.status)); +done: + return (error); +} + +static int ugen_fs_copy_out(struct usb_fifo *f, uint8_t ep_index) { struct usb_device_request *req; @@ -1242,7 +1279,12 @@ ugen_fs_copy_out(struct usb_fifo *f, uint8_t ep_index) return (EINVAL); mtx_lock(f->priv_mtx); - if (usbd_transfer_pending(xfer)) { + if (!xfer->flags_int.transferring && + !xfer->flags_int.started) { + mtx_unlock(f->priv_mtx); + DPRINTF("Returning fake cancel event\n"); + return (ugen_fs_copy_out_cancelled(f->fs_ep_ptr + ep_index)); + } else if (usbd_transfer_pending(xfer)) { mtx_unlock(f->priv_mtx); return (EBUSY); /* should not happen */ } @@ -1363,6 +1405,7 @@ complete: sizeof(fs_ep.isoc_time_complete)); if (error) goto done; + /* update "status" */ error = copyout(&fs_ep.status, &fs_ep_uptr->status, sizeof(fs_ep.status)); @@ -1451,12 +1494,15 @@ ugen_ioctl(struct usb_fifo *f, u_long cmd, void *addr, int fflags) xfer = f->fs_xfer[u.pstart->ep_index]; if (usbd_transfer_pending(xfer)) { usbd_transfer_stop(xfer); + /* * Check if the USB transfer was stopped - * before it was even started. Else a cancel - * callback will be pending. + * before it was even started and fake a + * cancel event. */ - if (!xfer->flags_int.transferring) { + if (!xfer->flags_int.transferring && + !xfer->flags_int.started) { + DPRINTF("Issuing fake completion event\n"); ugen_fs_set_complete(xfer->priv_sc, USB_P2U(xfer->priv_fifo)); } diff --git a/freebsd/sys/dev/usb/usb_hid.c b/freebsd/sys/dev/usb/usb_hid.c index 7ae052b7..21289f0d 100644 --- a/freebsd/sys/dev/usb/usb_hid.c +++ b/freebsd/sys/dev/usb/usb_hid.c @@ -76,7 +76,7 @@ static uint8_t hid_get_byte(struct hid_data *s, const uint16_t wSize); #define MAXUSAGE 64 #define MAXPUSH 4 #define MAXID 16 -#define MAXLOCCNT 1024 +#define MAXLOCCNT 2048 struct hid_pos_data { int32_t rid; diff --git a/freebsd/sys/dev/usb/usb_hub.c b/freebsd/sys/dev/usb/usb_hub.c index a0ad462f..b3543a8f 100644 --- a/freebsd/sys/dev/usb/usb_hub.c +++ b/freebsd/sys/dev/usb/usb_hub.c @@ -77,14 +77,9 @@ #include <dev/usb/usb_bus.h> #endif /* USB_GLOBAL_INCLUDE_FILE */ -#define UHUB_INTR_INTERVAL 250 /* ms */ -enum { - UHUB_INTR_TRANSFER, -#if USB_HAVE_TT_SUPPORT - UHUB_RESET_TT_TRANSFER, -#endif - UHUB_N_TRANSFER, -}; + +#include <dev/usb/usb_hub_private.h> + #ifdef USB_DEBUG static int uhub_debug = 0; @@ -113,27 +108,6 @@ SYSCTL_INT(_hw_usb, OID_AUTO, disable_port_power, CTLFLAG_RWTUN, &usb_disable_port_power, 0, "Set to disable all USB port power."); #endif -struct uhub_current_state { - uint16_t port_change; - uint16_t port_status; -}; - -struct uhub_softc { - struct uhub_current_state sc_st;/* current state */ -#if (USB_HAVE_FIXED_PORT != 0) - struct usb_hub sc_hub; -#endif - device_t sc_dev; /* base device */ - struct mtx sc_mtx; /* our mutex */ - struct usb_device *sc_udev; /* USB device */ - struct usb_xfer *sc_xfer[UHUB_N_TRANSFER]; /* interrupt xfer */ -#if USB_HAVE_DISABLE_ENUM - int sc_disable_enumeration; - int sc_disable_port_power; -#endif - uint8_t sc_flags; -#define UHUB_FLAG_DID_EXPLORE 0x01 -}; #define UHUB_PROTO(sc) ((sc)->sc_udev->ddesc.bDeviceProtocol) #define UHUB_IS_HIGH_SPEED(sc) (UHUB_PROTO(sc) != UDPROTO_FSHUB) @@ -143,14 +117,10 @@ struct uhub_softc { /* prototypes for type checking: */ -static device_probe_t uhub_probe; -static device_attach_t uhub_attach; -static device_detach_t uhub_detach; static device_suspend_t uhub_suspend; static device_resume_t uhub_resume; static bus_driver_added_t uhub_driver_added; -static bus_child_location_str_t uhub_child_location_string; static bus_child_pnpinfo_str_t uhub_child_pnpinfo_string; static usb_callback_t uhub_intr_callback; @@ -207,7 +177,7 @@ static device_method_t uhub_methods[] = { DEVMETHOD_END }; -static driver_t uhub_driver = { +driver_t uhub_driver = { .name = "uhub", .methods = uhub_methods, .size = sizeof(struct uhub_softc) @@ -589,13 +559,25 @@ uhub_read_port_status(struct uhub_softc *sc, uint8_t portno) struct usb_port_status ps; usb_error_t err; + if (sc->sc_usb_port_errors >= UHUB_USB_PORT_ERRORS_MAX) { + DPRINTFN(4, "port %d, HUB looks dead, too many errors\n", portno); + sc->sc_st.port_status = 0; + sc->sc_st.port_change = 0; + return (USB_ERR_TIMEOUT); + } + err = usbd_req_get_port_status( sc->sc_udev, NULL, &ps, portno); - /* update status regardless of error */ - - sc->sc_st.port_status = UGETW(ps.wPortStatus); - sc->sc_st.port_change = UGETW(ps.wPortChange); + if (err == 0) { + sc->sc_st.port_status = UGETW(ps.wPortStatus); + sc->sc_st.port_change = UGETW(ps.wPortChange); + sc->sc_usb_port_errors = 0; + } else { + sc->sc_st.port_status = 0; + sc->sc_st.port_change = 0; + sc->sc_usb_port_errors++; + } /* debugging print */ @@ -1126,7 +1108,7 @@ uhub_explore(struct usb_device *udev) return (USB_ERR_NORMAL_COMPLETION); } -static int +int uhub_probe(device_t dev) { struct usb_attach_arg *uaa = device_get_ivars(dev); @@ -1140,7 +1122,7 @@ uhub_probe(device_t dev) */ if (uaa->info.bConfigIndex == 0 && uaa->info.bDeviceClass == UDCLASS_HUB) - return (0); + return (BUS_PROBE_DEFAULT); return (ENXIO); } @@ -1206,7 +1188,7 @@ uhub_query_info(struct usb_device *udev, uint8_t *pnports, uint8_t *ptt) return (err); } -static int +int uhub_attach(device_t dev) { struct uhub_softc *sc = device_get_softc(dev); @@ -1552,7 +1534,7 @@ error: * Called from process context when the hub is gone. * Detach all devices on active ports. */ -static int +int uhub_detach(device_t dev) { struct uhub_softc *sc = device_get_softc(dev); @@ -1622,13 +1604,7 @@ uhub_driver_added(device_t dev, driver_t *driver) usb_needs_explore_all(); } -struct hub_result { - struct usb_device *udev; - uint8_t portno; - uint8_t iface_index; -}; - -static void +void uhub_find_iface_index(struct usb_hub *hub, device_t child, struct hub_result *res) { @@ -1661,7 +1637,7 @@ uhub_find_iface_index(struct usb_hub *hub, device_t child, res->portno = 0; } -static int +int uhub_child_location_string(device_t parent, device_t child, char *buf, size_t buflen) { diff --git a/freebsd/sys/dev/usb/usb_hub_private.h b/freebsd/sys/dev/usb/usb_hub_private.h new file mode 100644 index 00000000..1151ed7d --- /dev/null +++ b/freebsd/sys/dev/usb/usb_hub_private.h @@ -0,0 +1,86 @@ +/* $FreeBSD$ */ +/*- + * SPDX-License-Identifier: BSD-2-Clause-NetBSD + * + * Copyright (c) 1998 The NetBSD Foundation, Inc. All rights reserved. + * Copyright (c) 1998 Lennart Augustsson. All rights reserved. + * Copyright (c) 2008-2010 Hans Petter Selasky. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * USB spec: http://www.usb.org/developers/docs/usbspec.zip + */ + +#ifndef USB_HUB_PRIVATE_H_ +#define USB_HUB_PRIVATE_H_ +#define UHUB_INTR_INTERVAL 250 /* ms */ + +enum { + UHUB_INTR_TRANSFER, +#if USB_HAVE_TT_SUPPORT + UHUB_RESET_TT_TRANSFER, +#endif + UHUB_N_TRANSFER, +}; + + +struct uhub_current_state { + uint16_t port_change; + uint16_t port_status; +}; + +struct uhub_softc { + struct uhub_current_state sc_st; /* current state */ +#if (USB_HAVE_FIXED_PORT != 0) + struct usb_hub sc_hub; +#endif + device_t sc_dev; /* base device */ + struct mtx sc_mtx; /* our mutex */ + struct usb_device *sc_udev; /* USB device */ + struct usb_xfer *sc_xfer[UHUB_N_TRANSFER]; /* interrupt xfer */ +#if USB_HAVE_DISABLE_ENUM + int sc_disable_enumeration; + int sc_disable_port_power; +#endif + uint8_t sc_usb_port_errors; /* error counter */ +#define UHUB_USB_PORT_ERRORS_MAX 4 + uint8_t sc_flags; +#define UHUB_FLAG_DID_EXPLORE 0x01 +}; +struct hub_result { + struct usb_device *udev; + uint8_t portno; + uint8_t iface_index; +}; + +void +uhub_find_iface_index(struct usb_hub *hub, device_t child, + struct hub_result *res); + +device_probe_t uhub_probe; +device_attach_t uhub_attach; +device_detach_t uhub_detach; +bus_child_location_str_t uhub_child_location_string; + +#endif diff --git a/freebsd/sys/dev/usb/usb_ioctl.h b/freebsd/sys/dev/usb/usb_ioctl.h index fcd31e31..c4023cab 100644 --- a/freebsd/sys/dev/usb/usb_ioctl.h +++ b/freebsd/sys/dev/usb/usb_ioctl.h @@ -70,6 +70,7 @@ enum { USB_TEMP_SERIALNET, /* USB CDC Ethernet and Modem */ USB_TEMP_MIDI, /* USB MIDI */ USB_TEMP_MULTI, /* USB Ethernet, serial, and storage */ + USB_TEMP_CDCEEM, /* USB Ethernet Emulation Model */ USB_TEMP_MAX, }; @@ -223,7 +224,7 @@ struct usb_fs_uninit { } USB_IOCTL_STRUCT_ALIGN(1); struct usb_fs_open { -#define USB_FS_MAX_BUFSIZE (1 << 18) +#define USB_FS_MAX_BUFSIZE (1 << 25) /* 32 MBytes */ uint32_t max_bufsize; #define USB_FS_MAX_FRAMES (1U << 12) #define USB_FS_MAX_FRAMES_PRE_SCALE (1U << 31) /* for ISOCHRONOUS transfers */ diff --git a/freebsd/sys/dev/usb/usb_request.c b/freebsd/sys/dev/usb/usb_request.c index d2a15f3c..f288378e 100644 --- a/freebsd/sys/dev/usb/usb_request.c +++ b/freebsd/sys/dev/usb/usb_request.c @@ -1603,8 +1603,9 @@ usbd_req_get_port_status(struct usb_device *udev, struct mtx *mtx, USETW(req.wValue, 0); req.wIndex[0] = port; req.wIndex[1] = 0; - USETW(req.wLength, sizeof *ps); - return (usbd_do_request(udev, mtx, &req, ps)); + USETW(req.wLength, sizeof(*ps)); + + return (usbd_do_request_flags(udev, mtx, &req, ps, 0, NULL, 1000)); } /*------------------------------------------------------------------------* diff --git a/freebsd/sys/dev/usb/usb_transfer.c b/freebsd/sys/dev/usb/usb_transfer.c index 7ea25337..3b67c20c 100644 --- a/freebsd/sys/dev/usb/usb_transfer.c +++ b/freebsd/sys/dev/usb/usb_transfer.c @@ -111,6 +111,33 @@ static const struct usb_config usb_control_ep_cfg[USB_CTRL_XFER_MAX] = { }, }; +static const struct usb_config usb_control_ep_quirk_cfg[USB_CTRL_XFER_MAX] = { + + /* This transfer is used for generic control endpoint transfers */ + + [0] = { + .type = UE_CONTROL, + .endpoint = 0x00, /* Control endpoint */ + .direction = UE_DIR_ANY, + .bufsize = 65535, /* bytes */ + .callback = &usb_request_callback, + .usb_mode = USB_MODE_DUAL, /* both modes */ + }, + + /* This transfer is used for generic clear stall only */ + + [1] = { + .type = UE_CONTROL, + .endpoint = 0x00, /* Control pipe */ + .direction = UE_DIR_ANY, + .bufsize = sizeof(struct usb_device_request), + .callback = &usb_do_clear_stall_callback, + .timeout = 1000, /* 1 second */ + .interval = 50, /* 50ms */ + .usb_mode = USB_MODE_HOST, + }, +}; + /* function prototypes */ static void usbd_update_max_frame_size(struct usb_xfer *); @@ -1051,7 +1078,8 @@ usbd_transfer_setup(struct usb_device *udev, * context, else there is a chance of * deadlock! */ - if (setup_start == usb_control_ep_cfg) + if (setup_start == usb_control_ep_cfg || + setup_start == usb_control_ep_quirk_cfg) info->done_p = USB_BUS_CONTROL_XFER_PROC(udev->bus); else if (xfer_mtx == &Giant) @@ -3179,7 +3207,8 @@ repeat: */ iface_index = 0; if (usbd_transfer_setup(udev, &iface_index, - udev->ctrl_xfer, usb_control_ep_cfg, USB_CTRL_XFER_MAX, NULL, + udev->ctrl_xfer, udev->bus->control_ep_quirk ? + usb_control_ep_quirk_cfg : usb_control_ep_cfg, USB_CTRL_XFER_MAX, NULL, &udev->device_mtx)) { DPRINTFN(0, "could not setup default " "USB transfer\n"); diff --git a/freebsd/sys/dev/usb/usbdi.h b/freebsd/sys/dev/usb/usbdi.h index d5648c03..0a393844 100644 --- a/freebsd/sys/dev/usb/usbdi.h +++ b/freebsd/sys/dev/usb/usbdi.h @@ -105,7 +105,7 @@ typedef void (usb_fifo_filter_t)(struct usb_fifo *fifo, struct usb_mbuf *m); /* USB events */ #ifndef USB_GLOBAL_INCLUDE_FILE -#include <sys/eventhandler.h> +#include <sys/_eventhandler.h> #endif typedef void (*usb_dev_configured_t)(void *, struct usb_device *, struct usb_attach_arg *); diff --git a/freebsd/sys/dev/usb/wlan/if_rsu.c b/freebsd/sys/dev/usb/wlan/if_rsu.c index b730ce59..112b8675 100644 --- a/freebsd/sys/dev/usb/wlan/if_rsu.c +++ b/freebsd/sys/dev/usb/wlan/if_rsu.c @@ -44,13 +44,9 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/conf.h> #include <sys/bus.h> -#include <sys/rman.h> #include <sys/firmware.h> #include <sys/module.h> -#include <machine/bus.h> -#include <machine/resource.h> - #include <net/bpf.h> #include <net/if.h> #include <net/if_var.h> @@ -291,9 +287,6 @@ MODULE_DEPEND(rsu, firmware, 1, 1, 1); MODULE_VERSION(rsu, 1); USB_PNP_HOST_INFO(rsu_devs); -static const uint8_t rsu_chan_2ghz[] = - { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; - static uint8_t rsu_wme_ac_xfer_map[4] = { [WME_AC_BE] = RSU_BULK_TX_BE_BK, [WME_AC_BK] = RSU_BULK_TX_BE_BK, @@ -789,9 +782,8 @@ rsu_getradiocaps(struct ieee80211com *ic, setbit(bands, IEEE80211_MODE_11G); if (sc->sc_ht) setbit(bands, IEEE80211_MODE_11NG); - ieee80211_add_channel_list_2ghz(chans, maxchans, nchans, - rsu_chan_2ghz, nitems(rsu_chan_2ghz), bands, - (ic->ic_htcaps & IEEE80211_HTCAP_CHWIDTH40) != 0); + ieee80211_add_channels_default_2ghz(chans, maxchans, nchans, + bands, (ic->ic_htcaps & IEEE80211_HTCAP_CHWIDTH40) != 0); } static void @@ -2460,8 +2452,6 @@ rsu_rx_frame(struct rsu_softc *sc, struct mbuf *m) tap->wr_rate = rxs.c_rate; tap->wr_dbm_antsignal = rssi; - tap->wr_chan_freq = htole16(ic->ic_curchan->ic_freq); - tap->wr_chan_flags = htole16(ic->ic_curchan->ic_flags); }; (void) ieee80211_add_rx_params(m, &rxs); @@ -2762,15 +2752,16 @@ static int rsu_tx_start(struct rsu_softc *sc, struct ieee80211_node *ni, struct mbuf *m0, struct rsu_data *data) { - struct ieee80211com *ic = &sc->sc_ic; + const struct ieee80211_txparam *tp = ni->ni_txparms; struct ieee80211vap *vap = ni->ni_vap; struct ieee80211_frame *wh; struct ieee80211_key *k = NULL; struct r92s_tx_desc *txd; - uint8_t type, cipher; + uint8_t rate, ridx, type, cipher, qos; int prio = 0; uint8_t which; int hasqos; + int ismcast; int xferlen; int qid; @@ -2778,10 +2769,26 @@ rsu_tx_start(struct rsu_softc *sc, struct ieee80211_node *ni, wh = mtod(m0, struct ieee80211_frame *); type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK; + ismcast = IEEE80211_IS_MULTICAST(wh->i_addr1); RSU_DPRINTF(sc, RSU_DEBUG_TX, "%s: data=%p, m=%p\n", __func__, data, m0); + /* Choose a TX rate index. */ + if (type == IEEE80211_FC0_TYPE_MGT || + type == IEEE80211_FC0_TYPE_CTL || + (m0->m_flags & M_EAPOL) != 0) + rate = tp->mgmtrate; + else if (ismcast) + rate = tp->mcastrate; + else if (tp->ucastrate != IEEE80211_FIXED_RATE_NONE) + rate = tp->ucastrate; + else + rate = 0; + + if (rate != 0) + ridx = rate2ridx(rate); + if (wh->i_fc[1] & IEEE80211_FC1_PROTECTED) { k = ieee80211_crypto_encap(ni, m0); if (k == NULL) { @@ -2799,12 +2806,14 @@ rsu_tx_start(struct rsu_softc *sc, struct ieee80211_node *ni, prio = M_WME_GETAC(m0); which = rsu_wme_ac_xfer_map[prio]; hasqos = 1; + qos = ((const struct ieee80211_qosframe *)wh)->i_qos[0]; } else { /* Non-QoS TID */ /* XXX TODO: tid=0 for non-qos TID? */ which = rsu_wme_ac_xfer_map[WME_AC_BE]; hasqos = 0; prio = 0; + qos = 0; } qid = rsu_ac2qid[prio]; @@ -2860,8 +2869,23 @@ rsu_tx_start(struct rsu_softc *sc, struct ieee80211_node *ni, } /* XXX todo: set AGGEN bit if appropriate? */ txd->txdw2 |= htole32(R92S_TXDW2_BK); - if (IEEE80211_IS_MULTICAST(wh->i_addr1)) + if (ismcast) txd->txdw2 |= htole32(R92S_TXDW2_BMCAST); + + if (!ismcast && (!qos || (qos & IEEE80211_QOS_ACKPOLICY) != + IEEE80211_QOS_ACKPOLICY_NOACK)) { + txd->txdw2 |= htole32(R92S_TXDW2_RTY_LMT_ENA); + txd->txdw2 |= htole32(SM(R92S_TXDW2_RTY_LMT, tp->maxretry)); + } + + /* Force mgmt / mcast / ucast rate if needed. */ + if (rate != 0) { + /* Data rate fallback limit (max). */ + txd->txdw5 |= htole32(SM(R92S_TXDW5_DATARATE_FB_LMT, 0x1f)); + txd->txdw5 |= htole32(SM(R92S_TXDW5_DATARATE, ridx)); + txd->txdw4 |= htole32(R92S_TXDW4_DRVRATE); + } + /* * Firmware will use and increment the sequence number for the * specified priority. @@ -2872,8 +2896,6 @@ rsu_tx_start(struct rsu_softc *sc, struct ieee80211_node *ni, struct rsu_tx_radiotap_header *tap = &sc->sc_txtap; tap->wt_flags = 0; - tap->wt_chan_freq = htole16(ic->ic_curchan->ic_freq); - tap->wt_chan_flags = htole16(ic->ic_curchan->ic_flags); ieee80211_radiotap_tx(vap, m0); } diff --git a/freebsd/sys/dev/usb/wlan/if_rsureg.h b/freebsd/sys/dev/usb/wlan/if_rsureg.h index 973280cf..246b06b7 100644 --- a/freebsd/sys/dev/usb/wlan/if_rsureg.h +++ b/freebsd/sys/dev/usb/wlan/if_rsureg.h @@ -688,6 +688,9 @@ struct r92s_tx_desc { #define R92S_TXDW1_HWPC 0x80000000 uint32_t txdw2; +#define R92S_TXDW2_RTY_LMT_M 0x0000003f +#define R92S_TXDW2_RTY_LMT_S 0 +#define R92S_TXDW2_RTY_LMT_ENA 0x00000040 #define R92S_TXDW2_BMCAST 0x00000080 #define R92S_TXDW2_AGGEN 0x20000000 #define R92S_TXDW2_BK 0x40000000 @@ -700,9 +703,14 @@ struct r92s_tx_desc { uint32_t txdw4; #define R92S_TXDW4_TXBW 0x00040000 +#define R92S_TXDW4_DRVRATE 0x80000000 uint32_t txdw5; -#define R92S_TXDW5_DISFB 0x00008000 +#define R92S_TXDW5_DATARATE_M 0x00007e00 +#define R92S_TXDW5_DATARATE_S 9 +#define R92S_TXDW5_DISFB 0x00008000 +#define R92S_TXDW5_DATARATE_FB_LMT_M 0x001f0000 +#define R92S_TXDW5_DATARATE_FB_LMT_S 16 uint16_t ipchksum; uint16_t tcpchksum; @@ -792,9 +800,10 @@ struct rsu_rx_radiotap_header { struct rsu_tx_radiotap_header { struct ieee80211_radiotap_header wt_ihdr; uint8_t wt_flags; + uint8_t wt_pad; uint16_t wt_chan_freq; uint16_t wt_chan_flags; -} __packed __aligned(8); +} __packed; #define RSU_TX_RADIOTAP_PRESENT \ (1 << IEEE80211_RADIOTAP_FLAGS | \ diff --git a/freebsd/sys/dev/usb/wlan/if_rum.c b/freebsd/sys/dev/usb/wlan/if_rum.c index 5c826cac..c2f68406 100644 --- a/freebsd/sys/dev/usb/wlan/if_rum.c +++ b/freebsd/sys/dev/usb/wlan/if_rum.c @@ -46,10 +46,6 @@ __FBSDID("$FreeBSD$"); #include <sys/endian.h> #include <sys/kdb.h> -#include <machine/bus.h> -#include <machine/resource.h> -#include <sys/rman.h> - #include <net/bpf.h> #include <net/if.h> #include <net/if_var.h> @@ -344,9 +340,6 @@ static const struct { { 107, 0x04 } }; -static const uint8_t rum_chan_2ghz[] = - { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; - static const uint8_t rum_chan_5ghz[] = { 34, 36, 38, 40, 42, 44, 46, 48, 52, 56, 60, 64, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, @@ -3222,8 +3215,7 @@ rum_getradiocaps(struct ieee80211com *ic, memset(bands, 0, sizeof(bands)); setbit(bands, IEEE80211_MODE_11B); setbit(bands, IEEE80211_MODE_11G); - ieee80211_add_channel_list_2ghz(chans, maxchans, nchans, - rum_chan_2ghz, nitems(rum_chan_2ghz), bands, 0); + ieee80211_add_channels_default_2ghz(chans, maxchans, nchans, bands, 0); if (sc->rf_rev == RT2573_RF_5225 || sc->rf_rev == RT2573_RF_5226) { setbit(bands, IEEE80211_MODE_11A); diff --git a/freebsd/sys/dev/usb/wlan/if_rumvar.h b/freebsd/sys/dev/usb/wlan/if_rumvar.h index 4ff831f4..e19a7088 100644 --- a/freebsd/sys/dev/usb/wlan/if_rumvar.h +++ b/freebsd/sys/dev/usb/wlan/if_rumvar.h @@ -49,7 +49,7 @@ struct rum_tx_radiotap_header { uint16_t wt_chan_freq; uint16_t wt_chan_flags; uint8_t wt_antenna; -} __packed __aligned(8); +} __packed; #define RT2573_TX_RADIOTAP_PRESENT \ ((1 << IEEE80211_RADIOTAP_FLAGS) | \ diff --git a/freebsd/sys/dev/usb/wlan/if_run.c b/freebsd/sys/dev/usb/wlan/if_run.c index 3bd247e3..9f11a3a3 100644 --- a/freebsd/sys/dev/usb/wlan/if_run.c +++ b/freebsd/sys/dev/usb/wlan/if_run.c @@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_wlan.h> #include <sys/param.h> +#include <sys/eventhandler.h> #include <sys/sockio.h> #include <sys/sysctl.h> #include <sys/lock.h> @@ -46,10 +47,6 @@ __FBSDID("$FreeBSD$"); #include <sys/firmware.h> #include <sys/kdb.h> -#include <machine/bus.h> -#include <machine/resource.h> -#include <sys/rman.h> - #include <net/bpf.h> #include <net/if.h> #include <net/if_var.h> @@ -470,6 +467,7 @@ static void run_usb_timeout_cb(void *); static void run_reset_livelock(struct run_softc *); static void run_enable_tsf_sync(struct run_softc *); static void run_enable_tsf(struct run_softc *); +static void run_disable_tsf(struct run_softc *); static void run_get_tsf(struct run_softc *, uint64_t *); static void run_enable_mrr(struct run_softc *); static void run_set_txpreamble(struct run_softc *); @@ -2035,7 +2033,8 @@ run_read_eeprom(struct run_softc *sc) static struct ieee80211_node * run_node_alloc(struct ieee80211vap *vap, const uint8_t mac[IEEE80211_ADDR_LEN]) { - return malloc(sizeof (struct run_node), M_DEVBUF, M_NOWAIT | M_ZERO); + return malloc(sizeof (struct run_node), M_80211_NODE, + M_NOWAIT | M_ZERO); } static int @@ -2095,7 +2094,6 @@ run_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg) struct run_vap *rvp = RUN_VAP(vap); enum ieee80211_state ostate; uint32_t sta[3]; - uint32_t tmp; uint8_t ratectl; uint8_t restart_ratectl = 0; uint8_t bid = 1 << rvp->rvp_id; @@ -2128,12 +2126,8 @@ run_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg) sc->runbmap &= ~bid; /* abort TSF synchronization if there is no vap running */ - if (--sc->running == 0) { - run_read(sc, RT2860_BCN_TIME_CFG, &tmp); - run_write(sc, RT2860_BCN_TIME_CFG, - tmp & ~(RT2860_BCN_TX_EN | RT2860_TSF_TIMER_EN | - RT2860_TBTT_TIMER_EN)); - } + if (--sc->running == 0) + run_disable_tsf(sc); break; case IEEE80211_S_RUN: @@ -2826,76 +2820,83 @@ run_rx_frame(struct run_softc *sc, struct mbuf *m, uint32_t dmalen) uint8_t ant, rssi; int8_t nf; - rxwi = mtod(m, struct rt2860_rxwi *); - len = le16toh(rxwi->len) & 0xfff; rxwisize = sizeof(struct rt2860_rxwi); if (sc->mac_ver == 0x5592) rxwisize += sizeof(uint64_t); else if (sc->mac_ver == 0x3593) rxwisize += sizeof(uint32_t); - if (__predict_false(len > dmalen)) { - m_freem(m); - counter_u64_add(ic->ic_ierrors, 1); + + if (__predict_false(dmalen < + rxwisize + sizeof(struct ieee80211_frame_ack))) { + RUN_DPRINTF(sc, RUN_DEBUG_RECV, + "payload is too short: dma length %u < %zu\n", + dmalen, rxwisize + sizeof(struct ieee80211_frame_ack)); + goto fail; + } + + rxwi = mtod(m, struct rt2860_rxwi *); + len = le16toh(rxwi->len) & 0xfff; + + if (__predict_false(len > dmalen - rxwisize)) { RUN_DPRINTF(sc, RUN_DEBUG_RECV, "bad RXWI length %u > %u\n", len, dmalen); - return; + goto fail; } + /* Rx descriptor is located at the end */ rxd = (struct rt2870_rxd *)(mtod(m, caddr_t) + dmalen); flags = le32toh(rxd->flags); if (__predict_false(flags & (RT2860_RX_CRCERR | RT2860_RX_ICVERR))) { - m_freem(m); - counter_u64_add(ic->ic_ierrors, 1); RUN_DPRINTF(sc, RUN_DEBUG_RECV, "%s error.\n", (flags & RT2860_RX_CRCERR)?"CRC":"ICV"); - return; + goto fail; + } + + if (flags & RT2860_RX_L2PAD) { + RUN_DPRINTF(sc, RUN_DEBUG_RECV, + "received RT2860_RX_L2PAD frame\n"); + len += 2; } m->m_data += rxwisize; - m->m_pkthdr.len = m->m_len -= rxwisize; + m->m_pkthdr.len = m->m_len = len; wh = mtod(m, struct ieee80211_frame *); - if (wh->i_fc[1] & IEEE80211_FC1_PROTECTED) { + if ((wh->i_fc[1] & IEEE80211_FC1_PROTECTED) != 0 && + (flags & RT2860_RX_DEC) != 0) { wh->i_fc[1] &= ~IEEE80211_FC1_PROTECTED; m->m_flags |= M_WEP; } - if (flags & RT2860_RX_L2PAD) { - RUN_DPRINTF(sc, RUN_DEBUG_RECV, - "received RT2860_RX_L2PAD frame\n"); - len += 2; - } - - ni = ieee80211_find_rxnode(ic, - mtod(m, struct ieee80211_frame_min *)); + if (len >= sizeof(struct ieee80211_frame_min)) { + ni = ieee80211_find_rxnode(ic, + mtod(m, struct ieee80211_frame_min *)); + } else + ni = NULL; if (__predict_false(flags & RT2860_RX_MICERR)) { /* report MIC failures to net80211 for TKIP */ if (ni != NULL) ieee80211_notify_michael_failure(ni->ni_vap, wh, rxwi->keyidx); - m_freem(m); - counter_u64_add(ic->ic_ierrors, 1); RUN_DPRINTF(sc, RUN_DEBUG_RECV, "MIC error. Someone is lying.\n"); - return; + goto fail; } ant = run_maxrssi_chain(sc, rxwi); rssi = rxwi->rssi[ant]; nf = run_rssi2dbm(sc, rssi, ant); - m->m_pkthdr.len = m->m_len = len; - if (__predict_false(ieee80211_radiotap_active(ic))) { struct run_rx_radiotap_header *tap = &sc->sc_rxtap; uint16_t phy; tap->wr_flags = 0; - tap->wr_chan_freq = htole16(ic->ic_curchan->ic_freq); - tap->wr_chan_flags = htole16(ic->ic_curchan->ic_flags); + if (flags & RT2860_RX_L2PAD) + tap->wr_flags |= IEEE80211_RADIOTAP_F_DATAPAD; tap->wr_antsignal = rssi; tap->wr_antenna = ant; tap->wr_dbm_antsignal = run_rssi2dbm(sc, rssi, ant); @@ -2936,6 +2937,12 @@ run_rx_frame(struct run_softc *sc, struct mbuf *m, uint32_t dmalen) } else { (void)ieee80211_input_all(ic, m, rssi, nf); } + + return; + +fail: + m_freem(m); + counter_u64_add(ic->ic_ierrors, 1); } static void @@ -2945,7 +2952,7 @@ run_bulk_rx_callback(struct usb_xfer *xfer, usb_error_t error) struct ieee80211com *ic = &sc->sc_ic; struct mbuf *m = NULL; struct mbuf *m0; - uint32_t dmalen; + uint32_t dmalen, mbuf_len; uint16_t rxwisize; int xferlen; @@ -3051,6 +3058,14 @@ tr_setup: break; } + mbuf_len = dmalen + sizeof(struct rt2870_rxd); + if (__predict_false(mbuf_len > MCLBYTES)) { + RUN_DPRINTF(sc, RUN_DEBUG_RECV_DESC | RUN_DEBUG_USB, + "payload is too big: mbuf_len %u\n", mbuf_len); + counter_u64_add(ic->ic_ierrors, 1); + break; + } + /* copy aggregated frames to another mbuf */ m0 = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (__predict_false(m0 == NULL)) { @@ -3060,14 +3075,13 @@ tr_setup: break; } m_copydata(m, 4 /* skip 32-bit DMA-len header */, - dmalen + sizeof(struct rt2870_rxd), mtod(m0, caddr_t)); - m0->m_pkthdr.len = m0->m_len = - dmalen + sizeof(struct rt2870_rxd); + mbuf_len, mtod(m0, caddr_t)); + m0->m_pkthdr.len = m0->m_len = mbuf_len; run_rx_frame(sc, m0, dmalen); /* update data ptr */ - m->m_data += dmalen + 8; - m->m_pkthdr.len = m->m_len -= dmalen + 8; + m->m_data += mbuf_len + 4; + m->m_pkthdr.len = m->m_len -= mbuf_len + 4; } /* make sure we free the source buffer, if any */ @@ -3149,16 +3163,23 @@ tr_setup: vap = data->ni->ni_vap; if (ieee80211_radiotap_active_vap(vap)) { + const struct ieee80211_frame *wh; struct run_tx_radiotap_header *tap = &sc->sc_txtap; struct rt2860_txwi *txwi = (struct rt2860_txwi *)(&data->desc + sizeof(struct rt2870_txd)); + int has_l2pad; + + wh = mtod(m, struct ieee80211_frame *); + has_l2pad = IEEE80211_HAS_ADDR4(wh) != + IEEE80211_QOS_HAS_SEQ(wh); + tap->wt_flags = 0; tap->wt_rate = rt2860_rates[data->ridx].rate; - tap->wt_chan_freq = htole16(ic->ic_curchan->ic_freq); - tap->wt_chan_flags = htole16(ic->ic_curchan->ic_flags); tap->wt_hwqueue = index; if (le16toh(txwi->phy) & RT2860_PHY_SHPRE) tap->wt_flags |= IEEE80211_RADIOTAP_F_SHORTPRE; + if (has_l2pad) + tap->wt_flags |= IEEE80211_RADIOTAP_F_DATAPAD; ieee80211_radiotap_tx(vap, m); } @@ -3350,11 +3371,7 @@ run_tx(struct run_softc *sc, struct mbuf *m, struct ieee80211_node *ni) if ((hasqos = IEEE80211_QOS_HAS_SEQ(wh))) { uint8_t *frm; - if(IEEE80211_HAS_ADDR4(wh)) - frm = ((struct ieee80211_qosframe_addr4 *)wh)->i_qos; - else - frm =((struct ieee80211_qosframe *)wh)->i_qos; - + frm = ieee80211_getqos(wh); qos = le16toh(*(const uint16_t *)frm); tid = qos & IEEE80211_QOS_TID; qid = TID_TO_WME_AC(tid); @@ -4837,8 +4854,7 @@ run_getradiocaps(struct ieee80211com *ic, memset(bands, 0, sizeof(bands)); setbit(bands, IEEE80211_MODE_11B); setbit(bands, IEEE80211_MODE_11G); - ieee80211_add_channel_list_2ghz(chans, maxchans, nchans, - run_chan_2ghz, nitems(run_chan_2ghz), bands, 0); + ieee80211_add_channels_default_2ghz(chans, maxchans, nchans, bands, 0); if (sc->rf_rev == RT2860_RF_2750 || sc->rf_rev == RT2860_RF_2850 || sc->rf_rev == RT3070_RF_3052 || sc->rf_rev == RT3593_RF_3053 || @@ -4853,15 +4869,11 @@ static void run_scan_start(struct ieee80211com *ic) { struct run_softc *sc = ic->ic_softc; - uint32_t tmp; RUN_LOCK(sc); /* abort TSF synchronization */ - run_read(sc, RT2860_BCN_TIME_CFG, &tmp); - run_write(sc, RT2860_BCN_TIME_CFG, - tmp & ~(RT2860_BCN_TX_EN | RT2860_TSF_TIMER_EN | - RT2860_TBTT_TIMER_EN)); + run_disable_tsf(sc); run_set_bssid(sc, ieee80211broadcastaddr); RUN_UNLOCK(sc); @@ -5148,6 +5160,18 @@ run_enable_tsf(struct run_softc *sc) } static void +run_disable_tsf(struct run_softc *sc) +{ + uint32_t tmp; + + if (run_read(sc, RT2860_BCN_TIME_CFG, &tmp) == 0) { + tmp &= ~(RT2860_BCN_TX_EN | RT2860_TSF_TIMER_EN | + RT2860_TBTT_TIMER_EN); + run_write(sc, RT2860_BCN_TIME_CFG, tmp); + } +} + +static void run_get_tsf(struct run_softc *sc, uint64_t *buf) { run_read_region_1(sc, RT2860_TSF_TIMER_DW0, (uint8_t *)buf, @@ -6098,10 +6122,7 @@ run_init_locked(struct run_softc *sc) } /* abort TSF synchronization */ - run_read(sc, RT2860_BCN_TIME_CFG, &tmp); - tmp &= ~(RT2860_BCN_TX_EN | RT2860_TSF_TIMER_EN | - RT2860_TBTT_TIMER_EN); - run_write(sc, RT2860_BCN_TIME_CFG, tmp); + run_disable_tsf(sc); /* clear RX WCID search table */ run_set_region_4(sc, RT2860_WCID_ENTRY(0), 0, 512); diff --git a/freebsd/sys/dev/usb/wlan/if_runreg.h b/freebsd/sys/dev/usb/wlan/if_runreg.h index c09aac8f..8561d2c1 100644 --- a/freebsd/sys/dev/usb/wlan/if_runreg.h +++ b/freebsd/sys/dev/usb/wlan/if_runreg.h @@ -1086,9 +1086,6 @@ struct rt2860_rxwi { /* * Channel map for run(4) driver; taken from the table below. */ -static const uint8_t run_chan_2ghz[] = - { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; - static const uint8_t run_chan_5ghz[] = { 36, 38, 40, 44, 46, 48, 52, 54, 56, 60, 62, 64, 100, 102, 104, 108, 110, 112, 116, 118, 120, 124, 126, 128, 132, 134, 136, 140, diff --git a/freebsd/sys/dev/usb/wlan/if_runvar.h b/freebsd/sys/dev/usb/wlan/if_runvar.h index 7209bfc7..a17d5b46 100644 --- a/freebsd/sys/dev/usb/wlan/if_runvar.h +++ b/freebsd/sys/dev/usb/wlan/if_runvar.h @@ -71,7 +71,7 @@ struct run_tx_radiotap_header { uint16_t wt_chan_freq; uint16_t wt_chan_flags; uint8_t wt_hwqueue; -} __packed __aligned(8); +} __packed; #define IEEE80211_RADIOTAP_HWQUEUE 15 diff --git a/freebsd/sys/dev/usb/wlan/if_uath.c b/freebsd/sys/dev/usb/wlan/if_uath.c index 2b97060e..9f0c9d5d 100644 --- a/freebsd/sys/dev/usb/wlan/if_uath.c +++ b/freebsd/sys/dev/usb/wlan/if_uath.c @@ -87,10 +87,6 @@ __FBSDID("$FreeBSD$"); #include <sys/endian.h> #include <sys/kdb.h> -#include <machine/bus.h> -#include <machine/resource.h> -#include <sys/rman.h> - #include <net/bpf.h> #include <net/if.h> #include <net/if_var.h> @@ -1282,8 +1278,8 @@ uath_watchdog(void *arg) if (sc->sc_tx_timer > 0) { if (--sc->sc_tx_timer == 0) { device_printf(sc->sc_dev, "device timeout\n"); - /*uath_init(sc); XXX needs a process context! */ counter_u64_add(ic->ic_oerrors, 1); + ieee80211_restart_all(ic); return; } callout_reset(&sc->watchdog_ch, hz, uath_watchdog, sc); diff --git a/freebsd/sys/dev/usb/wlan/if_uathvar.h b/freebsd/sys/dev/usb/wlan/if_uathvar.h index a38f54fc..a0ef4eab 100644 --- a/freebsd/sys/dev/usb/wlan/if_uathvar.h +++ b/freebsd/sys/dev/usb/wlan/if_uathvar.h @@ -67,9 +67,10 @@ struct uath_rx_radiotap_header { struct uath_tx_radiotap_header { struct ieee80211_radiotap_header wt_ihdr; uint8_t wt_flags; + uint8_t wt_pad; uint16_t wt_chan_freq; uint16_t wt_chan_flags; -} __packed __aligned(8); +} __packed; #define UATH_TX_RADIOTAP_PRESENT \ ((1 << IEEE80211_RADIOTAP_FLAGS) | \ diff --git a/freebsd/sys/dev/usb/wlan/if_upgt.c b/freebsd/sys/dev/usb/wlan/if_upgt.c index e1923bab..c556d108 100644 --- a/freebsd/sys/dev/usb/wlan/if_upgt.c +++ b/freebsd/sys/dev/usb/wlan/if_upgt.c @@ -43,7 +43,6 @@ #include <net/if_types.h> #include <sys/bus.h> -#include <machine/bus.h> #include <net80211/ieee80211_var.h> #include <net80211/ieee80211_phy.h> @@ -1618,7 +1617,7 @@ upgt_fw_load(struct upgt_softc *sc) data_cmd->buflen = bsize; upgt_bulk_tx(sc, data_cmd); - DPRINTF(sc, UPGT_DEBUG_FW, "FW offset=%d, read=%d, sent=%d\n", + DPRINTF(sc, UPGT_DEBUG_FW, "FW offset=%zu, read=%d, sent=%d\n", offset, n, bsize); bsize = n; } @@ -1775,7 +1774,7 @@ upgt_fw_verify(struct upgt_softc *sc) } DPRINTF(sc, UPGT_DEBUG_FW, - "firmware Boot Record Area found at offset %d\n", offset); + "firmware Boot Record Area found at offset %zu\n", offset); /* * Parse Boot Record Area (BRA) options. diff --git a/freebsd/sys/dev/usb/wlan/if_upgtvar.h b/freebsd/sys/dev/usb/wlan/if_upgtvar.h index ce996f6a..9d4c85e6 100644 --- a/freebsd/sys/dev/usb/wlan/if_upgtvar.h +++ b/freebsd/sys/dev/usb/wlan/if_upgtvar.h @@ -394,7 +394,7 @@ struct upgt_tx_radiotap_header { uint8_t wt_rate; uint16_t wt_chan_freq; uint16_t wt_chan_flags; -} __packed __aligned(8); +} __packed; #define UPGT_TX_RADIOTAP_PRESENT \ ((1 << IEEE80211_RADIOTAP_FLAGS) | \ diff --git a/freebsd/sys/dev/usb/wlan/if_ural.c b/freebsd/sys/dev/usb/wlan/if_ural.c index 4de0a9c5..8897f148 100644 --- a/freebsd/sys/dev/usb/wlan/if_ural.c +++ b/freebsd/sys/dev/usb/wlan/if_ural.c @@ -47,10 +47,6 @@ __FBSDID("$FreeBSD$"); #include <sys/endian.h> #include <sys/kdb.h> -#include <machine/bus.h> -#include <machine/resource.h> -#include <sys/rman.h> - #include <net/bpf.h> #include <net/if.h> #include <net/if_var.h> @@ -363,9 +359,6 @@ static const struct { { 161, 0x08808, 0x0242f, 0x00281 } }; -static const uint8_t ural_chan_2ghz[] = - { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; - static const uint8_t ural_chan_5ghz[] = { 36, 40, 44, 48, 52, 56, 60, 64, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, @@ -1593,8 +1586,7 @@ ural_getradiocaps(struct ieee80211com *ic, memset(bands, 0, sizeof(bands)); setbit(bands, IEEE80211_MODE_11B); setbit(bands, IEEE80211_MODE_11G); - ieee80211_add_channel_list_2ghz(chans, maxchans, nchans, - ural_chan_2ghz, nitems(ural_chan_2ghz), bands, 0); + ieee80211_add_channels_default_2ghz(chans, maxchans, nchans, bands, 0); if (sc->rf_rev == RAL_RF_5222) { setbit(bands, IEEE80211_MODE_11A); diff --git a/freebsd/sys/dev/usb/wlan/if_uralvar.h b/freebsd/sys/dev/usb/wlan/if_uralvar.h index dd863fe0..b59b7911 100644 --- a/freebsd/sys/dev/usb/wlan/if_uralvar.h +++ b/freebsd/sys/dev/usb/wlan/if_uralvar.h @@ -51,7 +51,7 @@ struct ural_tx_radiotap_header { uint16_t wt_chan_freq; uint16_t wt_chan_flags; uint8_t wt_antenna; -} __packed __aligned(8); +} __packed; #define RAL_TX_RADIOTAP_PRESENT \ ((1 << IEEE80211_RADIOTAP_FLAGS) | \ diff --git a/freebsd/sys/dev/usb/wlan/if_urtw.c b/freebsd/sys/dev/usb/wlan/if_urtw.c index 309375f0..aba334af 100644 --- a/freebsd/sys/dev/usb/wlan/if_urtw.c +++ b/freebsd/sys/dev/usb/wlan/if_urtw.c @@ -36,10 +36,6 @@ __FBSDID("$FreeBSD$"); #include <sys/endian.h> #include <sys/kdb.h> -#include <machine/bus.h> -#include <machine/resource.h> -#include <sys/rman.h> - #include <net/if.h> #include <net/if_var.h> #include <net/if_arp.h> @@ -217,9 +213,6 @@ static uint8_t urtw_8225z2_agc[] = { 0x31, 0x31, 0x31, 0x31, 0x31, 0x31, 0x31 }; -static const uint8_t urtw_chan_2ghz[] = - { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; - static uint32_t urtw_8225_channel[] = { 0x0000, /* dummy channel 0 */ 0x085c, /* 1 */ @@ -679,6 +672,7 @@ static void urtw_scan_end(struct ieee80211com *); static void urtw_getradiocaps(struct ieee80211com *, int, int *, struct ieee80211_channel[]); static void urtw_set_channel(struct ieee80211com *); +static void urtw_update_promisc(struct ieee80211com *); static void urtw_update_mcast(struct ieee80211com *); static int urtw_tx_start(struct urtw_softc *, struct ieee80211_node *, struct mbuf *, @@ -760,6 +754,7 @@ static void urtw_free_tx_data_list(struct urtw_softc *); static void urtw_free_rx_data_list(struct urtw_softc *); static void urtw_free_data_list(struct urtw_softc *, struct urtw_data data[], int, int); +static usb_error_t urtw_set_macaddr(struct urtw_softc *, const uint8_t *); static usb_error_t urtw_adapter_start(struct urtw_softc *); static usb_error_t urtw_adapter_start_b(struct urtw_softc *); static usb_error_t urtw_set_mode(struct urtw_softc *, uint32_t); @@ -905,6 +900,7 @@ urtw_attach(device_t dev) ic->ic_updateslot = urtw_updateslot; ic->ic_vap_create = urtw_vap_create; ic->ic_vap_delete = urtw_vap_delete; + ic->ic_update_promisc = urtw_update_promisc; ic->ic_update_mcast = urtw_update_mcast; ic->ic_parent = urtw_parent; ic->ic_transmit = urtw_transmit; @@ -1194,9 +1190,23 @@ fail: } static usb_error_t +urtw_set_macaddr(struct urtw_softc *sc, const uint8_t *macaddr) +{ + usb_error_t error; + + urtw_write32_m(sc, URTW_MAC0, ((const uint32_t *)macaddr)[0]); + urtw_write16_m(sc, URTW_MAC4, ((const uint32_t *)macaddr)[1] & 0xffff); + +fail: + return (error); +} + +static usb_error_t urtw_adapter_start(struct urtw_softc *sc) { struct ieee80211com *ic = &sc->sc_ic; + struct ieee80211vap *vap = TAILQ_FIRST(&ic->ic_vaps); + const uint8_t *macaddr; usb_error_t error; error = urtw_reset(sc); @@ -1216,8 +1226,11 @@ urtw_adapter_start(struct urtw_softc *sc) if (error) goto fail; /* applying MAC address again. */ - urtw_write32_m(sc, URTW_MAC0, ((uint32_t *)ic->ic_macaddr)[0]); - urtw_write16_m(sc, URTW_MAC4, ((uint32_t *)ic->ic_macaddr)[1] & 0xffff); + macaddr = vap ? vap->iv_myaddr : ic->ic_macaddr; + urtw_set_macaddr(sc, macaddr); + if (error) + goto fail; + error = urtw_set_mode(sc, URTW_EPROM_CMD_NORMAL); if (error) goto fail; @@ -1587,8 +1600,7 @@ urtw_getradiocaps(struct ieee80211com *ic, memset(bands, 0, sizeof(bands)); setbit(bands, IEEE80211_MODE_11B); setbit(bands, IEEE80211_MODE_11G); - ieee80211_add_channel_list_2ghz(chans, maxchans, nchans, - urtw_chan_2ghz, nitems(urtw_chan_2ghz), bands, 0); + ieee80211_add_channels_default_2ghz(chans, maxchans, nchans, bands, 0); } static void @@ -1641,6 +1653,17 @@ fail: } static void +urtw_update_promisc(struct ieee80211com *ic) +{ + struct urtw_softc *sc = ic->ic_softc; + + URTW_LOCK(sc); + if (sc->sc_flags & URTW_RUNNING) + urtw_rx_setconf(sc); + URTW_UNLOCK(sc); +} + +static void urtw_update_mcast(struct ieee80211com *ic) { @@ -1694,11 +1717,7 @@ urtw_tx_start(struct urtw_softc *sc, struct ieee80211_node *ni, struct mbuf *m0, if (ieee80211_radiotap_active_vap(vap)) { struct urtw_tx_radiotap_header *tap = &sc->sc_txtap; - /* XXX Are variables correct? */ tap->wt_flags = 0; - tap->wt_chan_freq = htole16(ic->ic_curchan->ic_freq); - tap->wt_chan_flags = htole16(ic->ic_curchan->ic_flags); - ieee80211_radiotap_tx(vap, m0); } @@ -1892,11 +1911,13 @@ static void urtw_watchdog(void *arg) { struct urtw_softc *sc = arg; + struct ieee80211com *ic = &sc->sc_ic; if (sc->sc_txtimer > 0) { if (--sc->sc_txtimer == 0) { device_printf(sc->sc_dev, "device timeout\n"); - counter_u64_add(sc->sc_ic.ic_oerrors, 1); + counter_u64_add(ic->ic_oerrors, 1); + ieee80211_restart_all(ic); return; } callout_reset(&sc->sc_watchdog_ch, hz, urtw_watchdog, sc); @@ -3179,6 +3200,8 @@ static usb_error_t urtw_8225v2b_rf_init(struct urtw_softc *sc) { struct ieee80211com *ic = &sc->sc_ic; + struct ieee80211vap *vap = TAILQ_FIRST(&ic->ic_vaps); + const uint8_t *macaddr; unsigned int i; uint8_t data8; usb_error_t error; @@ -3226,8 +3249,10 @@ urtw_8225v2b_rf_init(struct urtw_softc *sc) urtw_write8_m(sc, URTW_CONFIG1, data8); /* applying MAC address again. */ - urtw_write32_m(sc, URTW_MAC0, ((uint32_t *)ic->ic_macaddr)[0]); - urtw_write16_m(sc, URTW_MAC4, ((uint32_t *)ic->ic_macaddr)[1] & 0xffff); + macaddr = vap ? vap->iv_myaddr : ic->ic_macaddr; + error = urtw_set_macaddr(sc, macaddr); + if (error) + goto fail; error = urtw_set_mode(sc, URTW_EPROM_CMD_NORMAL); if (error) @@ -3887,7 +3912,6 @@ urtw_rx_setconf(struct urtw_softc *sc) if (sc->sc_flags & URTW_RTL8187B) { data = data | URTW_RX_FILTER_MNG | URTW_RX_FILTER_DATA | URTW_RX_FILTER_MCAST | URTW_RX_FILTER_BCAST | - URTW_RX_FILTER_NICMAC | URTW_RX_CHECK_BSSID | URTW_RX_FIFO_THRESHOLD_NONE | URTW_MAX_RX_DMA_2048 | URTW_RX_AUTORESETPHY | URTW_RCR_ONLYERLPKT; @@ -3902,14 +3926,6 @@ urtw_rx_setconf(struct urtw_softc *sc) if (sc->sc_crcmon == 1 && ic->ic_opmode == IEEE80211_M_MONITOR) data = data | URTW_RX_FILTER_CRCERR; - if (ic->ic_opmode == IEEE80211_M_MONITOR || - ic->ic_promisc > 0 || ic->ic_allmulti > 0) { - data = data | URTW_RX_FILTER_ALLMAC; - } else { - data = data | URTW_RX_FILTER_NICMAC; - data = data | URTW_RX_CHECK_BSSID; - } - data = data &~ URTW_RX_FIFO_THRESHOLD_MASK; data = data | URTW_RX_FIFO_THRESHOLD_NONE | URTW_RX_AUTORESETPHY; @@ -3917,6 +3933,16 @@ urtw_rx_setconf(struct urtw_softc *sc) data = data | URTW_MAX_RX_DMA_2048 | URTW_RCR_ONLYERLPKT; } + /* XXX allmulti should not be checked here... */ + if (ic->ic_opmode == IEEE80211_M_MONITOR || + ic->ic_promisc > 0 || ic->ic_allmulti > 0) { + data = data | URTW_RX_FILTER_CTL; + data = data | URTW_RX_FILTER_ALLMAC; + } else { + data = data | URTW_RX_FILTER_NICMAC; + data = data | URTW_RX_CHECK_BSSID; + } + urtw_write32_m(sc, URTW_RX, data); fail: return (error); @@ -3932,50 +3958,56 @@ urtw_rxeof(struct usb_xfer *xfer, struct urtw_data *data, int *rssi_p, struct urtw_softc *sc = data->sc; struct ieee80211com *ic = &sc->sc_ic; uint8_t noise = 0, rate; + uint64_t mactime; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); - if (actlen < (int)URTW_MIN_RXBUFSZ) { - counter_u64_add(ic->ic_ierrors, 1); - return (NULL); - } - if (sc->sc_flags & URTW_RTL8187B) { struct urtw_8187b_rxhdr *rx; + if (actlen < sizeof(*rx) + IEEE80211_ACK_LEN) + goto fail; + rx = (struct urtw_8187b_rxhdr *)(data->buf + (actlen - (sizeof(struct urtw_8187b_rxhdr)))); flen = le32toh(rx->flag) & 0xfff; - if (flen > actlen) { - counter_u64_add(ic->ic_ierrors, 1); - return (NULL); - } + if (flen > actlen - sizeof(*rx)) + goto fail; + rate = (le32toh(rx->flag) >> URTW_RX_FLAG_RXRATE_SHIFT) & 0xf; /* XXX correct? */ rssi = rx->rssi & URTW_RX_RSSI_MASK; noise = rx->noise; + + if (ieee80211_radiotap_active(ic)) + mactime = rx->mactime; } else { struct urtw_8187l_rxhdr *rx; + if (actlen < sizeof(*rx) + IEEE80211_ACK_LEN) + goto fail; + rx = (struct urtw_8187l_rxhdr *)(data->buf + (actlen - (sizeof(struct urtw_8187l_rxhdr)))); flen = le32toh(rx->flag) & 0xfff; - if (flen > actlen) { - counter_u64_add(ic->ic_ierrors, 1); - return (NULL); - } + if (flen > actlen - sizeof(*rx)) + goto fail; rate = (le32toh(rx->flag) >> URTW_RX_FLAG_RXRATE_SHIFT) & 0xf; /* XXX correct? */ rssi = rx->rssi & URTW_RX_8187L_RSSI_MASK; noise = rx->noise; + + if (ieee80211_radiotap_active(ic)) + mactime = rx->mactime; } + if (flen < IEEE80211_ACK_LEN) + goto fail; + mnew = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); - if (mnew == NULL) { - counter_u64_add(ic->ic_ierrors, 1); - return (NULL); - } + if (mnew == NULL) + goto fail; m = data->m; data->m = mnew; @@ -3987,20 +4019,23 @@ urtw_rxeof(struct usb_xfer *xfer, struct urtw_data *data, int *rssi_p, if (ieee80211_radiotap_active(ic)) { struct urtw_rx_radiotap_header *tap = &sc->sc_rxtap; - /* XXX Are variables correct? */ - tap->wr_chan_freq = htole16(ic->ic_curchan->ic_freq); - tap->wr_chan_flags = htole16(ic->ic_curchan->ic_flags); + tap->wr_tsf = mactime; + tap->wr_flags = 0; tap->wr_dbm_antsignal = (int8_t)rssi; } wh = mtod(m, struct ieee80211_frame *); - if ((wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK) == IEEE80211_FC0_TYPE_DATA) + if (IEEE80211_IS_DATA(wh)) sc->sc_currate = (rate > 0) ? rate : sc->sc_currate; *rssi_p = rssi; *nf_p = noise; /* XXX correct? */ return (m); + +fail: + counter_u64_add(ic->ic_ierrors, 1); + return (NULL); } static void @@ -4008,7 +4043,6 @@ urtw_bulk_rx_callback(struct usb_xfer *xfer, usb_error_t error) { struct urtw_softc *sc = usbd_xfer_softc(xfer); struct ieee80211com *ic = &sc->sc_ic; - struct ieee80211_frame *wh; struct ieee80211_node *ni; struct mbuf *m = NULL; struct urtw_data *data; @@ -4046,9 +4080,13 @@ setup: */ URTW_UNLOCK(sc); if (m != NULL) { - wh = mtod(m, struct ieee80211_frame *); - ni = ieee80211_find_rxnode(ic, - (struct ieee80211_frame_min *)wh); + if (m->m_pkthdr.len >= + sizeof(struct ieee80211_frame_min)) { + ni = ieee80211_find_rxnode(ic, + mtod(m, struct ieee80211_frame_min *)); + } else + ni = NULL; + if (ni != NULL) { (void) ieee80211_input(ni, m, rssi, nf); /* node is no longer needed */ diff --git a/freebsd/sys/dev/usb/wlan/if_urtwvar.h b/freebsd/sys/dev/usb/wlan/if_urtwvar.h index 08ffc8f3..87c24d64 100644 --- a/freebsd/sys/dev/usb/wlan/if_urtwvar.h +++ b/freebsd/sys/dev/usb/wlan/if_urtwvar.h @@ -47,10 +47,6 @@ struct urtw_data { }; typedef STAILQ_HEAD(, urtw_data) urtw_datahead; -/* XXX not correct.. */ -#define URTW_MIN_RXBUFSZ \ - (sizeof(struct ieee80211_frame_min)) - #define URTW_RX_DATA_LIST_COUNT 4 #define URTW_TX_DATA_LIST_COUNT 16 #define URTW_RX_MAXSIZE 0x9c4 @@ -59,23 +55,27 @@ typedef STAILQ_HEAD(, urtw_data) urtw_datahead; struct urtw_rx_radiotap_header { struct ieee80211_radiotap_header wr_ihdr; + uint64_t wr_tsf; uint8_t wr_flags; + uint8_t wr_pad; uint16_t wr_chan_freq; uint16_t wr_chan_flags; int8_t wr_dbm_antsignal; } __packed __aligned(8); #define URTW_RX_RADIOTAP_PRESENT \ - ((1 << IEEE80211_RADIOTAP_FLAGS) | \ + ((1 << IEEE80211_RADIOTAP_TSFT) | \ + (1 << IEEE80211_RADIOTAP_FLAGS) | \ (1 << IEEE80211_RADIOTAP_CHANNEL) | \ (1 << IEEE80211_RADIOTAP_DBM_ANTSIGNAL)) struct urtw_tx_radiotap_header { struct ieee80211_radiotap_header wt_ihdr; uint8_t wt_flags; + uint8_t wt_pad; uint16_t wt_chan_freq; uint16_t wt_chan_flags; -} __packed __aligned(8); +} __packed; #define URTW_TX_RADIOTAP_PRESENT \ ((1 << IEEE80211_RADIOTAP_FLAGS) | \ diff --git a/freebsd/sys/dev/usb/wlan/if_zyd.c b/freebsd/sys/dev/usb/wlan/if_zyd.c index 1835b58b..bb4a9e40 100644 --- a/freebsd/sys/dev/usb/wlan/if_zyd.c +++ b/freebsd/sys/dev/usb/wlan/if_zyd.c @@ -46,10 +46,6 @@ __FBSDID("$FreeBSD$"); #include <sys/endian.h> #include <sys/kdb.h> -#include <machine/bus.h> -#include <machine/resource.h> -#include <sys/rman.h> - #include <net/bpf.h> #include <net/if.h> #include <net/if_var.h> @@ -2891,8 +2887,7 @@ zyd_getradiocaps(struct ieee80211com *ic, memset(bands, 0, sizeof(bands)); setbit(bands, IEEE80211_MODE_11B); setbit(bands, IEEE80211_MODE_11G); - ieee80211_add_channel_list_2ghz(chans, maxchans, nchans, - zyd_chan_2ghz, nitems(zyd_chan_2ghz), bands, 0); + ieee80211_add_channels_default_2ghz(chans, maxchans, nchans, bands, 0); } static void diff --git a/freebsd/sys/dev/usb/wlan/if_zydreg.h b/freebsd/sys/dev/usb/wlan/if_zydreg.h index 724b8c57..a4523199 100644 --- a/freebsd/sys/dev/usb/wlan/if_zydreg.h +++ b/freebsd/sys/dev/usb/wlan/if_zydreg.h @@ -421,10 +421,6 @@ #define ZYD_CR254 0x93f8 #define ZYD_CR255 0x93fc -/* nitems(ZYD_*_CHANTABLE) */ -static const uint8_t zyd_chan_2ghz[] = - { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; - /* copied nearly verbatim from the Linux driver rewrite */ #define ZYD_DEF_PHY \ { \ @@ -1204,7 +1200,7 @@ struct zyd_tx_radiotap_header { uint8_t wt_rate; uint16_t wt_chan_freq; uint16_t wt_chan_flags; -} __packed __aligned(8); +} __packed; #define ZYD_TX_RADIOTAP_PRESENT \ ((1 << IEEE80211_RADIOTAP_FLAGS) | \ diff --git a/freebsd/sys/fs/devfs/devfs_vnops.c b/freebsd/sys/fs/devfs/devfs_vnops.c index 176664db..ac1f21ee 100644 --- a/freebsd/sys/fs/devfs/devfs_vnops.c +++ b/freebsd/sys/fs/devfs/devfs_vnops.c @@ -47,12 +47,14 @@ #include <sys/systm.h> #include <sys/conf.h> #include <sys/dirent.h> +#include <sys/eventhandler.h> #include <sys/fcntl.h> #include <sys/file.h> #include <sys/filedesc.h> #include <sys/filio.h> #include <sys/jail.h> #include <sys/kernel.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mman.h> @@ -1441,7 +1443,6 @@ devfs_reclaim(struct vop_reclaim_args *ap) vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); - vnode_destroy_vobject(vp); return (0); } diff --git a/freebsd/sys/i386/include/machine/cpufunc.h b/freebsd/sys/i386/include/machine/cpufunc.h index c640b569..a029da3b 100644 --- a/freebsd/sys/i386/include/machine/cpufunc.h +++ b/freebsd/sys/i386/include/machine/cpufunc.h @@ -110,23 +110,49 @@ disable_intr(void) __asm __volatile("cli" : : : "memory"); } +#ifdef _KERNEL static __inline void do_cpuid(u_int ax, u_int *p) { __asm __volatile("cpuid" - : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) - : "0" (ax)); + : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (ax)); } static __inline void cpuid_count(u_int ax, u_int cx, u_int *p) { __asm __volatile("cpuid" - : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) - : "0" (ax), "c" (cx)); + : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (ax), "c" (cx)); +} +#else +static __inline void +do_cpuid(u_int ax, u_int *p) +{ + __asm __volatile( + "pushl\t%%ebx\n\t" + "cpuid\n\t" + "movl\t%%ebx,%1\n\t" + "popl\t%%ebx" + : "=a" (p[0]), "=DS" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (ax)); } static __inline void +cpuid_count(u_int ax, u_int cx, u_int *p) +{ + __asm __volatile( + "pushl\t%%ebx\n\t" + "cpuid\n\t" + "movl\t%%ebx,%1\n\t" + "popl\t%%ebx" + : "=a" (p[0]), "=DS" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (ax), "c" (cx)); +} +#endif + +static __inline void enable_intr(void) { @@ -708,6 +734,22 @@ intr_restore(register_t eflags) } #endif /* __rtems__ */ +static __inline uint32_t +rdpkru(void) +{ + uint32_t res; + + __asm __volatile("rdpkru" : "=a" (res) : "c" (0) : "edx"); + return (res); +} + +static __inline void +wrpkru(uint32_t mask) +{ + + __asm __volatile("wrpkru" : : "a" (mask), "c" (0), "d" (0)); +} + #else /* !(__GNUCLIKE_ASM && __CC_SUPPORTS___INLINE) */ #ifndef __rtems__ diff --git a/freebsd/sys/i386/include/machine/md_var.h b/freebsd/sys/i386/include/machine/md_var.h index 53e1861c..b20446dc 100644 --- a/freebsd/sys/i386/include/machine/md_var.h +++ b/freebsd/sys/i386/include/machine/md_var.h @@ -69,6 +69,8 @@ void doreti_popl_fs_fault(void) __asm(__STRING(doreti_popl_fs_fault)); void fill_based_sd(struct segment_descriptor *sdp, uint32_t base); void i686_pagezero(void *addr); void sse2_pagezero(void *addr); +int minidumpsys_nopae(struct dumperinfo *); +int minidumpsys_pae(struct dumperinfo *); void init_AMD_Elan_sc520(void); vm_paddr_t kvtop(void *addr); void panicifcpuunsupported(void); diff --git a/freebsd/sys/kern/init_main.c b/freebsd/sys/kern/init_main.c index c6a9e310..2265d89a 100644 --- a/freebsd/sys/kern/init_main.c +++ b/freebsd/sys/kern/init_main.c @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/kernel.h> #include <sys/epoch.h> +#include <sys/eventhandler.h> #include <sys/exec.h> #include <sys/file.h> #include <sys/filedesc.h> @@ -110,6 +111,14 @@ struct thread0_storage thread0_st __aligned(32); struct vmspace vmspace0; struct proc *initproc; +int +linux_alloc_current_noop(struct thread *td __unused, int flags __unused) +{ + return (0); +} +int (*lkpi_alloc_current)(struct thread *, int) = linux_alloc_current_noop; + + #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif @@ -155,11 +164,6 @@ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; -EVENTHANDLER_LIST_DECLARE(process_init); -EVENTHANDLER_LIST_DECLARE(thread_init); -EVENTHANDLER_LIST_DECLARE(process_ctor); -EVENTHANDLER_LIST_DECLARE(thread_ctor); - /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. @@ -440,7 +444,6 @@ struct sysentvec null_sysvec = { .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, - .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, @@ -482,7 +485,7 @@ proc0_init(void *dummy __unused) GIANT_REQUIRED; p = &proc0; td = &thread0; - + /* * Initialize magic number and osrel. */ @@ -822,11 +825,9 @@ start_init(void *dummy) } /* - * Like kproc_create(), but runs in its own address space. - * We do this early to reserve pid 1. - * - * Note special case - do not make it runnable yet. Other work - * in progress will change this more. + * Like kproc_create(), but runs in its own address space. We do this + * early to reserve pid 1. Note special case - do not make it + * runnable yet, init execution is started when userspace can be served. */ static void create_init(const void *udata __unused) diff --git a/freebsd/sys/kern/kern_conf.c b/freebsd/sys/kern/kern_conf.c index 560a450a..26718648 100644 --- a/freebsd/sys/kern/kern_conf.c +++ b/freebsd/sys/kern/kern_conf.c @@ -656,7 +656,7 @@ prep_cdevsw(struct cdevsw *devsw, int flags) return (0); } - if (devsw->d_version != D_VERSION_03) { + if (devsw->d_version != D_VERSION_04) { printf( "WARNING: Device driver \"%s\" has wrong version %s\n", devsw->d_name == NULL ? "???" : devsw->d_name, diff --git a/freebsd/sys/kern/kern_event.c b/freebsd/sys/kern/kern_event.c index 5c75c657..f0700e55 100644 --- a/freebsd/sys/kern/kern_event.c +++ b/freebsd/sys/kern/kern_event.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/capsicum.h> #include <sys/kernel.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/rwlock.h> diff --git a/freebsd/sys/kern/kern_intr.c b/freebsd/sys/kern/kern_intr.c index 122e82bd..65b633f6 100644 --- a/freebsd/sys/kern/kern_intr.c +++ b/freebsd/sys/kern/kern_intr.c @@ -101,7 +101,7 @@ struct proc *intrproc; static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads"); -static int intr_storm_threshold = 1000; +static int intr_storm_threshold = 0; SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RWTUN, &intr_storm_threshold, 0, "Number of consecutive interrupts before storm protection is enabled"); @@ -231,10 +231,20 @@ intr_event_update(struct intr_event *ie) } /* - * If the handler names were too long, add +'s to indicate missing - * names. If we run out of room and still have +'s to add, change - * the last character from a + to a *. + * If there is only one handler and its name is too long, just copy in + * as much of the end of the name (includes the unit number) as will + * fit. Otherwise, we have multiple handlers and not all of the names + * will fit. Add +'s to indicate missing names. If we run out of room + * and still have +'s to add, change the last character from a + to a *. */ + if (missed == 1 && space == 1) { + ih = CK_SLIST_FIRST(&ie->ie_handlers); + missed = strlen(ie->ie_fullname) + strlen(ih->ih_name) + 2 - + sizeof(ie->ie_fullname); + strcat(ie->ie_fullname, (missed == 0) ? " " : "-"); + strcat(ie->ie_fullname, &ih->ih_name[missed]); + missed = 0; + } last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2]; while (missed-- > 0) { if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) { @@ -393,6 +403,25 @@ intr_event_bind_ithread(struct intr_event *ie, int cpu) return (_intr_event_bind(ie, cpu, false, true)); } +/* + * Bind an interrupt event's ithread to the specified cpuset. + */ +int +intr_event_bind_ithread_cpuset(struct intr_event *ie, cpuset_t *cs) +{ + lwpid_t id; + + mtx_lock(&ie->ie_lock); + if (ie->ie_thread != NULL) { + id = ie->ie_thread->it_thread->td_tid; + mtx_unlock(&ie->ie_lock); + return (cpuset_setthread(id, cs)); + } else { + mtx_unlock(&ie->ie_lock); + } + return (ENODEV); +} + static struct intr_event * intr_lookup(int irq) { diff --git a/freebsd/sys/kern/kern_mbuf.c b/freebsd/sys/kern/kern_mbuf.c index f94eda5b..85846acd 100644 --- a/freebsd/sys/kern/kern_mbuf.c +++ b/freebsd/sys/kern/kern_mbuf.c @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_param.h> +#include <rtems/bsd/local/opt_kern_tls.h> #include <sys/param.h> #include <sys/conf.h> @@ -43,13 +44,20 @@ __FBSDID("$FreeBSD$"); #include <sys/domain.h> #include <sys/eventhandler.h> #include <sys/kernel.h> +#include <sys/ktls.h> #include <sys/limits.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/protosw.h> +#include <sys/refcount.h> +#include <sys/sf_buf.h> #include <sys/smp.h> +#include <sys/socket.h> #include <sys/sysctl.h> +#include <net/if.h> +#include <net/if_var.h> + #include <vm/vm.h> #include <vm/vm_extern.h> #include <vm/vm_kern.h> @@ -112,11 +120,20 @@ int nmbjumbop; /* limits number of page size jumbo clusters */ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ +bool mb_use_ext_pgs; /* use EXT_PGS mbufs for sendfile & TLS */ +SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN, + &mb_use_ext_pgs, 0, + "Use unmapped mbufs for sendfile(2) and TLS offload"); + static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, "Maximum real memory allocatable to various mbuf types"); +static counter_u64_t snd_tag_count; +SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, + &snd_tag_count, "# of active mbuf send tags"); + /* * tunable_mbinit() has to be run before any mbuf allocations are done. */ @@ -285,6 +302,7 @@ uma_zone_t zone_pack; uma_zone_t zone_jumbop; uma_zone_t zone_jumbo9; uma_zone_t zone_jumbo16; +uma_zone_t zone_extpgs; /* * Local prototypes. @@ -302,6 +320,9 @@ static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); +_Static_assert(sizeof(struct mbuf_ext_pgs) == 256, + "mbuf_ext_pgs size mismatch"); + /* * Initialize FreeBSD Network buffer allocation. */ @@ -383,6 +404,15 @@ mbuf_init(void *dummy) uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); + zone_extpgs = uma_zcreate(MBUF_EXTPGS_MEM_NAME, + sizeof(struct mbuf_ext_pgs), +#ifdef INVARIANTS + trash_ctor, trash_dtor, trash_init, trash_fini, +#else + NULL, NULL, NULL, NULL, +#endif + UMA_ALIGN_CACHE, 0); + /* * Hook event handler for low-memory situation, used to * drain protocols and push data back to the caches (UMA @@ -392,6 +422,8 @@ mbuf_init(void *dummy) EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, EVENTHANDLER_PRI_FIRST); #endif /* __rtems__ */ + + snd_tag_count = counter_u64_alloc(M_WAITOK); } SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); @@ -697,14 +729,14 @@ mb_dtor_pack(void *mem, int size, void *arg) #endif /* * If there are processes blocked on zone_clust, waiting for pages - * to be freed up, * cause them to be woken up by draining the - * packet zone. We are exposed to a race here * (in the check for + * to be freed up, cause them to be woken up by draining the + * packet zone. We are exposed to a race here (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted_nolock(zone_clust)) - zone_drain(zone_pack); + uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); } /* @@ -832,6 +864,384 @@ mb_reclaim(uma_zone_t zone __unused, int pending __unused) } /* + * Free "count" units of I/O from an mbuf chain. They could be held + * in EXT_PGS or just as a normal mbuf. This code is intended to be + * called in an error path (I/O error, closed connection, etc). + */ +void +mb_free_notready(struct mbuf *m, int count) +{ + int i; + + for (i = 0; i < count && m != NULL; i++) { +#ifndef __rtems__ + if ((m->m_flags & M_EXT) != 0 && + m->m_ext.ext_type == EXT_PGS) { + m->m_ext.ext_pgs->nrdy--; + if (m->m_ext.ext_pgs->nrdy != 0) + continue; + } +#endif /* __rtems__ */ + m = m_free(m); + } + KASSERT(i == count, ("Removed only %d items from %p", i, m)); +} + +#ifndef __rtems__ +/* + * Compress an unmapped mbuf into a simple mbuf when it holds a small + * amount of data. This is used as a DOS defense to avoid having + * small packets tie up wired pages, an ext_pgs structure, and an + * mbuf. Since this converts the existing mbuf in place, it can only + * be used if there are no other references to 'm'. + */ +int +mb_unmapped_compress(struct mbuf *m) +{ + volatile u_int *refcnt; + struct mbuf m_temp; + + /* + * Assert that 'm' does not have a packet header. If 'm' had + * a packet header, it would only be able to hold MHLEN bytes + * and m_data would have to be initialized differently. + */ + KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXT) && + m->m_ext.ext_type == EXT_PGS, + ("%s: m %p !M_EXT or !EXT_PGS or M_PKTHDR", __func__, m)); + KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); + + if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { + refcnt = &m->m_ext.ext_count; + } else { + KASSERT(m->m_ext.ext_cnt != NULL, + ("%s: no refcounting pointer on %p", __func__, m)); + refcnt = m->m_ext.ext_cnt; + } + + if (*refcnt != 1) + return (EBUSY); + + /* + * Copy mbuf header and m_ext portion of 'm' to 'm_temp' to + * create a "fake" EXT_PGS mbuf that can be used with + * m_copydata() as well as the ext_free callback. + */ + memcpy(&m_temp, m, offsetof(struct mbuf, m_ext) + sizeof (m->m_ext)); + m_temp.m_next = NULL; + m_temp.m_nextpkt = NULL; + + /* Turn 'm' into a "normal" mbuf. */ + m->m_flags &= ~(M_EXT | M_RDONLY | M_NOMAP); + m->m_data = m->m_dat; + + /* Copy data from template's ext_pgs. */ + m_copydata(&m_temp, 0, m_temp.m_len, mtod(m, caddr_t)); + + /* Free the backing pages. */ + m_temp.m_ext.ext_free(&m_temp); + + /* Finally, free the ext_pgs struct. */ + uma_zfree(zone_extpgs, m_temp.m_ext.ext_pgs); + return (0); +} + +/* + * These next few routines are used to permit downgrading an unmapped + * mbuf to a chain of mapped mbufs. This is used when an interface + * doesn't supported unmapped mbufs or if checksums need to be + * computed in software. + * + * Each unmapped mbuf is converted to a chain of mbufs. First, any + * TLS header data is stored in a regular mbuf. Second, each page of + * unmapped data is stored in an mbuf with an EXT_SFBUF external + * cluster. These mbufs use an sf_buf to provide a valid KVA for the + * associated physical page. They also hold a reference on the + * original EXT_PGS mbuf to ensure the physical page doesn't go away. + * Finally, any TLS trailer data is stored in a regular mbuf. + * + * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF + * mbufs. It frees the associated sf_buf and releases its reference + * on the original EXT_PGS mbuf. + * + * _mb_unmapped_to_ext() is a helper function that converts a single + * unmapped mbuf into a chain of mbufs. + * + * mb_unmapped_to_ext() is the public function that walks an mbuf + * chain converting any unmapped mbufs to mapped mbufs. It returns + * the new chain of unmapped mbufs on success. On failure it frees + * the original mbuf chain and returns NULL. + */ +static void +mb_unmapped_free_mext(struct mbuf *m) +{ + struct sf_buf *sf; + struct mbuf *old_m; + + sf = m->m_ext.ext_arg1; + sf_buf_free(sf); + + /* Drop the reference on the backing EXT_PGS mbuf. */ + old_m = m->m_ext.ext_arg2; + mb_free_ext(old_m); +} + +static struct mbuf * +_mb_unmapped_to_ext(struct mbuf *m) +{ + struct mbuf_ext_pgs *ext_pgs; + struct mbuf *m_new, *top, *prev, *mref; + struct sf_buf *sf; + vm_page_t pg; + int i, len, off, pglen, pgoff, seglen, segoff; + volatile u_int *refcnt; + u_int ref_inc = 0; + + MBUF_EXT_PGS_ASSERT(m); + ext_pgs = m->m_ext.ext_pgs; + len = m->m_len; + KASSERT(ext_pgs->tls == NULL, ("%s: can't convert TLS mbuf %p", + __func__, m)); + + /* See if this is the mbuf that holds the embedded refcount. */ + if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { + refcnt = &m->m_ext.ext_count; + mref = m; + } else { + KASSERT(m->m_ext.ext_cnt != NULL, + ("%s: no refcounting pointer on %p", __func__, m)); + refcnt = m->m_ext.ext_cnt; + mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); + } + + /* Skip over any data removed from the front. */ + off = mtod(m, vm_offset_t); + + top = NULL; + if (ext_pgs->hdr_len != 0) { + if (off >= ext_pgs->hdr_len) { + off -= ext_pgs->hdr_len; + } else { + seglen = ext_pgs->hdr_len - off; + segoff = off; + seglen = min(seglen, len); + off = 0; + len -= seglen; + m_new = m_get(M_NOWAIT, MT_DATA); + if (m_new == NULL) + goto fail; + m_new->m_len = seglen; + prev = top = m_new; + memcpy(mtod(m_new, void *), &ext_pgs->hdr[segoff], + seglen); + } + } + pgoff = ext_pgs->first_pg_off; + for (i = 0; i < ext_pgs->npgs && len > 0; i++) { + pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff); + if (off >= pglen) { + off -= pglen; + pgoff = 0; + continue; + } + seglen = pglen - off; + segoff = pgoff + off; + off = 0; + seglen = min(seglen, len); + len -= seglen; + + pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); + m_new = m_get(M_NOWAIT, MT_DATA); + if (m_new == NULL) + goto fail; + if (top == NULL) { + top = prev = m_new; + } else { + prev->m_next = m_new; + prev = m_new; + } + sf = sf_buf_alloc(pg, SFB_NOWAIT); + if (sf == NULL) + goto fail; + + ref_inc++; + m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, + mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF); + m_new->m_data += segoff; + m_new->m_len = seglen; + + pgoff = 0; + }; + if (len != 0) { + KASSERT((off + len) <= ext_pgs->trail_len, + ("off + len > trail (%d + %d > %d)", off, len, + ext_pgs->trail_len)); + m_new = m_get(M_NOWAIT, MT_DATA); + if (m_new == NULL) + goto fail; + if (top == NULL) + top = m_new; + else + prev->m_next = m_new; + m_new->m_len = len; + memcpy(mtod(m_new, void *), &ext_pgs->trail[off], len); + } + + if (ref_inc != 0) { + /* + * Obtain an additional reference on the old mbuf for + * each created EXT_SFBUF mbuf. They will be dropped + * in mb_unmapped_free_mext(). + */ + if (*refcnt == 1) + *refcnt += ref_inc; + else + atomic_add_int(refcnt, ref_inc); + } + m_free(m); + return (top); + +fail: + if (ref_inc != 0) { + /* + * Obtain an additional reference on the old mbuf for + * each created EXT_SFBUF mbuf. They will be + * immediately dropped when these mbufs are freed + * below. + */ + if (*refcnt == 1) + *refcnt += ref_inc; + else + atomic_add_int(refcnt, ref_inc); + } + m_free(m); + m_freem(top); + return (NULL); +} + +struct mbuf * +mb_unmapped_to_ext(struct mbuf *top) +{ + struct mbuf *m, *next, *prev = NULL; + + prev = NULL; + for (m = top; m != NULL; m = next) { + /* m might be freed, so cache the next pointer. */ + next = m->m_next; + if (m->m_flags & M_NOMAP) { + if (prev != NULL) { + /* + * Remove 'm' from the new chain so + * that the 'top' chain terminates + * before 'm' in case 'top' is freed + * due to an error. + */ + prev->m_next = NULL; + } + m = _mb_unmapped_to_ext(m); + if (m == NULL) { + m_freem(top); + m_freem(next); + return (NULL); + } + if (prev == NULL) { + top = m; + } else { + prev->m_next = m; + } + + /* + * Replaced one mbuf with a chain, so we must + * find the end of chain. + */ + prev = m_last(m); + } else { + if (prev != NULL) { + prev->m_next = m; + } + prev = m; + } + } + return (top); +} + +/* + * Allocate an empty EXT_PGS mbuf. The ext_free routine is + * responsible for freeing any pages backing this mbuf when it is + * freed. + */ +struct mbuf * +mb_alloc_ext_pgs(int how, bool pkthdr, m_ext_free_t ext_free) +{ + struct mbuf *m; + struct mbuf_ext_pgs *ext_pgs; + + if (pkthdr) + m = m_gethdr(how, MT_DATA); + else + m = m_get(how, MT_DATA); + if (m == NULL) + return (NULL); + + ext_pgs = uma_zalloc(zone_extpgs, how); + if (ext_pgs == NULL) { + m_free(m); + return (NULL); + } + ext_pgs->npgs = 0; + ext_pgs->nrdy = 0; + ext_pgs->first_pg_off = 0; + ext_pgs->last_pg_len = 0; + ext_pgs->hdr_len = 0; + ext_pgs->trail_len = 0; + ext_pgs->tls = NULL; + ext_pgs->so = NULL; + m->m_data = NULL; + m->m_flags |= (M_EXT | M_RDONLY | M_NOMAP); + m->m_ext.ext_type = EXT_PGS; + m->m_ext.ext_flags = EXT_FLAG_EMBREF; + m->m_ext.ext_count = 1; + m->m_ext.ext_pgs = ext_pgs; + m->m_ext.ext_size = 0; + m->m_ext.ext_free = ext_free; + return (m); +} + +#ifdef INVARIANT_SUPPORT +void +mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs) +{ + + /* + * NB: This expects a non-empty buffer (npgs > 0 and + * last_pg_len > 0). + */ + KASSERT(ext_pgs->npgs > 0, + ("ext_pgs with no valid pages: %p", ext_pgs)); + KASSERT(ext_pgs->npgs <= nitems(ext_pgs->pa), + ("ext_pgs with too many pages: %p", ext_pgs)); + KASSERT(ext_pgs->nrdy <= ext_pgs->npgs, + ("ext_pgs with too many ready pages: %p", ext_pgs)); + KASSERT(ext_pgs->first_pg_off < PAGE_SIZE, + ("ext_pgs with too large page offset: %p", ext_pgs)); + KASSERT(ext_pgs->last_pg_len > 0, + ("ext_pgs with zero last page length: %p", ext_pgs)); + KASSERT(ext_pgs->last_pg_len <= PAGE_SIZE, + ("ext_pgs with too large last page length: %p", ext_pgs)); + if (ext_pgs->npgs == 1) { + KASSERT(ext_pgs->first_pg_off + ext_pgs->last_pg_len <= + PAGE_SIZE, ("ext_pgs with single page too large: %p", + ext_pgs)); + } + KASSERT(ext_pgs->hdr_len <= sizeof(ext_pgs->hdr), + ("ext_pgs with too large header length: %p", ext_pgs)); + KASSERT(ext_pgs->trail_len <= sizeof(ext_pgs->trail), + ("ext_pgs with too large header length: %p", ext_pgs)); +} +#endif +#endif /* __rtems__ */ + +/* * Clean up after mbufs with M_EXT storage attached to them if the * reference count hits 1. */ @@ -865,7 +1275,8 @@ mb_free_ext(struct mbuf *m) */ if (m->m_flags & M_NOFREE) { freembuf = 0; - KASSERT(m->m_ext.ext_type == EXT_EXTREF, + KASSERT(m->m_ext.ext_type == EXT_EXTREF || + m->m_ext.ext_type == EXT_RXRING, ("%s: no-free mbuf %p has wrong type", __func__, m)); } else freembuf = 1; @@ -896,6 +1307,27 @@ mb_free_ext(struct mbuf *m) uma_zfree(zone_mbuf, mref); break; #ifndef __rtems__ + case EXT_PGS: { +#ifdef KERN_TLS + struct mbuf_ext_pgs *pgs; + struct ktls_session *tls; +#endif + + KASSERT(mref->m_ext.ext_free != NULL, + ("%s: ext_free not set", __func__)); + mref->m_ext.ext_free(mref); +#ifdef KERN_TLS + pgs = mref->m_ext.ext_pgs; + tls = pgs->tls; + if (tls != NULL && + !refcount_release_if_not_last(&tls->refcount)) + ktls_enqueue_to_free(pgs); + else +#endif + uma_zfree(zone_extpgs, mref->m_ext.ext_pgs); + uma_zfree(zone_mbuf, mref); + break; + } case EXT_SFBUF: #endif /* __rtems__ */ case EXT_NET_DRV: @@ -911,6 +1343,10 @@ mb_free_ext(struct mbuf *m) ("%s: ext_free not set", __func__)); m->m_ext.ext_free(m); break; + case EXT_RXRING: + KASSERT(m->m_ext.ext_free == NULL, + ("%s: ext_free is set", __func__)); + break; default: KASSERT(m->m_ext.ext_type == 0, ("%s: unknown ext_type", __func__)); @@ -950,7 +1386,7 @@ m_clget(struct mbuf *m, int how) * we might be able to loosen a few clusters up on the drain. */ if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { - zone_drain(zone_pack); + uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); uma_zalloc_arg(zone_clust, m, how); } MBUF_PROBE2(m__clget, m, how); @@ -1051,8 +1487,7 @@ m_getjcl(int how, short type, int flags, int size) * Allocate a given length worth of mbufs and/or clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If an * existing mbuf chain is provided, then we will append the new chain - * to the existing one but still return the top of the newly allocated - * chain. + * to the existing one and return a pointer to the provided mbuf. */ struct mbuf * m_getm2(struct mbuf *m, int len, int how, short type, int flags) @@ -1165,3 +1600,24 @@ m_freem(struct mbuf *mb) while (mb != NULL) mb = m_free(mb); } + +void +m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp) +{ + + if_ref(ifp); + mst->ifp = ifp; + refcount_init(&mst->refcount, 1); + counter_u64_add(snd_tag_count, 1); +} + +void +m_snd_tag_destroy(struct m_snd_tag *mst) +{ + struct ifnet *ifp; + + ifp = mst->ifp; + ifp->if_snd_tag_free(mst); + if_rele(ifp); + counter_u64_add(snd_tag_count, -1); +} diff --git a/freebsd/sys/kern/kern_mib.c b/freebsd/sys/kern/kern_mib.c index 52aa32fb..cd9e6285 100644 --- a/freebsd/sys/kern/kern_mib.c +++ b/freebsd/sys/kern/kern_mib.c @@ -46,8 +46,10 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_config.h> #include <sys/param.h> +#include <sys/boot.h> #include <sys/jail.h> #include <sys/kernel.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/proc.h> @@ -147,7 +149,7 @@ SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 0, "Whether saved set-group/user ID is available"); #endif -char kernelname[MAXPATHLEN] = "/kernel"; /* XXX bloat */ +char kernelname[MAXPATHLEN] = PATH_KERNEL; /* XXX bloat */ SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW | CTLFLAG_MPSAFE, kernelname, sizeof kernelname, "Name of kernel file booted"); @@ -170,15 +172,8 @@ sysctl_kern_arnd(SYSCTL_HANDLER_ARGS) char buf[256]; size_t len; - /*- - * This is one of the very few legitimate uses of read_random(9). - * Use of arc4random(9) is not recommended as that will ignore - * an unsafe (i.e. unseeded) random(4). - * - * If random(4) is not seeded, then this returns 0, so the - * sysctl will return a zero-length buffer. - */ - len = read_random(buf, MIN(req->oldlen, sizeof(buf))); + len = MIN(req->oldlen, sizeof(buf)); + read_random(buf, len); return (SYSCTL_OUT(req, buf, len)); } @@ -189,37 +184,51 @@ SYSCTL_PROC(_kern, KERN_ARND, arandom, static int sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) { - u_long val; + u_long val, p; - val = ctob(physmem); + p = SIZE_T_MAX >> PAGE_SHIFT; + if (physmem < p) + p = physmem; + val = ctob(p); return (sysctl_handle_long(oidp, &val, 0, req)); } - SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG | CTLFLAG_RD, - 0, 0, sysctl_hw_physmem, "LU", ""); + 0, 0, sysctl_hw_physmem, "LU", + "Amount of physical memory (in bytes)"); static int sysctl_hw_realmem(SYSCTL_HANDLER_ARGS) { - u_long val; - val = ctob(realmem); + u_long val, p; + + p = SIZE_T_MAX >> PAGE_SHIFT; + if (realmem < p) + p = realmem; + val = ctob(p); return (sysctl_handle_long(oidp, &val, 0, req)); } SYSCTL_PROC(_hw, HW_REALMEM, realmem, CTLTYPE_ULONG | CTLFLAG_RD, - 0, 0, sysctl_hw_realmem, "LU", ""); + 0, 0, sysctl_hw_realmem, "LU", + "Amount of memory (in bytes) reported by the firmware"); + static int sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) { - u_long val; + u_long val, p, p1; - val = ctob(physmem - vm_wire_count()); + p1 = physmem - vm_wire_count(); + p = SIZE_T_MAX >> PAGE_SHIFT; + if (p1 < p) + p = p1; + val = ctob(p); return (sysctl_handle_long(oidp, &val, 0, req)); } - SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG | CTLFLAG_RD, - 0, 0, sysctl_hw_usermem, "LU", ""); + 0, 0, sysctl_hw_usermem, "LU", + "Amount of memory (in bytes) which is not wired"); -SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0, ""); +SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0, + "Amount of physical memory (in pages)"); u_long pagesizes[MAXPAGESIZES] = { PAGE_SIZE }; @@ -501,6 +510,54 @@ sysctl_osreldate(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_kern, KERN_OSRELDATE, osreldate, CTLTYPE_INT | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_osreldate, "I", "Kernel release date"); + +/* + * The build-id is copied from the ELF section .note.gnu.build-id. The linker + * script defines two variables to expose the beginning and end. LLVM + * currently uses a SHA-1 hash, but other formats can be supported by checking + * the length of the section. + */ + +extern char __build_id_start[]; +extern char __build_id_end[]; + +#define BUILD_ID_HEADER_LEN 0x10 +#define BUILD_ID_HASH_MAXLEN 0x14 + +static int +sysctl_build_id(SYSCTL_HANDLER_ARGS) +{ + uintptr_t sectionlen = (uintptr_t)(__build_id_end - __build_id_start); + int hashlen; + char buf[2*BUILD_ID_HASH_MAXLEN+1]; + + /* + * The ELF note section has a four byte length for the vendor name, + * four byte length for the value, and a four byte vendor specific + * type. The name for the build id is "GNU\0". We skip the first 16 + * bytes to read the build hash. We will return the remaining bytes up + * to 20 (SHA-1) hash size. If the hash happens to be a custom number + * of bytes we will pad the value with zeros, as the section should be + * four byte aligned. + */ + if (sectionlen <= BUILD_ID_HEADER_LEN || + sectionlen > (BUILD_ID_HEADER_LEN + BUILD_ID_HASH_MAXLEN)) { + return (ENOENT); + } + + + hashlen = sectionlen - BUILD_ID_HEADER_LEN; + for (int i = 0; i < hashlen; i++) { + uint8_t c = __build_id_start[i+BUILD_ID_HEADER_LEN]; + snprintf(&buf[2*i], 3, "%02x", c); + } + + return (SYSCTL_OUT(req, buf, strlen(buf) + 1)); +} + +SYSCTL_PROC(_kern, OID_AUTO, build_id, + CTLTYPE_STRING | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE, + NULL, 0, sysctl_build_id, "A", "Operating system build-id"); #endif /* __rtems__ */ SYSCTL_NODE(_kern, OID_AUTO, features, CTLFLAG_RD, 0, "Kernel Features"); diff --git a/freebsd/sys/kern/kern_mtxpool.c b/freebsd/sys/kern/kern_mtxpool.c index 7f6c4dce..bc47d826 100644 --- a/freebsd/sys/kern/kern_mtxpool.c +++ b/freebsd/sys/kern/kern_mtxpool.c @@ -64,14 +64,14 @@ static MALLOC_DEFINE(M_MTXPOOL, "mtx_pool", "mutex pool"); /* Pool sizes must be a power of two */ #ifndef MTX_POOL_SLEEP_SIZE -#define MTX_POOL_SLEEP_SIZE 128 +#define MTX_POOL_SLEEP_SIZE 1024 #endif struct mtxpool_header { int mtxpool_size; int mtxpool_mask; int mtxpool_shift; - int mtxpool_next; + int mtxpool_next __aligned(CACHE_LINE_SIZE); }; struct mtx_pool { diff --git a/freebsd/sys/kern/kern_synch.c b/freebsd/sys/kern/kern_synch.c index 2597f91d..7d24c248 100644 --- a/freebsd/sys/kern/kern_synch.c +++ b/freebsd/sys/kern/kern_synch.c @@ -54,6 +54,7 @@ __FBSDID("$FreeBSD$"); #include <sys/mutex.h> #include <sys/proc.h> #include <sys/resourcevar.h> +#include <sys/refcount.h> #include <sys/sched.h> #include <sys/sdt.h> #include <sys/signalvar.h> @@ -366,6 +367,75 @@ pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) } /* + * Potentially release the last reference for refcount. Check for + * unlikely conditions and signal the caller as to whether it was + * the final ref. + */ +bool +refcount_release_last(volatile u_int *count, u_int n, u_int old) +{ + u_int waiter; + + waiter = old & REFCOUNT_WAITER; + old = REFCOUNT_COUNT(old); + if (__predict_false(n > old || REFCOUNT_SATURATED(old))) { + /* + * Avoid multiple destructor invocations if underflow occurred. + * This is not perfect since the memory backing the containing + * object may already have been reallocated. + */ + _refcount_update_saturated(count); + return (false); + } + + /* + * Attempt to atomically clear the waiter bit. Wakeup waiters + * if we are successful. + */ + if (waiter != 0 && atomic_cmpset_int(count, REFCOUNT_WAITER, 0)) + wakeup(__DEVOLATILE(u_int *, count)); + + /* + * Last reference. Signal the user to call the destructor. + * + * Ensure that the destructor sees all updates. The fence_rel + * at the start of refcount_releasen synchronizes with this fence. + */ + atomic_thread_fence_acq(); + return (true); +} + +/* + * Wait for a refcount wakeup. This does not guarantee that the ref is still + * zero on return and may be subject to transient wakeups. Callers wanting + * a precise answer should use refcount_wait(). + */ +void +refcount_sleep(volatile u_int *count, const char *wmesg, int pri) +{ + void *wchan; + u_int old; + + if (REFCOUNT_COUNT(*count) == 0) + return; + wchan = __DEVOLATILE(void *, count); + sleepq_lock(wchan); + old = *count; + for (;;) { + if (REFCOUNT_COUNT(old) == 0) { + sleepq_release(wchan); + return; + } + if (old & REFCOUNT_WAITER) + break; + if (atomic_fcmpset_int(count, &old, old | REFCOUNT_WAITER)) + break; + } + sleepq_add(wchan, NULL, wmesg, 0, 0); + sleepq_wait(wchan, pri); +} + +/* * Make all threads sleeping on the specified identifier runnable. */ void @@ -402,6 +472,19 @@ wakeup_one(void *ident) kick_proc0(); } +void +wakeup_any(void *ident) +{ + int wakeup_swapper; + + sleepq_lock(ident); + wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP | SLEEPQ_UNFAIR, + 0, 0); + sleepq_release(ident); + if (wakeup_swapper) + kick_proc0(); +} + #ifndef __rtems__ static void kdb_switch(void) diff --git a/freebsd/sys/kern/kern_sysctl.c b/freebsd/sys/kern/kern_sysctl.c index dc7c4c72..1135d7f3 100644 --- a/freebsd/sys/kern/kern_sysctl.c +++ b/freebsd/sys/kern/kern_sysctl.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_capsicum.h> +#include <rtems/bsd/local/opt_ddb.h> #include <rtems/bsd/local/opt_ktrace.h> #include <sys/param.h> @@ -50,11 +51,13 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/capsicum.h> #include <sys/kernel.h> +#include <sys/limits.h> #include <sys/sysctl.h> #include <sys/malloc.h> #include <sys/priv.h> #include <sys/proc.h> #include <sys/jail.h> +#include <sys/kdb.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/rmlock.h> @@ -66,6 +69,11 @@ __FBSDID("$FreeBSD$"); #include <sys/ktrace.h> #endif +#ifdef DDB +#include <ddb/ddb.h> +#include <ddb/db_lex.h> +#endif + #include <net/vnet.h> #include <security/mac/mac_framework.h> @@ -326,13 +334,6 @@ sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp) } #endif /* __rtems__ */ -static int -sbuf_printf_drain(void *arg __unused, const char *data, int len) -{ - - return (printf("%.*s", len, data)); -} - /* * Locate the path to a given oid. Returns the length of the resulting path, * or -1 if the oid was not found. nodes must have room for CTL_MAXNAME @@ -940,13 +941,18 @@ SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, NULL); * (be aware though, that the proper interface isn't as obvious as it * may seem, there are various conflicting requirements. * - * {0,0} printf the entire MIB-tree. - * {0,1,...} return the name of the "..." OID. - * {0,2,...} return the next OID. - * {0,3} return the OID of the name in "new" - * {0,4,...} return the kind & format info for the "..." OID. - * {0,5,...} return the description of the "..." OID. - * {0,6,...} return the aggregation label of the "..." OID. + * {CTL_SYSCTL, CTL_SYSCTL_DEBUG} printf the entire MIB-tree. + * {CTL_SYSCTL, CTL_SYSCTL_NAME, ...} return the name of the "..." + * OID. + * {CTL_SYSCTL, CTL_SYSCTL_NEXT, ...} return the next OID. + * {CTL_SYSCTL, CTL_SYSCTL_NAME2OID} return the OID of the name in + * "new" + * {CTL_SYSCTL, CTL_SYSCTL_OIDFMT, ...} return the kind & format info + * for the "..." OID. + * {CTL_SYSCTL, CTL_SYSCTL_OIDDESCR, ...} return the description of the + * "..." OID. + * {CTL_SYSCTL, CTL_SYSCTL_OIDLABEL, ...} return the aggregation label of + * the "..." OID. */ #ifdef SYSCTL_DEBUG @@ -1014,8 +1020,8 @@ sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS) return (ENOENT); } -SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE, - 0, 0, sysctl_sysctl_debug, "-", ""); +SYSCTL_PROC(_sysctl, CTL_SYSCTL_DEBUG, debug, CTLTYPE_STRING | CTLFLAG_RD | + CTLFLAG_MPSAFE, 0, 0, sysctl_sysctl_debug, "-", ""); #endif static int @@ -1080,8 +1086,8 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in * capability mode. */ -static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, - sysctl_sysctl_name, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NAME, name, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_name, ""); static int sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, @@ -1167,8 +1173,8 @@ sysctl_sysctl_next(SYSCTL_HANDLER_ARGS) * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in * capability mode. */ -static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, - sysctl_sysctl_next, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXT, next, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, ""); static int name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp) @@ -1254,9 +1260,9 @@ sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS) * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in * capability mode. */ -SYSCTL_PROC(_sysctl, 3, name2oid, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE - | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", ""); +SYSCTL_PROC(_sysctl, CTL_SYSCTL_NAME2OID, name2oid, CTLTYPE_INT | CTLFLAG_RW | + CTLFLAG_ANYBODY | CTLFLAG_MPSAFE | CTLFLAG_CAPRW, 0, 0, + sysctl_sysctl_name2oid, "I", ""); static int sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) @@ -1284,8 +1290,8 @@ sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) } -static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, - sysctl_sysctl_oidfmt, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDFMT, oidfmt, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidfmt, ""); static int sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) @@ -1309,8 +1315,8 @@ sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) return (error); } -static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, - sysctl_sysctl_oiddescr, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDDESCR, oiddescr, CTLFLAG_RD | + CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oiddescr, ""); static int sysctl_sysctl_oidlabel(SYSCTL_HANDLER_ARGS) @@ -1334,8 +1340,8 @@ sysctl_sysctl_oidlabel(SYSCTL_HANDLER_ARGS) return (error); } -static SYSCTL_NODE(_sysctl, 6, oidlabel, - CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDLABEL, oidlabel, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, ""); /* * Default "handler" functions. @@ -1622,9 +1628,10 @@ sysctl_handle_string(SYSCTL_HANDLER_ARGS) /* * A zero-length buffer indicates a fixed size read-only - * string: + * string. In ddb, don't worry about trying to make a malloced + * snapshot. */ - if (arg2 == 0) { + if (arg2 == 0 || kdb_active) { arg2 = strlen((char *)arg1) + 1; ro_string = 1; } @@ -1751,6 +1758,29 @@ sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS) return (0); } +/* + * Convert seconds to a struct timeval. Intended for use with + * intervals and thus does not permit negative seconds. + */ +int +sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS) +{ + struct timeval *tv; + int error, secs; + + tv = arg1; + secs = tv->tv_sec; + + error = sysctl_handle_int(oidp, &secs, 0, req); + if (error || req->newptr == NULL) + return (error); + + if (secs < 0) + return (EINVAL); + tv->tv_sec = secs; + + return (0); +} /* * Transfer functions to/from kernel space. @@ -1853,8 +1883,8 @@ kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, size_t oidlen, plen; int error; - oid[0] = 0; /* sysctl internal magic */ - oid[1] = 3; /* name2oid */ + oid[0] = CTL_SYSCTL; + oid[1] = CTL_SYSCTL_NAME2OID; oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, @@ -2149,6 +2179,68 @@ sys___sysctl(struct thread *td, struct sysctl_args *uap) return (error); } +int +kern___sysctlbyname(struct thread *td, const char *oname, size_t namelen, + void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, + int flags, bool inkernel) +{ + int oid[CTL_MAXNAME]; + char namebuf[16]; + char *name; + size_t oidlen; + int error; + + if (namelen > MAXPATHLEN || namelen == 0) + return (EINVAL); + name = namebuf; + if (namelen > sizeof(namebuf)) + name = malloc(namelen, M_SYSCTL, M_WAITOK); + error = copyin(oname, name, namelen); + if (error != 0) + goto out; + + oid[0] = CTL_SYSCTL; + oid[1] = CTL_SYSCTL_NAME2OID; + oidlen = sizeof(oid); + error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, namelen, + retval, flags); + if (error != 0) + goto out; + error = userland_sysctl(td, oid, *retval / sizeof(int), old, oldlenp, + inkernel, new, newlen, retval, flags); + +out: + if (namelen > sizeof(namebuf)) + free(name, M_SYSCTL); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct __sysctlbyname_args { + const char *name; + size_t namelen; + void *old; + size_t *oldlenp; + void *new; + size_t newlen; +}; +#endif +int +sys___sysctlbyname(struct thread *td, struct __sysctlbyname_args *uap) +{ + size_t rv; + int error; + + error = kern___sysctlbyname(td, uap->name, uap->namelen, uap->old, + uap->oldlenp, uap->new, uap->newlen, &rv, 0, 0); + if (error != 0) + return (error); + if (uap->oldlenp != NULL) + error = copyout(&rv, uap->oldlenp, sizeof(rv)); + + return (error); +} + /* * This is used from various compatibility syscalls too. That's why name * must be in kernel space. @@ -2254,3 +2346,528 @@ sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length, sbuf_set_drain(s, sbuf_sysctl_drain, req); return (s); } + +#ifdef DDB + +/* The current OID the debugger is working with */ +static struct sysctl_oid *g_ddb_oid; + +/* The current flags specified by the user */ +static int g_ddb_sysctl_flags; + +/* Check to see if the last sysctl printed */ +static int g_ddb_sysctl_printed; + +static const int ctl_sign[CTLTYPE+1] = { + [CTLTYPE_INT] = 1, + [CTLTYPE_LONG] = 1, + [CTLTYPE_S8] = 1, + [CTLTYPE_S16] = 1, + [CTLTYPE_S32] = 1, + [CTLTYPE_S64] = 1, +}; + +static const int ctl_size[CTLTYPE+1] = { + [CTLTYPE_INT] = sizeof(int), + [CTLTYPE_UINT] = sizeof(u_int), + [CTLTYPE_LONG] = sizeof(long), + [CTLTYPE_ULONG] = sizeof(u_long), + [CTLTYPE_S8] = sizeof(int8_t), + [CTLTYPE_S16] = sizeof(int16_t), + [CTLTYPE_S32] = sizeof(int32_t), + [CTLTYPE_S64] = sizeof(int64_t), + [CTLTYPE_U8] = sizeof(uint8_t), + [CTLTYPE_U16] = sizeof(uint16_t), + [CTLTYPE_U32] = sizeof(uint32_t), + [CTLTYPE_U64] = sizeof(uint64_t), +}; + +#define DB_SYSCTL_NAME_ONLY 0x001 /* Compare with -N */ +#define DB_SYSCTL_VALUE_ONLY 0x002 /* Compare with -n */ +#define DB_SYSCTL_OPAQUE 0x004 /* Compare with -o */ +#define DB_SYSCTL_HEX 0x008 /* Compare with -x */ + +#define DB_SYSCTL_SAFE_ONLY 0x100 /* Only simple types */ + +static const char db_sysctl_modifs[] = { + 'N', 'n', 'o', 'x', +}; + +static const int db_sysctl_modif_values[] = { + DB_SYSCTL_NAME_ONLY, DB_SYSCTL_VALUE_ONLY, + DB_SYSCTL_OPAQUE, DB_SYSCTL_HEX, +}; + +/* Handlers considered safe to print while recursing */ +static int (* const db_safe_handlers[])(SYSCTL_HANDLER_ARGS) = { + sysctl_handle_bool, + sysctl_handle_8, + sysctl_handle_16, + sysctl_handle_32, + sysctl_handle_64, + sysctl_handle_int, + sysctl_handle_long, + sysctl_handle_string, + sysctl_handle_opaque, +}; + +/* + * Use in place of sysctl_old_kernel to print sysctl values. + * + * Compare to the output handling in show_var from sbin/sysctl/sysctl.c + */ +static int +sysctl_old_ddb(struct sysctl_req *req, const void *ptr, size_t len) +{ + const u_char *val, *p; + const char *sep1; + size_t intlen, slen; + uintmax_t umv; + intmax_t mv; + int sign, ctltype, hexlen, xflag, error; + + /* Suppress false-positive GCC uninitialized variable warnings */ + mv = 0; + umv = 0; + + slen = len; + val = p = ptr; + + if (ptr == NULL) { + error = 0; + goto out; + } + + /* We are going to print */ + g_ddb_sysctl_printed = 1; + + xflag = g_ddb_sysctl_flags & DB_SYSCTL_HEX; + + ctltype = (g_ddb_oid->oid_kind & CTLTYPE); + sign = ctl_sign[ctltype]; + intlen = ctl_size[ctltype]; + + switch (ctltype) { + case CTLTYPE_NODE: + case CTLTYPE_STRING: + db_printf("%.*s", (int) len, (const char *) p); + error = 0; + goto out; + + case CTLTYPE_INT: + case CTLTYPE_UINT: + case CTLTYPE_LONG: + case CTLTYPE_ULONG: + case CTLTYPE_S8: + case CTLTYPE_S16: + case CTLTYPE_S32: + case CTLTYPE_S64: + case CTLTYPE_U8: + case CTLTYPE_U16: + case CTLTYPE_U32: + case CTLTYPE_U64: + hexlen = 2 + (intlen * CHAR_BIT + 3) / 4; + sep1 = ""; + while (len >= intlen) { + switch (ctltype) { + case CTLTYPE_INT: + case CTLTYPE_UINT: + umv = *(const u_int *)p; + mv = *(const int *)p; + break; + case CTLTYPE_LONG: + case CTLTYPE_ULONG: + umv = *(const u_long *)p; + mv = *(const long *)p; + break; + case CTLTYPE_S8: + case CTLTYPE_U8: + umv = *(const uint8_t *)p; + mv = *(const int8_t *)p; + break; + case CTLTYPE_S16: + case CTLTYPE_U16: + umv = *(const uint16_t *)p; + mv = *(const int16_t *)p; + break; + case CTLTYPE_S32: + case CTLTYPE_U32: + umv = *(const uint32_t *)p; + mv = *(const int32_t *)p; + break; + case CTLTYPE_S64: + case CTLTYPE_U64: + umv = *(const uint64_t *)p; + mv = *(const int64_t *)p; + break; + } + + db_printf("%s", sep1); + if (xflag) + db_printf("%#0*jx", hexlen, umv); + else if (!sign) + db_printf("%ju", umv); + else if (g_ddb_oid->oid_fmt[1] == 'K') { + /* Kelvins are currently unsupported. */ + error = EOPNOTSUPP; + goto out; + } else + db_printf("%jd", mv); + + sep1 = " "; + len -= intlen; + p += intlen; + } + error = 0; + goto out; + + case CTLTYPE_OPAQUE: + /* TODO: Support struct functions. */ + + /* FALLTHROUGH */ + default: + db_printf("Format:%s Length:%zu Dump:0x", + g_ddb_oid->oid_fmt, len); + while (len-- && (xflag || p < val + 16)) + db_printf("%02x", *p++); + if (!xflag && len > 16) + db_printf("..."); + error = 0; + goto out; + } + +out: + req->oldidx += slen; + return (error); +} + +/* + * Avoid setting new sysctl values from the debugger + */ +static int +sysctl_new_ddb(struct sysctl_req *req, void *p, size_t l) +{ + + if (!req->newptr) + return (0); + + /* Changing sysctls from the debugger is currently unsupported */ + return (EPERM); +} + +/* + * Run a sysctl handler with the DDB oldfunc and newfunc attached. + * Instead of copying any output to a buffer we'll dump it right to + * the console. + */ +static int +db_sysctl(struct sysctl_oid *oidp, int *name, u_int namelen, + void *old, size_t *oldlenp, size_t *retval, int flags) +{ + struct sysctl_req req; + int error; + + /* Setup the request */ + bzero(&req, sizeof req); + req.td = kdb_thread; + req.oldfunc = sysctl_old_ddb; + req.newfunc = sysctl_new_ddb; + req.lock = REQ_UNWIRED; + if (oldlenp) { + req.oldlen = *oldlenp; + } + req.validlen = req.oldlen; + if (old) { + req.oldptr = old; + } + + /* Setup our globals for sysctl_old_ddb */ + g_ddb_oid = oidp; + g_ddb_sysctl_flags = flags; + g_ddb_sysctl_printed = 0; + + error = sysctl_root(0, name, namelen, &req); + + /* Reset globals */ + g_ddb_oid = NULL; + g_ddb_sysctl_flags = 0; + + if (retval) { + if (req.oldptr && req.oldidx > req.validlen) + *retval = req.validlen; + else + *retval = req.oldidx; + } + return (error); +} + +/* + * Show a sysctl's name + */ +static void +db_show_oid_name(int *oid, size_t nlen) +{ + struct sysctl_oid *oidp; + int qoid[CTL_MAXNAME+2]; + int error; + + qoid[0] = 0; + memcpy(qoid + 2, oid, nlen * sizeof(int)); + qoid[1] = 1; + + error = sysctl_find_oid(qoid, nlen + 2, &oidp, NULL, NULL); + if (error) + db_error("sysctl name oid"); + + error = db_sysctl(oidp, qoid, nlen + 2, NULL, NULL, NULL, 0); + if (error) + db_error("sysctl name"); +} + +/* + * Check to see if an OID is safe to print from ddb. + */ +static bool +db_oid_safe(const struct sysctl_oid *oidp) +{ + for (unsigned int i = 0; i < nitems(db_safe_handlers); ++i) { + if (oidp->oid_handler == db_safe_handlers[i]) + return (true); + } + + return (false); +} + +/* + * Show a sysctl at a specific OID + * Compare to the input handling in show_var from sbin/sysctl/sysctl.c + */ +static int +db_show_oid(struct sysctl_oid *oidp, int *oid, size_t nlen, int flags) +{ + int error, xflag, oflag, Nflag, nflag; + size_t len; + + xflag = flags & DB_SYSCTL_HEX; + oflag = flags & DB_SYSCTL_OPAQUE; + nflag = flags & DB_SYSCTL_VALUE_ONLY; + Nflag = flags & DB_SYSCTL_NAME_ONLY; + + if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_OPAQUE && + (!xflag && !oflag)) + return (0); + + if (Nflag) { + db_show_oid_name(oid, nlen); + error = 0; + goto out; + } + + if (!nflag) { + db_show_oid_name(oid, nlen); + db_printf(": "); + } + + if ((flags & DB_SYSCTL_SAFE_ONLY) && !db_oid_safe(oidp)) { + db_printf("Skipping, unsafe to print while recursing."); + error = 0; + goto out; + } + + /* Try once, and ask about the size */ + len = 0; + error = db_sysctl(oidp, oid, nlen, + NULL, NULL, &len, flags); + if (error) + goto out; + + if (!g_ddb_sysctl_printed) + /* Lie about the size */ + error = db_sysctl(oidp, oid, nlen, + (void *) 1, &len, NULL, flags); + +out: + db_printf("\n"); + return (error); +} + +/* + * Show all sysctls under a specific OID + * Compare to sysctl_all from sbin/sysctl/sysctl.c + */ +static int +db_show_sysctl_all(int *oid, size_t len, int flags) +{ + struct sysctl_oid *oidp; + int name1[CTL_MAXNAME + 2], name2[CTL_MAXNAME + 2]; + size_t l1, l2; + + name1[0] = CTL_SYSCTL; + name1[1] = CTL_SYSCTL_NEXT; + l1 = 2; + if (len) { + memcpy(name1+2, oid, len * sizeof(int)); + l1 +=len; + } else { + name1[2] = 1; + l1++; + } + for (;;) { + int i, error; + + l2 = sizeof(name2); + error = kernel_sysctl(kdb_thread, name1, l1, + name2, &l2, NULL, 0, &l2, 0); + if (error != 0) { + if (error == ENOENT) + return (0); + else + db_error("sysctl(getnext)"); + } + + l2 /= sizeof(int); + + if (l2 < (unsigned int)len) + return (0); + + for (i = 0; i < len; i++) + if (name2[i] != oid[i]) + return (0); + + /* Find the OID in question */ + error = sysctl_find_oid(name2, l2, &oidp, NULL, NULL); + if (error) + return (error); + + i = db_show_oid(oidp, name2, l2, flags | DB_SYSCTL_SAFE_ONLY); + + if (db_pager_quit) + return (0); + + memcpy(name1+2, name2, l2 * sizeof(int)); + l1 = 2 + l2; + } +} + +/* + * Show a sysctl by its user facing string + */ +static int +db_sysctlbyname(char *name, int flags) +{ + struct sysctl_oid *oidp; + int oid[CTL_MAXNAME]; + int error, nlen; + + error = name2oid(name, oid, &nlen, &oidp); + if (error) { + return (error); + } + + if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + db_show_sysctl_all(oid, nlen, flags); + } else { + error = db_show_oid(oidp, oid, nlen, flags); + } + + return (error); +} + +static void +db_sysctl_cmd_usage(void) +{ + db_printf( + " sysctl [/Nnox] <sysctl> \n" + " \n" + " <sysctl> The name of the sysctl to show. \n" + " \n" + " Show a sysctl by hooking into SYSCTL_IN and SYSCTL_OUT. \n" + " This will work for most sysctls, but should not be used \n" + " with sysctls that are known to malloc. \n" + " \n" + " While recursing any \"unsafe\" sysctls will be skipped. \n" + " Call sysctl directly on the sysctl to try printing the \n" + " skipped sysctl. This is unsafe and may make the ddb \n" + " session unusable. \n" + " \n" + " Arguments: \n" + " /N Display only the name of the sysctl. \n" + " /n Display only the value of the sysctl. \n" + " /o Display opaque values. \n" + " /x Display the sysctl in hex. \n" + " \n" + "For example: \n" + "sysctl vm.v_free_min \n" + "vn.v_free_min: 12669 \n" + ); +} + +/* + * Show a specific sysctl similar to sysctl (8). + */ +DB_FUNC(sysctl, db_sysctl_cmd, db_cmd_table, CS_OWN, NULL) +{ + char name[TOK_STRING_SIZE]; + int error, i, t, flags; + + /* Parse the modifiers */ + t = db_read_token(); + if (t == tSLASH || t == tMINUS) { + t = db_read_token(); + if (t != tIDENT) { + db_printf("Bad modifier\n"); + error = EINVAL; + goto out; + } + db_strcpy(modif, db_tok_string); + } + else { + db_unread_token(t); + modif[0] = '\0'; + } + + flags = 0; + for (i = 0; i < nitems(db_sysctl_modifs); i++) { + if (strchr(modif, db_sysctl_modifs[i])) { + flags |= db_sysctl_modif_values[i]; + } + } + + /* Parse the sysctl names */ + t = db_read_token(); + if (t != tIDENT) { + db_printf("Need sysctl name\n"); + error = EINVAL; + goto out; + } + + /* Copy the name into a temporary buffer */ + db_strcpy(name, db_tok_string); + + /* Ensure there is no trailing cruft */ + t = db_read_token(); + if (t != tEOL) { + db_printf("Unexpected sysctl argument\n"); + error = EINVAL; + goto out; + } + + error = db_sysctlbyname(name, flags); + if (error == ENOENT) { + db_printf("unknown oid: '%s'\n", db_tok_string); + goto out; + } else if (error) { + db_printf("%s: error: %d\n", db_tok_string, error); + goto out; + } + +out: + /* Ensure we eat all of our text */ + db_flush_lex(); + + if (error == EINVAL) { + db_sysctl_cmd_usage(); + } +} + +#endif /* DDB */ diff --git a/freebsd/sys/kern/kern_time.c b/freebsd/sys/kern/kern_time.c index 74b144cb..47eb9032 100644 --- a/freebsd/sys/kern/kern_time.c +++ b/freebsd/sys/kern/kern_time.c @@ -422,7 +422,9 @@ kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats) if (ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000 || ats->tv_sec < 0) return (EINVAL); - if (!allow_insane_settime && ats->tv_sec > 8000ULL * 365 * 24 * 60 * 60) + if (!allow_insane_settime && + (ats->tv_sec > 8000ULL * 365 * 24 * 60 * 60 || + ats->tv_sec < utc_offset())) return (EINVAL); /* XXX Don't convert nsec->usec and back */ TIMESPEC_TO_TIMEVAL(&atv, ats); @@ -673,8 +675,8 @@ sys_gettimeofday(struct thread *td, struct gettimeofday_args *uap) error = copyout(&atv, uap->tp, sizeof (atv)); } if (error == 0 && uap->tzp != NULL) { - rtz.tz_minuteswest = tz_minuteswest; - rtz.tz_dsttime = tz_dsttime; + rtz.tz_minuteswest = 0; + rtz.tz_dsttime = 0; error = copyout(&rtz, uap->tzp, sizeof (rtz)); } return (error); @@ -726,10 +728,6 @@ kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp) return (EINVAL); error = settime(td, tv); } - if (tzp && error == 0) { - tz_minuteswest = tzp->tz_minuteswest; - tz_dsttime = tzp->tz_dsttime; - } return (error); } diff --git a/freebsd/sys/kern/kern_timeout.c b/freebsd/sys/kern/kern_timeout.c index 2f478afc..983abba2 100644 --- a/freebsd/sys/kern/kern_timeout.c +++ b/freebsd/sys/kern/kern_timeout.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/bus.h> #include <sys/callout.h> +#include <sys/domainset.h> #include <sys/file.h> #include <sys/interrupt.h> #include <sys/kernel.h> @@ -67,6 +68,7 @@ __FBSDID("$FreeBSD$"); #ifdef DDB #include <ddb/ddb.h> +#include <ddb/db_sym.h> #include <machine/_inttypes.h> #endif @@ -154,12 +156,16 @@ u_int callwheelsize, callwheelmask; struct cc_exec { struct callout *cc_curr; void (*cc_drain)(void *); +#ifndef __rtems__ + void *cc_last_func; + void *cc_last_arg; +#endif /* __rtems__ */ #ifdef SMP void (*ce_migration_func)(void *); void *ce_migration_arg; - int ce_migration_cpu; sbintime_t ce_migration_time; sbintime_t ce_migration_prec; + int ce_migration_cpu; #endif bool cc_cancel; bool cc_waiting; @@ -200,6 +206,8 @@ struct callout_cpu { #ifndef __rtems__ #define cc_exec_curr(cc, dir) cc->cc_exec_entity[dir].cc_curr +#define cc_exec_last_func(cc, dir) cc->cc_exec_entity[dir].cc_last_func +#define cc_exec_last_arg(cc, dir) cc->cc_exec_entity[dir].cc_last_arg #define cc_exec_drain(cc, dir) cc->cc_exec_entity[dir].cc_drain #else /* __rtems__ */ #define cc_exec_curr(cc, dir) cc->cc_exec_entity.cc_curr @@ -426,8 +434,9 @@ callout_cpu_init(struct callout_cpu *cc, int cpu) SLIST_INIT(&cc->cc_callfree); cc->cc_inited = 1; #ifndef __rtems__ - cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize, - M_CALLOUT, M_WAITOK); + cc->cc_callwheel = malloc_domainset(sizeof(struct callout_list) * + callwheelsize, M_CALLOUT, + DOMAINSET_PREF(pcpu_find(cpu)->pc_domain), M_WAITOK); #endif /* __rtems__ */ for (i = 0; i < callwheelsize; i++) LIST_INIT(&cc->cc_callwheel[i]); @@ -821,6 +830,10 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, c->c_iflags &= ~CALLOUT_PENDING; cc_exec_curr(cc, direct) = c; +#ifndef __rtems__ + cc_exec_last_func(cc, direct) = c_func; + cc_exec_last_arg(cc, direct) = c_arg; +#endif /* __rtems__ */ cc_exec_cancel(cc, direct) = false; cc_exec_drain(cc, direct) = NULL; CC_UNLOCK(cc); @@ -1876,4 +1889,42 @@ DB_SHOW_COMMAND(callout, db_show_callout) _show_callout((struct callout *)addr); } + +static void +_show_last_callout(int cpu, int direct, const char *dirstr) +{ + struct callout_cpu *cc; + void *func, *arg; + + cc = CC_CPU(cpu); + func = cc_exec_last_func(cc, direct); + arg = cc_exec_last_arg(cc, direct); + db_printf("cpu %d last%s callout function: %p ", cpu, dirstr, func); + db_printsym((db_expr_t)func, DB_STGY_ANY); + db_printf("\ncpu %d last%s callout argument: %p\n", cpu, dirstr, arg); +} + +DB_SHOW_COMMAND(callout_last, db_show_callout_last) +{ + int cpu, last; + + if (have_addr) { + if (addr < 0 || addr > mp_maxid || CPU_ABSENT(addr)) { + db_printf("no such cpu: %d\n", (int)addr); + return; + } + cpu = last = addr; + } else { + cpu = 0; + last = mp_maxid; + } + + while (cpu <= last) { + if (!CPU_ABSENT(cpu)) { + _show_last_callout(cpu, 0, ""); + _show_last_callout(cpu, 1, " direct"); + } + cpu++; + } +} #endif /* DDB */ diff --git a/freebsd/sys/kern/kern_uuid.c b/freebsd/sys/kern/kern_uuid.c index a2316b16..c2a5986a 100644 --- a/freebsd/sys/kern/kern_uuid.c +++ b/freebsd/sys/kern/kern_uuid.c @@ -301,7 +301,7 @@ sbuf_printf_uuid(struct sbuf *sb, struct uuid *uuid) char buf[38]; snprintf_uuid(buf, sizeof(buf), uuid); - return (sbuf_printf(sb, "%s", buf)); + return (sbuf_cat(sb, buf)); } /* diff --git a/freebsd/sys/kern/subr_blist.c b/freebsd/sys/kern/subr_blist.c index 807a7f3c..8b073bf8 100644 --- a/freebsd/sys/kern/subr_blist.c +++ b/freebsd/sys/kern/subr_blist.c @@ -111,6 +111,7 @@ __FBSDID("$FreeBSD$"); #include <sys/types.h> #include <sys/malloc.h> #include <sys/sbuf.h> +#include <assert.h> #include <stdio.h> #include <string.h> #include <stddef.h> @@ -122,19 +123,20 @@ __FBSDID("$FreeBSD$"); #define malloc(a,b,c) calloc(a, 1) #define free(a,b) free(a) #define ummin(a,b) ((a) < (b) ? (a) : (b)) +#define imin(a,b) ((a) < (b) ? (a) : (b)) +#define KASSERT(a,b) assert(a) #include <sys/blist.h> -void panic(const char *ctl, ...); - #endif /* * static support functions */ -static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count); -static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, - u_daddr_t radix); +static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, + int *count, int maxcount); +static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, + int maxcount, u_daddr_t radix); static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix); @@ -194,30 +196,40 @@ bitrange(int n, int count) /* - * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t. - * Assumes that the argument has only one bit set. + * Find the first bit set in a u_daddr_t. */ static inline int -bitpos(u_daddr_t mask) +generic_bitpos(u_daddr_t mask) { int hi, lo, mid; + lo = 0; + hi = BLIST_BMAP_RADIX; + while (lo + 1 < hi) { + mid = (lo + hi) >> 1; + if (mask & bitrange(0, mid)) + hi = mid; + else + lo = mid; + } + return (lo); +} + +static inline int +bitpos(u_daddr_t mask) +{ + switch (sizeof(mask)) { #ifdef HAVE_INLINE_FFSLL case sizeof(long long): return (ffsll(mask) - 1); #endif +#ifdef HAVE_INLINE_FFS + case sizeof(int): + return (ffs(mask) - 1); +#endif default: - lo = 0; - hi = BLIST_BMAP_RADIX; - while (lo + 1 < hi) { - mid = (lo + hi) >> 1; - if ((mask >> mid) != 0) - lo = mid; - else - hi = mid; - } - return (lo); + return (generic_bitpos(mask)); } } @@ -237,8 +249,7 @@ blist_create(daddr_t blocks, int flags) blist_t bl; u_daddr_t nodes, radix; - if (blocks == 0) - panic("invalid block count"); + KASSERT(blocks > 0, ("invalid block count")); /* * Calculate the radix and node count used for scanning. @@ -286,12 +297,14 @@ blist_destroy(blist_t bl) * not be allocated. */ daddr_t -blist_alloc(blist_t bl, daddr_t count) +blist_alloc(blist_t bl, int *count, int maxcount) { - daddr_t blk; + daddr_t blk, cursor; - if (count > BLIST_MAX_ALLOC) - panic("allocation too large"); + KASSERT(*count <= maxcount, + ("invalid parameters %d > %d", *count, maxcount)); + KASSERT(*count <= BLIST_MAX_ALLOC, + ("minimum allocation too large: %d", *count)); /* * This loop iterates at most twice. An allocation failure in the @@ -299,18 +312,18 @@ blist_alloc(blist_t bl, daddr_t count) * non-zero. When the cursor is zero, an allocation failure will * stop further iterations. */ - for (;;) { - blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count, + for (cursor = bl->bl_cursor;; cursor = 0) { + blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount, bl->bl_radix); if (blk != SWAPBLK_NONE) { - bl->bl_avail -= count; - bl->bl_cursor = blk + count; + bl->bl_avail -= *count; + bl->bl_cursor = blk + *count; if (bl->bl_cursor == bl->bl_blocks) bl->bl_cursor = 0; return (blk); - } else if (bl->bl_cursor == 0) + } + if (cursor == 0) return (SWAPBLK_NONE); - bl->bl_cursor = 0; } } @@ -326,15 +339,15 @@ blist_avail(blist_t bl) /* * blist_free() - free up space in the block bitmap. Return the base - * of a contiguous region. Panic if an inconsistancy is - * found. + * of a contiguous region. */ void blist_free(blist_t bl, daddr_t blkno, daddr_t count) { - if (blkno < 0 || blkno + count > bl->bl_blocks) - panic("freeing invalid range"); + KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, + ("freeing invalid range: blkno %jx, count %d, blocks %jd", + (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); bl->bl_avail += count; } @@ -350,8 +363,9 @@ blist_fill(blist_t bl, daddr_t blkno, daddr_t count) { daddr_t filled; - if (blkno < 0 || blkno + count > bl->bl_blocks) - panic("filling invalid range"); + KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, + ("filling invalid range: blkno %jx, count %d, blocks %jd", + (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix); bl->bl_avail -= filled; return (filled); @@ -533,7 +547,8 @@ blist_stats(blist_t bl, struct sbuf *s) struct gap_stats gstats; struct gap_stats *stats = &gstats; daddr_t i, nodes, radix; - u_daddr_t bit, diff, mask; + u_daddr_t diff, mask; + int digit; init_gap_stats(stats); nodes = 0; @@ -571,9 +586,9 @@ blist_stats(blist_t bl, struct sbuf *s) if (gap_stats_counting(stats)) diff ^= 1; while (diff != 0) { - bit = diff & -diff; - update_gap_stats(stats, i + bitpos(bit)); - diff ^= bit; + digit = bitpos(diff); + update_gap_stats(stats, i + digit); + diff ^= bitrange(digit, 1); } } nodes += radix_to_skip(radix); @@ -594,53 +609,104 @@ blist_stats(blist_t bl, struct sbuf *s) */ /* - * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf. + * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf. * - * 'scan' is a leaf node, associated with a block containing 'blk'. - * The next leaf node could be adjacent, or several nodes away if the - * least common ancestor of 'scan' and its neighbor is several levels - * up. Use 'blk' to determine how many meta-nodes lie between the - * leaves. If the next leaf has enough initial bits set, clear them - * and clear the bits in the meta nodes on the path up to the least - * common ancestor to mark any subtrees made completely empty. + * 'scan' is a leaf node, and its first block is at address 'start'. The + * next leaf node could be adjacent, or several nodes away if the least + * common ancestor of 'scan' and its neighbor is several levels up. Use + * addresses to determine how many meta-nodes lie between the leaves. If + * sequence of leaves starting with the next one has enough initial bits + * set, clear them and clear the bits in the meta nodes on the path up to + * the least common ancestor to mark any subtrees made completely empty. */ static int -blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) +blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount) { - blmeta_t *next; - daddr_t skip; u_daddr_t radix; - int digit; + daddr_t blk; + int avail, digit; - next = scan + 1; - blk += BLIST_BMAP_RADIX; - radix = BLIST_BMAP_RADIX; - while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0 && - (next->bm_bitmap & 1) == 1) { - next++; - radix *= BLIST_META_RADIX; - } - if (((next->bm_bitmap + 1) & ~((u_daddr_t)-1 << count)) != 0) { - /* - * The next leaf doesn't have enough free blocks at the - * beginning to complete the spanning allocation. - */ - return (ENOMEM); + start += BLIST_BMAP_RADIX; + for (blk = start; blk - start < maxcount; blk += BLIST_BMAP_RADIX) { + /* Skip meta-nodes, as long as they promise more free blocks. */ + radix = BLIST_BMAP_RADIX; + while (((++scan)->bm_bitmap & 1) == 1 && + ((blk / radix) & BLIST_META_MASK) == 0) + radix *= BLIST_META_RADIX; + if (~scan->bm_bitmap != 0) { + /* + * Either there is no next leaf with any free blocks, + * or we've reached the next leaf and found that some + * of its blocks are not free. In the first case, + * bitpos() returns zero here. + */ + avail = blk - start + bitpos(~scan->bm_bitmap); + if (avail < count || avail == 0) { + /* + * There isn't a next leaf with enough free + * blocks at its beginning to bother + * allocating. + */ + return (avail); + } + maxcount = imin(avail, maxcount); + if (maxcount % BLIST_BMAP_RADIX == 0) { + /* + * There was no next leaf. Back scan up to + * last leaf. + */ + --scan; + while (radix != BLIST_BMAP_RADIX) { + radix /= BLIST_META_RADIX; + --scan; + } + blk -= BLIST_BMAP_RADIX; + } + } } - /* Clear the first 'count' bits in the next leaf to allocate. */ - next->bm_bitmap &= (u_daddr_t)-1 << count; - + /* - * Update bitmaps of next-ancestors, up to least common ancestor. + * 'scan' is the last leaf that provides blocks. Clear from 1 to + * BLIST_BMAP_RADIX bits to represent the allocation of those last + * blocks. */ - skip = radix_to_skip(radix); - while (radix != BLIST_BMAP_RADIX && next->bm_bitmap == 0) { - (--next)->bm_bitmap ^= 1; - radix /= BLIST_META_RADIX; + if (maxcount % BLIST_BMAP_RADIX != 0) + scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_BMAP_RADIX); + else + scan->bm_bitmap = 0; + + for (;;) { + /* Back up over meta-nodes, clearing bits if necessary. */ + blk -= BLIST_BMAP_RADIX; + radix = BLIST_BMAP_RADIX; + while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0) { + if ((scan--)->bm_bitmap == 0) + scan->bm_bitmap ^= 1; + radix *= BLIST_META_RADIX; + } + if ((scan--)->bm_bitmap == 0) + scan[-digit * radix_to_skip(radix)].bm_bitmap ^= + (u_daddr_t)1 << digit; + + if (blk == start) + break; + /* Clear all the bits of this leaf. */ + scan->bm_bitmap = 0; } - if (next->bm_bitmap == 0) - scan[-digit * skip].bm_bitmap ^= (u_daddr_t)1 << digit; - return (0); + return (maxcount); +} + +/* + * Given a bitmask, flip all the bits from the least-significant 1-bit to the + * most significant bit. If the result is non-zero, then the least-significant + * 1-bit of the result is in the same position as the least-signification 0-bit + * in mask that is followed by a 1-bit. + */ +static inline u_daddr_t +flip_hibits(u_daddr_t mask) +{ + + return (-mask & ~mask); } /* @@ -651,16 +717,16 @@ blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) * crosses a leaf boundary. */ static daddr_t -blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) +blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount) { u_daddr_t cursor_mask, mask; int count1, hi, lo, num_shifts, range1, range_ext; range1 = 0; - count1 = count - 1; + count1 = *count - 1; num_shifts = fls(count1); mask = scan->bm_bitmap; - while ((-mask & ~mask) != 0 && num_shifts > 0) { + while (flip_hibits(mask) != 0 && num_shifts > 0) { /* * If bit i is set in mask, then bits in [i, i+range1] are set * in scan->bm_bitmap. The value of range1 is equal to count1 @@ -712,40 +778,50 @@ blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) /* * The least significant set bit in mask marks the start of the first - * available range of sufficient size. Clear all the bits but that one, - * and then find its position. + * available range of sufficient size. Find its position. */ - mask &= -mask; lo = bitpos(mask); - hi = lo + count; - if (hi > BLIST_BMAP_RADIX) { - /* - * An allocation within this leaf is impossible, so a successful - * allocation depends on the next leaf providing some of the blocks. - */ - if (blst_next_leaf_alloc(scan, blk, hi - BLIST_BMAP_RADIX) != 0) + /* + * Find how much space is available starting at that position. + */ + if (flip_hibits(mask) != 0) { + /* Count the 1 bits starting at position lo. */ + hi = bitpos(flip_hibits(mask)) + count1; + if (maxcount < hi - lo) + hi = lo + maxcount; + *count = hi - lo; + mask = bitrange(lo, *count); + } else if (maxcount <= BLIST_BMAP_RADIX - lo) { + /* All the blocks we can use are available here. */ + hi = lo + maxcount; + *count = maxcount; + mask = bitrange(lo, *count); + } else { + /* Check next leaf for some of the blocks we want or need. */ + count1 = *count - (BLIST_BMAP_RADIX - lo); + maxcount -= BLIST_BMAP_RADIX - lo; + hi = blst_next_leaf_alloc(scan, blk, count1, maxcount); + if (hi < count1) /* - * The hint cannot be updated, because the same - * allocation request could be satisfied later, by this - * leaf, if the state of the next leaf changes, and - * without any changes to this leaf. + * The next leaf cannot supply enough blocks to reach + * the minimum required allocation. The hint cannot be + * updated, because the same allocation request could + * be satisfied later, by this leaf, if the state of + * the next leaf changes, and without any changes to + * this leaf. */ return (SWAPBLK_NONE); + *count = BLIST_BMAP_RADIX - lo + hi; hi = BLIST_BMAP_RADIX; } - /* Set the bits of mask at position 'lo' and higher. */ - mask = -mask; if (hi == BLIST_BMAP_RADIX) { /* * Update bighint. There is no allocation bigger than range1 * available in this leaf after this allocation completes. */ scan->bm_bighint = range1; - } else { - /* Clear the bits of mask at position 'hi' and higher. */ - mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi); } /* Clear the allocated bits from this leaf. */ scan->bm_bitmap &= ~mask; @@ -761,15 +837,16 @@ blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) * and we have a few optimizations strewn in as well. */ static daddr_t -blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) +blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, + int maxcount, u_daddr_t radix) { daddr_t blk, i, r, skip; - u_daddr_t bit, mask; + u_daddr_t mask; bool scan_from_start; int digit; if (radix == BLIST_BMAP_RADIX) - return (blst_leaf_alloc(scan, cursor, count)); + return (blst_leaf_alloc(scan, cursor, count, maxcount)); blk = cursor & -radix; scan_from_start = (cursor == blk); radix /= BLIST_META_RADIX; @@ -796,23 +873,22 @@ blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) * Examine the nonempty subtree associated with each bit set in mask. */ do { - bit = mask & -mask; - digit = bitpos(bit); + digit = bitpos(mask); i = 1 + digit * skip; - if (count <= scan[i].bm_bighint) { + if (*count <= scan[i].bm_bighint) { /* * The allocation might fit beginning in the i'th subtree. */ r = blst_meta_alloc(&scan[i], cursor + digit * radix, - count, radix); + count, maxcount, radix); if (r != SWAPBLK_NONE) { if (scan[i].bm_bitmap == 0) - scan->bm_bitmap ^= bit; + scan->bm_bitmap ^= bitrange(digit, 1); return (r); } } cursor = blk; - } while ((mask ^= bit) != 0); + } while ((mask ^= bitrange(digit, 1)) != 0); /* * We couldn't allocate count in this subtree. If the whole tree was @@ -820,7 +896,7 @@ blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) */ if (scan_from_start && !(digit == BLIST_META_RADIX - 1 && scan[i].bm_bighint == BLIST_MAX_ALLOC)) - scan->bm_bighint = count - 1; + scan->bm_bighint = *count - 1; return (SWAPBLK_NONE); } @@ -841,8 +917,9 @@ blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) * count n */ mask = bitrange(blk & BLIST_BMAP_MASK, count); - if (scan->bm_bitmap & mask) - panic("freeing free block"); + KASSERT((scan->bm_bitmap & mask) == 0, + ("freeing free block: %jx, size %d, mask %jx", + (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask)); scan->bm_bitmap |= mask; } @@ -1006,7 +1083,7 @@ static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) { daddr_t skip; - u_daddr_t bit, mask; + u_daddr_t mask; int digit; if (radix == BLIST_BMAP_RADIX) { @@ -1038,11 +1115,10 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) mask = scan->bm_bitmap; /* Examine the nonempty subtree associated with each bit set in mask */ do { - bit = mask & -mask; - digit = bitpos(bit); + digit = bitpos(mask); blst_radix_print(&scan[1 + digit * skip], blk + digit * radix, radix, tab); - } while ((mask ^= bit) != 0); + } while ((mask ^= bitrange(digit, 1)) != 0); tab -= 4; printf( @@ -1079,7 +1155,7 @@ main(int ac, char **av) for (;;) { char buf[1024]; long long da = 0; - long long count = 0; + int count = 0, maxcount = 0; printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), (long long)size, (long long)bl->bl_radix); @@ -1088,7 +1164,7 @@ main(int ac, char **av) break; switch(buf[0]) { case 'r': - if (sscanf(buf + 1, "%lld", &count) == 1) { + if (sscanf(buf + 1, "%d", &count) == 1) { blist_resize(&bl, count, 1, M_WAITOK); } else { printf("?\n"); @@ -1104,22 +1180,23 @@ main(int ac, char **av) sbuf_delete(s); break; case 'a': - if (sscanf(buf + 1, "%lld", &count) == 1) { - daddr_t blk = blist_alloc(bl, count); - printf(" R=%08llx\n", (long long)blk); + if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) { + daddr_t blk = blist_alloc(bl, &count, maxcount); + printf(" R=%08llx, c=%08d\n", + (long long)blk, count); } else { printf("?\n"); } break; case 'f': - if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { + if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { blist_free(bl, da, count); } else { printf("?\n"); } break; case 'l': - if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { + if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { printf(" n=%jd\n", (intmax_t)blist_fill(bl, da, count)); } else { @@ -1131,31 +1208,24 @@ main(int ac, char **av) puts( "p -print\n" "s -stats\n" - "a %d -allocate\n" + "a %d %d -allocate\n" "f %x %d -free\n" "l %x %d -fill\n" "r %d -resize\n" - "h/? -help" + "h/? -help\n" + "q -quit" ); break; + case 'q': + break; default: printf("?\n"); break; } + if (buf[0] == 'q') + break; } - return(0); -} - -void -panic(const char *ctl, ...) -{ - va_list va; - - va_start(va, ctl); - vfprintf(stderr, ctl, va); - fprintf(stderr, "\n"); - va_end(va); - exit(1); + return (0); } #endif diff --git a/freebsd/sys/kern/subr_bus.c b/freebsd/sys/kern/subr_bus.c index a87c02a5..244f1af3 100644 --- a/freebsd/sys/kern/subr_bus.c +++ b/freebsd/sys/kern/subr_bus.c @@ -2472,13 +2472,31 @@ device_print_prettyname(device_t dev) int device_printf(device_t dev, const char * fmt, ...) { + char buf[128]; + struct sbuf sb; + const char *name; va_list ap; - int retval; + size_t retval; + + retval = 0; + + sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); + sbuf_set_drain(&sb, sbuf_printf_drain, &retval); + + name = device_get_name(dev); + + if (name == NULL) + sbuf_cat(&sb, "unknown: "); + else + sbuf_printf(&sb, "%s%d: ", name, device_get_unit(dev)); - retval = device_print_prettyname(dev); va_start(ap, fmt); - retval += vprintf(fmt, ap); + sbuf_vprintf(&sb, fmt, ap); va_end(ap); + + sbuf_finish(&sb); + sbuf_delete(&sb); + return (retval); } @@ -3050,6 +3068,10 @@ device_detach(device_t dev) PDEBUG(("%s", DEVICENAME(dev))); if (dev->state == DS_BUSY) return (EBUSY); + if (dev->state == DS_ATTACHING) { + device_printf(dev, "device in attaching state! Deferring detach.\n"); + return (EBUSY); + } if (dev->state != DS_ATTACHED) return (0); @@ -3914,6 +3936,95 @@ bus_generic_resume(device_t dev) return (0); } + +/** + * @brief Helper function for implementing BUS_RESET_POST + * + * Bus can use this function to implement common operations of + * re-attaching or resuming the children after the bus itself was + * reset, and after restoring bus-unique state of children. + * + * @param dev The bus + * #param flags DEVF_RESET_* + */ +int +bus_helper_reset_post(device_t dev, int flags) +{ + device_t child; + int error, error1; + + error = 0; + TAILQ_FOREACH(child, &dev->children,link) { + BUS_RESET_POST(dev, child); + error1 = (flags & DEVF_RESET_DETACH) != 0 ? + device_probe_and_attach(child) : + BUS_RESUME_CHILD(dev, child); + if (error == 0 && error1 != 0) + error = error1; + } + return (error); +} + +static void +bus_helper_reset_prepare_rollback(device_t dev, device_t child, int flags) +{ + + child = TAILQ_NEXT(child, link); + if (child == NULL) + return; + TAILQ_FOREACH_FROM(child, &dev->children,link) { + BUS_RESET_POST(dev, child); + if ((flags & DEVF_RESET_DETACH) != 0) + device_probe_and_attach(child); + else + BUS_RESUME_CHILD(dev, child); + } +} + +/** + * @brief Helper function for implementing BUS_RESET_PREPARE + * + * Bus can use this function to implement common operations of + * detaching or suspending the children before the bus itself is + * reset, and then save bus-unique state of children that must + * persists around reset. + * + * @param dev The bus + * #param flags DEVF_RESET_* + */ +int +bus_helper_reset_prepare(device_t dev, int flags) +{ + device_t child; + int error; + + if (dev->state != DS_ATTACHED) + return (EBUSY); + + TAILQ_FOREACH_REVERSE(child, &dev->children, device_list, link) { + if ((flags & DEVF_RESET_DETACH) != 0) { + error = device_get_state(child) == DS_ATTACHED ? + device_detach(child) : 0; + } else { + error = BUS_SUSPEND_CHILD(dev, child); + } + if (error == 0) { + error = BUS_RESET_PREPARE(dev, child); + if (error != 0) { + if ((flags & DEVF_RESET_DETACH) != 0) + device_probe_and_attach(child); + else + BUS_RESUME_CHILD(dev, child); + } + } + if (error != 0) { + bus_helper_reset_prepare_rollback(dev, child, flags); + return (error); + } + } + return (0); +} + /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * @@ -5613,6 +5724,7 @@ devctl2_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, case DEV_CLEAR_DRIVER: case DEV_RESCAN: case DEV_DELETE: + case DEV_RESET: error = priv_check(td, PRIV_DRIVER); if (error == 0) error = find_device(req, &dev); @@ -5839,6 +5951,14 @@ devctl2_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, device_frozen = false; } break; + case DEV_RESET: + if ((req->dr_flags & ~(DEVF_RESET_DETACH)) != 0) { + error = EINVAL; + break; + } + error = BUS_RESET_CHILD(device_get_parent(dev), dev, + req->dr_flags); + break; } #endif /* __rtems__ */ mtx_unlock(&Giant); @@ -5864,8 +5984,9 @@ devctl2_init(void) */ static int obsolete_panic = 0; SYSCTL_INT(_debug, OID_AUTO, obsolete_panic, CTLFLAG_RWTUN, &obsolete_panic, 0, - "Bus debug level"); -/* 0 - don't panic, 1 - panic if already obsolete, 2 - panic if deprecated */ + "Panic when obsolete features are used (0 = never, 1 = if osbolete, " + "2 = if deprecated)"); + static void gone_panic(int major, int running, const char *msg) { @@ -5890,7 +6011,7 @@ _gone_in(int major, const char *msg) gone_panic(major, P_OSREL_MAJOR(__FreeBSD_version), msg); if (P_OSREL_MAJOR(__FreeBSD_version) >= major) printf("Obsolete code will removed soon: %s\n", msg); - else if (P_OSREL_MAJOR(__FreeBSD_version) + 1 == major) + else printf("Deprecated code (to be removed in FreeBSD %d): %s\n", major, msg); } @@ -5903,7 +6024,7 @@ _gone_in_dev(device_t dev, int major, const char *msg) if (P_OSREL_MAJOR(__FreeBSD_version) >= major) device_printf(dev, "Obsolete code will removed soon: %s\n", msg); - else if (P_OSREL_MAJOR(__FreeBSD_version) + 1 == major) + else device_printf(dev, "Deprecated code (to be removed in FreeBSD %d): %s\n", major, msg); diff --git a/freebsd/sys/kern/subr_eventhandler.c b/freebsd/sys/kern/subr_eventhandler.c index e07248bf..6d36653d 100644 --- a/freebsd/sys/kern/subr_eventhandler.c +++ b/freebsd/sys/kern/subr_eventhandler.c @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/kernel.h> +#include <sys/ktr.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mutex.h> diff --git a/freebsd/sys/kern/subr_gtaskqueue.c b/freebsd/sys/kern/subr_gtaskqueue.c index 3f80cd2c..af9b65d4 100644 --- a/freebsd/sys/kern/subr_gtaskqueue.c +++ b/freebsd/sys/kern/subr_gtaskqueue.c @@ -35,7 +35,6 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/bus.h> #include <sys/cpuset.h> -#include <sys/interrupt.h> #include <sys/kernel.h> #include <sys/kthread.h> #include <sys/libkern.h> @@ -69,6 +68,8 @@ struct gtaskqueue_busy { static struct gtask * const TB_DRAIN_WAITER = (struct gtask *)0x1; +typedef void (*gtaskqueue_enqueue_fn)(void *context); + struct gtaskqueue { STAILQ_HEAD(, gtask) tq_queue; gtaskqueue_enqueue_fn tq_enqueue; @@ -697,7 +698,7 @@ taskqgroup_find(struct taskqgroup *qgroup, void *uniq) } } if (idx == -1) - panic("taskqgroup_find: Failed to pick a qid."); + panic("%s: failed to pick a qid.", __func__); return (idx); } @@ -733,36 +734,36 @@ SYSINIT(tqg_record_smp_started, SI_SUB_SMP, SI_ORDER_FOURTH, void taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask, - void *uniq, int irq, const char *name) + void *uniq, device_t dev, struct resource *irq, const char *name) { #ifndef __rtems__ - cpuset_t mask; - int qid, error; + int cpu, qid, error; #else /* __rtems__ */ int qid; #endif /* __rtems__ */ gtask->gt_uniq = uniq; snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask"); +#ifndef __rtems__ + gtask->gt_dev = dev; gtask->gt_irq = irq; gtask->gt_cpu = -1; +#endif /* __rtems__ */ mtx_lock(&qgroup->tqg_lock); qid = taskqgroup_find(qgroup, uniq); qgroup->tqg_queue[qid].tgc_cnt++; LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list); gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; #ifndef __rtems__ - if (irq != -1 && tqg_smp_started) { - gtask->gt_cpu = qgroup->tqg_queue[qid].tgc_cpu; - CPU_ZERO(&mask); - CPU_SET(qgroup->tqg_queue[qid].tgc_cpu, &mask); + if (dev != NULL && irq != NULL && tqg_smp_started) { + cpu = qgroup->tqg_queue[qid].tgc_cpu; + gtask->gt_cpu = cpu; mtx_unlock(&qgroup->tqg_lock); - error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask); + error = bus_bind_intr(dev, irq, cpu); if (error) - printf("%s: setaffinity failed for %s: %d\n", __func__, gtask->gt_name, error); + printf("%s: binding interrupt failed for %s: %d\n", + __func__, gtask->gt_name, error); } else -#else /* __rtems__ */ - BSD_ASSERT(irq == -1); #endif /* __rtems__ */ mtx_unlock(&qgroup->tqg_lock); } @@ -771,7 +772,6 @@ static void taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask) { #ifndef __rtems__ - cpuset_t mask; int qid, cpu, error; #else /* __rtems__ */ int qid; @@ -781,24 +781,18 @@ taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask) qid = taskqgroup_find(qgroup, gtask->gt_uniq); #ifndef __rtems__ cpu = qgroup->tqg_queue[qid].tgc_cpu; - if (gtask->gt_irq != -1) { + if (gtask->gt_dev != NULL && gtask->gt_irq != NULL) { mtx_unlock(&qgroup->tqg_lock); - - CPU_ZERO(&mask); - CPU_SET(cpu, &mask); - error = intr_setaffinity(gtask->gt_irq, CPU_WHICH_IRQ, &mask); + error = bus_bind_intr(gtask->gt_dev, gtask->gt_irq, cpu); mtx_lock(&qgroup->tqg_lock); if (error) - printf("%s: %s setaffinity failed: %d\n", __func__, gtask->gt_name, error); + printf("%s: binding interrupt failed for %s: %d\n", + __func__, gtask->gt_name, error); } -#else /* __rtems__ */ - BSD_ASSERT(gtask->gt_irq == -1); #endif /* __rtems__ */ qgroup->tqg_queue[qid].tgc_cnt++; - - LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, - gt_list); + LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list); MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL); gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; mtx_unlock(&qgroup->tqg_lock); @@ -806,10 +800,9 @@ taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask) int taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask, - void *uniq, int cpu, int irq, const char *name) + void *uniq, int cpu, device_t dev, struct resource *irq, const char *name) { #ifndef __rtems__ - cpuset_t mask; int i, qid, error; #else /* __rtems__ */ int i, qid; @@ -818,8 +811,11 @@ taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask, qid = -1; gtask->gt_uniq = uniq; snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask"); +#ifndef __rtems__ + gtask->gt_dev = dev; gtask->gt_irq = irq; gtask->gt_cpu = cpu; +#endif /* __rtems__ */ mtx_lock(&qgroup->tqg_lock); if (tqg_smp_started) { for (i = 0; i < qgroup->tqg_cnt; i++) @@ -843,30 +839,28 @@ taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask, mtx_unlock(&qgroup->tqg_lock); #ifndef __rtems__ - CPU_ZERO(&mask); - CPU_SET(cpu, &mask); - if (irq != -1 && tqg_smp_started) { - error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask); + if (dev != NULL && irq != NULL && tqg_smp_started) { + error = bus_bind_intr(dev, irq, cpu); if (error) - printf("%s: setaffinity failed: %d\n", __func__, error); + printf("%s: binding interrupt failed for %s: %d\n", + __func__, gtask->gt_name, error); } #else /* __rtems__ */ - BSD_ASSERT(irq == -1); + BSD_ASSERT(irq == NULL); #endif /* __rtems__ */ return (0); } +#ifndef __rtems__ static int taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtask) { -#ifndef __rtems__ - cpuset_t mask; - int i, qid, irq, cpu, error; -#else /* __rtems__ */ - int i, qid, irq, cpu; -#endif /* __rtems__ */ + device_t dev; + struct resource *irq; + int cpu, error, i, qid; qid = -1; + dev = gtask->gt_dev; irq = gtask->gt_irq; cpu = gtask->gt_cpu; MPASS(tqg_smp_started); @@ -887,20 +881,15 @@ taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtas gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; mtx_unlock(&qgroup->tqg_lock); -#ifndef __rtems__ - CPU_ZERO(&mask); - CPU_SET(cpu, &mask); - - if (irq != -1) { - error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask); + if (dev != NULL && irq != NULL) { + error = bus_bind_intr(dev, irq, cpu); if (error) - printf("%s: setaffinity failed: %d\n", __func__, error); + printf("%s: binding interrupt failed for %s: %d\n", + __func__, gtask->gt_name, error); } -#else /* __rtems__ */ - BSD_ASSERT(irq == -1); -#endif /* __rtems__ */ return (0); } +#endif /* __rtems__ */ void taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask) @@ -913,7 +902,7 @@ taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask) if (qgroup->tqg_queue[i].tgc_taskq == gtask->gt_taskqueue) break; if (i == qgroup->tqg_cnt) - panic("taskqgroup_detach: task %s not in group\n", gtask->gt_name); + panic("%s: task %s not in group", __func__, gtask->gt_name); qgroup->tqg_queue[i].tgc_cnt--; LIST_REMOVE(gtask, gt_list); mtx_unlock(&qgroup->tqg_lock); @@ -941,8 +930,7 @@ taskqgroup_binder(void *ctx) thread_unlock(curthread); if (error) - printf("%s: setaffinity failed: %d\n", __func__, - error); + printf("%s: binding curthread failed: %d\n", __func__, error); #else /* __rtems__ */ sc = rtems_task_set_affinity(RTEMS_SELF, sizeof(mask), &mask); if (sc != RTEMS_SUCCESSFUL) @@ -1053,10 +1041,14 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride) while ((gtask = LIST_FIRST(>ask_head))) { LIST_REMOVE(gtask, gt_list); +#ifndef __rtems__ if (gtask->gt_cpu == -1) +#endif /* __rtems__ */ taskqgroup_attach_deferred(qgroup, gtask); +#ifndef __rtems__ else if (taskqgroup_attach_cpu_deferred(qgroup, gtask)) taskqgroup_attach_deferred(qgroup, gtask); +#endif /* __rtems__ */ } #ifdef INVARIANTS @@ -1115,15 +1107,16 @@ taskqgroup_destroy(struct taskqgroup *qgroup) void taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn, - const char *name) + const char *name) { GROUPTASK_INIT(gtask, 0, fn, ctx); - taskqgroup_attach(qgroup_config, gtask, gtask, -1, name); + taskqgroup_attach(qgroup_config, gtask, gtask, NULL, NULL, name); } void taskqgroup_config_gtask_deinit(struct grouptask *gtask) { + taskqgroup_detach(qgroup_config, gtask); } diff --git a/freebsd/sys/kern/subr_kobj.c b/freebsd/sys/kern/subr_kobj.c index a6a888d5..3736f64c 100644 --- a/freebsd/sys/kern/subr_kobj.c +++ b/freebsd/sys/kern/subr_kobj.c @@ -127,35 +127,40 @@ kobj_class_compile_common(kobj_class_t cls, kobj_ops_t ops) cls->ops = ops; } -void -kobj_class_compile(kobj_class_t cls) +static int +kobj_class_compile1(kobj_class_t cls, int mflags) { kobj_ops_t ops; KOBJ_ASSERT(MA_NOTOWNED); - /* - * Allocate space for the compiled ops table. - */ - ops = malloc(sizeof(struct kobj_ops), M_KOBJ, M_NOWAIT); - if (!ops) - panic("%s: out of memory", __func__); + ops = malloc(sizeof(struct kobj_ops), M_KOBJ, mflags); + if (ops == NULL) + return (ENOMEM); - KOBJ_LOCK(); - /* * We may have lost a race for kobj_class_compile here - check * to make sure someone else hasn't already compiled this * class. */ + KOBJ_LOCK(); if (cls->ops) { KOBJ_UNLOCK(); free(ops, M_KOBJ); - return; + return (0); } - kobj_class_compile_common(cls, ops); KOBJ_UNLOCK(); + return (0); +} + +void +kobj_class_compile(kobj_class_t cls) +{ + int error; + + error = kobj_class_compile1(cls, M_WAITOK); + KASSERT(error == 0, ("kobj_class_compile1 returned %d", error)); } void @@ -256,24 +261,6 @@ kobj_class_free(kobj_class_t cls) free(ops, M_KOBJ); } -kobj_t -kobj_create(kobj_class_t cls, - struct malloc_type *mtype, - int mflags) -{ - kobj_t obj; - - /* - * Allocate and initialise the new object. - */ - obj = malloc(cls->size, mtype, mflags | M_ZERO); - if (!obj) - return NULL; - kobj_init(obj, cls); - - return obj; -} - static void kobj_init_common(kobj_t obj, kobj_class_t cls) { @@ -282,30 +269,52 @@ kobj_init_common(kobj_t obj, kobj_class_t cls) cls->refs++; } -void -kobj_init(kobj_t obj, kobj_class_t cls) +static int +kobj_init1(kobj_t obj, kobj_class_t cls, int mflags) { - KOBJ_ASSERT(MA_NOTOWNED); - retry: - KOBJ_LOCK(); + int error; - /* - * Consider compiling the class' method table. - */ - if (!cls->ops) { + KOBJ_LOCK(); + while (cls->ops == NULL) { /* * kobj_class_compile doesn't want the lock held * because of the call to malloc - we drop the lock * and re-try. */ KOBJ_UNLOCK(); - kobj_class_compile(cls); - goto retry; + error = kobj_class_compile1(cls, mflags); + if (error != 0) + return (error); + KOBJ_LOCK(); } - kobj_init_common(obj, cls); - KOBJ_UNLOCK(); + return (0); +} + +kobj_t +kobj_create(kobj_class_t cls, struct malloc_type *mtype, int mflags) +{ + kobj_t obj; + + obj = malloc(cls->size, mtype, mflags | M_ZERO); + if (obj == NULL) + return (NULL); + if (kobj_init1(obj, cls, mflags) != 0) { + free(obj, mtype); + return (NULL); + } + return (obj); +} + +void +kobj_init(kobj_t obj, kobj_class_t cls) +{ + int error; + + error = kobj_init1(obj, cls, M_NOWAIT); + if (error != 0) + panic("kobj_init1 failed: error %d", error); } void diff --git a/freebsd/sys/kern/subr_lock.c b/freebsd/sys/kern/subr_lock.c index c2587cd0..53d99743 100644 --- a/freebsd/sys/kern/subr_lock.c +++ b/freebsd/sys/kern/subr_lock.c @@ -4,7 +4,6 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org> - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -258,7 +257,9 @@ struct lock_prof_cpu { struct lock_prof_type lpc_types[2]; /* One for spin one for other. */ }; -struct lock_prof_cpu *lp_cpu[MAXCPU]; +DPCPU_DEFINE_STATIC(struct lock_prof_cpu, lp); +#define LP_CPU_SELF (DPCPU_PTR(lp)) +#define LP_CPU(cpu) (DPCPU_ID_PTR((cpu), lp)) volatile int __read_mostly lock_prof_enable; static volatile int lock_prof_resetting; @@ -304,11 +305,9 @@ lock_prof_init(void *arg) { int cpu; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - lp_cpu[cpu] = malloc(sizeof(*lp_cpu[cpu]), M_DEVBUF, - M_WAITOK | M_ZERO); - lock_prof_init_type(&lp_cpu[cpu]->lpc_types[0]); - lock_prof_init_type(&lp_cpu[cpu]->lpc_types[1]); + CPU_FOREACH(cpu) { + lock_prof_init_type(&LP_CPU(cpu)->lpc_types[0]); + lock_prof_init_type(&LP_CPU(cpu)->lpc_types[1]); } } SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL); @@ -347,15 +346,15 @@ lock_prof_reset(void) * before we zero the structures. Some items may still be linked * into per-thread lists as well. */ - for (cpu = 0; cpu <= mp_maxid; cpu++) { - lpc = lp_cpu[cpu]; + CPU_FOREACH(cpu) { + lpc = LP_CPU(cpu); for (i = 0; i < LPROF_CACHE_SIZE; i++) { LIST_REMOVE(&lpc->lpc_types[0].lpt_objs[i], lpo_link); LIST_REMOVE(&lpc->lpc_types[1].lpt_objs[i], lpo_link); } } - for (cpu = 0; cpu <= mp_maxid; cpu++) { - lpc = lp_cpu[cpu]; + CPU_FOREACH(cpu) { + lpc = LP_CPU(cpu); bzero(lpc, sizeof(*lpc)); lock_prof_init_type(&lpc->lpc_types[0]); lock_prof_init_type(&lpc->lpc_types[1]); @@ -395,10 +394,8 @@ lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash, dst->class = match->class; dst->name = match->name; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (lp_cpu[cpu] == NULL) - continue; - type = &lp_cpu[cpu]->lpc_types[spin]; + CPU_FOREACH(cpu) { + type = &LP_CPU(cpu)->lpc_types[spin]; SLIST_FOREACH(l, &type->lpt_hash[hash], link) { if (l->ticks == t) continue; @@ -416,7 +413,6 @@ lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash, dst->cnt_contest_locking += l->cnt_contest_locking; } } - } static void @@ -455,11 +451,9 @@ dump_lock_prof_stats(SYSCTL_HANDLER_ARGS) lock_prof_enable = 0; quiesce_all_cpus("profstat", 0); t = ticks; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (lp_cpu[cpu] == NULL) - continue; - lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[0], sb, 0, t); - lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[1], sb, 1, t); + CPU_FOREACH(cpu) { + lock_prof_type_stats(&LP_CPU(cpu)->lpc_types[0], sb, 0, t); + lock_prof_type_stats(&LP_CPU(cpu)->lpc_types[1], sb, 1, t); } lock_prof_enable = enabled; @@ -525,7 +519,7 @@ lock_profile_lookup(struct lock_object *lo, int spin, const char *file, p = unknown; hash = (uintptr_t)lo->lo_name * 31 + (uintptr_t)p * 31 + line; hash &= LPROF_HASH_MASK; - type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; + type = &LP_CPU_SELF->lpc_types[spin]; head = &type->lpt_hash[hash]; SLIST_FOREACH(lp, head, link) { if (lp->line == line && lp->file == p && @@ -560,7 +554,7 @@ lock_profile_object_lookup(struct lock_object *lo, int spin, const char *file, if (l->lpo_obj == lo && l->lpo_file == file && l->lpo_line == line) return (l); - type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; + type = &LP_CPU_SELF->lpc_types[spin]; l = LIST_FIRST(&type->lpt_lpoalloc); if (l == NULL) { lock_prof_rejected++; @@ -696,7 +690,7 @@ lock_profile_release_lock(struct lock_object *lo) lp->cnt_cur += l->lpo_cnt; release: LIST_REMOVE(l, lpo_link); - type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; + type = &LP_CPU_SELF->lpc_types[spin]; LIST_INSERT_HEAD(&type->lpt_lpoalloc, l, lpo_link); out: critical_exit(); diff --git a/freebsd/sys/kern/subr_pcpu.c b/freebsd/sys/kern/subr_pcpu.c index 0ab77996..a3a06c78 100644 --- a/freebsd/sys/kern/subr_pcpu.c +++ b/freebsd/sys/kern/subr_pcpu.c @@ -136,24 +136,20 @@ SYSINIT(dpcpu, SI_SUB_KLD, SI_ORDER_FIRST, dpcpu_startup, NULL); /* * UMA_PCPU_ZONE zones, that are available for all kernel * consumers. Right now 64 bit zone is used for counter(9) - * and pointer zone is used by flowtable. + * and int zone is used for mount point counters. */ +uma_zone_t pcpu_zone_int; uma_zone_t pcpu_zone_64; -uma_zone_t pcpu_zone_ptr; static void pcpu_zones_startup(void) { + pcpu_zone_int = uma_zcreate("int pcpu", sizeof(int), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); pcpu_zone_64 = uma_zcreate("64 pcpu", sizeof(uint64_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); - - if (sizeof(uint64_t) == sizeof(void *)) - pcpu_zone_ptr = pcpu_zone_64; - else - pcpu_zone_ptr = uma_zcreate("ptr pcpu", sizeof(void *), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); } SYSINIT(pcpu_zones, SI_SUB_VM, SI_ORDER_ANY, pcpu_zones_startup, NULL); diff --git a/freebsd/sys/kern/subr_prf.c b/freebsd/sys/kern/subr_prf.c index 2b45c13e..ed3c8498 100644 --- a/freebsd/sys/kern/subr_prf.c +++ b/freebsd/sys/kern/subr_prf.c @@ -70,6 +70,8 @@ __FBSDID("$FreeBSD$"); #include <sys/cons.h> #endif /* __rtems__ */ #include <sys/uio.h> +#else /* !_KERNEL */ +#include <errno.h> #endif #include <sys/ctype.h> #include <sys/sbuf.h> @@ -1300,3 +1302,46 @@ sbuf_putbuf(struct sbuf *sb) printf("%s", sbuf_data(sb)); } #endif /* __rtems__ */ + +int +sbuf_printf_drain(void *arg, const char *data, int len) +{ +#ifndef __rtems__ + size_t *retvalptr; + int r; +#ifdef _KERNEL + char *dataptr; + char oldchr; + + /* + * This is allowed as an extra byte is always resvered for + * terminating NUL byte. Save and restore the byte because + * we might be flushing a record, and there may be valid + * data after the buffer. + */ + oldchr = data[len]; + dataptr = __DECONST(char *, data); + dataptr[len] = '\0'; + + prf_putbuf(dataptr, TOLOG | TOCONS, -1); + r = len; + + dataptr[len] = oldchr; + +#else /* !_KERNEL */ + + r = printf("%.*s", len, data); + if (r < 0) + return (-errno); + +#endif + + retvalptr = arg; + if (retvalptr != NULL) + *retvalptr += r; + + return (r); +#else /* __rtems__ */ + return (printf("%.*s", len, data)); +#endif /* __rtems__ */ +} diff --git a/freebsd/sys/kern/subr_sbuf.c b/freebsd/sys/kern/subr_sbuf.c index b51ed52c..42e6f8f0 100644 --- a/freebsd/sys/kern/subr_sbuf.c +++ b/freebsd/sys/kern/subr_sbuf.c @@ -58,11 +58,11 @@ __FBSDID("$FreeBSD$"); #ifdef _KERNEL static MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers"); -#define SBMALLOC(size) malloc(size, M_SBUF, M_WAITOK|M_ZERO) +#define SBMALLOC(size, flags) malloc(size, M_SBUF, (flags) | M_ZERO) #define SBFREE(buf) free(buf, M_SBUF) #else /* _KERNEL */ #define KASSERT(e, m) -#define SBMALLOC(size) calloc(1, size) +#define SBMALLOC(size, flags) calloc(1, size) #define SBFREE(buf) free(buf) #endif /* _KERNEL */ @@ -72,6 +72,7 @@ static MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers"); #define SBUF_ISDYNAMIC(s) ((s)->s_flags & SBUF_DYNAMIC) #define SBUF_ISDYNSTRUCT(s) ((s)->s_flags & SBUF_DYNSTRUCT) #define SBUF_ISFINISHED(s) ((s)->s_flags & SBUF_FINISHED) +#define SBUF_ISDRAINATEOL(s) ((s)->s_flags & SBUF_DRAINATEOL) #define SBUF_HASROOM(s) ((s)->s_len < (s)->s_size - 1) #define SBUF_FREESPACE(s) ((s)->s_size - ((s)->s_len + 1)) #define SBUF_CANEXTEND(s) ((s)->s_flags & SBUF_AUTOEXTEND) @@ -79,6 +80,8 @@ static MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers"); #define SBUF_NULINCLUDED(s) ((s)->s_flags & SBUF_INCLUDENUL) #define SBUF_ISDRAINTOEOR(s) ((s)->s_flags & SBUF_DRAINTOEOR) #define SBUF_DODRAINTOEOR(s) (SBUF_ISSECTION(s) && SBUF_ISDRAINTOEOR(s)) +#define SBUF_MALLOCFLAG(s) \ + (((s)->s_flags & SBUF_NOWAIT) ? M_NOWAIT : M_WAITOK) /* * Set / clear flags @@ -173,7 +176,7 @@ sbuf_extend(struct sbuf *s, int addlen) if (!SBUF_CANEXTEND(s)) return (-1); newsize = sbuf_extendsize(s->s_size + addlen); - newbuf = SBMALLOC(newsize); + newbuf = SBMALLOC(newsize, SBUF_MALLOCFLAG(s)); if (newbuf == NULL) return (-1); memcpy(newbuf, s->s_buf, s->s_size); @@ -187,39 +190,6 @@ sbuf_extend(struct sbuf *s, int addlen) } /* - * Initialize the internals of an sbuf. - * If buf is non-NULL, it points to a static or already-allocated string - * big enough to hold at least length characters. - */ -static struct sbuf * -sbuf_newbuf(struct sbuf *s, char *buf, int length, int flags) -{ - - memset(s, 0, sizeof(*s)); - s->s_flags = flags; - s->s_size = length; - s->s_buf = buf; - - if ((s->s_flags & SBUF_AUTOEXTEND) == 0) { - KASSERT(s->s_size >= SBUF_MINSIZE, - ("attempt to create an sbuf smaller than %d bytes", - SBUF_MINSIZE)); - } - - if (s->s_buf != NULL) - return (s); - - if ((flags & SBUF_AUTOEXTEND) != 0) - s->s_size = sbuf_extendsize(s->s_size); - - s->s_buf = SBMALLOC(s->s_size); - if (s->s_buf == NULL) - return (NULL); - SBUF_SETFLAG(s, SBUF_DYNAMIC); - return (s); -} - -/* * Initialize an sbuf. * If buf is non-NULL, it points to a static or already-allocated string * big enough to hold at least length characters. @@ -232,19 +202,56 @@ sbuf_new(struct sbuf *s, char *buf, int length, int flags) ("attempt to create an sbuf of negative length (%d)", length)); KASSERT((flags & ~SBUF_USRFLAGMSK) == 0, ("%s called with invalid flags", __func__)); + KASSERT((flags & SBUF_AUTOEXTEND) || length >= SBUF_MINSIZE, + ("sbuf buffer %d smaller than minimum %d bytes", length, + SBUF_MINSIZE)); flags &= SBUF_USRFLAGMSK; - if (s != NULL) - return (sbuf_newbuf(s, buf, length, flags)); - s = SBMALLOC(sizeof(*s)); - if (s == NULL) - return (NULL); - if (sbuf_newbuf(s, buf, length, flags) == NULL) { - SBFREE(s); - return (NULL); + /* + * Allocate 'DYNSTRUCT' sbuf from the heap, if NULL 's' was provided. + */ + if (s == NULL) { + s = SBMALLOC(sizeof(*s), + (flags & SBUF_NOWAIT) ? M_NOWAIT : M_WAITOK); + if (s == NULL) + goto out; + SBUF_SETFLAG(s, SBUF_DYNSTRUCT); + } else { + /* + * DYNSTRUCT SBMALLOC sbufs are allocated with M_ZERO, but + * user-provided sbuf objects must be initialized. + */ + memset(s, 0, sizeof(*s)); + } + + s->s_flags |= flags; + s->s_size = length; + s->s_buf = buf; + /* + * Never-written sbufs do not need \n termination. + */ + SBUF_SETFLAG(s, SBUF_DRAINATEOL); + + /* + * Allocate DYNAMIC, i.e., heap data buffer backing the sbuf, if no + * buffer was provided. + */ + if (s->s_buf == NULL) { + if (SBUF_CANEXTEND(s)) + s->s_size = sbuf_extendsize(s->s_size); + s->s_buf = SBMALLOC(s->s_size, SBUF_MALLOCFLAG(s)); + if (s->s_buf == NULL) + goto out; + SBUF_SETFLAG(s, SBUF_DYNAMIC); + } + +out: + if (s != NULL && s->s_buf == NULL) { + if (SBUF_ISDYNSTRUCT(s)) + SBFREE(s); + s = NULL; } - SBUF_SETFLAG(s, SBUF_DYNSTRUCT); return (s); } @@ -310,6 +317,8 @@ sbuf_clear(struct sbuf *s) assert_sbuf_integrity(s); /* don't care if it's finished or not */ + KASSERT(s->s_drain_func == NULL, + ("%s makes no sense on sbuf %p with drain", __func__, s)); SBUF_CLEARFLAG(s, SBUF_FINISHED); s->s_error = 0; @@ -344,6 +353,21 @@ sbuf_setpos(struct sbuf *s, ssize_t pos) } /* + * Drain into a counter. Counts amount of data without producing output. + * Useful for cases like sysctl, where user may first request only size. + * This allows to avoid pointless allocation/freeing of large buffers. + */ +int +sbuf_count_drain(void *arg, const char *data __unused, int len) +{ + size_t *sizep; + + sizep = (size_t *)arg; + *sizep += len; + return (len); +} + +/* * Set up a drain function and argument on an sbuf to flush data to * when the sbuf buffer overflows. */ @@ -369,6 +393,7 @@ sbuf_drain(struct sbuf *s) KASSERT(s->s_len > 0, ("Shouldn't drain empty sbuf %p", s)); KASSERT(s->s_error == 0, ("Called %s with error on %p", __func__, s)); + if (SBUF_DODRAINTOEOR(s) && s->s_rec_off == 0) return (s->s_error = EDEADLK); len = s->s_drain_func(s->s_drain_arg, s->s_buf, @@ -385,8 +410,18 @@ sbuf_drain(struct sbuf *s) * Fast path for the expected case where all the data was * drained. */ - if (s->s_len == 0) + if (s->s_len == 0) { + /* + * When the s_buf is entirely drained, we need to remember if + * the last character was a '\n' or not for + * sbuf_nl_terminate(). + */ + if (s->s_buf[len - 1] == '\n') + SBUF_SETFLAG(s, SBUF_DRAINATEOL); + else + SBUF_CLEARFLAG(s, SBUF_DRAINATEOL); return (0); + } /* * Move the remaining characters to the beginning of the * string. @@ -702,6 +737,38 @@ sbuf_putc(struct sbuf *s, int c) } /* + * Append a trailing newline to a non-empty sbuf, if one is not already + * present. Handles sbufs with drain functions correctly. + */ +int +sbuf_nl_terminate(struct sbuf *s) +{ + + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + /* + * If the s_buf isn't empty, the last byte is simply s_buf[s_len - 1]. + * + * If the s_buf is empty because a drain function drained it, we + * remember if the last byte was a \n with the SBUF_DRAINATEOL flag in + * sbuf_drain(). + * + * In either case, we only append a \n if the previous character was + * something else. + */ + if (s->s_len == 0) { + if (!SBUF_ISDRAINATEOL(s)) + sbuf_put_byte(s, '\n'); + } else if (s->s_buf[s->s_len - 1] != '\n') + sbuf_put_byte(s, '\n'); + + if (s->s_error != 0) + return (-1); + return (0); +} + +/* * Trim whitespace characters from end of an sbuf. */ int diff --git a/freebsd/sys/kern/subr_sleepqueue.c b/freebsd/sys/kern/subr_sleepqueue.c index 57681cce..9665c02f 100644 --- a/freebsd/sys/kern/subr_sleepqueue.c +++ b/freebsd/sys/kern/subr_sleepqueue.c @@ -5,7 +5,6 @@ * * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org> * Copyright (c) 2015 embedded brains GmbH <rtems@embedded-brains.de> - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -132,7 +131,7 @@ CTASSERT(powerof2(SC_TABLESIZE)); * c - sleep queue chain lock */ struct sleepqueue { - TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */ + struct threadqueue sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */ u_int sq_blockedcnt[NR_SLEEPQS]; /* (c) N. of blocked threads. */ LIST_ENTRY(sleepqueue) sq_hash; /* (c) Chain and free list. */ LIST_HEAD(, sleepqueue) sq_free; /* (c) Free queues. */ @@ -593,6 +592,19 @@ sleepq_catch_signals(void *wchan, int pri) } else { mtx_unlock(&ps->ps_mtx); } + + /* + * Do not go into sleep if this thread was the + * ptrace(2) attach leader. cursig() consumed + * SIGSTOP from PT_ATTACH, but we usually act + * on the signal by interrupting sleep, and + * should do that here as well. + */ + if ((td->td_dbgflags & TDB_FSTP) != 0) { + if (ret == 0) + ret = EINTR; + td->td_dbgflags &= ~TDB_FSTP; + } } /* * Lock the per-process spinlock prior to dropping the PROC_LOCK @@ -1127,13 +1139,15 @@ sleepq_init(void *mem, int size, int flags) } /* - * Find the highest priority thread sleeping on a wait channel and resume it. + * Find thread sleeping on a wait channel and resume it. */ int sleepq_signal(void *wchan, int flags, int pri, int queue) { + struct sleepqueue_chain *sc; struct sleepqueue *sq; #ifndef __rtems__ + struct threadqueue *head; struct thread *td, *besttd; #else /* __rtems__ */ struct thread *besttd; @@ -1150,16 +1164,33 @@ sleepq_signal(void *wchan, int flags, int pri, int queue) ("%s: mismatch between sleep/wakeup and cv_*", __func__)); #ifndef __rtems__ - /* - * Find the highest priority thread on the queue. If there is a - * tie, use the thread that first appears in the queue as it has - * been sleeping the longest since threads are always added to - * the tail of sleep queues. - */ - besttd = TAILQ_FIRST(&sq->sq_blocked[queue]); - TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) { - if (td->td_priority < besttd->td_priority) + head = &sq->sq_blocked[queue]; + if (flags & SLEEPQ_UNFAIR) { + /* + * Find the most recently sleeping thread, but try to + * skip threads still in process of context switch to + * avoid spinning on the thread lock. + */ + sc = SC_LOOKUP(wchan); + besttd = TAILQ_LAST_FAST(head, thread, td_slpq); + while (besttd->td_lock != &sc->sc_lock) { + td = TAILQ_PREV_FAST(besttd, head, thread, td_slpq); + if (td == NULL) + break; besttd = td; + } + } else { + /* + * Find the highest priority thread on the queue. If there + * is a tie, use the thread that first appears in the queue + * as it has been sleeping the longest since threads are + * always added to the tail of sleep queues. + */ + besttd = td = TAILQ_FIRST(head); + while ((td = TAILQ_NEXT(td, td_slpq)) != NULL) { + if (td->td_priority < besttd->td_priority) + besttd = td; + } } #else /* __rtems__ */ besttd = TAILQ_FIRST(&sq->sq_blocked[queue]); diff --git a/freebsd/sys/kern/subr_taskqueue.c b/freebsd/sys/kern/subr_taskqueue.c index 39d9f939..67e62fc8 100644 --- a/freebsd/sys/kern/subr_taskqueue.c +++ b/freebsd/sys/kern/subr_taskqueue.c @@ -841,7 +841,7 @@ taskqueue_thread_enqueue(void *context) tqp = context; tq = *tqp; - wakeup_one(tq); + wakeup_any(tq); } TASKQUEUE_DEFINE(swi, taskqueue_swi_enqueue, NULL, diff --git a/freebsd/sys/kern/sys_generic.c b/freebsd/sys/kern/sys_generic.c index cc208d6e..1bc4fa6b 100644 --- a/freebsd/sys/kern/sys_generic.c +++ b/freebsd/sys/kern/sys_generic.c @@ -772,7 +772,11 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) fp = NULL; /* fhold() was not called yet */ goto out; } - fhold(fp); + if (!fhold(fp)) { + error = EBADF; + fp = NULL; + goto out; + } if (locked == LA_SLOCKED) { FILEDESC_SUNLOCK(fdp); locked = LA_UNLOCKED; diff --git a/freebsd/sys/kern/sys_pipe.c b/freebsd/sys/kern/sys_pipe.c index 050d63a4..d9b502f0 100755 --- a/freebsd/sys/kern/sys_pipe.c +++ b/freebsd/sys/kern/sys_pipe.c @@ -177,7 +177,6 @@ struct fileops pipeops = { }; #else /* __rtems__ */ #define PIPE_NODIRECT -#define PRIBIO (0) static int rtems_bsd_pipe_open(rtems_libio_t *iop, const char *path, int oflag, mode_t mode); @@ -433,9 +432,7 @@ void pipe_dtor(struct pipe *dpipe) { struct pipe *peer; - ino_t ino; - ino = dpipe->pipe_ino; peer = (dpipe->pipe_state & PIPE_NAMED) != 0 ? dpipe->pipe_peer : NULL; funsetown(&dpipe->pipe_sigio); pipeclose(dpipe); @@ -802,11 +799,9 @@ pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, /* * Direct copy, bypassing a kernel buffer. */ - } else if ((size = rpipe->pipe_map.cnt) && - (rpipe->pipe_state & PIPE_DIRECTW)) { + } else if ((size = rpipe->pipe_map.cnt) != 0) { if (size > uio->uio_resid) size = (u_int) uio->uio_resid; - PIPE_UNLOCK(rpipe); error = uiomove_fromphys(rpipe->pipe_map.ms, rpipe->pipe_map.pos, size, uio); @@ -817,7 +812,7 @@ pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, rpipe->pipe_map.pos += size; rpipe->pipe_map.cnt -= size; if (rpipe->pipe_map.cnt == 0) { - rpipe->pipe_state &= ~(PIPE_DIRECTW|PIPE_WANTW); + rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } #endif @@ -984,32 +979,33 @@ pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio) u_int size; int i; - PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); - KASSERT(wpipe->pipe_state & PIPE_DIRECTW, - ("Clone attempt on non-direct write pipe!")); + PIPE_LOCK_ASSERT(wpipe, MA_OWNED); + KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, + ("%s: PIPE_DIRECTW set on %p", __func__, wpipe)); + KASSERT(wpipe->pipe_map.cnt == 0, + ("%s: pipe map for %p contains residual data", __func__, wpipe)); if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; else size = uio->uio_iov->iov_len; - if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, + wpipe->pipe_state |= PIPE_DIRECTW; + PIPE_UNLOCK(wpipe); + i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, - wpipe->pipe_map.ms, PIPENPAGES)) < 0) + wpipe->pipe_map.ms, PIPENPAGES); + PIPE_LOCK(wpipe); + if (i < 0) { + wpipe->pipe_state &= ~PIPE_DIRECTW; return (EFAULT); + } -/* - * set up the control block - */ wpipe->pipe_map.npages = i; wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_map.cnt = size; -/* - * and update the uio data - */ - uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; if (uio->uio_iov->iov_len == 0) @@ -1020,13 +1016,19 @@ pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio) } /* - * unmap and unwire the process buffer + * Unwire the process buffer. */ static void pipe_destroy_write_buffer(struct pipe *wpipe) { PIPE_LOCK_ASSERT(wpipe, MA_OWNED); + KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0, + ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe)); + KASSERT(wpipe->pipe_map.cnt == 0, + ("%s: pipe map for %p contains residual data", __func__, wpipe)); + + wpipe->pipe_state &= ~PIPE_DIRECTW; vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages); wpipe->pipe_map.npages = 0; } @@ -1045,13 +1047,16 @@ pipe_clone_write_buffer(struct pipe *wpipe) int pos; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); + KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0, + ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe)); + size = wpipe->pipe_map.cnt; pos = wpipe->pipe_map.pos; + wpipe->pipe_map.cnt = 0; wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; - wpipe->pipe_state &= ~PIPE_DIRECTW; PIPE_UNLOCK(wpipe); iov.iov_base = wpipe->pipe_buffer.buffer; @@ -1090,7 +1095,7 @@ retry: pipeunlock(wpipe); goto error1; } - while (wpipe->pipe_state & PIPE_DIRECTW) { + if (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); @@ -1105,7 +1110,6 @@ retry: else goto retry; } - wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ if (wpipe->pipe_buffer.cnt > 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; @@ -1122,20 +1126,15 @@ retry: goto retry; } - wpipe->pipe_state |= PIPE_DIRECTW; - - PIPE_UNLOCK(wpipe); error = pipe_build_write_buffer(wpipe, uio); - PIPE_LOCK(wpipe); if (error) { - wpipe->pipe_state &= ~PIPE_DIRECTW; pipeunlock(wpipe); goto error1; } - error = 0; - while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { + while (wpipe->pipe_map.cnt != 0) { if (wpipe->pipe_state & PIPE_EOF) { + wpipe->pipe_map.cnt = 0; pipe_destroy_write_buffer(wpipe); pipeselwakeup(wpipe); pipeunlock(wpipe); @@ -1152,20 +1151,19 @@ retry: error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwt", 0); pipelock(wpipe, 0); + if (error != 0) + break; } if (wpipe->pipe_state & PIPE_EOF) error = EPIPE; - if (wpipe->pipe_state & PIPE_DIRECTW) { - /* - * this bit of trickery substitutes a kernel buffer for - * the process that might be going away. - */ + if (error == EINTR || error == ERESTART) pipe_clone_write_buffer(wpipe); - } else { + else pipe_destroy_write_buffer(wpipe); - } pipeunlock(wpipe); + KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, + ("pipe %p leaked PIPE_DIRECTW", wpipe)); return (error); error1: @@ -1290,7 +1288,7 @@ pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred, * pipe buffer. We break out if a signal occurs or the * reader goes away. */ - if (wpipe->pipe_state & PIPE_DIRECTW) { + if (wpipe->pipe_map.cnt != 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); @@ -1586,7 +1584,7 @@ pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, PIPE_UNLOCK(mpipe); return (0); } - if (mpipe->pipe_state & PIPE_DIRECTW) + if (mpipe->pipe_map.cnt != 0) *(int *)data = mpipe->pipe_map.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; @@ -1663,8 +1661,7 @@ pipe_poll(struct file *fp, int events, struct ucred *active_cred, #else /* __rtems__ */ if (rtems_bsd_libio_flags_to_fflag(fp->f_io.flags) & FREAD && events & (POLLIN | POLLRDNORM)) #endif /* __rtems__ */ - if ((rpipe->pipe_state & PIPE_DIRECTW) || - (rpipe->pipe_buffer.cnt > 0)) + if (rpipe->pipe_map.cnt > 0 || rpipe->pipe_buffer.cnt > 0) revents |= events & (POLLIN | POLLRDNORM); #ifndef __rtems__ @@ -1674,7 +1671,7 @@ pipe_poll(struct file *fp, int events, struct ucred *active_cred, #endif /* __rtems__ */ if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF) || - (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && + ((wpipe->pipe_state & PIPE_DIRECTW) == 0 && ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF || wpipe->pipe_buffer.size == 0))) revents |= events & (POLLOUT | POLLWRNORM); @@ -1683,7 +1680,7 @@ pipe_poll(struct file *fp, int events, struct ucred *active_cred, (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); #ifndef __rtems__ if (rpipe->pipe_state & PIPE_NAMED && fp->f_flag & FREAD && levents && - fp->f_seqcount == rpipe->pipe_wgen) + fp->f_pipegen == rpipe->pipe_wgen) #else /* __rtems__ */ if (rpipe->pipe_state & PIPE_NAMED && rtems_bsd_libio_flags_to_fflag(fp->f_io.flags) & FREAD && levents) #endif /* __rtems__ */ @@ -1792,7 +1789,7 @@ pipe_stat(struct pipe *pipe, struct stat *ub) #endif /* __rtems__ */ ub->st_mode = S_IFIFO; ub->st_blksize = PAGE_SIZE; - if (pipe->pipe_state & PIPE_DIRECTW) + if (pipe->pipe_map.cnt != 0) ub->st_size = pipe->pipe_map.cnt; else ub->st_size = pipe->pipe_buffer.cnt; @@ -2081,7 +2078,7 @@ filt_piperead(struct knote *kn, long hint) PIPE_LOCK_ASSERT(rpipe, MA_OWNED); kn->kn_data = rpipe->pipe_buffer.cnt; - if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) + if (kn->kn_data == 0) kn->kn_data = rpipe->pipe_map.cnt; if ((rpipe->pipe_state & PIPE_EOF) || @@ -2099,15 +2096,19 @@ static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *wpipe; - + + /* + * If this end of the pipe is closed, the knote was removed from the + * knlist and the list lock (i.e., the pipe lock) is therefore not held. + */ wpipe = kn->kn_hook; - PIPE_LOCK_ASSERT(wpipe, MA_OWNED); if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_data = 0; kn->kn_flags |= EV_EOF; return (1); } + PIPE_LOCK_ASSERT(wpipe, MA_OWNED); kn->kn_data = (wpipe->pipe_buffer.size > 0) ? (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) : PIPE_BUF; if (wpipe->pipe_state & PIPE_DIRECTW) diff --git a/freebsd/sys/kern/tty.c b/freebsd/sys/kern/tty.c index 5d9c8a57..ee46a44f 100644 --- a/freebsd/sys/kern/tty.c +++ b/freebsd/sys/kern/tty.c @@ -95,7 +95,7 @@ static const char *dev_console_filename; FLUSHO|NOKERNINFO|NOFLSH) #define TTYSUP_CFLAG (CIGNORE|CSIZE|CSTOPB|CREAD|PARENB|PARODD|\ HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\ - CDSR_OFLOW|CCAR_OFLOW) + CDSR_OFLOW|CCAR_OFLOW|CNO_RTSDTR) #define TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT) @@ -336,7 +336,8 @@ ttydev_open(struct cdev *dev, int oflags, int devtype __unused, if (TTY_CALLOUT(tp, dev) || dev == dev_console) tp->t_termios.c_cflag |= CLOCAL; - ttydevsw_modem(tp, SER_DTR|SER_RTS, 0); + if ((tp->t_termios.c_cflag & CNO_RTSDTR) == 0) + ttydevsw_modem(tp, SER_DTR|SER_RTS, 0); error = ttydevsw_open(tp); if (error != 0) @@ -1147,6 +1148,9 @@ tty_rel_free(struct tty *tp) return; } + /* Stop asynchronous I/O. */ + funsetown(&tp->t_sigio); + /* TTY can be deallocated. */ dev = tp->t_dev; tp->t_dev = NULL; diff --git a/freebsd/sys/kern/uipc_mbuf.c b/freebsd/sys/kern/uipc_mbuf.c index 185d14a0..2f1768da 100644 --- a/freebsd/sys/kern/uipc_mbuf.c +++ b/freebsd/sys/kern/uipc_mbuf.c @@ -51,7 +51,11 @@ __FBSDID("$FreeBSD$"); #include <sys/domain.h> #include <sys/protosw.h> #include <sys/uio.h> +#include <sys/vmmeter.h> #include <sys/sdt.h> +#include <vm/vm.h> +#include <vm/vm_pageout.h> +#include <vm/vm_page.h> SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, "struct mbuf *", "mbufinfo_t *", @@ -204,7 +208,7 @@ mb_dupcl(struct mbuf *n, struct mbuf *m) else bcopy(&m->m_ext, &n->m_ext, m_ext_copylen); n->m_flags |= M_EXT; - n->m_flags |= m->m_flags & M_RDONLY; + n->m_flags |= m->m_flags & (M_RDONLY | M_NOMAP); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { @@ -248,7 +252,8 @@ m_demote(struct mbuf *m0, int all, int flags) __func__, m, m0)); if (m->m_flags & M_PKTHDR) m_demote_pkthdr(m); - m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags); + m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | + M_NOMAP | flags); } } @@ -343,6 +348,9 @@ m_pkthdr_init(struct mbuf *m, int how) #endif m->m_data = m->m_pktdat; bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); +#ifdef NUMA + m->m_pkthdr.numa_domain = M_NODOM; +#endif #ifdef MAC /* If the label init fails, fail the alloc */ error = mac_mbuf_init(m, how); @@ -375,12 +383,17 @@ m_move_pkthdr(struct mbuf *to, struct mbuf *from) if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif - to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); + to->m_flags = (from->m_flags & M_COPYFLAGS) | + (to->m_flags & (M_EXT | M_NOMAP)); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; /* especially tags */ SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ from->m_flags &= ~M_PKTHDR; + if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) { + from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; + from->m_pkthdr.snd_tag = NULL; + } } /* @@ -409,10 +422,13 @@ m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif - to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); + to->m_flags = (from->m_flags & M_COPYFLAGS) | + (to->m_flags & (M_EXT | M_NOMAP)); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; + if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) + m_snd_tag_ref(from->m_pkthdr.snd_tag); SLIST_INIT(&to->m_pkthdr.tags); return (m_tag_copy_chain(to, from, how)); } @@ -572,6 +588,32 @@ nospace: return (NULL); } +#ifndef __rtems__ +static void +m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp) +{ + struct iovec iov; + struct uio uio; + int error; + + KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off)); + KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len)); + KASSERT(off < m->m_len, + ("m_copyfromunmapped: len exceeds mbuf length")); + iov.iov_base = cp; + iov.iov_len = len; + uio.uio_resid = len; + uio.uio_iov = &iov; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_rw = UIO_READ; + error = m_unmappedtouio(m, off, &uio, len); + KASSERT(error == 0, ("m_unmappedtouio failed: off %d, len %d", off, + len)); +} +#endif /* __rtems__ */ + /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. @@ -593,7 +635,12 @@ m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) while (len > 0) { KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); count = min(m->m_len - off, len); - bcopy(mtod(m, caddr_t) + off, cp, count); +#ifndef __rtems__ + if ((m->m_flags & M_NOMAP) != 0) + m_copyfromunmapped(m, off, count, cp); + else +#endif /* __rtems__ */ + bcopy(mtod(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; @@ -688,6 +735,7 @@ m_cat(struct mbuf *m, struct mbuf *n) m = m->m_next; while (n) { if (!M_WRITABLE(m) || + (n->m_flags & M_NOMAP) != 0 || M_TRAILINGSPACE(m) < n->m_len) { /* just join the two chains */ m->m_next = n; @@ -805,6 +853,9 @@ m_pullup(struct mbuf *n, int len) int count; int space; + KASSERT((n->m_flags & M_NOMAP) == 0, + ("%s: unmapped mbuf %p", __func__, n)); + /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, @@ -923,7 +974,12 @@ m_split(struct mbuf *m0, int len0, int wait) return (NULL); n->m_next = m->m_next; m->m_next = NULL; - n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { + n->m_pkthdr.snd_tag = + m_snd_tag_ref(m0->m_pkthdr.snd_tag); + n->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } else + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; return (n); @@ -931,7 +987,12 @@ m_split(struct mbuf *m0, int len0, int wait) n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); - n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { + n->m_pkthdr.snd_tag = + m_snd_tag_ref(m0->m_pkthdr.snd_tag); + n->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } else + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; if (m->m_flags & M_EXT) @@ -1348,6 +1409,41 @@ nospace: } /* + * Return the number of fragments an mbuf will use. This is usually + * used as a proxy for the number of scatter/gather elements needed by + * a DMA engine to access an mbuf. In general mapped mbufs are + * assumed to be backed by physically contiguous buffers that only + * need a single fragment. Unmapped mbufs, on the other hand, can + * span disjoint physical pages. + */ +static int +frags_per_mbuf(struct mbuf *m) +{ + struct mbuf_ext_pgs *ext_pgs; + int frags; + + if ((m->m_flags & M_NOMAP) == 0) + return (1); + + /* + * The header and trailer are counted as a single fragment + * each when present. + * + * XXX: This overestimates the number of fragments by assuming + * all the backing physical pages are disjoint. + */ + ext_pgs = m->m_ext.ext_pgs; + frags = 0; + if (ext_pgs->hdr_len != 0) + frags++; + frags += ext_pgs->npgs; + if (ext_pgs->trail_len != 0) + frags++; + + return (frags); +} + +/* * Defragment an mbuf chain, returning at most maxfrags separate * mbufs+clusters. If this is not possible NULL is returned and * the original mbuf chain is left in its present (potentially @@ -1367,7 +1463,7 @@ m_collapse(struct mbuf *m0, int how, int maxfrags) */ curfrags = 0; for (m = m0; m != NULL; m = m->m_next) - curfrags++; + curfrags += frags_per_mbuf(m); /* * First, try to collapse mbufs. Note that we always collapse * towards the front so we don't need to deal with moving the @@ -1382,12 +1478,13 @@ again: break; if (M_WRITABLE(m) && n->m_len < M_TRAILINGSPACE(m)) { - bcopy(mtod(n, void *), mtod(m, char *) + m->m_len, - n->m_len); + m_copydata(n, 0, n->m_len, + mtod(m, char *) + m->m_len); m->m_len += n->m_len; m->m_next = n->m_next; + curfrags -= frags_per_mbuf(n); m_free(n); - if (--curfrags <= maxfrags) + if (curfrags <= maxfrags) return m0; } else m = n; @@ -1404,15 +1501,18 @@ again: m = m_getcl(how, MT_DATA, 0); if (m == NULL) goto bad; - bcopy(mtod(n, void *), mtod(m, void *), n->m_len); - bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len, - n2->m_len); + m_copydata(n, 0, n->m_len, mtod(m, char *)); + m_copydata(n2, 0, n2->m_len, + mtod(m, char *) + n->m_len); m->m_len = n->m_len + n2->m_len; m->m_next = n2->m_next; *prev = m; + curfrags += 1; /* For the new cluster */ + curfrags -= frags_per_mbuf(n); + curfrags -= frags_per_mbuf(n2); m_free(n); m_free(n2); - if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */ + if (curfrags <= maxfrags) return m0; /* * Still not there, try the normal collapse @@ -1512,6 +1612,100 @@ nospace: #endif +#ifndef __rtems__ +/* + * Free pages from mbuf_ext_pgs, assuming they were allocated via + * vm_page_alloc() and aren't associated with any object. Complement + * to allocator from m_uiotombuf_nomap(). + */ +void +mb_free_mext_pgs(struct mbuf *m) +{ + struct mbuf_ext_pgs *ext_pgs; + vm_page_t pg; + + MBUF_EXT_PGS_ASSERT(m); + ext_pgs = m->m_ext.ext_pgs; + for (int i = 0; i < ext_pgs->npgs; i++) { + pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); + vm_page_unwire_noq(pg); + vm_page_free(pg); + } +} + +static struct mbuf * +m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags) +{ + struct mbuf *m, *mb, *prev; + struct mbuf_ext_pgs *pgs; + vm_page_t pg_array[MBUF_PEXT_MAX_PGS]; + int error, length, i, needed; + ssize_t total; + int pflags = malloc2vm_flags(how) | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | + VM_ALLOC_WIRED; + + /* + * len can be zero or an arbitrary large value bound by + * the total data supplied by the uio. + */ + if (len > 0) + total = MIN(uio->uio_resid, len); + else + total = uio->uio_resid; + + if (maxseg == 0) + maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE; + + /* + * Allocate the pages + */ + m = NULL; + while (total > 0) { + mb = mb_alloc_ext_pgs(how, (flags & M_PKTHDR), + mb_free_mext_pgs); + if (mb == NULL) + goto failed; + if (m == NULL) + m = mb; + else + prev->m_next = mb; + prev = mb; + pgs = mb->m_ext.ext_pgs; + needed = length = MIN(maxseg, total); + for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) { +retry_page: + pg_array[i] = vm_page_alloc(NULL, 0, pflags); + if (pg_array[i] == NULL) { + if (how & M_NOWAIT) { + goto failed; + } else { + vm_wait(NULL); + goto retry_page; + } + } + pg_array[i]->flags &= ~PG_ZERO; + pgs->pa[i] = VM_PAGE_TO_PHYS(pg_array[i]); + pgs->npgs++; + } + pgs->last_pg_len = length - PAGE_SIZE * (pgs->npgs - 1); + MBUF_EXT_PGS_ASSERT_SANITY(pgs); + total -= length; + error = uiomove_fromphys(pg_array, 0, length, uio); + if (error != 0) + goto failed; + mb->m_len = length; + mb->m_ext.ext_size += PAGE_SIZE * pgs->npgs; + if (flags & M_PKTHDR) + m->m_pkthdr.len += length; + } + return (m); + +failed: + m_freem(m); + return (NULL); +} +#endif /* __rtems__ */ + /* * Copy the contents of uio into a properly sized mbuf chain. */ @@ -1523,6 +1717,11 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) ssize_t total; int progress = 0; +#ifndef __rtems__ + if (flags & M_NOMAP) + return (m_uiotombuf_nomap(uio, how, len, align, flags)); +#endif /* __rtems__ */ + /* * len can be zero or an arbitrary large value bound by * the total data supplied by the uio. @@ -1569,6 +1768,62 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) } /* + * Copy data from an unmapped mbuf into a uio limited by len if set. + */ +int +m_unmappedtouio(const struct mbuf *m, int m_off, struct uio *uio, int len) +{ + struct mbuf_ext_pgs *ext_pgs; + vm_page_t pg; + int error, i, off, pglen, pgoff, seglen, segoff; + + MBUF_EXT_PGS_ASSERT(m); + ext_pgs = m->m_ext.ext_pgs; + error = 0; + + /* Skip over any data removed from the front. */ + off = mtod(m, vm_offset_t); + + off += m_off; + if (ext_pgs->hdr_len != 0) { + if (off >= ext_pgs->hdr_len) { + off -= ext_pgs->hdr_len; + } else { + seglen = ext_pgs->hdr_len - off; + segoff = off; + seglen = min(seglen, len); + off = 0; + len -= seglen; + error = uiomove(&ext_pgs->hdr[segoff], seglen, uio); + } + } + pgoff = ext_pgs->first_pg_off; + for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) { + pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff); + if (off >= pglen) { + off -= pglen; + pgoff = 0; + continue; + } + seglen = pglen - off; + segoff = pgoff + off; + off = 0; + seglen = min(seglen, len); + len -= seglen; + pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); + error = uiomove_fromphys(&pg, segoff, seglen, uio); + pgoff = 0; + }; + if (len != 0 && error == 0) { + KASSERT((off + len) <= ext_pgs->trail_len, + ("off + len > trail (%d + %d > %d, m_off = %d)", off, len, + ext_pgs->trail_len, m_off)); + error = uiomove(&ext_pgs->trail[off], len, uio); + } + return (error); +} + +/* * Copy an mbuf chain into a uio limited by len if set. */ int @@ -1586,7 +1841,12 @@ m_mbuftouio(struct uio *uio, const struct mbuf *m, int len) for (; m != NULL; m = m->m_next) { length = min(m->m_len, total - progress); - error = uiomove(mtod(m, void *), length, uio); +#ifndef __rtems__ + if ((m->m_flags & M_NOMAP) != 0) + error = m_unmappedtouio(m, 0, uio, length); + else +#endif /* __rtems__ */ + error = uiomove(mtod(m, void *), length, uio); if (error) return (error); diff --git a/freebsd/sys/kern/uipc_mbuf2.c b/freebsd/sys/kern/uipc_mbuf2.c index 7dd2840c..6f98b0a2 100644 --- a/freebsd/sys/kern/uipc_mbuf2.c +++ b/freebsd/sys/kern/uipc_mbuf2.c @@ -218,7 +218,7 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) goto ok; } if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen - && writable) { + && writable && n->m_next->m_len >= tlen) { n->m_next->m_data -= hlen; n->m_next->m_len += hlen; bcopy(mtod(n, caddr_t) + off, mtod(n->m_next, caddr_t), hlen); diff --git a/freebsd/sys/kern/uipc_sockbuf.c b/freebsd/sys/kern/uipc_sockbuf.c index 0830206a..2305b333 100644 --- a/freebsd/sys/kern/uipc_sockbuf.c +++ b/freebsd/sys/kern/uipc_sockbuf.c @@ -36,11 +36,13 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <rtems/bsd/local/opt_kern_tls.h> #include <rtems/bsd/local/opt_param.h> #include <sys/param.h> #include <sys/aio.h> /* for aio_swake proto */ #include <sys/kernel.h> +#include <sys/ktls.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> @@ -91,28 +93,135 @@ sbm_clrprotoflags(struct mbuf *m, int flags) } /* - * Mark ready "count" mbufs starting with "m". + * Compress M_NOTREADY mbufs after they have been readied by sbready(). + * + * sbcompress() skips M_NOTREADY mbufs since the data is not available to + * be copied at the time of sbcompress(). This function combines small + * mbufs similar to sbcompress() once mbufs are ready. 'm0' is the first + * mbuf sbready() marked ready, and 'end' is the first mbuf still not + * ready. + */ +static void +sbready_compress(struct sockbuf *sb, struct mbuf *m0, struct mbuf *end) +{ + struct mbuf *m, *n; + int ext_size; + + SOCKBUF_LOCK_ASSERT(sb); + + if ((sb->sb_flags & SB_NOCOALESCE) != 0) + return; + + for (m = m0; m != end; m = m->m_next) { + MPASS((m->m_flags & M_NOTREADY) == 0); + + /* Compress small unmapped mbufs into plain mbufs. */ + if ((m->m_flags & M_NOMAP) && m->m_len <= MLEN && + !mbuf_has_tls_session(m)) { + MPASS(m->m_flags & M_EXT); + ext_size = m->m_ext.ext_size; + if (mb_unmapped_compress(m) == 0) { + sb->sb_mbcnt -= ext_size; + sb->sb_ccnt -= 1; + } + } + + /* + * NB: In sbcompress(), 'n' is the last mbuf in the + * socket buffer and 'm' is the new mbuf being copied + * into the trailing space of 'n'. Here, the roles + * are reversed and 'n' is the next mbuf after 'm' + * that is being copied into the trailing space of + * 'm'. + */ + n = m->m_next; + while ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 && + M_WRITABLE(m) && + (m->m_flags & M_NOMAP) == 0 && + !mbuf_has_tls_session(n) && + !mbuf_has_tls_session(m) && + n->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ + n->m_len <= M_TRAILINGSPACE(m) && + m->m_type == n->m_type) { + KASSERT(sb->sb_lastrecord != n, + ("%s: merging start of record (%p) into previous mbuf (%p)", + __func__, n, m)); + m_copydata(n, 0, n->m_len, mtodo(m, m->m_len)); + m->m_len += n->m_len; + m->m_next = n->m_next; + m->m_flags |= n->m_flags & M_EOR; + if (sb->sb_mbtail == n) + sb->sb_mbtail = m; + + sb->sb_mbcnt -= MSIZE; + sb->sb_mcnt -= 1; + if (n->m_flags & M_EXT) { + sb->sb_mbcnt -= n->m_ext.ext_size; + sb->sb_ccnt -= 1; + } + m_free(n); + n = m->m_next; + } + } + SBLASTRECORDCHK(sb); + SBLASTMBUFCHK(sb); +} + +/* + * Mark ready "count" units of I/O starting with "m". Most mbufs + * count as a single unit of I/O except for EXT_PGS-backed mbufs which + * can be backed by multiple pages. */ int -sbready(struct sockbuf *sb, struct mbuf *m, int count) +sbready(struct sockbuf *sb, struct mbuf *m0, int count) { + struct mbuf *m; u_int blocker; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb)); + KASSERT(count > 0, ("%s: invalid count %d", __func__, count)); + m = m0; blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0; - for (int i = 0; i < count; i++, m = m->m_next) { + while (count > 0) { KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); +#ifndef __rtems__ + if ((m->m_flags & M_EXT) != 0 && + m->m_ext.ext_type == EXT_PGS) { + if (count < m->m_ext.ext_pgs->nrdy) { + m->m_ext.ext_pgs->nrdy -= count; + count = 0; + break; + } + count -= m->m_ext.ext_pgs->nrdy; + m->m_ext.ext_pgs->nrdy = 0; + } else +#endif /* __rtems__ */ + count--; + m->m_flags &= ~(M_NOTREADY | blocker); if (blocker) sb->sb_acc += m->m_len; + m = m->m_next; + } + + /* + * If the first mbuf is still not fully ready because only + * some of its backing pages were readied, no further progress + * can be made. + */ + if (m0 == m) { + MPASS(m->m_flags & M_NOTREADY); + return (EINPROGRESS); } - if (!blocker) + if (!blocker) { + sbready_compress(sb, m0, m); return (EINPROGRESS); + } /* This one was blocking all the queue. */ for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) { @@ -123,6 +232,7 @@ sbready(struct sockbuf *sb, struct mbuf *m, int count) } sb->sb_fnrdy = m; + sbready_compress(sb, m0, m); return (0); } @@ -571,6 +681,11 @@ sbdestroy(struct sockbuf *sb, struct socket *so) { sbrelease_internal(sb, so); +#ifdef KERN_TLS + if (sb->sb_tls_info != NULL) + ktls_free(sb->sb_tls_info); + sb->sb_tls_info = NULL; +#endif } /* @@ -734,6 +849,11 @@ sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags) SBLASTMBUFCHK(sb); +#ifdef KERN_TLS + if (sb->sb_tls_info != NULL) + ktls_seq(sb, m); +#endif + /* Remove all packet headers and mbuf tags to get a pure data chain. */ m_demote(m, 1, flags & PRUS_NOTREADY ? M_NOTREADY : 0); @@ -1036,12 +1156,13 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) M_WRITABLE(n) && ((sb->sb_flags & SB_NOCOALESCE) == 0) && !(m->m_flags & M_NOTREADY) && - !(n->m_flags & M_NOTREADY) && + !(n->m_flags & (M_NOTREADY | M_NOMAP)) && + !mbuf_has_tls_session(m) && + !mbuf_has_tls_session(n) && m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n) && n->m_type == m->m_type) { - bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, - (unsigned)m->m_len); + m_copydata(m, 0, m->m_len, mtodo(n, n->m_len)); n->m_len += m->m_len; sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) @@ -1052,6 +1173,10 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) m = m_free(m); continue; } + if (m->m_len <= MLEN && (m->m_flags & M_NOMAP) && + (m->m_flags & M_NOTREADY) == 0 && + !mbuf_has_tls_session(m)) + (void)mb_unmapped_compress(m); if (n) n->m_next = m; else diff --git a/freebsd/sys/kern/uipc_socket.c b/freebsd/sys/kern/uipc_socket.c index 380c97dd..c01535c4 100644 --- a/freebsd/sys/kern/uipc_socket.c +++ b/freebsd/sys/kern/uipc_socket.c @@ -109,6 +109,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> +#include <rtems/bsd/local/opt_kern_tls.h> #include <rtems/bsd/local/opt_sctp.h> #include <sys/param.h> @@ -125,6 +126,7 @@ __FBSDID("$FreeBSD$"); #include <sys/hhook.h> #include <sys/kernel.h> #include <sys/khelp.h> +#include <sys/ktls.h> #include <sys/event.h> #include <sys/eventhandler.h> #include <sys/poll.h> @@ -143,6 +145,7 @@ __FBSDID("$FreeBSD$"); #include <sys/jail.h> #include <sys/syslog.h> #include <netinet/in.h> +#include <netinet/tcp.h> #include <net/vnet.h> @@ -911,6 +914,8 @@ solisten_wakeup(struct socket *sol) } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); + if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) + pgsigio(&sol->so_sigio, SIGIO, 0); } /* @@ -1067,7 +1072,7 @@ sofree(struct socket *so) * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to - * dom_dispose() and sbrelease_internal() are an inlining of what was + * dom_dispose() and sbdestroy() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down @@ -1154,9 +1159,9 @@ drop: so->so_state |= SS_NOFDREF; sorele(so); if (listening) { - struct socket *sp; + struct socket *sp, *tsp; - TAILQ_FOREACH(sp, &lqueue, so_list) { + TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { SOCK_LOCK(sp); if (sp->so_count == 0) { SOCK_UNLOCK(sp); @@ -1197,7 +1202,6 @@ soabort(struct socket *so) KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); - KASSERT(so->so_qstate == SQ_NONE, ("soabort: !SQ_NONE")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) @@ -1468,7 +1472,15 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; - + int pru_flag; +#ifdef KERN_TLS + struct ktls_session *tls; + int tls_enq_cnt, tls_pruflag; + uint8_t tls_rtype; + + tls = NULL; + tls_rtype = TLS_RLTYPE_APP; +#endif if (uio != NULL) resid = uio->uio_resid; else @@ -1502,6 +1514,28 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, if (error) goto out; +#ifdef KERN_TLS + tls_pruflag = 0; + tls = ktls_hold(so->so_snd.sb_tls_info); + if (tls != NULL) { + if (tls->sw_encrypt != NULL) + tls_pruflag = PRUS_NOTREADY; + + if (control != NULL) { + struct cmsghdr *cm = mtod(control, struct cmsghdr *); + + if (clen >= sizeof(*cm) && + cm->cmsg_type == TLS_SET_RECORD_TYPE) { + tls_rtype = *((uint8_t *)CMSG_DATA(cm)); + clen = 0; + m_freem(control); + control = NULL; + atomic = 1; + } + } + } +#endif + restart: do { SOCKBUF_LOCK(&so->so_snd); @@ -1551,7 +1585,8 @@ restart: } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { - if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { + if ((so->so_state & SS_NBIO) || + (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; @@ -1578,10 +1613,27 @@ restart: * is a workaround to prevent protocol send * methods to panic. */ - top = m_uiotombuf(uio, M_WAITOK, space, - (atomic ? max_hdr : 0), - (atomic ? M_PKTHDR : 0) | - ((flags & MSG_EOR) ? M_EOR : 0)); +#ifdef KERN_TLS + if (tls != NULL) { + top = m_uiotombuf(uio, M_WAITOK, space, + tls->params.max_frame_len, + M_NOMAP | + ((flags & MSG_EOR) ? M_EOR : 0)); + if (top != NULL) { + error = ktls_frame(top, tls, + &tls_enq_cnt, tls_rtype); + if (error) { + m_freem(top); + goto release; + } + } + tls_rtype = TLS_RLTYPE_APP; + } else +#endif + top = m_uiotombuf(uio, M_WAITOK, space, + (atomic ? max_hdr : 0), + (atomic ? M_PKTHDR : 0) | + ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; @@ -1605,8 +1657,8 @@ restart: * this. */ VNET_SO_ASSERT(so); - error = (*so->so_proto->pr_usrreqs->pru_send)(so, - (flags & MSG_OOB) ? PRUS_OOB : + + pru_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use @@ -1618,13 +1670,37 @@ restart: PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || - (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, - top, addr, control, td); + (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; + +#ifdef KERN_TLS + pru_flag |= tls_pruflag; +#endif + + error = (*so->so_proto->pr_usrreqs->pru_send)(so, + pru_flag, top, addr, control, td); + if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } + +#ifdef KERN_TLS + if (tls != NULL && tls->sw_encrypt != NULL) { + /* + * Note that error is intentionally + * ignored. + * + * Like sendfile(), we rely on the + * completion routine (pru_ready()) + * to free the mbufs in the event that + * pru_send() encountered an error and + * did not append them to the sockbuf. + */ + soref(so); + ktls_enqueue(top, so, tls_enq_cnt); + } +#endif clen = 0; control = NULL; top = NULL; @@ -1636,6 +1712,10 @@ restart: release: sbunlock(&so->so_snd); out: +#ifdef KERN_TLS + if (tls != NULL) + ktls_free(tls); +#endif if (top != NULL) m_freem(top); if (control != NULL) @@ -2011,7 +2091,13 @@ dontblock: SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); - error = uiomove(mtod(m, char *) + moff, (int)len, uio); +#ifndef __rtems__ + if ((m->m_flags & M_NOMAP) != 0) + error = m_unmappedtouio(m, moff, uio, (int)len); + else +#endif /* __rtems__ */ + error = uiomove(mtod(m, char *) + moff, + (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* @@ -2225,7 +2311,7 @@ soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) - goto out; + return (error); SOCKBUF_LOCK(sb); /* Easy one, no space to copyout anything. */ @@ -2793,12 +2879,10 @@ sosetopt(struct socket *so, struct sockopt *sopt) CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { - if (so->so_proto->pr_ctloutput != NULL) { + if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); - CURVNET_RESTORE(); - return (error); - } - error = ENOPROTOOPT; + else + error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: @@ -2811,7 +2895,12 @@ sosetopt(struct socket *so, struct sockopt *sopt) error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; - + if (l.l_linger < 0 || + l.l_linger > USHRT_MAX || + l.l_linger > (INT_MAX / hz)) { + error = EDOM; + goto bad; + } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) @@ -4162,6 +4251,9 @@ void so_linger_set(struct socket *so, int val) { + KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), + ("%s: val %d out of range", __func__, val)); + so->so_linger = val; } diff --git a/freebsd/sys/kern/uipc_syscalls.c b/freebsd/sys/kern/uipc_syscalls.c index 529268a9..39e96abe 100644 --- a/freebsd/sys/kern/uipc_syscalls.c +++ b/freebsd/sys/kern/uipc_syscalls.c @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$"); #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/syscallsubr.h> +#include <sys/sysent.h> #include <sys/uio.h> #include <sys/un.h> #include <sys/unpcb.h> @@ -451,7 +452,8 @@ accept1(td, s, uname, anamelen, flags) if (error == 0 && uname != NULL) { #ifdef COMPAT_OLDSOCK - if (flags & ACCEPT4_COMPAT) + if (SV_PROC_FLAG(td->td_proc, SV_AOUT) && + (flags & ACCEPT4_COMPAT) != 0) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif @@ -968,7 +970,8 @@ sendit(struct thread *td, int s, struct msghdr *mp, int flags) if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK - && mp->msg_flags != MSG_COMPAT + && (mp->msg_flags != MSG_COMPAT || + !SV_PROC_FLAG(td->td_proc, SV_AOUT)) #endif ) { error = EINVAL; @@ -979,7 +982,8 @@ sendit(struct thread *td, int s, struct msghdr *mp, int flags) if (error != 0) goto bad; #ifdef COMPAT_OLDSOCK - if (mp->msg_flags == MSG_COMPAT) { + if (mp->msg_flags == MSG_COMPAT && + SV_PROC_FLAG(td->td_proc, SV_AOUT)) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAITOK); @@ -1120,7 +1124,8 @@ sys_sendto(struct thread *td, struct sendto_args *uap) msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK - msg.msg_flags = 0; + if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) + msg.msg_flags = 0; #endif aiov.iov_base = __DECONST(void *, uap->buf); aiov.iov_len = uap->len; @@ -1239,7 +1244,8 @@ sys_sendmsg(struct thread *td, struct sendmsg_args *uap) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK - msg.msg_flags = 0; + if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) + msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); @@ -1356,7 +1362,8 @@ kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg, /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK - if (mp->msg_flags & MSG_COMPAT) + if ((mp->msg_flags & MSG_COMPAT) != 0 && + SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif @@ -1379,7 +1386,8 @@ kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg, * If we receive rights, trim the cmsghdr; anything else * is tossed. */ - if (control && mp->msg_flags & MSG_COMPAT) { + if (control && (mp->msg_flags & MSG_COMPAT) != 0 && + SV_PROC_FLAG(td->td_proc, SV_AOUT)) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != @@ -1438,7 +1446,8 @@ recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp) if (namelenp != NULL) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK - if (mp->msg_flags & MSG_COMPAT) + if ((mp->msg_flags & MSG_COMPAT) != 0 && + SV_PROC_FLAG(td->td_proc, SV_AOUT)) error = 0; /* old recvfrom didn't check */ #endif } @@ -1581,7 +1590,8 @@ sys_recvmsg(struct thread *td, struct recvmsg_args *uap) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK - msg.msg_flags &= ~MSG_COMPAT; + if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) + msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; @@ -1863,7 +1873,7 @@ getsockname1(struct thread *td, struct getsockname_args *uap, int compat) if (len != 0) { #ifdef COMPAT_OLDSOCK - if (compat) + if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); @@ -1978,7 +1988,7 @@ getpeername1(struct thread *td, struct getpeername_args *uap, int compat) if (len != 0) { #ifdef COMPAT_OLDSOCK - if (compat) + if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); @@ -2083,7 +2093,8 @@ sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type) if (buflen > MLEN) { #ifdef COMPAT_OLDSOCK - if (type == MT_SONAME && buflen <= 112) + if (type == MT_SONAME && buflen <= 112 && + SV_CURPROC_FLAG(SV_AOUT)) buflen = MLEN; /* unix domain compat. hack */ else #endif @@ -2101,7 +2112,8 @@ sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type) sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN - if (sa->sa_family == 0 && sa->sa_len < AF_MAX) + if (sa->sa_family == 0 && sa->sa_len < AF_MAX && + SV_CURPROC_FLAG(SV_AOUT)) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; @@ -2129,7 +2141,8 @@ getsockaddr(struct sockaddr **namp, const struct sockaddr *uaddr, size_t len) free(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN - if (sa->sa_family == 0 && sa->sa_len < AF_MAX) + if (sa->sa_family == 0 && sa->sa_len < AF_MAX && + SV_CURPROC_FLAG(SV_AOUT)) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; @@ -2180,8 +2193,10 @@ m_dispose_extcontrolm(struct mbuf *m) fd = *fds++; error = fget(td, fd, &cap_no_rights, &fp); - if (error == 0) + if (error == 0) { fdclose(td, fp, fd); + fdrop(fp, td); + } } } clen -= datalen; diff --git a/freebsd/sys/kern/uipc_usrreq.c b/freebsd/sys/kern/uipc_usrreq.c index 6b34dcb8..39f28b4b 100644 --- a/freebsd/sys/kern/uipc_usrreq.c +++ b/freebsd/sys/kern/uipc_usrreq.c @@ -1032,7 +1032,7 @@ uipc_listen(struct socket *so, int backlog, struct thread *td) SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0) { - cru2x(td->td_ucred, &unp->unp_peercred); + cru2xt(td, &unp->unp_peercred); solisten_proto(so, backlog); } SOCK_UNLOCK(so); @@ -1837,7 +1837,7 @@ void unp_copy_peercred(struct thread *td, struct unpcb *client_unp, struct unpcb *server_unp, struct unpcb *listen_unp) { - cru2x(td->td_ucred, &client_unp->unp_peercred); + cru2xt(td, &client_unp->unp_peercred); client_unp->unp_flags |= UNP_HAVEPC; memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred, @@ -2306,30 +2306,53 @@ unp_init(void) } #ifndef __rtems__ +static void +unp_internalize_cleanup_rights(struct mbuf *control) +{ + struct cmsghdr *cp; + struct mbuf *m; + void *data; + socklen_t datalen; + + for (m = control; m != NULL; m = m->m_next) { + cp = mtod(m, struct cmsghdr *); + if (cp->cmsg_level != SOL_SOCKET || + cp->cmsg_type != SCM_RIGHTS) + continue; + data = CMSG_DATA(cp); + datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data; + unp_freerights(data, datalen / sizeof(struct filedesc *)); + } +} + static int unp_internalize(struct mbuf **controlp, struct thread *td) { - struct mbuf *control = *controlp; - struct proc *p = td->td_proc; - struct filedesc *fdesc = p->p_fd; + struct mbuf *control, **initial_controlp; + struct proc *p; + struct filedesc *fdesc; struct bintime *bt; - struct cmsghdr *cm = mtod(control, struct cmsghdr *); + struct cmsghdr *cm; struct cmsgcred *cmcred; struct filedescent *fde, **fdep, *fdev; struct file *fp; struct timeval *tv; struct timespec *ts; - int i, *fdp; void *data; - socklen_t clen = control->m_len, datalen; - int error, oldfds; + socklen_t clen, datalen; + int i, j, error, *fdp, oldfds; u_int newlen; UNP_LINK_UNLOCK_ASSERT(); + p = td->td_proc; + fdesc = p->p_fd; error = 0; + control = *controlp; + clen = control->m_len; *controlp = NULL; - while (cm != NULL) { + initial_controlp = controlp; + for (cm = mtod(control, struct cmsghdr *); cm != NULL;) { if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len > clen || cm->cmsg_len < sizeof(*cm)) { error = EINVAL; @@ -2400,6 +2423,19 @@ unp_internalize(struct mbuf **controlp, struct thread *td) goto out; } fdp = data; + for (i = 0; i < oldfds; i++, fdp++) { + if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) { + fdp = data; + for (j = 0; j < i; j++, fdp++) { + fdrop(fdesc->fd_ofiles[*fdp]. + fde_file, td); + } + FILEDESC_SUNLOCK(fdesc); + error = EBADF; + goto out; + } + } + fdp = data; fdep = (struct filedescent **) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS, @@ -2480,6 +2516,8 @@ unp_internalize(struct mbuf **controlp, struct thread *td) } out: + if (error != 0 && initial_controlp != NULL) + unp_internalize_cleanup_rights(*initial_controlp); m_freem(control); return (error); } @@ -2601,7 +2639,6 @@ unp_internalize_fp(struct file *fp) unp->unp_file = fp; unp->unp_msgcount++; } - fhold(fp); unp_rights++; UNP_LINK_WUNLOCK(); } @@ -2762,10 +2799,10 @@ unp_gc(__unused void *arg, int pending) if ((unp->unp_gcflag & UNPGC_DEAD) != 0) { f = unp->unp_file; if (unp->unp_msgcount == 0 || f == NULL || - f->f_count != unp->unp_msgcount) + f->f_count != unp->unp_msgcount || + !fhold(f)) continue; unref[total++] = f; - fhold(f); KASSERT(total <= unp_unreachable, ("unp_gc: incorrect unreachable count.")); } @@ -2942,8 +2979,8 @@ db_print_xucred(int indent, struct xucred *xu) int comma, i; db_print_indent(indent); - db_printf("cr_version: %u cr_uid: %u cr_ngroups: %d\n", - xu->cr_version, xu->cr_uid, xu->cr_ngroups); + db_printf("cr_version: %u cr_uid: %u cr_pid: %d cr_ngroups: %d\n", + xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups); db_print_indent(indent); db_printf("cr_groups: "); comma = 0; diff --git a/freebsd/sys/libkern/crc32.c b/freebsd/sys/libkern/gsb_crc32.c index f1f11e3b..0eba1206 100644 --- a/freebsd/sys/libkern/crc32.c +++ b/freebsd/sys/libkern/gsb_crc32.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #ifdef _KERNEL #include <sys/libkern.h> #include <sys/systm.h> +#include <sys/gsb_crc32.h> #if defined(__amd64__) || defined(__i386__) #include <machine/md_var.h> diff --git a/freebsd/sys/mips/include/machine/cpufunc.h b/freebsd/sys/mips/include/machine/cpufunc.h index 7cfc548c..a91d0cc4 100644 --- a/freebsd/sys/mips/include/machine/cpufunc.h +++ b/freebsd/sys/mips/include/machine/cpufunc.h @@ -371,27 +371,19 @@ get_intr_mask(void) return (mips_rd_status() & MIPS_SR_INT_MASK); } -#if defined(__GNUC__) && !defined(__mips_o32) -#define mips3_ld(a) (*(const volatile uint64_t *)(a)) -#define mips3_sd(a, v) (*(volatile uint64_t *)(a) = (v)) -#else -uint64_t mips3_ld(volatile uint64_t *va); -void mips3_sd(volatile uint64_t *, uint64_t); -#endif /* __GNUC__ */ - #endif /* _KERNEL */ #define readb(va) (*(volatile uint8_t *) (va)) #define readw(va) (*(volatile uint16_t *) (va)) #define readl(va) (*(volatile uint32_t *) (va)) -#if defined(__GNUC__) && !defined(__mips_o32) +#if !defined(__mips_o32) #define readq(a) (*(volatile uint64_t *)(a)) #endif #define writeb(va, d) (*(volatile uint8_t *) (va) = (d)) #define writew(va, d) (*(volatile uint16_t *) (va) = (d)) #define writel(va, d) (*(volatile uint32_t *) (va) = (d)) -#if defined(__GNUC__) && !defined(__mips_o32) +#if !defined(__mips_o32) #define writeq(va, d) (*(volatile uint64_t *) (va) = (d)) #endif diff --git a/freebsd/sys/net/altq/altq_cbq.c b/freebsd/sys/net/altq/altq_cbq.c index 015e35bf..7c99f8a8 100644 --- a/freebsd/sys/net/altq/altq_cbq.c +++ b/freebsd/sys/net/altq/altq_cbq.c @@ -225,12 +225,11 @@ cbq_pfattach(struct pf_altq *a) } int -cbq_add_altq(struct pf_altq *a) +cbq_add_altq(struct ifnet *ifp, struct pf_altq *a) { cbq_state_t *cbqp; - struct ifnet *ifp; - if ((ifp = ifunit(a->ifname)) == NULL) + if (ifp == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); diff --git a/freebsd/sys/net/altq/altq_codel.c b/freebsd/sys/net/altq/altq_codel.c index 4a55cdbe..375fc382 100644 --- a/freebsd/sys/net/altq/altq_codel.c +++ b/freebsd/sys/net/altq/altq_codel.c @@ -91,13 +91,12 @@ codel_pfattach(struct pf_altq *a) } int -codel_add_altq(struct pf_altq *a) +codel_add_altq(struct ifnet *ifp, struct pf_altq *a) { struct codel_if *cif; - struct ifnet *ifp; struct codel_opts *opts; - if ((ifp = ifunit(a->ifname)) == NULL) + if (ifp == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); diff --git a/freebsd/sys/net/altq/altq_fairq.c b/freebsd/sys/net/altq/altq_fairq.c index a1bc3fdb..5b7646e2 100644 --- a/freebsd/sys/net/altq/altq_fairq.c +++ b/freebsd/sys/net/altq/altq_fairq.c @@ -150,12 +150,11 @@ fairq_pfattach(struct pf_altq *a) } int -fairq_add_altq(struct pf_altq *a) +fairq_add_altq(struct ifnet *ifp, struct pf_altq *a) { struct fairq_if *pif; - struct ifnet *ifp; - if ((ifp = ifunit(a->ifname)) == NULL) + if (ifp == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); diff --git a/freebsd/sys/net/altq/altq_hfsc.c b/freebsd/sys/net/altq/altq_hfsc.c index 202915a8..024055e3 100644 --- a/freebsd/sys/net/altq/altq_hfsc.c +++ b/freebsd/sys/net/altq/altq_hfsc.c @@ -161,12 +161,11 @@ hfsc_pfattach(struct pf_altq *a) } int -hfsc_add_altq(struct pf_altq *a) +hfsc_add_altq(struct ifnet *ifp, struct pf_altq *a) { struct hfsc_if *hif; - struct ifnet *ifp; - if ((ifp = ifunit(a->ifname)) == NULL) + if (ifp == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); @@ -508,6 +507,7 @@ hfsc_class_create(struct hfsc_if *hif, struct service_curve *rsc, goto err_ret; } } + cl->cl_slot = i; if (flags & HFCF_DEFAULTCLASS) hif->hif_defaultclass = cl; @@ -560,7 +560,7 @@ hfsc_class_create(struct hfsc_if *hif, struct service_curve *rsc, static int hfsc_class_destroy(struct hfsc_class *cl) { - int i, s; + int s; if (cl == NULL) return (0); @@ -591,12 +591,7 @@ hfsc_class_destroy(struct hfsc_class *cl) ASSERT(p != NULL); } - for (i = 0; i < HFSC_MAX_CLASSES; i++) - if (cl->cl_hif->hif_class_tbl[i] == cl) { - cl->cl_hif->hif_class_tbl[i] = NULL; - break; - } - + cl->cl_hif->hif_class_tbl[cl->cl_slot] = NULL; cl->cl_hif->hif_classes--; IFQ_UNLOCK(cl->cl_hif->hif_ifq); splx(s); diff --git a/freebsd/sys/net/altq/altq_hfsc.h b/freebsd/sys/net/altq/altq_hfsc.h index fa4aa811..c43c6671 100644 --- a/freebsd/sys/net/altq/altq_hfsc.h +++ b/freebsd/sys/net/altq/altq_hfsc.h @@ -214,6 +214,7 @@ struct runtime_sc { struct hfsc_class { u_int cl_id; /* class id (just for debug) */ + u_int cl_slot; /* slot in hif class table */ u_int32_t cl_handle; /* class handle */ struct hfsc_if *cl_hif; /* back pointer to struct hfsc_if */ int cl_flags; /* misc flags */ diff --git a/freebsd/sys/net/altq/altq_priq.c b/freebsd/sys/net/altq/altq_priq.c index 5e77aef2..0090d8fa 100644 --- a/freebsd/sys/net/altq/altq_priq.c +++ b/freebsd/sys/net/altq/altq_priq.c @@ -97,12 +97,11 @@ priq_pfattach(struct pf_altq *a) } int -priq_add_altq(struct pf_altq *a) +priq_add_altq(struct ifnet * ifp, struct pf_altq *a) { struct priq_if *pif; - struct ifnet *ifp; - if ((ifp = ifunit(a->ifname)) == NULL) + if (ifp == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); diff --git a/freebsd/sys/net/altq/altq_subr.c b/freebsd/sys/net/altq/altq_subr.c index 61aaec59..151bdf10 100644 --- a/freebsd/sys/net/altq/altq_subr.c +++ b/freebsd/sys/net/altq/altq_subr.c @@ -412,11 +412,11 @@ tbr_timeout(arg) { VNET_ITERATOR_DECL(vnet_iter); struct ifnet *ifp; - int active, s; + struct epoch_tracker et; + int active; active = 0; - s = splnet(); - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); @@ -433,8 +433,7 @@ tbr_timeout(arg) CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); - IFNET_RUNLOCK_NOSLEEP(); - splx(s); + NET_EPOCH_EXIT(et); if (active > 0) CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); else @@ -523,7 +522,7 @@ altq_pfdetach(struct pf_altq *a) * malloc with WAITOK, also it is not yet clear which lock to use. */ int -altq_add(struct pf_altq *a) +altq_add(struct ifnet *ifp, struct pf_altq *a) { int error = 0; @@ -538,27 +537,27 @@ altq_add(struct pf_altq *a) switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: - error = cbq_add_altq(a); + error = cbq_add_altq(ifp, a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: - error = priq_add_altq(a); + error = priq_add_altq(ifp, a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: - error = hfsc_add_altq(a); + error = hfsc_add_altq(ifp, a); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: - error = fairq_add_altq(a); + error = fairq_add_altq(ifp, a); break; #endif #ifdef ALTQ_CODEL case ALTQT_CODEL: - error = codel_add_altq(a); + error = codel_add_altq(ifp, a); break; #endif default: diff --git a/freebsd/sys/net/altq/altq_var.h b/freebsd/sys/net/altq/altq_var.h index 47326a03..f711e093 100644 --- a/freebsd/sys/net/altq/altq_var.h +++ b/freebsd/sys/net/altq/altq_var.h @@ -199,40 +199,40 @@ int tbr_set(struct ifaltq *, struct tb_profile *); int altq_pfattach(struct pf_altq *); int altq_pfdetach(struct pf_altq *); -int altq_add(struct pf_altq *); +int altq_add(struct ifnet *, struct pf_altq *); int altq_remove(struct pf_altq *); int altq_add_queue(struct pf_altq *); int altq_remove_queue(struct pf_altq *); int altq_getqstats(struct pf_altq *, void *, int *, int); int cbq_pfattach(struct pf_altq *); -int cbq_add_altq(struct pf_altq *); +int cbq_add_altq(struct ifnet *, struct pf_altq *); int cbq_remove_altq(struct pf_altq *); int cbq_add_queue(struct pf_altq *); int cbq_remove_queue(struct pf_altq *); int cbq_getqstats(struct pf_altq *, void *, int *, int); int codel_pfattach(struct pf_altq *); -int codel_add_altq(struct pf_altq *); +int codel_add_altq(struct ifnet *, struct pf_altq *); int codel_remove_altq(struct pf_altq *); int codel_getqstats(struct pf_altq *, void *, int *, int); int priq_pfattach(struct pf_altq *); -int priq_add_altq(struct pf_altq *); +int priq_add_altq(struct ifnet *, struct pf_altq *); int priq_remove_altq(struct pf_altq *); int priq_add_queue(struct pf_altq *); int priq_remove_queue(struct pf_altq *); int priq_getqstats(struct pf_altq *, void *, int *, int); int hfsc_pfattach(struct pf_altq *); -int hfsc_add_altq(struct pf_altq *); +int hfsc_add_altq(struct ifnet *, struct pf_altq *); int hfsc_remove_altq(struct pf_altq *); int hfsc_add_queue(struct pf_altq *); int hfsc_remove_queue(struct pf_altq *); int hfsc_getqstats(struct pf_altq *, void *, int *, int); int fairq_pfattach(struct pf_altq *); -int fairq_add_altq(struct pf_altq *); +int fairq_add_altq(struct ifnet *, struct pf_altq *); int fairq_remove_altq(struct pf_altq *); int fairq_add_queue(struct pf_altq *); int fairq_remove_queue(struct pf_altq *); diff --git a/freebsd/sys/net/bpf.c b/freebsd/sys/net/bpf.c index edee632b..101ac4e0 100644 --- a/freebsd/sys/net/bpf.c +++ b/freebsd/sys/net/bpf.c @@ -5,6 +5,7 @@ * * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. + * Copyright (c) 2019 Andrey V. Elsukov <ae@FreeBSD.org> * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed @@ -45,16 +46,16 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_ddb.h> #include <rtems/bsd/local/opt_netgraph.h> -#include <sys/types.h> #include <sys/param.h> -#include <sys/lock.h> -#include <sys/rwlock.h> -#include <sys/systm.h> #include <sys/conf.h> +#include <sys/eventhandler.h> #include <sys/fcntl.h> #include <sys/jail.h> +#include <sys/ktr.h> +#include <sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> +#include <sys/mutex.h> #include <sys/time.h> #include <sys/priv.h> #include <sys/proc.h> @@ -64,6 +65,7 @@ __FBSDID("$FreeBSD$"); #include <sys/ttycom.h> #include <sys/uio.h> #include <sys/sysent.h> +#include <sys/systm.h> #include <sys/event.h> #include <sys/file.h> @@ -99,14 +101,16 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> #ifdef __rtems__ #include <rtems/imfs.h> +#undef devfs_get_cdevpriv #define devfs_get_cdevpriv(x) 0 +#undef devtoname #define devtoname(x) "bpf" #endif /* __rtems__ */ MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); static struct bpf_if_ext dead_bpf_if = { - .bif_dlist = LIST_HEAD_INITIALIZER() + .bif_dlist = CK_LIST_HEAD_INITIALIZER() }; struct bpf_if { @@ -115,19 +119,22 @@ struct bpf_if { struct bpf_if_ext bif_ext; /* public members */ u_int bif_dlt; /* link layer type */ u_int bif_hdrlen; /* length of link header */ + struct bpfd_list bif_wlist; /* writer-only list */ struct ifnet *bif_ifp; /* corresponding interface */ - struct rwlock bif_lock; /* interface lock */ - LIST_HEAD(, bpf_d) bif_wlist; /* writer-only list */ - int bif_flags; /* Interface flags */ struct bpf_if **bif_bpf; /* Pointer to pointer to us */ + volatile u_int bif_refcnt; + struct epoch_context epoch_ctx; }; CTASSERT(offsetof(struct bpf_if, bif_ext) == 0); -#define BPFIF_RLOCK(bif) rw_rlock(&(bif)->bif_lock) -#define BPFIF_RUNLOCK(bif) rw_runlock(&(bif)->bif_lock) -#define BPFIF_WLOCK(bif) rw_wlock(&(bif)->bif_lock) -#define BPFIF_WUNLOCK(bif) rw_wunlock(&(bif)->bif_lock) +struct bpf_program_buffer { + struct epoch_context epoch_ctx; +#ifdef BPF_JITTER + bpf_jit_filter *func; +#endif + void *buffer[0]; +}; #if defined(DEV_BPF) || defined(NETGRAPH_BPF) @@ -187,18 +194,24 @@ struct bpf_dltlist32 { #define BPF_LOCK_ASSERT() sx_assert(&bpf_sx, SA_XLOCKED) /* * bpf_iflist is a list of BPF interface structures, each corresponding to a - * specific DLT. The same network interface might have several BPF interface + * specific DLT. The same network interface might have several BPF interface * structures registered by different layers in the stack (i.e., 802.11 * frames, ethernet frames, etc). */ -static LIST_HEAD(, bpf_if) bpf_iflist, bpf_freelist; +CK_LIST_HEAD(bpf_iflist, bpf_if); +static struct bpf_iflist bpf_iflist; static struct sx bpf_sx; /* bpf global lock */ static int bpf_bpfd_cnt; +static void bpfif_ref(struct bpf_if *); +static void bpfif_rele(struct bpf_if *); + +static void bpfd_ref(struct bpf_d *); +static void bpfd_rele(struct bpf_d *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); -static void bpf_detachd_locked(struct bpf_d *); -static void bpf_freed(struct bpf_d *); +static void bpf_detachd_locked(struct bpf_d *, bool); +static void bpfd_free(epoch_context_t); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, struct sockaddr *, int *, struct bpf_d *); static int bpf_setif(struct bpf_d *, struct ifreq *); @@ -261,37 +274,106 @@ static struct filterops bpfread_filtops = { .f_event = filt_bpfread, }; -eventhandler_tag bpf_ifdetach_cookie = NULL; - /* - * LOCKING MODEL USED BY BPF: + * LOCKING MODEL USED BY BPF + * * Locks: - * 1) global lock (BPF_LOCK). Mutex, used to protect interface addition/removal, - * some global counters and every bpf_if reference. - * 2) Interface lock. Rwlock, used to protect list of BPF descriptors and their filters. - * 3) Descriptor lock. Mutex, used to protect BPF buffers and various structure fields - * used by bpf_mtap code. + * 1) global lock (BPF_LOCK). Sx, used to protect some global counters, + * every bpf_iflist changes, serializes ioctl access to bpf descriptors. + * 2) Descriptor lock. Mutex, used to protect BPF buffers and various + * structure fields used by bpf_*tap* code. + * + * Lock order: global lock, then descriptor lock. * - * Lock order: + * There are several possible consumers: * - * Global lock, interface lock, descriptor lock + * 1. The kernel registers interface pointer with bpfattach(). + * Each call allocates new bpf_if structure, references ifnet pointer + * and links bpf_if into bpf_iflist chain. This is protected with global + * lock. * - * We have to acquire interface lock before descriptor main lock due to BPF_MTAP[2] - * working model. In many places (like bpf_detachd) we start with BPF descriptor - * (and we need to at least rlock it to get reliable interface pointer). This - * gives us potential LOR. As a result, we use global lock to protect from bpf_if - * change in every such place. + * 2. An userland application uses ioctl() call to bpf_d descriptor. + * All such call are serialized with global lock. BPF filters can be + * changed, but pointer to old filter will be freed using epoch_call(). + * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to + * filter pointers, even if change will happen during bpf_tap execution. + * Destroying of bpf_d descriptor also is doing using epoch_call(). * - * Changing d->bd_bif is protected by 1) global lock, 2) interface lock and - * 3) descriptor main wlock. - * Reading bd_bif can be protected by any of these locks, typically global lock. + * 3. An userland application can write packets into bpf_d descriptor. + * There we need to be sure, that ifnet won't disappear during bpfwrite(). * - * Changing read/write BPF filter is protected by the same three locks, - * the same applies for reading. + * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to + * bif_dlist is protected with net_epoch_preempt section. So, it should + * be safe to make access to bpf_d descriptor inside the section. * - * Sleeping in global lock is not allowed due to bpfdetach() using it. + * 5. The kernel invokes bpfdetach() on interface destroying. All lists + * are modified with global lock held and actual free() is done using + * epoch_call(). */ +static void +bpfif_free(epoch_context_t ctx) +{ + struct bpf_if *bp; + + bp = __containerof(ctx, struct bpf_if, epoch_ctx); + if_rele(bp->bif_ifp); + free(bp, M_BPF); +} + +static void +bpfif_ref(struct bpf_if *bp) +{ + + refcount_acquire(&bp->bif_refcnt); +} + +static void +bpfif_rele(struct bpf_if *bp) +{ + + if (!refcount_release(&bp->bif_refcnt)) + return; + epoch_call(net_epoch_preempt, &bp->epoch_ctx, bpfif_free); +} + +static void +bpfd_ref(struct bpf_d *d) +{ + + refcount_acquire(&d->bd_refcnt); +} + +static void +bpfd_rele(struct bpf_d *d) +{ + + if (!refcount_release(&d->bd_refcnt)) + return; + epoch_call(net_epoch_preempt, &d->epoch_ctx, bpfd_free); +} + +static struct bpf_program_buffer* +bpf_program_buffer_alloc(size_t size, int flags) +{ + + return (malloc(sizeof(struct bpf_program_buffer) + size, + M_BPF, flags)); +} + +static void +bpf_program_buffer_free(epoch_context_t ctx) +{ + struct bpf_program_buffer *ptr; + + ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx); +#ifdef BPF_JITTER + if (ptr->func != NULL) + bpf_destroy_jit_filter(ptr->func); +#endif + free(ptr, M_BPF); +} + /* * Wrapper functions for various buffering methods. If the set of buffer * modes expands, we will probably want to introduce a switch data structure @@ -673,7 +755,8 @@ bad: } /* - * Attach file to the bpf interface, i.e. make d listen on bp. + * Attach descriptor to the bpf interface, i.e. make d listen on bp, + * then reset its buffers and counters with reset_d(). */ static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) @@ -689,7 +772,7 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) op_w = V_bpf_optimize_writers || d->bd_writer; if (d->bd_bif != NULL) - bpf_detachd_locked(d); + bpf_detachd_locked(d, false); /* * Point d at bp, and add d to the interface's list. * Since there are many applications using BPF for @@ -698,26 +781,27 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) * some filter is configured. */ - BPFIF_WLOCK(bp); BPFD_LOCK(d); - + /* + * Hold reference to bpif while descriptor uses this interface. + */ + bpfif_ref(bp); d->bd_bif = bp; - if (op_w != 0) { /* Add to writers-only list */ - LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next); + CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next); /* * We decrement bd_writer on every filter set operation. * First BIOCSETF is done by pcap_open_live() to set up - * snap length. After that appliation usually sets its own filter + * snap length. After that appliation usually sets its own + * filter. */ d->bd_writer = 2; } else - LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); + CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); + reset_d(d); BPFD_UNLOCK(d); - BPFIF_WUNLOCK(bp); - bpf_bpfd_cnt++; CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list", @@ -731,7 +815,8 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) * Check if we need to upgrade our descriptor @d from write-only mode. */ static int -bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen) +bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, + int flen) { int is_snap, need_upgrade; @@ -751,7 +836,8 @@ bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen) * we'd prefer to treat k=0 (deny ALL) case the same way: e.g. * do not consider upgrading immediately */ - if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K)) + if (cmd == BIOCSETF && flen == 1 && + fcode[0].code == (BPF_RET | BPF_K)) is_snap = 1; else is_snap = 0; @@ -789,88 +875,45 @@ bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen) } /* - * Add d to the list of active bp filters. - * Requires bpf_attachd() to be called before. - */ -static void -bpf_upgraded(struct bpf_d *d) -{ - struct bpf_if *bp; - - BPF_LOCK_ASSERT(); - - bp = d->bd_bif; - - /* - * Filter can be set several times without specifying interface. - * Mark d as reader and exit. - */ - if (bp == NULL) { - BPFD_LOCK(d); - d->bd_writer = 0; - BPFD_UNLOCK(d); - return; - } - - BPFIF_WLOCK(bp); - BPFD_LOCK(d); - - /* Remove from writers-only list */ - LIST_REMOVE(d, bd_next); - LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); - /* Mark d as reader */ - d->bd_writer = 0; - - BPFD_UNLOCK(d); - BPFIF_WUNLOCK(bp); - - CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid); - - EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1); -} - -/* * Detach a file from its interface. */ static void bpf_detachd(struct bpf_d *d) { BPF_LOCK(); - bpf_detachd_locked(d); + bpf_detachd_locked(d, false); BPF_UNLOCK(); } static void -bpf_detachd_locked(struct bpf_d *d) +bpf_detachd_locked(struct bpf_d *d, bool detached_ifp) { - int error; struct bpf_if *bp; struct ifnet *ifp; - - CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid); + int error; BPF_LOCK_ASSERT(); + CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid); /* Check if descriptor is attached */ if ((bp = d->bd_bif) == NULL) return; - BPFIF_WLOCK(bp); BPFD_LOCK(d); - + /* Remove d from the interface's descriptor list. */ + CK_LIST_REMOVE(d, bd_next); /* Save bd_writer value */ error = d->bd_writer; - - /* - * Remove d from the interface's descriptor list. - */ - LIST_REMOVE(d, bd_next); - ifp = bp->bif_ifp; d->bd_bif = NULL; + if (detached_ifp) { + /* + * Notify descriptor as it's detached, so that any + * sleepers wake up and get ENXIO. + */ + bpf_wakeup(d); + } BPFD_UNLOCK(d); - BPFIF_WUNLOCK(bp); - bpf_bpfd_cnt--; /* Call event handler iff d is attached */ @@ -879,9 +922,9 @@ bpf_detachd_locked(struct bpf_d *d) /* * Check if this descriptor had requested promiscuous mode. - * If so, turn it off. + * If so and ifnet is not detached, turn it off. */ - if (d->bd_promisc) { + if (d->bd_promisc && !detached_ifp) { d->bd_promisc = 0; CURVNET_SET(ifp->if_vnet); error = ifpromisc(ifp, 0); @@ -897,6 +940,7 @@ bpf_detachd_locked(struct bpf_d *d) "bpf_detach: ifpromisc failed (%d)\n", error); } } + bpfif_rele(bp); } /* @@ -921,8 +965,7 @@ bpf_dtor(void *data) seldrain(&d->bd_sel); knlist_destroy(&d->bd_sel.si_note); callout_drain(&d->bd_callout); - bpf_freed(d); - free(d, M_BPF); + bpfd_rele(d); } /* @@ -975,6 +1018,7 @@ bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) d->bd_bufmode = BPF_BUFMODE_BUFFER; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; + d->bd_refcnt = 1; BPF_PID_REFRESH(d, td); #ifdef MAC mac_bpfdesc_init(d); @@ -1162,7 +1206,8 @@ bpf_timed_out(void *arg) BPFD_LOCK_ASSERT(d); - if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout)) + if (callout_pending(&d->bd_callout) || + !callout_active(&d->bd_callout)) return; if (d->bd_state == BPF_WAITING) { d->bd_state = BPF_TIMED_OUT; @@ -1192,49 +1237,73 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) bpfwrite(struct bpf_d *d, struct uio *uio, int ioflag) #endif /* __rtems__ */ { + struct route ro; + struct sockaddr dst; + struct epoch_tracker et; + struct bpf_if *bp; #ifndef __rtems__ struct bpf_d *d; #endif /* __rtems__ */ struct ifnet *ifp; struct mbuf *m, *mc; - struct sockaddr dst; - struct route ro; int error, hlen; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); + NET_EPOCH_ENTER(et); + BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); counter_u64_add(d->bd_wcount, 1); - /* XXX: locking required */ - if (d->bd_bif == NULL) { - counter_u64_add(d->bd_wdcount, 1); - return (ENXIO); + if ((bp = d->bd_bif) == NULL) { + error = ENXIO; + goto out_locked; } - ifp = d->bd_bif->bif_ifp; - + ifp = bp->bif_ifp; if ((ifp->if_flags & IFF_UP) == 0) { - counter_u64_add(d->bd_wdcount, 1); - return (ENETDOWN); + error = ENETDOWN; + goto out_locked; } - if (uio->uio_resid == 0) { - counter_u64_add(d->bd_wdcount, 1); - return (0); - } + if (uio->uio_resid == 0) + goto out_locked; bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; - /* XXX: bpf_movein() can sleep */ - error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp, + + /* + * Take extra reference, unlock d and exit from epoch section, + * since bpf_movein() can sleep. + */ + bpfd_ref(d); + NET_EPOCH_EXIT(et); + BPFD_UNLOCK(d); + + error = bpf_movein(uio, (int)bp->bif_dlt, ifp, &m, &dst, &hlen, d); - if (error) { + + if (error != 0) { counter_u64_add(d->bd_wdcount, 1); + bpfd_rele(d); return (error); } + + BPFD_LOCK(d); + /* + * Check that descriptor is still attached to the interface. + * This can happen on bpfdetach(). To avoid access to detached + * ifnet, free mbuf and return ENXIO. + */ + if (d->bd_bif == NULL) { + counter_u64_add(d->bd_wdcount, 1); + BPFD_UNLOCK(d); + bpfd_rele(d); + m_freem(m); + return (ENXIO); + } counter_u64_add(d->bd_wfcount, 1); if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; @@ -1255,11 +1324,9 @@ bpfwrite(struct bpf_d *d, struct uio *uio, int ioflag) CURVNET_SET(ifp->if_vnet); #ifdef MAC - BPFD_LOCK(d); mac_bpfdesc_create_mbuf(d, m); if (mc != NULL) mac_bpfdesc_create_mbuf(d, mc); - BPFD_UNLOCK(d); #endif bzero(&ro, sizeof(ro)); @@ -1269,6 +1336,9 @@ bpfwrite(struct bpf_d *d, struct uio *uio, int ioflag) ro.ro_flags = RT_HAS_HEADER; } + /* Avoid possible recursion on BPFD_LOCK(). */ + NET_EPOCH_ENTER(et); + BPFD_UNLOCK(d); error = (*ifp->if_output)(ifp, m, &dst, &ro); if (error) counter_u64_add(d->bd_wdcount, 1); @@ -1279,8 +1349,15 @@ bpfwrite(struct bpf_d *d, struct uio *uio, int ioflag) else m_freem(mc); } + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); + bpfd_rele(d); + return (error); +out_locked: + counter_u64_add(d->bd_wdcount, 1); + NET_EPOCH_EXIT(et); + BPFD_UNLOCK(d); return (error); } @@ -1916,16 +1993,11 @@ bpfioctl(struct bpf_d *d, u_long cmd, caddr_t addr, int flags, } /* - * Set d's packet filter program to fp. If this file already has a filter, - * free it and replace it. Returns EINVAL for bogus requests. - * - * Note we need global lock here to serialize bpf_setf() and bpf_setif() calls - * since reading d->bd_bif can't be protected by d or interface lock due to - * lock order. - * - * Additionally, we have to acquire interface write lock due to bpf_mtap() uses - * interface read lock to read all filers. + * Set d's packet filter program to fp. If this file already has a filter, + * free it and replace it. Returns EINVAL for bogus requests. * + * Note we use global lock here to serialize bpf_setf() and bpf_setif() + * calls. */ static int bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) @@ -1934,13 +2006,14 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) struct bpf_program fp_swab; struct bpf_program32 *fp32; #endif - struct bpf_insn *fcode, *old; + struct bpf_program_buffer *fcode; + struct bpf_insn *filter; #ifdef BPF_JITTER - bpf_jit_filter *jfunc, *ofunc; + bpf_jit_filter *jfunc; #endif size_t size; u_int flen; - int need_upgrade; + bool track_event; #ifdef COMPAT_FREEBSD32 switch (cmd) { @@ -1949,7 +2022,8 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) case BIOCSETFNR32: fp32 = (struct bpf_program32 *)fp; fp_swab.bf_len = fp32->bf_len; - fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns; + fp_swab.bf_insns = + (struct bpf_insn *)(uintptr_t)fp32->bf_insns; fp = &fp_swab; switch (cmd) { case BIOCSETF32: @@ -1963,12 +2037,10 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) } #endif - fcode = NULL; + filter = NULL; #ifdef BPF_JITTER - jfunc = ofunc = NULL; + jfunc = NULL; #endif - need_upgrade = 0; - /* * Check new filter validness before acquiring any locks. * Allocate memory for new filter, if needed. @@ -1978,10 +2050,11 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) return (EINVAL); size = flen * sizeof(*fp->bf_insns); if (size > 0) { - /* We're setting up new filter. Copy and check actual data. */ - fcode = malloc(size, M_BPF, M_WAITOK); - if (copyin(fp->bf_insns, fcode, size) != 0 || - !bpf_validate(fcode, flen)) { + /* We're setting up new filter. Copy and check actual data. */ + fcode = bpf_program_buffer_alloc(size, M_WAITOK); + filter = (struct bpf_insn *)fcode->buffer; + if (copyin(fp->bf_insns, filter, size) != 0 || + !bpf_validate(filter, flen)) { free(fcode, M_BPF); return (EINVAL); } @@ -1991,49 +2064,72 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) * Filter is copied inside fcode and is * perfectly valid. */ - jfunc = bpf_jitter(fcode, flen); + jfunc = bpf_jitter(filter, flen); } #endif } - BPF_LOCK(); + track_event = false; + fcode = NULL; - /* - * Set up new filter. - * Protect filter change by interface lock. - * Additionally, we are protected by global lock here. - */ - if (d->bd_bif != NULL) - BPFIF_WLOCK(d->bd_bif); + BPF_LOCK(); BPFD_LOCK(d); + /* Set up new filter. */ if (cmd == BIOCSETWF) { - old = d->bd_wfilter; - d->bd_wfilter = fcode; + if (d->bd_wfilter != NULL) { + fcode = __containerof((void *)d->bd_wfilter, + struct bpf_program_buffer, buffer); +#ifdef BPF_JITTER + fcode->func = NULL; +#endif + } + d->bd_wfilter = filter; } else { - old = d->bd_rfilter; - d->bd_rfilter = fcode; + if (d->bd_rfilter != NULL) { + fcode = __containerof((void *)d->bd_rfilter, + struct bpf_program_buffer, buffer); +#ifdef BPF_JITTER + fcode->func = d->bd_bfilter; +#endif + } + d->bd_rfilter = filter; #ifdef BPF_JITTER - ofunc = d->bd_bfilter; d->bd_bfilter = jfunc; #endif if (cmd == BIOCSETF) reset_d(d); - need_upgrade = bpf_check_upgrade(cmd, d, fcode, flen); + if (bpf_check_upgrade(cmd, d, filter, flen) != 0) { + /* + * Filter can be set several times without + * specifying interface. In this case just mark d + * as reader. + */ + d->bd_writer = 0; + if (d->bd_bif != NULL) { + /* + * Remove descriptor from writers-only list + * and add it to active readers list. + */ + CK_LIST_REMOVE(d, bd_next); + CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist, + d, bd_next); + CTR2(KTR_NET, + "%s: upgrade required by pid %d", + __func__, d->bd_pid); + track_event = true; + } + } } BPFD_UNLOCK(d); - if (d->bd_bif != NULL) - BPFIF_WUNLOCK(d->bd_bif); - if (old != NULL) - free(old, M_BPF); -#ifdef BPF_JITTER - if (ofunc != NULL) - bpf_destroy_jit_filter(ofunc); -#endif - /* Move d to active readers list. */ - if (need_upgrade != 0) - bpf_upgraded(d); + if (fcode != NULL) + epoch_call(net_epoch_preempt, &fcode->epoch_ctx, + bpf_program_buffer_free); + + if (track_event) + EVENTHANDLER_INVOKE(bpf_track, + d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1); BPF_UNLOCK(); return (0); @@ -2057,15 +2153,6 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr) return (ENXIO); bp = theywant->if_bpf; - - /* Check if interface is not being detached from BPF */ - BPFIF_RLOCK(bp); - if (bp->bif_flags & BPFIF_FLAG_DYING) { - BPFIF_RUNLOCK(bp); - return (ENXIO); - } - BPFIF_RUNLOCK(bp); - /* * At this point, we expect the buffer is already allocated. If not, * return an error. @@ -2084,9 +2171,11 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr) } if (bp != d->bd_bif) bpf_attachd(d, bp); - BPFD_LOCK(d); - reset_d(d); - BPFD_UNLOCK(d); + else { + BPFD_LOCK(d); + reset_d(d); + BPFD_UNLOCK(d); + } return (0); } @@ -2253,6 +2342,7 @@ bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m) void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { + struct epoch_tracker et; struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER @@ -2262,24 +2352,14 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) int gottime; gottime = BPF_TSTAMP_NONE; - - BPFIF_RLOCK(bp); - - LIST_FOREACH(d, &bp->bif_dlist, bd_next) { - /* - * We are not using any locks for d here because: - * 1) any filter change is protected by interface - * write lock - * 2) destroying/detaching d is protected by interface - * write lock, too - */ - + NET_EPOCH_ENTER(et); + CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { counter_u64_add(d->bd_rcount, 1); /* - * NB: We dont call BPF_CHECK_DIRECTION() here since there is no - * way for the caller to indiciate to us whether this packet - * is inbound or outbound. In the bpf_mtap() routines, we use - * the interface pointers on the mbuf to figure it out. + * NB: We dont call BPF_CHECK_DIRECTION() here since there + * is no way for the caller to indiciate to us whether this + * packet is inbound or outbound. In the bpf_mtap() routines, + * we use the interface pointers on the mbuf to figure it out. */ #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; @@ -2293,10 +2373,10 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) * Filter matches. Let's to acquire write lock. */ BPFD_LOCK(d); - counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) - gottime = bpf_gettime(&bt, d->bd_tstamp, NULL); + gottime = bpf_gettime(&bt, d->bd_tstamp, + NULL); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif @@ -2305,7 +2385,7 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) BPFD_UNLOCK(d); } } - BPFIF_RUNLOCK(bp); + NET_EPOCH_EXIT(et); } #define BPF_CHECK_DIRECTION(d, r, i) \ @@ -2319,6 +2399,7 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { + struct epoch_tracker et; struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER @@ -2328,7 +2409,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m) int gottime; /* Skip outgoing duplicate packets. */ - if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { + if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) { m->m_flags &= ~M_PROMISC; return; } @@ -2336,17 +2417,17 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m) pktlen = m_length(m, NULL); gottime = BPF_TSTAMP_NONE; - BPFIF_RLOCK(bp); - - LIST_FOREACH(d, &bp->bif_dlist, bd_next) { - if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) + NET_EPOCH_ENTER(et); + CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { + if (BPF_CHECK_DIRECTION(d, m_rcvif(m), bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; /* XXX We cannot handle multiple mbufs. */ if (bf != NULL && m->m_next == NULL) - slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen); + slen = (*(bf->func))(mtod(m, u_char *), pktlen, + pktlen); else #endif slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0); @@ -2364,7 +2445,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m) BPFD_UNLOCK(d); } } - BPFIF_RUNLOCK(bp); + NET_EPOCH_EXIT(et); } /* @@ -2374,6 +2455,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m) void bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) { + struct epoch_tracker et; struct bintime bt; struct mbuf mb; struct bpf_d *d; @@ -2392,6 +2474,7 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) * Note that we cut corners here; we only setup what's * absolutely needed--this mbuf should never go anywhere else. */ + mb.m_flags = 0; mb.m_next = m; mb.m_data = data; mb.m_len = dlen; @@ -2399,9 +2482,8 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) gottime = BPF_TSTAMP_NONE; - BPFIF_RLOCK(bp); - - LIST_FOREACH(d, &bp->bif_dlist, bd_next) { + NET_EPOCH_ENTER(et); + CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); @@ -2420,11 +2502,10 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) BPFD_UNLOCK(d); } } - BPFIF_RUNLOCK(bp); + NET_EPOCH_EXIT(et); } #undef BPF_CHECK_DIRECTION - #undef BPF_TSTAMP_NONE #undef BPF_TSTAMP_FAST #undef BPF_TSTAMP_NORMAL @@ -2514,6 +2595,11 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, int tstype; BPFD_LOCK_ASSERT(d); + if (d->bd_bif == NULL) { + /* Descriptor was detached in concurrent thread */ + counter_u64_add(d->bd_dcount, 1); + return; + } /* * Detect whether user space has released a buffer back to us, and if @@ -2643,26 +2729,36 @@ copy: * Called on close. */ static void -bpf_freed(struct bpf_d *d) +bpfd_free(epoch_context_t ctx) { + struct bpf_d *d; + struct bpf_program_buffer *p; /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ + d = __containerof(ctx, struct bpf_d, epoch_ctx); bpf_free(d); if (d->bd_rfilter != NULL) { - free((caddr_t)d->bd_rfilter, M_BPF); + p = __containerof((void *)d->bd_rfilter, + struct bpf_program_buffer, buffer); #ifdef BPF_JITTER - if (d->bd_bfilter != NULL) - bpf_destroy_jit_filter(d->bd_bfilter); + p->func = d->bd_bfilter; #endif + bpf_program_buffer_free(&p->epoch_ctx); + } + if (d->bd_wfilter != NULL) { + p = __containerof((void *)d->bd_wfilter, + struct bpf_program_buffer, buffer); +#ifdef BPF_JITTER + p->func = NULL; +#endif + bpf_program_buffer_free(&p->epoch_ctx); } - if (d->bd_wfilter != NULL) - free((caddr_t)d->bd_wfilter, M_BPF); - mtx_destroy(&d->bd_lock); + mtx_destroy(&d->bd_lock); counter_u64_free(d->bd_rcount); counter_u64_free(d->bd_dcount); counter_u64_free(d->bd_fcount); @@ -2670,7 +2766,7 @@ bpf_freed(struct bpf_d *d) counter_u64_free(d->bd_wfcount); counter_u64_free(d->bd_wdcount); counter_u64_free(d->bd_zcopy); - + free(d, M_BPF); } /* @@ -2691,29 +2787,33 @@ bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) * headers are not yet supporrted). */ void -bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) +bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, + struct bpf_if **driverp) { struct bpf_if *bp; - bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO); - if (bp == NULL) - panic("bpfattach"); + KASSERT(*driverp == NULL, + ("bpfattach2: driverp already initialized")); - LIST_INIT(&bp->bif_dlist); - LIST_INIT(&bp->bif_wlist); + bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO); + + CK_LIST_INIT(&bp->bif_dlist); + CK_LIST_INIT(&bp->bif_wlist); bp->bif_ifp = ifp; bp->bif_dlt = dlt; - rw_init(&bp->bif_lock, "bpf interface lock"); - KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized")); + bp->bif_hdrlen = hdrlen; bp->bif_bpf = driverp; + bp->bif_refcnt = 1; *driverp = bp; - + /* + * Reference ifnet pointer, so it won't freed until + * we release it. + */ + if_ref(ifp); BPF_LOCK(); - LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); + CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); BPF_UNLOCK(); - bp->bif_hdrlen = hdrlen; - if (bootverbose && IS_DEFAULT_VNET(curvnet)) if_printf(ifp, "bpf attached\n"); } @@ -2752,98 +2852,32 @@ bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen) void bpfdetach(struct ifnet *ifp) { - struct bpf_if *bp, *bp_temp; - struct bpf_d *d; - int ndetached; - - ndetached = 0; + struct bpf_if *bp, *bp_temp; + struct bpf_d *d; BPF_LOCK(); /* Find all bpf_if struct's which reference ifp and detach them. */ - LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) { + CK_LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) { if (ifp != bp->bif_ifp) continue; - LIST_REMOVE(bp, bif_next); - /* Add to to-be-freed list */ - LIST_INSERT_HEAD(&bpf_freelist, bp, bif_next); - - ndetached++; - /* - * Delay freeing bp till interface is detached - * and all routes through this interface are removed. - * Mark bp as detached to restrict new consumers. - */ - BPFIF_WLOCK(bp); - bp->bif_flags |= BPFIF_FLAG_DYING; + CK_LIST_REMOVE(bp, bif_next); *bp->bif_bpf = (struct bpf_if *)&dead_bpf_if; - BPFIF_WUNLOCK(bp); - CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p", + CTR4(KTR_NET, + "%s: sheduling free for encap %d (%p) for if %p", __func__, bp->bif_dlt, bp, ifp); - /* Free common descriptors */ - while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) { - bpf_detachd_locked(d); - BPFD_LOCK(d); - bpf_wakeup(d); - BPFD_UNLOCK(d); + /* Detach common descriptors */ + while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) { + bpf_detachd_locked(d, true); } - /* Free writer-only descriptors */ - while ((d = LIST_FIRST(&bp->bif_wlist)) != NULL) { - bpf_detachd_locked(d); - BPFD_LOCK(d); - bpf_wakeup(d); - BPFD_UNLOCK(d); + /* Detach writer-only descriptors */ + while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) { + bpf_detachd_locked(d, true); } - } - BPF_UNLOCK(); - -#ifdef INVARIANTS - if (ndetached == 0) - printf("bpfdetach: %s was not attached\n", ifp->if_xname); -#endif -} - -/* - * Interface departure handler. - * Note departure event does not guarantee interface is going down. - * Interface renaming is currently done via departure/arrival event set. - * - * Departure handled is called after all routes pointing to - * given interface are removed and interface is in down state - * restricting any packets to be sent/received. We assume it is now safe - * to free data allocated by BPF. - */ -static void -bpf_ifdetach(void *arg __unused, struct ifnet *ifp) -{ - struct bpf_if *bp, *bp_temp; - int nmatched = 0; - - /* Ignore ifnet renaming. */ - if (ifp->if_flags & IFF_RENAMING) - return; - - BPF_LOCK(); - /* - * Find matching entries in free list. - * Nothing should be found if bpfdetach() was not called. - */ - LIST_FOREACH_SAFE(bp, &bpf_freelist, bif_next, bp_temp) { - if (ifp != bp->bif_ifp) - continue; - - CTR3(KTR_NET, "%s: freeing BPF instance %p for interface %p", - __func__, bp, ifp); - - LIST_REMOVE(bp, bif_next); - - rw_destroy(&bp->bif_lock); - free(bp, M_BPF); - - nmatched++; + bpfif_rele(bp); } BPF_UNLOCK(); } @@ -2862,9 +2896,8 @@ bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) BPF_LOCK_ASSERT(); ifp = d->bd_bif->bif_ifp; -again: n1 = 0; - LIST_FOREACH(bp, &bpf_iflist, bif_next) { + CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp) n1++; } @@ -2874,24 +2907,16 @@ again: } if (n1 > bfl->bfl_len) return (ENOMEM); - BPF_UNLOCK(); + lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK); n = 0; - BPF_LOCK(); - LIST_FOREACH(bp, &bpf_iflist, bif_next) { + CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp != ifp) continue; - if (n >= n1) { - free(lst, M_TEMP); - goto again; - } - lst[n] = bp->bif_dlt; - n++; + lst[n++] = bp->bif_dlt; } - BPF_UNLOCK(); error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n); free(lst, M_TEMP); - BPF_LOCK(); bfl->bfl_len = n; return (error); } @@ -2907,33 +2932,34 @@ bpf_setdlt(struct bpf_d *d, u_int dlt) struct bpf_if *bp; BPF_LOCK_ASSERT(); + MPASS(d->bd_bif != NULL); + /* + * It is safe to check bd_bif without BPFD_LOCK, it can not be + * changed while we hold global lock. + */ if (d->bd_bif->bif_dlt == dlt) return (0); - ifp = d->bd_bif->bif_ifp; - LIST_FOREACH(bp, &bpf_iflist, bif_next) { + ifp = d->bd_bif->bif_ifp; + CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) break; } + if (bp == NULL) + return (EINVAL); - if (bp != NULL) { - opromisc = d->bd_promisc; - bpf_attachd(d, bp); - BPFD_LOCK(d); - reset_d(d); - BPFD_UNLOCK(d); - if (opromisc) { - error = ifpromisc(bp->bif_ifp, 1); - if (error) - if_printf(bp->bif_ifp, - "bpf_setdlt: ifpromisc failed (%d)\n", - error); - else - d->bd_promisc = 1; - } + opromisc = d->bd_promisc; + bpf_attachd(d, bp); + if (opromisc) { + error = ifpromisc(bp->bif_ifp, 1); + if (error) + if_printf(bp->bif_ifp, "%s: ifpromisc failed (%d)\n", + __func__, error); + else + d->bd_promisc = 1; } - return (bp == NULL ? EINVAL : 0); + return (0); } #ifdef __rtems__ static struct bpf_d * @@ -2973,7 +2999,7 @@ bpf_imfs_readv(rtems_libio_t *iop, const struct iovec *iov, int iovcnt, ssize_t struct bpf_d *d = bpf_imfs_get_context_by_iop(iop); struct thread *td = rtems_bsd_get_curthread_or_null(); struct uio uio = { - .uio_iov = iov, + .uio_iov = RTEMS_DECONST(struct iovec *, iov), .uio_iovcnt = iovcnt, .uio_offset = 0, .uio_resid = total, @@ -3014,7 +3040,7 @@ bpf_imfs_writev(rtems_libio_t *iop, const struct iovec *iov, int iovcnt, ssize_t struct bpf_d *d = bpf_imfs_get_context_by_iop(iop); struct thread *td = rtems_bsd_get_curthread_or_null(); struct uio uio = { - .uio_iov = iov, + .uio_iov = RTEMS_DECONST(struct iovec *, iov), .uio_iovcnt = iovcnt, .uio_offset = 0, .uio_resid = total, @@ -3042,7 +3068,7 @@ static ssize_t bpf_imfs_write(rtems_libio_t *iop, const void *buffer, size_t count) { struct iovec iov = { - .iov_base = buffer, + .iov_base = RTEMS_DECONST(void *, buffer), .iov_len = count }; @@ -3115,24 +3141,23 @@ bpf_drvinit(void *unused) #endif /* __rtems__ */ sx_init(&bpf_sx, "bpf global lock"); - LIST_INIT(&bpf_iflist); - LIST_INIT(&bpf_freelist); + CK_LIST_INIT(&bpf_iflist); #ifndef __rtems__ dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf"); /* For compatibility */ make_dev_alias(dev, "bpf0"); -#else /* __rtems__ */ - rv = IMFS_make_generic_node("/dev/bpf", mode, &bpf_imfs_control, NULL); - BSD_ASSERT(rv == 0); - rv = symlink("/dev/bpf", "/dev/bpf0"); - BSD_ASSERT(rv == 0); -#endif /* __rtems__ */ /* Register interface departure handler */ bpf_ifdetach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, bpf_ifdetach, NULL, EVENTHANDLER_PRI_ANY); +#else /* __rtems__ */ + rv = IMFS_make_generic_node("/dev/bpf", mode, &bpf_imfs_control, NULL); + BSD_ASSERT(rv == 0); + rv = symlink("/dev/bpf", "/dev/bpf0"); + BSD_ASSERT(rv == 0); +#endif /* __rtems__ */ } /* @@ -3147,19 +3172,19 @@ bpf_zero_counters(void) struct bpf_d *bd; BPF_LOCK(); - LIST_FOREACH(bp, &bpf_iflist, bif_next) { - BPFIF_RLOCK(bp); - LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { - BPFD_LOCK(bd); + /* + * We are protected by global lock here, interfaces and + * descriptors can not be deleted while we hold it. + */ + CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { + CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { counter_u64_zero(bd->bd_rcount); counter_u64_zero(bd->bd_dcount); counter_u64_zero(bd->bd_fcount); counter_u64_zero(bd->bd_wcount); counter_u64_zero(bd->bd_wfcount); counter_u64_zero(bd->bd_zcopy); - BPFD_UNLOCK(bd); } - BPFIF_RUNLOCK(bp); } BPF_UNLOCK(); } @@ -3171,10 +3196,9 @@ static void bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) { + BPF_LOCK_ASSERT(); bzero(d, sizeof(*d)); - BPFD_LOCK_ASSERT(bd); d->bd_structsize = sizeof(*d); - /* XXX: reading should be protected by global lock */ d->bd_immediate = bd->bd_immediate; d->bd_promisc = bd->bd_promisc; d->bd_hdrcmplt = bd->bd_hdrcmplt; @@ -3251,22 +3275,16 @@ bpf_stats_sysctl(SYSCTL_HANDLER_ARGS) return (ENOMEM); } index = 0; - LIST_FOREACH(bp, &bpf_iflist, bif_next) { - BPFIF_RLOCK(bp); + CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { /* Send writers-only first */ - LIST_FOREACH(bd, &bp->bif_wlist, bd_next) { + CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) { xbd = &xbdbuf[index++]; - BPFD_LOCK(bd); bpfstats_fill_xbpf(xbd, bd); - BPFD_UNLOCK(bd); } - LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { + CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { xbd = &xbdbuf[index++]; - BPFD_LOCK(bd); bpfstats_fill_xbpf(xbd, bd); - BPFD_UNLOCK(bd); } - BPFIF_RUNLOCK(bp); } BPF_UNLOCK(); error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd)); @@ -3346,10 +3364,10 @@ bpf_show_bpf_if(struct bpf_if *bpf_if) /* bif_ext.bif_dlist */ BPF_DB_PRINTF("%#x", bif_dlt); BPF_DB_PRINTF("%u", bif_hdrlen); - BPF_DB_PRINTF("%p", bif_ifp); - /* bif_lock */ /* bif_wlist */ - BPF_DB_PRINTF("%#x", bif_flags); + BPF_DB_PRINTF("%p", bif_ifp); + BPF_DB_PRINTF("%p", bif_bpf); + BPF_DB_PRINTF("%u", bif_refcnt); } DB_SHOW_COMMAND(bpf_if, db_show_bpf_if) diff --git a/freebsd/sys/net/bpf.h b/freebsd/sys/net/bpf.h index d8eb7ff4..55b03b54 100644 --- a/freebsd/sys/net/bpf.h +++ b/freebsd/sys/net/bpf.h @@ -42,6 +42,10 @@ #ifndef _NET_BPF_H_ #define _NET_BPF_H_ +#include <sys/_eventhandler.h> +#include <sys/ck.h> +#include <net/dlt.h> + #if defined(__rtems__) && !defined(__FreeBSD__) #define __FreeBSD__ 1 #endif /* defined(__rtems__) && !defined(__FreeBSD__) */ @@ -236,9 +240,6 @@ struct bpf_zbuf_header { u_int _bzh_pad[5]; }; -/* Pull in data-link level type codes. */ -#include <net/dlt.h> - /* * The instruction encodings. * @@ -412,10 +413,11 @@ SYSCTL_DECL(_net_bpf); * bpf_peers_present() calls. */ struct bpf_if; +CK_LIST_HEAD(bpfd_list, bpf_d); struct bpf_if_ext { - LIST_ENTRY(bpf_if) bif_next; /* list of all interfaces */ - LIST_HEAD(, bpf_d) bif_dlist; /* descriptor list */ + CK_LIST_ENTRY(bpf_if) bif_next; /* list of all interfaces */ + struct bpfd_list bif_dlist; /* descriptor list */ }; void bpf_bufheld(struct bpf_d *d); @@ -439,7 +441,7 @@ bpf_peers_present(struct bpf_if *bpf) struct bpf_if_ext *ext; ext = (struct bpf_if_ext *)bpf; - if (!LIST_EMPTY(&ext->bif_dlist)) + if (!CK_LIST_EMPTY(&ext->bif_dlist)) return (1); return (0); } @@ -467,12 +469,10 @@ bpf_peers_present(struct bpf_if *bpf) */ #define BPF_MEMWORDS 16 -#ifdef _SYS_EVENTHANDLER_H_ /* BPF attach/detach events */ struct ifnet; typedef void (*bpf_track_fn)(void *, struct ifnet *, int /* dlt */, int /* 1 =>'s attach */); EVENTHANDLER_DECLARE(bpf_track, bpf_track_fn); -#endif /* _SYS_EVENTHANDLER_H_ */ #endif /* _NET_BPF_H_ */ diff --git a/freebsd/sys/net/bpf_buffer.c b/freebsd/sys/net/bpf_buffer.c index 7a182a61..daa9e267 100644 --- a/freebsd/sys/net/bpf_buffer.c +++ b/freebsd/sys/net/bpf_buffer.c @@ -71,8 +71,10 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_bpf.h> #include <sys/param.h> +#include <sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> +#include <sys/mutex.h> #include <sys/socket.h> #include <sys/uio.h> #include <sys/kernel.h> @@ -119,19 +121,10 @@ bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, { const struct mbuf *m; u_char *dst; - u_int count; m = (struct mbuf *)src; dst = (u_char *)buf + offset; - while (len > 0) { - if (m == NULL) - panic("bpf_mcopy"); - count = min(m->m_len, len); - bcopy(mtod(m, void *), dst, count); - m = m->m_next; - dst += count; - len -= count; - } + m_copydata(m, 0, len, dst); } /* diff --git a/freebsd/sys/net/bpfdesc.h b/freebsd/sys/net/bpfdesc.h index 2ce9204b..c28a74f9 100644 --- a/freebsd/sys/net/bpfdesc.h +++ b/freebsd/sys/net/bpfdesc.h @@ -43,9 +43,10 @@ #include <sys/callout.h> #include <sys/selinfo.h> -#include <sys/queue.h> +#include <sys/ck.h> #include <sys/conf.h> #include <sys/counter.h> +#include <sys/epoch.h> #include <net/if.h> /* @@ -53,7 +54,7 @@ */ struct zbuf; struct bpf_d { - LIST_ENTRY(bpf_d) bd_next; /* Linked list of descriptors */ + CK_LIST_ENTRY(bpf_d) bd_next; /* Linked list of descriptors */ /* * Buffer slots: two memory buffers store the incoming packets. * The model has three slots. Sbuf is always occupied. @@ -106,6 +107,9 @@ struct bpf_d { counter_u64_t bd_wdcount; /* number of packets dropped during a write */ counter_u64_t bd_zcopy; /* number of zero copy operations */ u_char bd_compat32; /* 32-bit stream on LP64 system */ + + volatile u_int bd_refcnt; + struct epoch_context epoch_ctx; }; /* Values for bd_state */ diff --git a/freebsd/sys/net/bridgestp.c b/freebsd/sys/net/bridgestp.c index 49e772b3..424f4d69 100644 --- a/freebsd/sys/net/bridgestp.c +++ b/freebsd/sys/net/bridgestp.c @@ -2024,6 +2024,7 @@ bstp_same_bridgeid(uint64_t id1, uint64_t id2) void bstp_reinit(struct bstp_state *bs) { + struct epoch_tracker et; struct bstp_port *bp; struct ifnet *ifp, *mif; u_char *e_addr; @@ -2044,7 +2045,7 @@ bstp_reinit(struct bstp_state *bs) * from is part of this bridge, so we can have more than one independent * bridges in the same STP domain. */ - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_type != IFT_ETHER) continue; /* Not Ethernet */ @@ -2064,7 +2065,7 @@ bstp_reinit(struct bstp_state *bs) continue; } } - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); if (mif == NULL) goto disablestp; @@ -2275,4 +2276,7 @@ bstp_destroy(struct bstp_port *bp) taskqueue_drain(taskqueue_swi, &bp->bp_statetask); taskqueue_drain(taskqueue_swi, &bp->bp_rtagetask); taskqueue_drain(taskqueue_swi, &bp->bp_mediatask); + + if (bp->bp_bs->bs_root_port == bp) + bstp_assign_roles(bp->bp_bs); } diff --git a/freebsd/sys/net/ethernet.h b/freebsd/sys/net/ethernet.h index fa75c1df..7ceb9b80 100644 --- a/freebsd/sys/net/ethernet.h +++ b/freebsd/sys/net/ethernet.h @@ -401,6 +401,8 @@ struct ether_vlan_header { #ifdef _KERNEL +#include <sys/_eventhandler.h> + struct ifnet; struct mbuf; struct route; @@ -422,12 +424,11 @@ void ether_vlan_mtap(struct bpf_if *, struct mbuf *, struct mbuf *ether_vlanencap(struct mbuf *, uint16_t); bool ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p, uint16_t vid, uint8_t pcp); +void ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr); -#ifdef _SYS_EVENTHANDLER_H_ /* new ethernet interface attached event */ typedef void (*ether_ifattach_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ether_ifattach_event, ether_ifattach_event_handler_t); -#endif #else /* _KERNEL */ diff --git a/freebsd/sys/net/ieee8023ad_lacp.c b/freebsd/sys/net/ieee8023ad_lacp.c index 9a70d6a1..46076a23 100644 --- a/freebsd/sys/net/ieee8023ad_lacp.c +++ b/freebsd/sys/net/ieee8023ad_lacp.c @@ -34,6 +34,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <rtems/bsd/local/opt_kern_tls.h> #include <rtems/bsd/local/opt_ratelimit.h> #include <sys/param.h> @@ -837,7 +838,9 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) struct lacp_softc *lsc = LACP_SOFTC(sc); struct lacp_portmap *pm; struct lacp_port *lp; + struct lacp_port **map; uint32_t hash; + int count; if (__predict_false(lsc->lsc_suppress_distributing)) { LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__)); @@ -850,13 +853,31 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) return (NULL); } +#ifdef NUMA + if ((sc->sc_opts & LAGG_OPT_USE_NUMA) && + pm->pm_num_dom > 1 && m->m_pkthdr.numa_domain < MAXMEMDOM) { + count = pm->pm_numa[m->m_pkthdr.numa_domain].count; + if (count > 0) { + map = pm->pm_numa[m->m_pkthdr.numa_domain].map; + } else { + /* No ports on this domain; use global hash. */ + map = pm->pm_map; + count = pm->pm_count; + } + } else +#endif + { + map = pm->pm_map; + count = pm->pm_count; + } if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) hash = m->m_pkthdr.flowid >> sc->flowid_shift; else hash = m_ether_tcpip_hash(sc->sc_flags, m, lsc->lsc_hashkey); - hash %= pm->pm_count; - lp = pm->pm_map[hash]; + + hash %= count; + lp = map[hash]; KASSERT((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0, ("aggregated port is not distributing")); @@ -864,7 +885,7 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) return (lp->lp_lagg); } -#ifdef RATELIMIT +#if defined(RATELIMIT) || defined(KERN_TLS) struct lagg_port * lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t flowid) { @@ -1046,6 +1067,10 @@ lacp_update_portmap(struct lacp_softc *lsc) uint64_t speed; u_int newmap; int i; +#ifdef NUMA + int count; + uint8_t domain; +#endif newmap = lsc->lsc_activemap == 0 ? 1 : 0; p = &lsc->lsc_pmap[newmap]; @@ -1056,9 +1081,25 @@ lacp_update_portmap(struct lacp_softc *lsc) if (la != NULL && la->la_nports > 0) { p->pm_count = la->la_nports; i = 0; - TAILQ_FOREACH(lp, &la->la_ports, lp_dist_q) + TAILQ_FOREACH(lp, &la->la_ports, lp_dist_q) { p->pm_map[i++] = lp; +#ifdef NUMA + domain = lp->lp_ifp->if_numa_domain; + if (domain >= MAXMEMDOM) + continue; + count = p->pm_numa[domain].count; + p->pm_numa[domain].map[count] = lp; + p->pm_numa[domain].count++; +#endif + } KASSERT(i == p->pm_count, ("Invalid port count")); + +#ifdef NUMA + for (i = 0; i < MAXMEMDOM; i++) { + if (p->pm_numa[i].count != 0) + p->pm_num_dom++; + } +#endif speed = lacp_aggregator_bandwidth(la); } sc->sc_ifp->if_baudrate = speed; diff --git a/freebsd/sys/net/ieee8023ad_lacp.h b/freebsd/sys/net/ieee8023ad_lacp.h index 5ae48ceb..b6a0860f 100644 --- a/freebsd/sys/net/ieee8023ad_lacp.h +++ b/freebsd/sys/net/ieee8023ad_lacp.h @@ -197,8 +197,15 @@ enum lacp_mux_state { #define LACP_MAX_PORTS 32 +struct lacp_numa { + int count; + struct lacp_port *map[LACP_MAX_PORTS]; +}; + struct lacp_portmap { int pm_count; + int pm_num_dom; + struct lacp_numa pm_numa[MAXMEMDOM]; struct lacp_port *pm_map[LACP_MAX_PORTS]; }; @@ -286,7 +293,7 @@ struct lacp_softc { struct mbuf *lacp_input(struct lagg_port *, struct mbuf *); struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *); -#ifdef RATELIMIT +#if defined(RATELIMIT) || defined(KERN_TLS) struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t); #endif void lacp_attach(struct lagg_softc *); diff --git a/freebsd/sys/net/ieee_oui.h b/freebsd/sys/net/ieee_oui.h new file mode 100644 index 00000000..068328d8 --- /dev/null +++ b/freebsd/sys/net/ieee_oui.h @@ -0,0 +1,85 @@ +/* - + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + * Author: George V. Neville-Neil + * + */ + +/* Organizationally Unique Identifier assigned by IEEE 14 Nov 2013 */ +#define OUI_FREEBSD_BASE 0x589cfc000000 +#define OUI_FREEBSD(nic) (OUI_FREEBSD_BASE | (nic)) + +/* + * OUIs are most often used to uniquely identify network interfaces + * and occupy the first 3 bytes of both destination and source MAC + * addresses. The following allocations exist so that various + * software systems associated with FreeBSD can have unique IDs in the + * absence of hardware. The use of OUIs for this purpose is not fully + * fleshed out but is now in common use in virtualization technology. + * + * Allocations from this range are expected to be made using COMMON + * SENSE by developers. Do NOT take a large range just because + * they're currently wide open. Take the smallest useful range for + * your system. We have (2^24 - 2) available addresses (see Reserved + * Values below) but that is far from infinite. + * + * In the event of a conflict arbitration of allocation in this file + * is subject to core@ approval. + * + * Applications are differentiated based on the high order bit(s) of + * the remaining three bytes. Our first allocation has all 0s, the + * next allocation has the highest bit set. Allocating in this way + * gives us 254 allocations of 64K addresses. Address blocks can be + * concatenated if necessary. + * + * Reserved Values: 0x000000 and 0xffffff are reserved and MUST NOT BE + * allocated for any reason. + */ + +/* Allocate 20 bits to bhyve */ +#define OUI_FREEBSD_BHYVE_LOW OUI_FREEBSD(0x000001) +#define OUI_FREEBSD_BHYVE_HIGH OUI_FREEBSD(0x0fffff) + +/* + * Allocate 16 bits for a pool to give to various interfaces that need a + * generated address, but don't quite need to slice off a whole section of + * the OUI (e.g. cloned interfaces, one-off NICs of various vendors). + * + * ether_gen_addr should be used to generate an address from this pool. + */ +#define OUI_FREEBSD_GENERATED_MASK 0x10ffff +#define OUI_FREEBSD_GENERATED_LOW OUI_FREEBSD(0x100000) +#define OUI_FREEBSD_GENERATED_HIGH OUI_FREEBSD(OUI_FREEBSD_GENERATED_MASK) + +/* Allocate 16 bits for emulated NVMe devices */ +#define OUI_FREEBSD_NVME_MASK 0x20ffff +#define OUI_FREEBSD_NVME_LOW OUI_FREEBSD(0x200000) +#define OUI_FREEBSD_NVME_HIGH OUI_FREEBSD(OUI_FREEBSD_NVME_MASK) diff --git a/freebsd/sys/net/if.c b/freebsd/sys/net/if.c index 9d233444..c1fd928e 100644 --- a/freebsd/sys/net/if.c +++ b/freebsd/sys/net/if.c @@ -38,9 +38,10 @@ #include <rtems/bsd/local/opt_inet.h> #include <sys/param.h> -#include <sys/types.h> #include <sys/conf.h> +#include <sys/eventhandler.h> #include <sys/malloc.h> +#include <sys/domainset.h> #include <sys/sbuf.h> #include <sys/bus.h> #include <sys/epoch.h> @@ -175,14 +176,14 @@ struct ifmediareq32 { #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #define _CASE_IOC_IFGROUPREQ_32(cmd) \ - case _IOC_NEWTYPE((cmd), struct ifgroupreq32): + _IOC_NEWTYPE((cmd), struct ifgroupreq32): case #else /* !COMPAT_FREEBSD32 */ #define _CASE_IOC_IFGROUPREQ_32(cmd) #endif /* !COMPAT_FREEBSD32 */ #define CASE_IOC_IFGROUPREQ(cmd) \ _CASE_IOC_IFGROUPREQ_32(cmd) \ - case (cmd) + (cmd) union ifreq_union { struct ifreq ifr; @@ -270,7 +271,6 @@ static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); -static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); @@ -358,16 +358,17 @@ ifnet_byindex(u_short idx) struct ifnet * ifnet_byindex_ref(u_short idx) { + struct epoch_tracker et; struct ifnet *ifp; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); ifp = ifnet_byindex_locked(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (NULL); } if_ref(ifp); - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (ifp); } @@ -431,14 +432,15 @@ ifnet_setbyindex(u_short idx, struct ifnet *ifp) struct ifaddr * ifaddr_byindex(u_short idx) { + struct epoch_tracker et; struct ifnet *ifp; struct ifaddr *ifa = NULL; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); ifp = ifnet_byindex_locked(idx); if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (ifa); } @@ -531,13 +533,23 @@ if_grow(void) * registered for the passed type. */ struct ifnet * -if_alloc(u_char type) +if_alloc_domain(u_char type, int numa_domain) { struct ifnet *ifp; u_short idx; void *old; - ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); +#ifndef __rtems__ + KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large")); + if (numa_domain == IF_NODOM) +#endif /* __rtems__ */ + ifp = malloc(sizeof(struct ifnet), M_IFNET, + M_WAITOK | M_ZERO); +#ifndef __rtems__ + else + ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET, + DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO); +#endif /* __rtems__ */ restart: IFNET_WLOCK(); idx = ifindex_alloc(&old); @@ -552,6 +564,9 @@ if_alloc(u_char type) ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; +#ifndef __rtems__ + ifp->if_numa_domain = numa_domain; +#endif /* __rtems__ */ #ifdef VIMAGE ifp->if_vnet = curvnet; #endif @@ -585,6 +600,22 @@ if_alloc(u_char type) return (ifp); } +struct ifnet * +if_alloc_dev(u_char type, device_t dev) +{ + int numa_domain; + + if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0) + return (if_alloc_domain(type, IF_NODOM)); + return (if_alloc_domain(type, numa_domain)); +} + +struct ifnet * +if_alloc(u_char type) +{ + + return (if_alloc_domain(type, IF_NODOM)); +} /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an @@ -613,7 +644,14 @@ if_free_internal(struct ifnet *ifp) free(ifp->if_description, M_IFDESCR); free(ifp->if_hw_addr, M_IFADDR); - free(ifp, M_IFNET); +#ifndef __rtems__ + if (ifp->if_numa_domain == IF_NODOM) +#endif /* __rtems__ */ + free(ifp, M_IFNET); +#ifndef __rtems__ + else + free_domain(ifp, M_IFNET); +#endif /* __rtems__ */ } static void @@ -840,7 +878,6 @@ if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; - ifa->ifa_rtrequest = link_rtrequest; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; @@ -976,12 +1013,14 @@ if_purgeaddrs(struct ifnet *ifp) struct ifaddr *ifa; while (1) { - NET_EPOCH_ENTER(); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) break; } - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); if (ifa == NULL) break; @@ -1107,6 +1146,15 @@ if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) curvnet->vnet_ifcnt--; #endif epoch_wait_preempt(net_epoch_preempt); + + /* + * Ensure all pending EPOCH(9) callbacks have been executed. This + * fixes issues about late destruction of multicast options + * which lead to leave group calls, which in turn access the + * belonging ifnet structure: + */ + epoch_drain_callbacks(net_epoch_preempt); + /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. @@ -1618,38 +1666,39 @@ ifgr_groups_get(void *ifgrp) static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { + struct epoch_tracker et; int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; if (ifgr->ifgr_len == 0) { - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); /* XXX: wire */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (error); } len -= sizeof(ifgrq); ifgp++; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (0); } @@ -1869,6 +1918,7 @@ static int ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, struct sockaddr *ia) { + struct epoch_tracker et; int error; struct rt_addrinfo info; struct sockaddr_dl null_sdl; @@ -1879,6 +1929,16 @@ ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, bzero(&info, sizeof(info)); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; + if (cmd == RTM_ADD) { + /* explicitly specify (loopback) ifa */ + if (info.rti_ifp != NULL) { + NET_EPOCH_ENTER(et); + info.rti_ifa = ifaof_ifpforaddr(ifa->ifa_addr, info.rti_ifp); + if (info.rti_ifa != NULL) + ifa_ref(info.rti_ifa); + NET_EPOCH_EXIT(et); + } + } info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; @@ -1963,11 +2023,12 @@ done: int ifa_ifwithaddr_check(const struct sockaddr *addr) { + struct epoch_tracker et; int rc; - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); rc = (ifa_ifwithaddr(addr) != NULL); - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (rc); } @@ -2057,9 +2118,7 @@ ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) /* * Scan though each interface, looking for ones that have addresses - * in this address family and the requested fib. Maintain a reference - * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that - * kept it stable when we move onto the next interface. + * in this address family and the requested fib. */ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) @@ -2188,38 +2247,6 @@ ifa_preferred(struct ifaddr *cur, struct ifaddr *next) ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } -#include <net/if_llatbl.h> - -/* - * Default action when installing a route with a Link Level gateway. - * Lookup an appropriate real ifa to point to. - * This should be moved to /sys/net/link.c eventually. - */ -static void -link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) -{ - struct ifaddr *ifa, *oifa; - struct sockaddr *dst; - struct ifnet *ifp; - - if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == NULL) || - ((ifp = ifa->ifa_ifp) == NULL) || ((dst = rt_key(rt)) == NULL)) - return; - NET_EPOCH_ENTER(); - ifa = ifaof_ifpforaddr(dst, ifp); - if (ifa) { - oifa = rt->rt_ifa; - if (oifa != ifa) { - ifa_free(oifa); - ifa_ref(ifa); - } - rt->rt_ifa = ifa; - if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) - ifa->ifa_rtrequest(cmd, rt, info); - } - NET_EPOCH_EXIT(); -} - struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { @@ -2418,9 +2445,10 @@ if_qflush(struct ifnet *ifp) struct ifnet * ifunit_ref(const char *name) { + struct epoch_tracker et; struct ifnet *ifp; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) @@ -2428,21 +2456,22 @@ ifunit_ref(const char *name) } if (ifp != NULL) if_ref(ifp); - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (ifp); } struct ifnet * ifunit(const char *name) { + struct epoch_tracker et; struct ifnet *ifp; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (ifp); } @@ -2706,6 +2735,8 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) if (strlen(new_name) == IFNAMSIZ-1) return (EINVAL); } + if (strcmp(new_name, ifp->if_xname) == 0) + break; if (ifunit(new_name) != NULL) return (EEXIST); @@ -2830,6 +2861,7 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) return (EINVAL); if (cmd == SIOCADDMULTI) { + struct epoch_tracker et; struct ifmultiaddr *ifma; /* @@ -2839,9 +2871,9 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) * lose a race while we check if the membership * already exists. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); ifma = if_findmulti(ifp, &ifr->ifr_addr); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (ifma != NULL) error = EADDRINUSE; else @@ -2878,6 +2910,7 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: + case SIOCGIFDOWNREASON: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); @@ -2895,7 +2928,7 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) error = if_gethwaddr(ifp, ifr); break; - CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): + case CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); @@ -2904,12 +2937,12 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) return (error); break; - CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): + case CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): if ((error = if_getgroup((struct ifgroupreq *)data, ifp))) return (error); break; - CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): + case CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); @@ -3080,7 +3113,7 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) error = if_clone_list((struct if_clonereq *)data); goto out_noref; - CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): + case CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; @@ -3280,6 +3313,7 @@ again: IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { + struct epoch_tracker et; int addrs; /* @@ -3296,7 +3330,7 @@ again: } addrs = 0; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; @@ -3324,7 +3358,7 @@ again: if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); @@ -3631,15 +3665,16 @@ if_delmulti(struct ifnet *ifp, struct sockaddr *sa) struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS + struct epoch_tracker et; struct ifnet *oifp; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif @@ -3705,15 +3740,16 @@ if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags) if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { + struct epoch_tracker et; struct ifnet *oifp; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); } #endif /* @@ -3837,10 +3873,11 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; + struct epoch_tracker et; int rc; rc = 0; - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); ifa = ifp->if_addr; if (ifa == NULL) { rc = EINVAL; @@ -3874,7 +3911,7 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) * to re-init it in order to reprogram its * address filter. */ - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; @@ -3890,7 +3927,7 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) EVENTHANDLER_INVOKE(iflladdr_event, ifp); return (0); out: - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (rc); } @@ -4305,6 +4342,8 @@ if_getsoftc(if_t ifp) void if_setrcvif(struct mbuf *m, if_t ifp) { + + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (struct ifnet *)ifp; } diff --git a/freebsd/sys/net/if_arp.h b/freebsd/sys/net/if_arp.h index 070dbafe..f4c3bec2 100644 --- a/freebsd/sys/net/if_arp.h +++ b/freebsd/sys/net/if_arp.h @@ -105,8 +105,9 @@ struct arpstat { uint64_t rxrequests; /* # of ARP requests received by this host. */ uint64_t rxreplies; /* # of ARP replies received by this host. */ uint64_t received; /* # of ARP packets received by this host. */ + uint64_t txerrors; /* # of ARP requests failed to send. */ - uint64_t arp_spares[4]; /* For either the upper or lower half. */ + uint64_t arp_spares[3]; /* For either the upper or lower half. */ /* Abnormal event and error counting: */ uint64_t dropped; /* # of packets dropped waiting for a reply. */ uint64_t timeouts; /* # of times with entries removed */ diff --git a/freebsd/sys/net/if_bridge.c b/freebsd/sys/net/if_bridge.c index aa56be48..18e0e7bf 100644 --- a/freebsd/sys/net/if_bridge.c +++ b/freebsd/sys/net/if_bridge.c @@ -228,7 +228,7 @@ struct bridge_softc { struct bstp_state sc_stp; /* STP state */ uint32_t sc_brtexceeded; /* # of cache drops */ struct ifnet *sc_ifaddr; /* member mac copied from */ - u_char sc_defaddr[6]; /* Default MAC address */ + struct ether_addr sc_defaddr; /* Default MAC address */ }; VNET_DEFINE_STATIC(struct mtx, bridge_list_mtx); @@ -237,7 +237,8 @@ static eventhandler_tag bridge_detach_cookie; int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD; -uma_zone_t bridge_rtnode_zone; +VNET_DEFINE_STATIC(uma_zone_t, bridge_rtnode_zone); +#define V_bridge_rtnode_zone VNET(bridge_rtnode_zone) static int bridge_clone_create(struct if_clone *, int, caddr_t); static void bridge_clone_destroy(struct ifnet *); @@ -529,6 +530,9 @@ static void vnet_bridge_init(const void *unused __unused) { + V_bridge_rtnode_zone = uma_zcreate("bridge_rtnode", + sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); BRIDGE_LIST_LOCK_INIT(); LIST_INIT(&V_bridge_list); V_bridge_cloner = if_clone_simple(bridge_name, @@ -544,6 +548,7 @@ vnet_bridge_uninit(const void *unused __unused) if_clone_detach(V_bridge_cloner); V_bridge_cloner = NULL; BRIDGE_LIST_LOCK_DESTROY(); + uma_zdestroy(V_bridge_rtnode_zone); } VNET_SYSUNINIT(vnet_bridge_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_bridge_uninit, NULL); @@ -554,9 +559,6 @@ bridge_modevent(module_t mod, int type, void *data) switch (type) { case MOD_LOAD: - bridge_rtnode_zone = uma_zcreate("bridge_rtnode", - sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL, - UMA_ALIGN_PTR, 0); bridge_dn_p = bridge_dummynet; bridge_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, bridge_ifdetach, NULL, @@ -565,7 +567,6 @@ bridge_modevent(module_t mod, int type, void *data) case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, bridge_detach_cookie); - uma_zdestroy(bridge_rtnode_zone); bridge_dn_p = NULL; break; default: @@ -672,16 +673,14 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) getcredhostid(curthread->td_ucred, &hostid); do { if (fb || hostid == 0) { - arc4rand(sc->sc_defaddr, ETHER_ADDR_LEN, 1); - sc->sc_defaddr[0] &= ~1;/* clear multicast bit */ - sc->sc_defaddr[0] |= 2; /* set the LAA bit */ + ether_gen_addr(ifp, &sc->sc_defaddr); } else { - sc->sc_defaddr[0] = 0x2; - sc->sc_defaddr[1] = (hostid >> 24) & 0xff; - sc->sc_defaddr[2] = (hostid >> 16) & 0xff; - sc->sc_defaddr[3] = (hostid >> 8 ) & 0xff; - sc->sc_defaddr[4] = hostid & 0xff; - sc->sc_defaddr[5] = ifp->if_dunit & 0xff; + sc->sc_defaddr.octet[0] = 0x2; + sc->sc_defaddr.octet[1] = (hostid >> 24) & 0xff; + sc->sc_defaddr.octet[2] = (hostid >> 16) & 0xff; + sc->sc_defaddr.octet[3] = (hostid >> 8 ) & 0xff; + sc->sc_defaddr.octet[4] = hostid & 0xff; + sc->sc_defaddr.octet[5] = ifp->if_dunit & 0xff; } fb = 1; @@ -689,7 +688,7 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) BRIDGE_LIST_LOCK(); LIST_FOREACH(sc2, &V_bridge_list, sc_list) { bifp = sc2->sc_ifp; - if (memcmp(sc->sc_defaddr, + if (memcmp(sc->sc_defaddr.octet, IF_LLADDR(bifp), ETHER_ADDR_LEN) == 0) { retry = 1; break; @@ -699,7 +698,7 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) } while (retry == 1); bstp_attach(&sc->sc_stp, &bridge_ops); - ether_ifattach(ifp, sc->sc_defaddr); + ether_ifattach(ifp, sc->sc_defaddr.octet); /* Now undo some of the damage... */ ifp->if_baudrate = 0; ifp->if_type = IFT_BRIDGE; @@ -734,6 +733,9 @@ bridge_clone_destroy(struct ifnet *ifp) bridge_delete_span(sc, bif); } + /* Tear down the routing table. */ + bridge_rtable_fini(sc); + BRIDGE_UNLOCK(sc); callout_drain(&sc->sc_brcallout); @@ -746,9 +748,6 @@ bridge_clone_destroy(struct ifnet *ifp) ether_ifdetach(ifp); if_free(ifp); - /* Tear down the routing table. */ - bridge_rtable_fini(sc); - BRIDGE_LOCK_DESTROY(sc); free(sc, M_DEVBUF); } @@ -927,7 +926,7 @@ bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set) { struct ifnet *ifp = bif->bif_ifp; struct ifreq ifr; - int error; + int error, mask, stuck; BRIDGE_UNLOCK_ASSERT(sc); @@ -940,10 +939,12 @@ bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set) if_printf(sc->sc_ifp, "error setting capabilities on %s: %d\n", ifp->if_xname, error); - if ((ifp->if_capenable & ~set) != 0) + mask = BRIDGE_IFCAPS_MASK | BRIDGE_IFCAPS_STRIP; + stuck = ifp->if_capenable & mask & ~set; + if (stuck != 0) if_printf(sc->sc_ifp, "can't disable some capabilities on %s: 0x%x\n", - ifp->if_xname, ifp->if_capenable & ~set); + ifp->if_xname, stuck); } } @@ -1018,7 +1019,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, */ if (V_bridge_inherit_mac && sc->sc_ifaddr == ifs) { if (LIST_EMPTY(&sc->sc_iflist)) { - bcopy(sc->sc_defaddr, + bcopy(&sc->sc_defaddr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = NULL; } else { @@ -1189,7 +1190,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) * the default randomly generated one. */ if (V_bridge_inherit_mac && LIST_EMPTY(&sc->sc_iflist) && - !memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr, ETHER_ADDR_LEN)) { + !memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr.octet, ETHER_ADDR_LEN)) { bcopy(IF_LLADDR(ifs), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = ifs; EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); @@ -1972,9 +1973,9 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp) return; } - if (PFIL_HOOKED(&V_inet_pfil_hook) + if (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 - || PFIL_HOOKED(&V_inet6_pfil_hook) + || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif ) { if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0) @@ -2001,7 +2002,7 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, struct rtentry *rt) { struct ether_header *eh; - struct ifnet *dst_if; + struct ifnet *bifp, *dst_if; struct bridge_softc *sc; uint16_t vlan; @@ -2016,13 +2017,14 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, vlan = VLANTAGOF(m); BRIDGE_LOCK(sc); + bifp = sc->sc_ifp; /* * If bridge is down, but the original output interface is up, * go ahead and send out that interface. Otherwise, the packet * is dropped below. */ - if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + if ((bifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { dst_if = ifp; goto sendunicast; } @@ -2035,6 +2037,9 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, dst_if = NULL; else dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan); + /* Tap any traffic not passing back out the originating interface */ + if (dst_if != ifp) + ETHER_BPF_MTAP(bifp, m); if (dst_if == NULL) { struct bridge_iflist *bif; struct mbuf *mc; @@ -2072,7 +2077,7 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, } else { mc = m_copypacket(m, M_NOWAIT); if (mc == NULL) { - if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); + if_inc_counter(bifp, IFCOUNTER_OERRORS, 1); continue; } } @@ -2232,9 +2237,9 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, ETHER_BPF_MTAP(ifp, m); /* run the packet filter */ - if (PFIL_HOOKED(&V_inet_pfil_hook) + if (PFIL_HOOKED_IN(V_inet_pfil_head) #ifdef INET6 - || PFIL_HOOKED(&V_inet6_pfil_hook) + || PFIL_HOOKED_IN(V_inet6_pfil_head) #endif ) { BRIDGE_UNLOCK(sc); @@ -2272,9 +2277,9 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, BRIDGE_UNLOCK(sc); - if (PFIL_HOOKED(&V_inet_pfil_hook) + if (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 - || PFIL_HOOKED(&V_inet6_pfil_hook) + || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif ) { if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0) @@ -2411,7 +2416,7 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) #ifdef INET6 # define OR_PFIL_HOOKED_INET6 \ - || PFIL_HOOKED(&V_inet6_pfil_hook) + || PFIL_HOOKED_IN(V_inet6_pfil_head) #else # define OR_PFIL_HOOKED_INET6 #endif @@ -2423,22 +2428,6 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) if (memcmp(IF_LLADDR((iface)), eh->ether_dhost, ETHER_ADDR_LEN) == 0 \ OR_CARP_CHECK_WE_ARE_DST((iface)) \ ) { \ - if ((iface)->if_type == IFT_BRIDGE) { \ - ETHER_BPF_MTAP(iface, m); \ - if_inc_counter(iface, IFCOUNTER_IPACKETS, 1); \ - if_inc_counter(iface, IFCOUNTER_IBYTES, m->m_pkthdr.len); \ - /* Filter on the physical interface. */ \ - if (V_pfil_local_phys && \ - (PFIL_HOOKED(&V_inet_pfil_hook) \ - OR_PFIL_HOOKED_INET6)) { \ - if (bridge_pfil(&m, NULL, ifp, \ - PFIL_IN) != 0 || m == NULL) { \ - BRIDGE_UNLOCK(sc); \ - return (NULL); \ - } \ - eh = mtod(m, struct ether_header *); \ - } \ - } \ if (bif->bif_flags & IFBIF_LEARNING) { \ error = bridge_rtupdate(sc, eh->ether_shost, \ vlan, bif, 0, IFBAF_DYNAMIC); \ @@ -2449,6 +2438,26 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) } \ } \ m->m_pkthdr.rcvif = iface; \ + if ((iface) == ifp) { \ + /* Skip bridge processing... src == dest */ \ + BRIDGE_UNLOCK(sc); \ + return (m); \ + } \ + /* It's passing over or to the bridge, locally. */ \ + ETHER_BPF_MTAP(bifp, m); \ + if_inc_counter(bifp, IFCOUNTER_IPACKETS, 1); \ + if_inc_counter(bifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); \ + /* Filter on the physical interface. */ \ + if (V_pfil_local_phys && (PFIL_HOOKED_IN(V_inet_pfil_head) \ + OR_PFIL_HOOKED_INET6)) { \ + if (bridge_pfil(&m, NULL, ifp, \ + PFIL_IN) != 0 || m == NULL) { \ + BRIDGE_UNLOCK(sc); \ + return (NULL); \ + } \ + } \ + if ((iface) != bifp) \ + ETHER_BPF_MTAP(iface, m); \ BRIDGE_UNLOCK(sc); \ return (m); \ } \ @@ -2519,9 +2528,9 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, } /* Filter on the bridge interface before broadcasting */ - if (runfilt && (PFIL_HOOKED(&V_inet_pfil_hook) + if (runfilt && (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 - || PFIL_HOOKED(&V_inet6_pfil_hook) + || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif )) { if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0) @@ -2566,9 +2575,9 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, * pointer so we do not redundantly filter on the bridge for * each interface we broadcast on. */ - if (runfilt && (PFIL_HOOKED(&V_inet_pfil_hook) + if (runfilt && (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 - || PFIL_HOOKED(&V_inet6_pfil_hook) + || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif )) { if (used == 0) { @@ -2671,7 +2680,7 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, * initialize the expiration time and Ethernet * address. */ - brt = uma_zalloc(bridge_rtnode_zone, M_NOWAIT | M_ZERO); + brt = uma_zalloc(V_bridge_rtnode_zone, M_NOWAIT | M_ZERO); if (brt == NULL) return (ENOMEM); @@ -2684,7 +2693,7 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, brt->brt_vlan = vlan; if ((error = bridge_rtnode_insert(sc, brt)) != 0) { - uma_zfree(bridge_rtnode_zone, brt); + uma_zfree(V_bridge_rtnode_zone, brt); return (error); } brt->brt_dst = bif; @@ -2768,11 +2777,14 @@ bridge_timer(void *arg) BRIDGE_LOCK_ASSERT(sc); + /* Destruction of rtnodes requires a proper vnet context */ + CURVNET_SET(sc->sc_ifp->if_vnet); bridge_rtage(sc); if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz, bridge_timer, sc); + CURVNET_RESTORE(); } /* @@ -3030,7 +3042,7 @@ bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt) LIST_REMOVE(brt, brt_list); sc->sc_brtcnt--; brt->brt_dst->bif_addrcnt--; - uma_zfree(bridge_rtnode_zone, brt); + uma_zfree(V_bridge_rtnode_zone, brt); } /* @@ -3044,6 +3056,7 @@ bridge_rtable_expire(struct ifnet *ifp, int age) struct bridge_softc *sc = ifp->if_bridge; struct bridge_rtnode *brt; + CURVNET_SET(ifp->if_vnet); BRIDGE_LOCK(sc); /* @@ -3062,6 +3075,7 @@ bridge_rtable_expire(struct ifnet *ifp, int age) } } BRIDGE_UNLOCK(sc); + CURVNET_RESTORE(); } /* @@ -3103,6 +3117,7 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) struct ip *ip; struct llc llc1; u_int16_t ether_type; + pfil_return_t rv; snap = 0; error = -1; /* Default error if not error == 0 */ @@ -3174,14 +3189,14 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) } /* Run the packet through pfil before stripping link headers */ - if (PFIL_HOOKED(&V_link_pfil_hook) && V_pfil_ipfw != 0 && - dir == PFIL_OUT && ifp != NULL) { - - error = pfil_run_hooks(&V_link_pfil_hook, mp, ifp, dir, 0, - NULL); - - if (*mp == NULL || error != 0) /* packet consumed by filter */ - return (error); + if (PFIL_HOOKED_OUT(V_link_pfil_head) && V_pfil_ipfw != 0 && + dir == PFIL_OUT && ifp != NULL) { + switch (pfil_run_hooks(V_link_pfil_head, mp, ifp, dir, NULL)) { + case PFIL_DROPPED: + return (EPERM); + case PFIL_CONSUMED: + return (0); + } } /* Strip off the Ethernet header and keep a copy. */ @@ -3219,6 +3234,7 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) /* * Run the packet through pfil */ + rv = PFIL_PASS; switch (ether_type) { case ETHERTYPE_IP: /* @@ -3228,25 +3244,19 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) * Keep the order: * in_if -> bridge_if -> out_if */ - if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL) - error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp, - dir, 0, NULL); - - if (*mp == NULL || error != 0) /* filter may consume */ + if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL && (rv = + pfil_run_hooks(V_inet_pfil_head, mp, bifp, dir, NULL)) != + PFIL_PASS) break; - if (V_pfil_member && ifp != NULL) - error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, - dir, 0, NULL); - - if (*mp == NULL || error != 0) /* filter may consume */ + if (V_pfil_member && ifp != NULL && (rv = + pfil_run_hooks(V_inet_pfil_head, mp, ifp, dir, NULL)) != + PFIL_PASS) break; - if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL) - error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp, - dir, 0, NULL); - - if (*mp == NULL || error != 0) /* filter may consume */ + if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL && (rv = + pfil_run_hooks(V_inet_pfil_head, mp, bifp, dir, NULL)) != + PFIL_PASS) break; /* check if we need to fragment the packet */ @@ -3282,35 +3292,33 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) break; #ifdef INET6 case ETHERTYPE_IPV6: - if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL) - error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp, - dir, 0, NULL); - - if (*mp == NULL || error != 0) /* filter may consume */ + if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL && (rv = + pfil_run_hooks(V_inet6_pfil_head, mp, bifp, dir, NULL)) != + PFIL_PASS) break; - if (V_pfil_member && ifp != NULL) - error = pfil_run_hooks(&V_inet6_pfil_hook, mp, ifp, - dir, 0, NULL); - - if (*mp == NULL || error != 0) /* filter may consume */ + if (V_pfil_member && ifp != NULL && (rv = + pfil_run_hooks(V_inet6_pfil_head, mp, ifp, dir, NULL)) != + PFIL_PASS) break; - if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL) - error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp, - dir, 0, NULL); + if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL && (rv = + pfil_run_hooks(V_inet6_pfil_head, mp, bifp, dir, NULL)) != + PFIL_PASS) + break; break; #endif + } + + switch (rv) { + case PFIL_CONSUMED: + return (0); + case PFIL_DROPPED: + return (EPERM); default: - error = 0; break; } - if (*mp == NULL) - return (error); - if (error != 0) - goto bad; - error = -1; /* diff --git a/freebsd/sys/net/if_clone.h b/freebsd/sys/net/if_clone.h index 5dceacf6..30d604f3 100644 --- a/freebsd/sys/net/if_clone.h +++ b/freebsd/sys/net/if_clone.h @@ -37,6 +37,8 @@ #ifdef _KERNEL +#include <sys/_eventhandler.h> + #define IFC_NOGROUP 0x1 struct if_clone; @@ -65,11 +67,9 @@ const char *ifc_name(struct if_clone *); void ifc_flags_set(struct if_clone *, int flags); int ifc_flags_get(struct if_clone *); -#ifdef _SYS_EVENTHANDLER_H_ /* Interface clone event. */ typedef void (*if_clone_event_handler_t)(void *, struct if_clone *); EVENTHANDLER_DECLARE(if_clone_event, if_clone_event_handler_t); -#endif /* The below interfaces used only by net/if.c. */ void vnet_if_clone_init(void); diff --git a/freebsd/sys/net/if_dead.c b/freebsd/sys/net/if_dead.c index 552be13f..ff73ceaf 100644 --- a/freebsd/sys/net/if_dead.c +++ b/freebsd/sys/net/if_dead.c @@ -128,6 +128,23 @@ ifdead_snd_tag_free(struct m_snd_tag *pmt) { } +static void +ifdead_ratelimit_query(struct ifnet *ifp __unused, + struct if_ratelimit_query_results *q) +{ + /* + * This guy does not support + * this interface. Not sure + * why we would specify a + * flag on the interface + * that says we do. + */ + q->rate_table = NULL; + q->flags = RT_NOSUPPORT; + q->max_flows = 0; + q->number_of_rates = 0; +} + void if_dead(struct ifnet *ifp) { @@ -144,4 +161,5 @@ if_dead(struct ifnet *ifp) ifp->if_snd_tag_modify = ifdead_snd_tag_modify; ifp->if_snd_tag_query = ifdead_snd_tag_query; ifp->if_snd_tag_free = ifdead_snd_tag_free; + ifp->if_ratelimit_query = ifdead_ratelimit_query; } diff --git a/freebsd/sys/net/if_enc.c b/freebsd/sys/net/if_enc.c index ebfbf5cb..9e7fcc53 100644 --- a/freebsd/sys/net/if_enc.c +++ b/freebsd/sys/net/if_enc.c @@ -287,24 +287,24 @@ enc_hhook(int32_t hhook_type, int32_t hhook_id, void *udata, void *ctx_data, switch (hhook_id) { #ifdef INET case AF_INET: - ph = &V_inet_pfil_hook; + ph = V_inet_pfil_head; break; #endif #ifdef INET6 case AF_INET6: - ph = &V_inet6_pfil_hook; + ph = V_inet6_pfil_head; break; #endif default: ph = NULL; } - if (ph == NULL || !PFIL_HOOKED(ph)) + if (ph == NULL || (pdir == PFIL_OUT && !PFIL_HOOKED_OUT(ph)) || + (pdir == PFIL_IN && !PFIL_HOOKED_IN(ph))) return (0); /* Make a packet looks like it was received on enc(4) */ rcvif = (*ctx->mp)->m_pkthdr.rcvif; (*ctx->mp)->m_pkthdr.rcvif = ifp; - if (pfil_run_hooks(ph, ctx->mp, ifp, pdir, 0, ctx->inp) != 0 || - *ctx->mp == NULL) { + if (pfil_run_hooks(ph, ctx->mp, ifp, pdir, ctx->inp) != PFIL_PASS) { *ctx->mp = NULL; /* consumed by filter */ return (EACCES); } diff --git a/freebsd/sys/net/if_ethersubr.c b/freebsd/sys/net/if_ethersubr.c index 96ed309a..6c5c2ccb 100644 --- a/freebsd/sys/net/if_ethersubr.c +++ b/freebsd/sys/net/if_ethersubr.c @@ -44,11 +44,13 @@ #include <sys/systm.h> #include <sys/bus.h> #include <sys/eventhandler.h> +#include <sys/jail.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/module.h> #include <sys/mbuf.h> +#include <sys/proc.h> #include <sys/priv.h> #include <sys/random.h> #include <sys/socket.h> @@ -56,6 +58,7 @@ #include <sys/sysctl.h> #include <sys/uuid.h> +#include <net/ieee_oui.h> #include <net/if.h> #include <net/if_var.h> #include <net/if_arp.h> @@ -87,12 +90,14 @@ #endif #include <security/mac/mac_framework.h> +#include <crypto/sha1.h> + #ifdef CTASSERT CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif -VNET_DEFINE(struct pfil_head, link_pfil_hook); /* Packet filter hooks */ +VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); @@ -459,7 +464,6 @@ ether_set_pcp(struct mbuf **mp, struct ifnet *ifp, uint8_t pcp) int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { - int error; uint8_t pcp; pcp = ifp->if_pcp; @@ -467,27 +471,27 @@ ether_output_frame(struct ifnet *ifp, struct mbuf *m) !ether_set_pcp(&m, ifp, pcp)) return (0); - if (PFIL_HOOKED(&V_link_pfil_hook)) { - error = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, - PFIL_OUT, 0, NULL); - if (error != 0) + if (PFIL_HOOKED_OUT(V_link_pfil_head)) + switch (pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_OUT, + NULL)) { + case PFIL_DROPPED: return (EACCES); - - if (m == NULL) + case PFIL_CONSUMED: return (0); - } + } #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ - /* Catch ETHERTYPE_IP, and ETHERTYPE_ARP if we are v6-only. */ - if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY) != 0) { + /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ + if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { struct ether_header *eh; eh = mtod(m, struct ether_header *); switch (ntohs(eh->ether_type)) { case ETHERTYPE_IP: case ETHERTYPE_ARP: + case ETHERTYPE_REVARP: m_freem(m); return (EAFNOSUPPORT); /* NOTREACHED */ @@ -538,6 +542,25 @@ ether_input_internal(struct ifnet *ifp, struct mbuf *m) etype = ntohs(eh->ether_type); random_harvest_queue_ether(m, sizeof(*m)); +#ifdef EXPERIMENTAL +#if defined(INET6) && defined(INET) + /* draft-ietf-6man-ipv6only-flag */ + /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ + if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { + + switch (etype) { + case ETHERTYPE_IP: + case ETHERTYPE_ARP: + case ETHERTYPE_REVARP: + m_freem(m); + return; + /* NOTREACHED */ + break; + }; + } +#endif +#endif + CURVNET_SET_QUIET(ifp->if_vnet); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { @@ -739,14 +762,14 @@ SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL); static void vnet_ether_init(__unused void *arg) { - int i; + struct pfil_head_args args; + + args.pa_version = PFIL_VERSION; + args.pa_flags = PFIL_IN | PFIL_OUT; + args.pa_type = PFIL_TYPE_ETHERNET; + args.pa_headname = PFIL_ETHER_NAME; + V_link_pfil_head = pfil_head_register(&args); - /* Initialize packet filter hooks. */ - V_link_pfil_hook.ph_type = PFIL_TYPE_AF; - V_link_pfil_hook.ph_af = AF_LINK; - if ((i = pfil_head_register(&V_link_pfil_hook)) != 0) - printf("%s: WARNING: unable to register pfil link hook, " - "error %d\n", __func__, i); #ifdef VIMAGE netisr_register_vnet(ðer_nh); #endif @@ -758,11 +781,8 @@ VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, static void vnet_ether_pfil_destroy(__unused void *arg) { - int i; - if ((i = pfil_head_unregister(&V_link_pfil_hook)) != 0) - printf("%s: WARNING: unable to unregister pfil link hook, " - "error %d\n", __func__, i); + pfil_head_unregister(V_link_pfil_head); } VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY, vnet_ether_pfil_destroy, NULL); @@ -798,6 +818,7 @@ ether_input(struct ifnet *ifp, struct mbuf *m) * We will rely on rcvif being set properly in the deferred context, * so assert it is correct here. */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p " "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp)); CURVNET_SET_QUIET(ifp->if_vnet); @@ -820,10 +841,8 @@ ether_demux(struct ifnet *ifp, struct mbuf *m) KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); /* Do not grab PROMISC frames in case we are re-entered. */ - if (PFIL_HOOKED(&V_link_pfil_hook) && !(m->m_flags & M_PROMISC)) { - i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_IN, 0, - NULL); - + if (PFIL_HOOKED_IN(V_link_pfil_head) && !(m->m_flags & M_PROMISC)) { + i = pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_IN, NULL); if (i != 0 || m == NULL) return; } @@ -1390,5 +1409,38 @@ ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p, return (true); } +/* + * Allocate an address from the FreeBSD Foundation OUI. This uses a + * cryptographic hash function on the containing jail's UUID and the interface + * name to attempt to provide a unique but stable address. Pseudo-interfaces + * which require a MAC address should use this function to allocate + * non-locally-administered addresses. + */ +void +ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr) +{ +#define ETHER_GEN_ADDR_BUFSIZ HOSTUUIDLEN + IFNAMSIZ + 2 + SHA1_CTX ctx; + char buf[ETHER_GEN_ADDR_BUFSIZ]; + char uuid[HOSTUUIDLEN + 1]; + uint64_t addr; + int i, sz; + char digest[SHA1_RESULTLEN]; + + getcredhostuuid(curthread->td_ucred, uuid, sizeof(uuid)); + sz = snprintf(buf, ETHER_GEN_ADDR_BUFSIZ, "%s-%s", uuid, ifp->if_xname); + SHA1Init(&ctx); + SHA1Update(&ctx, buf, sz); + SHA1Final(digest, &ctx); + + addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) & + OUI_FREEBSD_GENERATED_MASK; + addr = OUI_FREEBSD(addr); + for (i = 0; i < ETHER_ADDR_LEN; ++i) { + hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) & + 0xFF; + } +} + DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(ether, 1); diff --git a/freebsd/sys/net/if_gre.c b/freebsd/sys/net/if_gre.c index 4fbc105e..5aeb8266 100644 --- a/freebsd/sys/net/if_gre.c +++ b/freebsd/sys/net/if_gre.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> +#include <rtems/bsd/local/opt_rss.h> #include <sys/param.h> #include <sys/kernel.h> @@ -51,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include <sys/priv.h> #include <sys/proc.h> #include <sys/socket.h> +#include <sys/socketvar.h> #include <sys/sockio.h> #include <sys/sx.h> #include <sys/sysctl.h> @@ -67,19 +69,27 @@ __FBSDID("$FreeBSD$"); #include <net/route.h> #include <netinet/in.h> +#include <netinet/in_pcb.h> #ifdef INET #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/ip_var.h> +#ifdef RSS +#include <netinet/in_rss.h> +#endif #endif #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/in6_var.h> #include <netinet6/ip6_var.h> +#ifdef RSS +#include <netinet6/in6_rss.h> +#endif #endif #include <netinet/ip_encap.h> +#include <netinet/udp.h> #include <net/bpf.h> #include <net/if_gre.h> @@ -153,6 +163,7 @@ vnet_gre_uninit(const void *unused __unused) #ifdef INET6 in6_gre_uninit(); #endif + /* XXX: epoch_call drain */ } VNET_SYSUNINIT(vnet_gre_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gre_uninit, NULL); @@ -272,6 +283,7 @@ gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) break; case GRESKEY: case GRESOPTS: + case GRESPORT: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if ((error = copyin(ifr_data_get_ptr(ifr), &opt, @@ -287,23 +299,45 @@ gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) } if (sc->gre_options == opt) break; + } else if (cmd == GRESPORT) { + if (opt != 0 && (opt < V_ipport_hifirstauto || + opt > V_ipport_hilastauto)) { + error = EINVAL; + break; + } + if (sc->gre_port == opt) + break; + if ((sc->gre_options & GRE_UDPENCAP) == 0) { + /* + * UDP encapsulation is not enabled, thus + * there is no need to reattach softc. + */ + sc->gre_port = opt; + break; + } } switch (sc->gre_family) { #ifdef INET case AF_INET: - in_gre_setopts(sc, cmd, opt); + error = in_gre_setopts(sc, cmd, opt); break; #endif #ifdef INET6 case AF_INET6: - in6_gre_setopts(sc, cmd, opt); + error = in6_gre_setopts(sc, cmd, opt); break; #endif default: + /* + * Tunnel is not yet configured. + * We can just change any parameters. + */ if (cmd == GRESKEY) sc->gre_key = opt; - else + if (cmd == GRESOPTS) sc->gre_options = opt; + if (cmd == GRESPORT) + sc->gre_port = opt; break; } /* @@ -319,6 +353,10 @@ gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = copyout(&sc->gre_options, ifr_data_get_ptr(ifr), sizeof(sc->gre_options)); break; + case GREGPORT: + error = copyout(&sc->gre_port, ifr_data_get_ptr(ifr), + sizeof(sc->gre_port)); + break; default: error = EINVAL; break; @@ -343,6 +381,7 @@ end: static void gre_delete_tunnel(struct gre_softc *sc) { + struct gre_socket *gs; sx_assert(&gre_ioctl_sx, SA_XLOCKED); if (sc->gre_family != 0) { @@ -352,6 +391,16 @@ gre_delete_tunnel(struct gre_softc *sc) free(sc->gre_hdr, M_GRE); sc->gre_family = 0; } + /* + * If this Tunnel was the last one that could use UDP socket, + * we should unlink socket from hash table and close it. + */ + if ((gs = sc->gre_so) != NULL && CK_LIST_EMPTY(&gs->list)) { + CK_LIST_REMOVE(gs, chain); + soclose(gs->so); + epoch_call(net_epoch_preempt, &gs->epoch_ctx, gre_sofree); + sc->gre_so = NULL; + } GRE2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(GRE2IFP(sc), LINK_STATE_DOWN); } @@ -378,7 +427,38 @@ gre_hashdestroy(struct gre_list *hash) } void -gre_updatehdr(struct gre_softc *sc, struct grehdr *gh) +gre_sofree(epoch_context_t ctx) +{ + struct gre_socket *gs; + + gs = __containerof(ctx, struct gre_socket, epoch_ctx); + free(gs, M_GRE); +} + +static __inline uint16_t +gre_cksum_add(uint16_t sum, uint16_t a) +{ + uint16_t res; + + res = sum + a; + return (res + (res < a)); +} + +void +gre_update_udphdr(struct gre_softc *sc, struct udphdr *udp, uint16_t csum) +{ + + sx_assert(&gre_ioctl_sx, SA_XLOCKED); + MPASS(sc->gre_options & GRE_UDPENCAP); + + udp->uh_dport = htons(GRE_UDPPORT); + udp->uh_sport = htons(sc->gre_port); + udp->uh_sum = csum; + udp->uh_ulen = 0; +} + +void +gre_update_hdr(struct gre_softc *sc, struct grehdr *gh) { uint32_t *opts; uint16_t flags; @@ -545,6 +625,52 @@ gre_setseqn(struct grehdr *gh, uint32_t seq) *opts = htonl(seq); } +static uint32_t +gre_flowid(struct gre_softc *sc, struct mbuf *m, uint32_t af) +{ + uint32_t flowid; + + if ((sc->gre_options & GRE_UDPENCAP) == 0 || sc->gre_port != 0) + return (0); +#ifndef RSS + switch (af) { +#ifdef INET + case AF_INET: + flowid = mtod(m, struct ip *)->ip_src.s_addr ^ + mtod(m, struct ip *)->ip_dst.s_addr; + break; +#endif +#ifdef INET6 + case AF_INET6: + flowid = mtod(m, struct ip6_hdr *)->ip6_src.s6_addr32[3] ^ + mtod(m, struct ip6_hdr *)->ip6_dst.s6_addr32[3]; + break; +#endif + default: + flowid = 0; + } +#else /* RSS */ + switch (af) { +#ifdef INET + case AF_INET: + flowid = rss_hash_ip4_2tuple(mtod(m, struct ip *)->ip_src, + mtod(m, struct ip *)->ip_dst); + break; +#endif +#ifdef INET6 + case AF_INET6: + flowid = rss_hash_ip6_2tuple( + &mtod(m, struct ip6_hdr *)->ip6_src, + &mtod(m, struct ip6_hdr *)->ip6_dst); + break; +#endif + default: + flowid = 0; + } +#endif + return (flowid); +} + #define MTAG_GRE 1307983903 static int gre_transmit(struct ifnet *ifp, struct mbuf *m) @@ -552,7 +678,8 @@ gre_transmit(struct ifnet *ifp, struct mbuf *m) GRE_RLOCK_TRACKER; struct gre_softc *sc; struct grehdr *gh; - uint32_t af; + struct udphdr *uh; + uint32_t af, flowid; int error, len; uint16_t proto; @@ -579,6 +706,7 @@ gre_transmit(struct ifnet *ifp, struct mbuf *m) af = m->m_pkthdr.csum_data; BPF_MTAP2(ifp, &af, sizeof(af), m); m->m_flags &= ~(M_BCAST|M_MCAST); + flowid = gre_flowid(sc, m, af); M_SETFIB(m, sc->gre_fibnum); M_PREPEND(m, sc->gre_hlen, M_NOWAIT); if (m == NULL) { @@ -620,6 +748,19 @@ gre_transmit(struct ifnet *ifp, struct mbuf *m) error = ENETDOWN; goto drop; } + if (sc->gre_options & GRE_UDPENCAP) { + uh = (struct udphdr *)mtodo(m, len); + uh->uh_sport |= htons(V_ipport_hifirstauto) | + (flowid >> 16) | (flowid & 0xFFFF); + uh->uh_sport = htons(ntohs(uh->uh_sport) % + V_ipport_hilastauto); + uh->uh_ulen = htons(m->m_pkthdr.len - len); + uh->uh_sum = gre_cksum_add(uh->uh_sum, + htons(m->m_pkthdr.len - len + IPPROTO_UDP)); + m->m_pkthdr.csum_flags = sc->gre_csumflags; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + len += sizeof(struct udphdr); + } gh = (struct grehdr *)mtodo(m, len); gh->gre_proto = proto; if (sc->gre_options & GRE_ENABLE_SEQ) @@ -637,7 +778,7 @@ gre_transmit(struct ifnet *ifp, struct mbuf *m) #endif #ifdef INET6 case AF_INET6: - error = in6_gre_output(m, af, sc->gre_hlen); + error = in6_gre_output(m, af, sc->gre_hlen, flowid); break; #endif default: diff --git a/freebsd/sys/net/if_gre.h b/freebsd/sys/net/if_gre.h index 4b93321a..de3c5979 100644 --- a/freebsd/sys/net/if_gre.h +++ b/freebsd/sys/net/if_gre.h @@ -53,14 +53,35 @@ struct greip { struct ip gi_ip; struct grehdr gi_gre; } __packed; -#endif + +struct greudp { + struct ip gi_ip; + struct udphdr gi_udp; + struct grehdr gi_gre; +} __packed; +#endif /* INET */ #ifdef INET6 struct greip6 { struct ip6_hdr gi6_ip6; struct grehdr gi6_gre; } __packed; -#endif + +struct greudp6 { + struct ip6_hdr gi6_ip6; + struct udphdr gi6_udp; + struct grehdr gi6_gre; +} __packed; +#endif /* INET6 */ + +CK_LIST_HEAD(gre_list, gre_softc); +CK_LIST_HEAD(gre_sockets, gre_socket); +struct gre_socket { + struct socket *so; + struct gre_list list; + CK_LIST_ENTRY(gre_socket) chain; + struct epoch_context epoch_ctx; +}; struct gre_softc { struct ifnet *gre_ifp; @@ -69,22 +90,26 @@ struct gre_softc { uint32_t gre_oseq; uint32_t gre_key; uint32_t gre_options; + uint32_t gre_csumflags; + uint32_t gre_port; u_int gre_fibnum; u_int gre_hlen; /* header size */ union { void *hdr; #ifdef INET - struct greip *gihdr; + struct greip *iphdr; + struct greudp *udphdr; #endif #ifdef INET6 - struct greip6 *gi6hdr; + struct greip6 *ip6hdr; + struct greudp6 *udp6hdr; #endif } gre_uhdr; + struct gre_socket *gre_so; CK_LIST_ENTRY(gre_softc) chain; CK_LIST_ENTRY(gre_softc) srchash; }; -CK_LIST_HEAD(gre_list, gre_softc); MALLOC_DECLARE(M_GRE); #ifndef GRE_HASH_SIZE @@ -98,28 +123,35 @@ MALLOC_DECLARE(M_GRE); #define GRE_WAIT() epoch_wait_preempt(net_epoch_preempt) #define gre_hdr gre_uhdr.hdr -#define gre_gihdr gre_uhdr.gihdr -#define gre_gi6hdr gre_uhdr.gi6hdr -#define gre_oip gre_gihdr->gi_ip -#define gre_oip6 gre_gi6hdr->gi6_ip6 +#define gre_iphdr gre_uhdr.iphdr +#define gre_ip6hdr gre_uhdr.ip6hdr +#define gre_udphdr gre_uhdr.udphdr +#define gre_udp6hdr gre_uhdr.udp6hdr + +#define gre_oip gre_iphdr->gi_ip +#define gre_udp gre_udphdr->gi_udp +#define gre_oip6 gre_ip6hdr->gi6_ip6 +#define gre_udp6 gre_udp6hdr->gi6_udp struct gre_list *gre_hashinit(void); void gre_hashdestroy(struct gre_list *); int gre_input(struct mbuf *, int, int, void *); -void gre_updatehdr(struct gre_softc *, struct grehdr *); +void gre_update_hdr(struct gre_softc *, struct grehdr *); +void gre_update_udphdr(struct gre_softc *, struct udphdr *, uint16_t); +void gre_sofree(epoch_context_t); void in_gre_init(void); void in_gre_uninit(void); -void in_gre_setopts(struct gre_softc *, u_long, uint32_t); +int in_gre_setopts(struct gre_softc *, u_long, uint32_t); int in_gre_ioctl(struct gre_softc *, u_long, caddr_t); int in_gre_output(struct mbuf *, int, int); void in6_gre_init(void); void in6_gre_uninit(void); -void in6_gre_setopts(struct gre_softc *, u_long, uint32_t); +int in6_gre_setopts(struct gre_softc *, u_long, uint32_t); int in6_gre_ioctl(struct gre_softc *, u_long, caddr_t); -int in6_gre_output(struct mbuf *, int, int); +int in6_gre_output(struct mbuf *, int, int, uint32_t); /* * CISCO uses special type for GRE tunnel created as part of WCCP * connection, while in fact those packets are just IPv4 encapsulated @@ -139,9 +171,15 @@ int in6_gre_output(struct mbuf *, int, int); #define GRESKEY _IOW('i', 108, struct ifreq) #define GREGOPTS _IOWR('i', 109, struct ifreq) #define GRESOPTS _IOW('i', 110, struct ifreq) +#define GREGPORT _IOWR('i', 111, struct ifreq) +#define GRESPORT _IOW('i', 112, struct ifreq) + +/* GRE-in-UDP encapsulation destination port as defined in RFC8086 */ +#define GRE_UDPPORT 4754 #define GRE_ENABLE_CSUM 0x0001 #define GRE_ENABLE_SEQ 0x0002 -#define GRE_OPTMASK (GRE_ENABLE_CSUM|GRE_ENABLE_SEQ) +#define GRE_UDPENCAP 0x0004 +#define GRE_OPTMASK (GRE_ENABLE_CSUM|GRE_ENABLE_SEQ|GRE_UDPENCAP) #endif /* _NET_IF_GRE_H_ */ diff --git a/freebsd/sys/net/if_lagg.c b/freebsd/sys/net/if_lagg.c index 85099115..b82313eb 100644 --- a/freebsd/sys/net/if_lagg.c +++ b/freebsd/sys/net/if_lagg.c @@ -25,6 +25,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> +#include <rtems/bsd/local/opt_kern_tls.h> #include <rtems/bsd/local/opt_ratelimit.h> #include <sys/param.h> @@ -97,6 +98,11 @@ static struct { {0, NULL} }; +struct lagg_snd_tag { + struct m_snd_tag com; + struct m_snd_tag *tag; +}; + VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */ #define V_lagg_list VNET(lagg_list) VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx); @@ -113,6 +119,7 @@ static void lagg_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner); #define V_lagg_cloner VNET(lagg_cloner) static const char laggname[] = "lagg"; +static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface"); static void lagg_capabilities(struct lagg_softc *); static int lagg_port_create(struct lagg_softc *, struct ifnet *); @@ -131,10 +138,17 @@ static void lagg_port2req(struct lagg_port *, struct lagg_reqport *); static void lagg_init(void *); static void lagg_stop(struct lagg_softc *); static int lagg_ioctl(struct ifnet *, u_long, caddr_t); -#ifdef RATELIMIT +#if defined(KERN_TLS) || defined(RATELIMIT) static int lagg_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); +static int lagg_snd_tag_modify(struct m_snd_tag *, + union if_snd_tag_modify_params *); +static int lagg_snd_tag_query(struct m_snd_tag *, + union if_snd_tag_query_params *); +static void lagg_snd_tag_free(struct m_snd_tag *); +static void lagg_ratelimit_query(struct ifnet *, + struct if_ratelimit_query_results *); #endif static int lagg_setmulti(struct lagg_port *); static int lagg_clrmulti(struct lagg_port *); @@ -264,6 +278,13 @@ SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN, &VNET_NAME(def_use_flowid), 0, "Default setting for using flow id for load sharing"); +/* Default value for using numa */ +VNET_DEFINE_STATIC(int, def_use_numa) = 1; +#define V_def_use_numa VNET(def_use_numa) +SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_numa, CTLFLAG_RWTUN, + &VNET_NAME(def_use_numa), 0, + "Use numa to steer flows"); + /* Default value for flowid shift */ VNET_DEFINE_STATIC(int, def_flowid_shift) = 16; #define V_def_flowid_shift VNET(def_flowid_shift) @@ -480,10 +501,10 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) struct ifnet *ifp; static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ - sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); + sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { - free(sc, M_DEVBUF); + free(sc, M_LAGG); return (ENOSPC); } LAGG_SX_INIT(sc); @@ -491,6 +512,8 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) LAGG_XLOCK(sc); if (V_def_use_flowid) sc->sc_opts |= LAGG_OPT_USE_FLOWID; + if (V_def_use_numa) + sc->sc_opts |= LAGG_OPT_USE_NUMA; sc->flowid_shift = V_def_flowid_shift; /* Hash all layers by default */ @@ -514,12 +537,14 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) ifp->if_ioctl = lagg_ioctl; ifp->if_get_counter = lagg_get_counter; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; -#ifdef RATELIMIT +#if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = lagg_snd_tag_alloc; - ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS | IFCAP_TXRTLMT; -#else - ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; + ifp->if_snd_tag_modify = lagg_snd_tag_modify; + ifp->if_snd_tag_query = lagg_snd_tag_query; + ifp->if_snd_tag_free = lagg_snd_tag_free; + ifp->if_ratelimit_query = lagg_ratelimit_query; #endif + ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; /* * Attach as an ordinary ethernet device, children will be attached @@ -572,7 +597,7 @@ lagg_clone_destroy(struct ifnet *ifp) LAGG_LIST_UNLOCK(); LAGG_SX_DESTROY(sc); - free(sc, M_DEVBUF); + free(sc, M_LAGG); } static void @@ -686,7 +711,7 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) ifr.ifr_mtu = oldmtu; } - lp = malloc(sizeof(struct lagg_port), M_DEVBUF, M_WAITOK|M_ZERO); + lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK|M_ZERO); lp->lp_softc = sc; /* Check if port is a stacked lagg */ @@ -694,7 +719,7 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) { if (ifp == sc_ptr->sc_ifp) { LAGG_LIST_UNLOCK(); - free(lp, M_DEVBUF); + free(lp, M_LAGG); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); @@ -705,7 +730,7 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) if (lagg_port_checkstacking(sc_ptr) >= LAGG_MAX_STACKING) { LAGG_LIST_UNLOCK(); - free(lp, M_DEVBUF); + free(lp, M_LAGG); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); @@ -753,7 +778,6 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) * is predictable and `ifconfig laggN create ...` command * will lead to the same result each time. */ - LAGG_RLOCK(); CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) { if (tlp->lp_ifp->if_index < ifp->if_index && ( CK_SLIST_NEXT(tlp, lp_entries) == NULL || @@ -761,7 +785,6 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) ifp->if_index)) break; } - LAGG_RUNLOCK(); if (tlp != NULL) CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries); else @@ -816,7 +839,7 @@ lagg_port_destroy_cb(epoch_context_t ec) ifp = lp->lp_ifp; if_rele(ifp); - free(lp, M_DEVBUF); + free(lp, M_LAGG); } static int @@ -1250,6 +1273,8 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) switch (ro->ro_opts) { case LAGG_OPT_USE_FLOWID: case -LAGG_OPT_USE_FLOWID: + case LAGG_OPT_USE_NUMA: + case -LAGG_OPT_USE_NUMA: case LAGG_OPT_FLOWIDSHIFT: valid = 1; lacp = 0; @@ -1528,49 +1553,142 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) return (error); } -#ifdef RATELIMIT -static int -lagg_snd_tag_alloc(struct ifnet *ifp, - union if_snd_tag_alloc_params *params, - struct m_snd_tag **ppmt) +#if defined(KERN_TLS) || defined(RATELIMIT) +static inline struct lagg_snd_tag * +mst_to_lst(struct m_snd_tag *mst) { - struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + + return (__containerof(mst, struct lagg_snd_tag, com)); +} + +/* + * Look up the port used by a specific flow. This only works for lagg + * protocols with deterministic port mappings (e.g. not roundrobin). + * In addition protocols which use a hash to map flows to ports must + * be configured to use the mbuf flowid rather than hashing packet + * contents. + */ +static struct lagg_port * +lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype) +{ + struct lagg_softc *sc; struct lagg_port *lp; struct lagg_lb *lb; uint32_t p; + sc = ifp->if_softc; + switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: - lp = lagg_link_active(sc, sc->sc_primary); - break; + return (lagg_link_active(sc, sc->sc_primary)); case LAGG_PROTO_LOADBALANCE: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || - params->hdr.flowtype == M_HASHTYPE_NONE) - return (EOPNOTSUPP); - p = params->hdr.flowid >> sc->flowid_shift; + flowtype == M_HASHTYPE_NONE) + return (NULL); + p = flowid >> sc->flowid_shift; p %= sc->sc_count; lb = (struct lagg_lb *)sc->sc_psc; lp = lb->lb_ports[p]; - lp = lagg_link_active(sc, lp); - break; + return (lagg_link_active(sc, lp)); case LAGG_PROTO_LACP: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || - params->hdr.flowtype == M_HASHTYPE_NONE) - return (EOPNOTSUPP); - lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid); - break; + flowtype == M_HASHTYPE_NONE) + return (NULL); + return (lacp_select_tx_port_by_hash(sc, flowid)); default: - return (EOPNOTSUPP); + return (NULL); } - if (lp == NULL) +} + +static int +lagg_snd_tag_alloc(struct ifnet *ifp, + union if_snd_tag_alloc_params *params, + struct m_snd_tag **ppmt) +{ + struct lagg_snd_tag *lst; + struct lagg_softc *sc; + struct lagg_port *lp; + struct ifnet *lp_ifp; + int error; + + sc = ifp->if_softc; + + LAGG_RLOCK(); + lp = lookup_snd_tag_port(ifp, params->hdr.flowid, params->hdr.flowtype); + if (lp == NULL) { + LAGG_RUNLOCK(); return (EOPNOTSUPP); - ifp = lp->lp_ifp; - if (ifp == NULL || ifp->if_snd_tag_alloc == NULL || - (ifp->if_capenable & IFCAP_TXRTLMT) == 0) + } + if (lp->lp_ifp == NULL || lp->lp_ifp->if_snd_tag_alloc == NULL) { + LAGG_RUNLOCK(); return (EOPNOTSUPP); + } + lp_ifp = lp->lp_ifp; + if_ref(lp_ifp); + LAGG_RUNLOCK(); + + lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT); + if (lst == NULL) { + if_rele(lp_ifp); + return (ENOMEM); + } + + error = lp_ifp->if_snd_tag_alloc(lp_ifp, params, &lst->tag); + if_rele(lp_ifp); + if (error) { + free(lst, M_LAGG); + return (error); + } + + m_snd_tag_init(&lst->com, ifp); - /* forward allocation request */ - return (ifp->if_snd_tag_alloc(ifp, params, ppmt)); + *ppmt = &lst->com; + return (0); +} + +static int +lagg_snd_tag_modify(struct m_snd_tag *mst, + union if_snd_tag_modify_params *params) +{ + struct lagg_snd_tag *lst; + + lst = mst_to_lst(mst); + return (lst->tag->ifp->if_snd_tag_modify(lst->tag, params)); +} + +static int +lagg_snd_tag_query(struct m_snd_tag *mst, + union if_snd_tag_query_params *params) +{ + struct lagg_snd_tag *lst; + + lst = mst_to_lst(mst); + return (lst->tag->ifp->if_snd_tag_query(lst->tag, params)); +} + +static void +lagg_snd_tag_free(struct m_snd_tag *mst) +{ + struct lagg_snd_tag *lst; + + lst = mst_to_lst(mst); + m_snd_tag_rele(lst->tag); + free(lst, M_LAGG); +} + +static void +lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) +{ + /* + * For lagg, we have an indirect + * interface. The caller needs to + * get a ratelimit tag on the actual + * interface the flow will go on. + */ + q->rate_table = NULL; + q->flags = RT_IS_INDIRECT; + q->max_flows = 0; + q->number_of_rates = 0; } #endif @@ -1588,7 +1706,7 @@ lagg_setmulti(struct lagg_port *lp) CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; - mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT); + mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(scifp); return (ENOMEM); @@ -1619,7 +1737,7 @@ lagg_clrmulti(struct lagg_port *lp) SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); if (mc->mc_ifma && lp->lp_detaching == 0) if_delmulti_ifma(mc->mc_ifma); - free(mc, M_DEVBUF); + free(mc, M_LAGG); } return (0); } @@ -1696,6 +1814,10 @@ lagg_transmit(struct ifnet *ifp, struct mbuf *m) struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error; +#if defined(KERN_TLS) || defined(RATELIMIT) + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) + MPASS(m->m_pkthdr.snd_tag->ifp == ifp); +#endif LAGG_RLOCK(); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { @@ -1848,12 +1970,20 @@ struct lagg_port * lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_port *lp_next, *rval = NULL; - struct epoch_tracker net_et; /* * Search a port which reports an active link state. */ +#ifdef INVARIANTS + /* + * This is called with either LAGG_RLOCK() held or + * LAGG_XLOCK(sc) held. + */ + if (!in_epoch(net_epoch_preempt)) + LAGG_XLOCK_ASSERT(sc); +#endif + if (lp == NULL) goto search; if (LAGG_PORTACTIVE(lp)) { @@ -1866,15 +1996,12 @@ lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) goto found; } - search: - epoch_enter_preempt(net_epoch_preempt, &net_et); +search: CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp_next)) { - epoch_exit_preempt(net_epoch_preempt, &net_et); return (lp_next); } } - epoch_exit_preempt(net_epoch_preempt, &net_et); found: return (rval); } @@ -1883,6 +2010,21 @@ int lagg_enqueue(struct ifnet *ifp, struct mbuf *m) { +#if defined(KERN_TLS) || defined(RATELIMIT) + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { + struct lagg_snd_tag *lst; + struct m_snd_tag *mst; + + mst = m->m_pkthdr.snd_tag; + lst = mst_to_lst(mst); + if (lst->tag->ifp != ifp) { + m_freem(m); + return (EAGAIN); + } + m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag); + m_snd_tag_rele(mst); + } +#endif return (ifp->if_transmit)(ifp, m); } @@ -1956,7 +2098,7 @@ lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m) struct lagg_port *lp, *last = NULL; struct mbuf *m0; - LAGG_RLOCK(); + LAGG_RLOCK_ASSERT(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (!LAGG_PORTACTIVE(lp)) continue; @@ -1977,7 +2119,6 @@ lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m) } last = lp; } - LAGG_RUNLOCK(); if (last == NULL) { m_freem(m); @@ -2063,7 +2204,7 @@ lagg_lb_attach(struct lagg_softc *sc) struct lagg_lb *lb; LAGG_XLOCK_ASSERT(sc); - lb = malloc(sizeof(struct lagg_lb), M_DEVBUF, M_WAITOK | M_ZERO); + lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO); lb->lb_key = m_ether_tcpip_hash_init(); sc->sc_psc = lb; @@ -2078,7 +2219,7 @@ lagg_lb_detach(struct lagg_softc *sc) lb = (struct lagg_lb *)sc->sc_psc; if (lb != NULL) - free(lb, M_DEVBUF); + free(lb, M_LAGG); } static int @@ -2090,7 +2231,7 @@ lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) rv = 0; bzero(&lb->lb_ports, sizeof(lb->lb_ports)); - LAGG_RLOCK(); + LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (lp_next == lp) continue; @@ -2103,7 +2244,6 @@ lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) sc->sc_ifname, lp_next->lp_ifp->if_xname, i); lb->lb_ports[i++] = lp_next; } - LAGG_RUNLOCK(); return (rv); } diff --git a/freebsd/sys/net/if_lagg.h b/freebsd/sys/net/if_lagg.h index f1e2d8f4..2c566c0d 100644 --- a/freebsd/sys/net/if_lagg.h +++ b/freebsd/sys/net/if_lagg.h @@ -143,6 +143,7 @@ struct lagg_reqopts { #define LAGG_OPT_USE_FLOWID 0x01 /* enable use of flowid */ /* Pseudo flags which are used in ro_opts but not stored into sc_opts. */ #define LAGG_OPT_FLOWIDSHIFT 0x02 /* set flowid shift */ +#define LAGG_OPT_USE_NUMA 0x04 /* enable use of numa */ #define LAGG_OPT_FLOWIDSHIFT_MASK 0x1f /* flowid is uint32_t */ #define LAGG_OPT_LACP_STRICT 0x10 /* LACP strict mode */ #define LAGG_OPT_LACP_TXTEST 0x20 /* LACP debug: txtest */ @@ -158,8 +159,9 @@ struct lagg_reqopts { #define SIOCGLAGGOPTS _IOWR('i', 152, struct lagg_reqopts) #define SIOCSLAGGOPTS _IOW('i', 153, struct lagg_reqopts) -#define LAGG_OPT_BITS "\020\001USE_FLOWID\005LACP_STRICT" \ - "\006LACP_TXTEST\007LACP_RXTEST" +#define LAGG_OPT_BITS "\020\001USE_FLOWID\003USE_NUMA" \ + "\005LACP_STRICT\006LACP_TXTEST" \ + "\007LACP_RXTEST" #ifdef _KERNEL diff --git a/freebsd/sys/net/if_llatbl.c b/freebsd/sys/net/if_llatbl.c index b220d7aa..e79b9ba9 100644 --- a/freebsd/sys/net/if_llatbl.c +++ b/freebsd/sys/net/if_llatbl.c @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/eventhandler.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/syslog.h> @@ -92,6 +93,7 @@ static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, static int lltable_dump_af(struct lltable *llt, struct sysctl_req *wr) { + struct epoch_tracker et; int error; LLTABLE_LIST_LOCK_ASSERT(); @@ -100,10 +102,10 @@ lltable_dump_af(struct lltable *llt, struct sysctl_req *wr) return (0); error = 0; - IF_AFDATA_RLOCK(llt->llt_ifp); + NET_EPOCH_ENTER(et); error = lltable_foreach_lle(llt, (llt_foreach_cb_t *)llt->llt_dump_entry, wr); - IF_AFDATA_RUNLOCK(llt->llt_ifp); + NET_EPOCH_EXIT(et); return (error); } @@ -455,11 +457,12 @@ struct llentry * llentry_alloc(struct ifnet *ifp, struct lltable *lt, struct sockaddr_storage *dst) { + struct epoch_tracker et; struct llentry *la, *la_tmp; - IF_AFDATA_RLOCK(ifp); + NET_EPOCH_ENTER(et); la = lla_lookup(lt, LLE_EXCLUSIVE, (struct sockaddr *)dst); - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (la != NULL) { LLE_ADDREF(la); diff --git a/freebsd/sys/net/if_llatbl.h b/freebsd/sys/net/if_llatbl.h index 74301284..7bf57bdb 100644 --- a/freebsd/sys/net/if_llatbl.h +++ b/freebsd/sys/net/if_llatbl.h @@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$"); #ifndef _NET_IF_LLATBL_H_ #define _NET_IF_LLATBL_H_ +#include <sys/_eventhandler.h> #include <sys/_rwlock.h> #include <netinet/in.h> #include <sys/epoch.h> @@ -267,7 +268,6 @@ llentry_mark_used(struct llentry *lle) int lla_rt_output(struct rt_msghdr *, struct rt_addrinfo *); -#include <sys/eventhandler.h> enum { LLENTRY_RESOLVED, LLENTRY_TIMEDOUT, diff --git a/freebsd/sys/net/if_spppsubr.c b/freebsd/sys/net/if_spppsubr.c index f5b78dec..9aec7cd1 100644 --- a/freebsd/sys/net/if_spppsubr.c +++ b/freebsd/sys/net/if_spppsubr.c @@ -1062,15 +1062,13 @@ sppp_detach(struct ifnet *ifp) KASSERT(mtx_initialized(&sp->mtx), ("sppp mutex is not initialized")); /* Stop keepalive handler. */ - if (!callout_drain(&sp->keepalive_callout)) - callout_stop(&sp->keepalive_callout); + callout_drain(&sp->keepalive_callout); for (i = 0; i < IDX_COUNT; i++) { - if (!callout_drain(&sp->ch[i])) - callout_stop(&sp->ch[i]); + callout_drain(&sp->ch[i]); } - if (!callout_drain(&sp->pap_my_to_ch)) - callout_stop(&sp->pap_my_to_ch); + callout_drain(&sp->pap_my_to_ch); + mtx_destroy(&sp->pp_cpq.ifq_mtx); mtx_destroy(&sp->pp_fastq.ifq_mtx); mtx_destroy(&sp->mtx); @@ -4339,16 +4337,12 @@ sppp_chap_tld(struct sppp *sp) static void sppp_chap_scr(struct sppp *sp) { - u_long *ch, seed; + u_long *ch; u_char clen; /* Compute random challenge. */ ch = (u_long *)sp->myauth.challenge; - read_random(&seed, sizeof seed); - ch[0] = seed ^ random(); - ch[1] = seed ^ random(); - ch[2] = seed ^ random(); - ch[3] = seed ^ random(); + arc4random_buf(ch, 4 * sizeof(*ch)); clen = AUTHKEYLEN; sp->confid[IDX_CHAP] = ++sp->pp_seq[IDX_CHAP]; @@ -4809,7 +4803,7 @@ sppp_keepalive(void *dummy) sppp_cisco_send (sp, CISCO_KEEPALIVE_REQ, ++sp->pp_seq[IDX_LCP], sp->pp_rseq[IDX_LCP]); else if (sp->pp_phase >= PHASE_AUTHENTICATE) { - long nmagic = htonl (sp->lcp.magic); + uint32_t nmagic = htonl(sp->lcp.magic); sp->lcp.echoid = ++sp->pp_seq[IDX_LCP]; sppp_cp_send (sp, PPP_LCP, ECHO_REQ, sp->lcp.echoid, 4, &nmagic); diff --git a/freebsd/sys/net/if_stf.c b/freebsd/sys/net/if_stf.c index 3ba9f8c0..7185fb8d 100644 --- a/freebsd/sys/net/if_stf.c +++ b/freebsd/sys/net/if_stf.c @@ -730,6 +730,7 @@ stf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) } ifp->if_flags |= IFF_UP; + ifp->if_drv_flags |= IFF_DRV_RUNNING; break; case SIOCADDMULTI: diff --git a/freebsd/sys/net/if_tap.c b/freebsd/sys/net/if_tap.c deleted file mode 100644 index dbf3e599..00000000 --- a/freebsd/sys/net/if_tap.c +++ /dev/null @@ -1,1133 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * BASED ON: - * ------------------------------------------------------------------------- - * - * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> - * Nottingham University 1987. - */ - -/* - * $FreeBSD$ - * $Id: if_tap.c,v 0.21 2000/07/23 21:46:02 max Exp $ - */ - -#include <rtems/bsd/local/opt_inet.h> - -#include <sys/param.h> -#include <sys/conf.h> -#include <sys/fcntl.h> -#include <sys/filio.h> -#include <sys/jail.h> -#include <sys/kernel.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <sys/poll.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/selinfo.h> -#include <sys/signalvar.h> -#include <sys/socket.h> -#include <sys/sockio.h> -#include <sys/sysctl.h> -#include <sys/systm.h> -#include <sys/ttycom.h> -#include <sys/uio.h> -#include <sys/queue.h> - -#include <net/bpf.h> -#include <net/ethernet.h> -#include <net/if.h> -#include <net/if_var.h> -#include <net/if_clone.h> -#include <net/if_dl.h> -#include <net/if_media.h> -#include <net/if_types.h> -#include <net/route.h> -#include <net/vnet.h> - -#include <netinet/in.h> - -#include <net/if_tapvar.h> -#include <net/if_tap.h> - -#define CDEV_NAME "tap" -#define TAPDEBUG if (tapdebug) printf - -static const char tapname[] = "tap"; -static const char vmnetname[] = "vmnet"; -#define TAPMAXUNIT 0x7fff -#define VMNET_DEV_MASK CLONE_FLAG0 - -/* module */ -static int tapmodevent(module_t, int, void *); - -/* device */ -static void tapclone(void *, struct ucred *, char *, int, - struct cdev **); -static void tapcreate(struct cdev *); - -/* network interface */ -static void tapifstart(struct ifnet *); -static int tapifioctl(struct ifnet *, u_long, caddr_t); -static void tapifinit(void *); - -static int tap_clone_create(struct if_clone *, int, caddr_t); -static void tap_clone_destroy(struct ifnet *); -static struct if_clone *tap_cloner; -static int vmnet_clone_create(struct if_clone *, int, caddr_t); -static void vmnet_clone_destroy(struct ifnet *); -static struct if_clone *vmnet_cloner; - -/* character device */ -static d_open_t tapopen; -static d_close_t tapclose; -static d_read_t tapread; -static d_write_t tapwrite; -static d_ioctl_t tapioctl; -static d_poll_t tappoll; -static d_kqfilter_t tapkqfilter; - -/* kqueue(2) */ -static int tapkqread(struct knote *, long); -static int tapkqwrite(struct knote *, long); -static void tapkqdetach(struct knote *); - -static struct filterops tap_read_filterops = { - .f_isfd = 1, - .f_attach = NULL, - .f_detach = tapkqdetach, - .f_event = tapkqread, -}; - -static struct filterops tap_write_filterops = { - .f_isfd = 1, - .f_attach = NULL, - .f_detach = tapkqdetach, - .f_event = tapkqwrite, -}; - -static struct cdevsw tap_cdevsw = { - .d_version = D_VERSION, - .d_flags = D_NEEDMINOR, - .d_open = tapopen, - .d_close = tapclose, - .d_read = tapread, - .d_write = tapwrite, - .d_ioctl = tapioctl, - .d_poll = tappoll, - .d_name = CDEV_NAME, - .d_kqfilter = tapkqfilter, -}; - -/* - * All global variables in if_tap.c are locked with tapmtx, with the - * exception of tapdebug, which is accessed unlocked; tapclones is - * static at runtime. - */ -static struct mtx tapmtx; -static int tapdebug = 0; /* debug flag */ -static int tapuopen = 0; /* allow user open() */ -static int tapuponopen = 0; /* IFF_UP on open() */ -static int tapdclone = 1; /* enable devfs cloning */ -static SLIST_HEAD(, tap_softc) taphead; /* first device */ -static struct clonedevs *tapclones; - -MALLOC_DECLARE(M_TAP); -MALLOC_DEFINE(M_TAP, CDEV_NAME, "Ethernet tunnel interface"); -SYSCTL_INT(_debug, OID_AUTO, if_tap_debug, CTLFLAG_RW, &tapdebug, 0, ""); - -SYSCTL_DECL(_net_link); -static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0, - "Ethernet tunnel software network interface"); -SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tapuopen, 0, - "Allow user to open /dev/tap (based on node permissions)"); -SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, - "Bring interface up when /dev/tap is opened"); -SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, - "Enable legacy devfs interface creation"); -SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tapdebug, 0, ""); - -DEV_MODULE(if_tap, tapmodevent, NULL); - -static int -tap_clone_create(struct if_clone *ifc, int unit, caddr_t params) -{ - struct cdev *dev; - int i; - - /* Find any existing device, or allocate new unit number. */ - i = clone_create(&tapclones, &tap_cdevsw, &unit, &dev, 0); - if (i) { - dev = make_dev(&tap_cdevsw, unit, UID_ROOT, GID_WHEEL, 0600, - "%s%d", tapname, unit); - } - - tapcreate(dev); - return (0); -} - -/* vmnet devices are tap devices in disguise */ -static int -vmnet_clone_create(struct if_clone *ifc, int unit, caddr_t params) -{ - struct cdev *dev; - int i; - - /* Find any existing device, or allocate new unit number. */ - i = clone_create(&tapclones, &tap_cdevsw, &unit, &dev, VMNET_DEV_MASK); - if (i) { - dev = make_dev(&tap_cdevsw, unit | VMNET_DEV_MASK, UID_ROOT, - GID_WHEEL, 0600, "%s%d", vmnetname, unit); - } - - tapcreate(dev); - return (0); -} - -static void -tap_destroy(struct tap_softc *tp) -{ - struct ifnet *ifp = tp->tap_ifp; - - CURVNET_SET(ifp->if_vnet); - destroy_dev(tp->tap_dev); - seldrain(&tp->tap_rsel); - knlist_clear(&tp->tap_rsel.si_note, 0); - knlist_destroy(&tp->tap_rsel.si_note); - ether_ifdetach(ifp); - if_free(ifp); - - mtx_destroy(&tp->tap_mtx); - free(tp, M_TAP); - CURVNET_RESTORE(); -} - -static void -tap_clone_destroy(struct ifnet *ifp) -{ - struct tap_softc *tp = ifp->if_softc; - - mtx_lock(&tapmtx); - SLIST_REMOVE(&taphead, tp, tap_softc, tap_next); - mtx_unlock(&tapmtx); - tap_destroy(tp); -} - -/* vmnet devices are tap devices in disguise */ -static void -vmnet_clone_destroy(struct ifnet *ifp) -{ - tap_clone_destroy(ifp); -} - -/* - * tapmodevent - * - * module event handler - */ -static int -tapmodevent(module_t mod, int type, void *data) -{ - static eventhandler_tag eh_tag = NULL; - struct tap_softc *tp = NULL; - struct ifnet *ifp = NULL; - - switch (type) { - case MOD_LOAD: - - /* intitialize device */ - - mtx_init(&tapmtx, "tapmtx", NULL, MTX_DEF); - SLIST_INIT(&taphead); - - clone_setup(&tapclones); - eh_tag = EVENTHANDLER_REGISTER(dev_clone, tapclone, 0, 1000); - if (eh_tag == NULL) { - clone_cleanup(&tapclones); - mtx_destroy(&tapmtx); - return (ENOMEM); - } - tap_cloner = if_clone_simple(tapname, tap_clone_create, - tap_clone_destroy, 0); - vmnet_cloner = if_clone_simple(vmnetname, vmnet_clone_create, - vmnet_clone_destroy, 0); - return (0); - - case MOD_UNLOAD: - /* - * The EBUSY algorithm here can't quite atomically - * guarantee that this is race-free since we have to - * release the tap mtx to deregister the clone handler. - */ - mtx_lock(&tapmtx); - SLIST_FOREACH(tp, &taphead, tap_next) { - mtx_lock(&tp->tap_mtx); - if (tp->tap_flags & TAP_OPEN) { - mtx_unlock(&tp->tap_mtx); - mtx_unlock(&tapmtx); - return (EBUSY); - } - mtx_unlock(&tp->tap_mtx); - } - mtx_unlock(&tapmtx); - - EVENTHANDLER_DEREGISTER(dev_clone, eh_tag); - if_clone_detach(tap_cloner); - if_clone_detach(vmnet_cloner); - drain_dev_clone_events(); - - mtx_lock(&tapmtx); - while ((tp = SLIST_FIRST(&taphead)) != NULL) { - SLIST_REMOVE_HEAD(&taphead, tap_next); - mtx_unlock(&tapmtx); - - ifp = tp->tap_ifp; - - TAPDEBUG("detaching %s\n", ifp->if_xname); - - tap_destroy(tp); - mtx_lock(&tapmtx); - } - mtx_unlock(&tapmtx); - clone_cleanup(&tapclones); - - mtx_destroy(&tapmtx); - - break; - - default: - return (EOPNOTSUPP); - } - - return (0); -} /* tapmodevent */ - - -/* - * DEVFS handler - * - * We need to support two kind of devices - tap and vmnet - */ -static void -tapclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) -{ - char devname[SPECNAMELEN + 1]; - int i, unit, append_unit; - int extra; - - if (*dev != NULL) - return; - - if (!tapdclone || - (!tapuopen && priv_check_cred(cred, PRIV_NET_IFCREATE) != 0)) - return; - - unit = 0; - append_unit = 0; - extra = 0; - - /* We're interested in only tap/vmnet devices. */ - if (strcmp(name, tapname) == 0) { - unit = -1; - } else if (strcmp(name, vmnetname) == 0) { - unit = -1; - extra = VMNET_DEV_MASK; - } else if (dev_stdclone(name, NULL, tapname, &unit) != 1) { - if (dev_stdclone(name, NULL, vmnetname, &unit) != 1) { - return; - } else { - extra = VMNET_DEV_MASK; - } - } - - if (unit == -1) - append_unit = 1; - - CURVNET_SET(CRED_TO_VNET(cred)); - /* find any existing device, or allocate new unit number */ - i = clone_create(&tapclones, &tap_cdevsw, &unit, dev, extra); - if (i) { - if (append_unit) { - /* - * We were passed 'tun' or 'tap', with no unit specified - * so we'll need to append it now. - */ - namelen = snprintf(devname, sizeof(devname), "%s%d", name, - unit); - name = devname; - } - - *dev = make_dev_credf(MAKEDEV_REF, &tap_cdevsw, unit | extra, - cred, UID_ROOT, GID_WHEEL, 0600, "%s", name); - } - - if_clone_create(name, namelen, NULL); - CURVNET_RESTORE(); -} /* tapclone */ - - -/* - * tapcreate - * - * to create interface - */ -static void -tapcreate(struct cdev *dev) -{ - struct ifnet *ifp = NULL; - struct tap_softc *tp = NULL; - unsigned short macaddr_hi; - uint32_t macaddr_mid; - int unit; - const char *name = NULL; - u_char eaddr[6]; - - /* allocate driver storage and create device */ - tp = malloc(sizeof(*tp), M_TAP, M_WAITOK | M_ZERO); - mtx_init(&tp->tap_mtx, "tap_mtx", NULL, MTX_DEF); - mtx_lock(&tapmtx); - SLIST_INSERT_HEAD(&taphead, tp, tap_next); - mtx_unlock(&tapmtx); - - unit = dev2unit(dev); - - /* select device: tap or vmnet */ - if (unit & VMNET_DEV_MASK) { - name = vmnetname; - tp->tap_flags |= TAP_VMNET; - } else - name = tapname; - - unit &= TAPMAXUNIT; - - TAPDEBUG("tapcreate(%s%d). minor = %#x\n", name, unit, dev2unit(dev)); - - /* generate fake MAC address: 00 bd xx xx xx unit_no */ - macaddr_hi = htons(0x00bd); - macaddr_mid = (uint32_t) ticks; - bcopy(&macaddr_hi, eaddr, sizeof(short)); - bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t)); - eaddr[5] = (u_char)unit; - - /* fill the rest and attach interface */ - ifp = tp->tap_ifp = if_alloc(IFT_ETHER); - if (ifp == NULL) - panic("%s%d: can not if_alloc()", name, unit); - ifp->if_softc = tp; - if_initname(ifp, name, unit); - ifp->if_init = tapifinit; - ifp->if_start = tapifstart; - ifp->if_ioctl = tapifioctl; - ifp->if_mtu = ETHERMTU; - ifp->if_flags = (IFF_BROADCAST|IFF_SIMPLEX|IFF_MULTICAST); - IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); - ifp->if_capabilities |= IFCAP_LINKSTATE; - ifp->if_capenable |= IFCAP_LINKSTATE; - - dev->si_drv1 = tp; - tp->tap_dev = dev; - - ether_ifattach(ifp, eaddr); - - mtx_lock(&tp->tap_mtx); - tp->tap_flags |= TAP_INITED; - mtx_unlock(&tp->tap_mtx); - - knlist_init_mtx(&tp->tap_rsel.si_note, &tp->tap_mtx); - - TAPDEBUG("interface %s is created. minor = %#x\n", - ifp->if_xname, dev2unit(dev)); -} /* tapcreate */ - - -/* - * tapopen - * - * to open tunnel. must be superuser - */ -static int -tapopen(struct cdev *dev, int flag, int mode, struct thread *td) -{ - struct tap_softc *tp = NULL; - struct ifnet *ifp = NULL; - int error; - - if (tapuopen == 0) { - error = priv_check(td, PRIV_NET_TAP); - if (error) - return (error); - } - - if ((dev2unit(dev) & CLONE_UNITMASK) > TAPMAXUNIT) - return (ENXIO); - - tp = dev->si_drv1; - - mtx_lock(&tp->tap_mtx); - if (tp->tap_flags & TAP_OPEN) { - mtx_unlock(&tp->tap_mtx); - return (EBUSY); - } - - bcopy(IF_LLADDR(tp->tap_ifp), tp->ether_addr, sizeof(tp->ether_addr)); -#ifndef __rtems__ - tp->tap_pid = td->td_proc->p_pid; -#else /* __rtems__ */ - tp->tap_pid = BSD_DEFAULT_PID; -#endif /* __rtems__ */ - tp->tap_flags |= TAP_OPEN; - ifp = tp->tap_ifp; - - ifp->if_drv_flags |= IFF_DRV_RUNNING; - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - if (tapuponopen) - ifp->if_flags |= IFF_UP; - if_link_state_change(ifp, LINK_STATE_UP); - mtx_unlock(&tp->tap_mtx); - - TAPDEBUG("%s is open. minor = %#x\n", ifp->if_xname, dev2unit(dev)); - - return (0); -} /* tapopen */ - - -/* - * tapclose - * - * close the device - mark i/f down & delete routing info - */ -static int -tapclose(struct cdev *dev, int foo, int bar, struct thread *td) -{ - struct ifaddr *ifa; - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - - /* junk all pending output */ - mtx_lock(&tp->tap_mtx); - CURVNET_SET(ifp->if_vnet); - IF_DRAIN(&ifp->if_snd); - - /* - * Do not bring the interface down, and do not anything with - * interface, if we are in VMnet mode. Just close the device. - */ - if (((tp->tap_flags & TAP_VMNET) == 0) && - (ifp->if_flags & (IFF_UP | IFF_LINK0)) == IFF_UP) { - mtx_unlock(&tp->tap_mtx); - if_down(ifp); - mtx_lock(&tp->tap_mtx); - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - mtx_unlock(&tp->tap_mtx); - CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - rtinit(ifa, (int)RTM_DELETE, 0); - } - if_purgeaddrs(ifp); - mtx_lock(&tp->tap_mtx); - } - } - - if_link_state_change(ifp, LINK_STATE_DOWN); - CURVNET_RESTORE(); - - funsetown(&tp->tap_sigio); - selwakeuppri(&tp->tap_rsel, PZERO+1); - KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); - - tp->tap_flags &= ~TAP_OPEN; - tp->tap_pid = 0; - mtx_unlock(&tp->tap_mtx); - - TAPDEBUG("%s is closed. minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - - return (0); -} /* tapclose */ - - -/* - * tapifinit - * - * network interface initialization function - */ -static void -tapifinit(void *xtp) -{ - struct tap_softc *tp = (struct tap_softc *)xtp; - struct ifnet *ifp = tp->tap_ifp; - - TAPDEBUG("initializing %s\n", ifp->if_xname); - - mtx_lock(&tp->tap_mtx); - ifp->if_drv_flags |= IFF_DRV_RUNNING; - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - mtx_unlock(&tp->tap_mtx); - - /* attempt to start output */ - tapifstart(ifp); -} /* tapifinit */ - - -/* - * tapifioctl - * - * Process an ioctl request on network interface - */ -static int -tapifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) -{ - struct tap_softc *tp = ifp->if_softc; - struct ifreq *ifr = (struct ifreq *)data; - struct ifstat *ifs = NULL; - struct ifmediareq *ifmr = NULL; - int dummy, error = 0; - - switch (cmd) { - case SIOCSIFFLAGS: /* XXX -- just like vmnet does */ - case SIOCADDMULTI: - case SIOCDELMULTI: - break; - - case SIOCGIFMEDIA: - ifmr = (struct ifmediareq *)data; - dummy = ifmr->ifm_count; - ifmr->ifm_count = 1; - ifmr->ifm_status = IFM_AVALID; - ifmr->ifm_active = IFM_ETHER; - if (tp->tap_flags & TAP_OPEN) - ifmr->ifm_status |= IFM_ACTIVE; - ifmr->ifm_current = ifmr->ifm_active; - if (dummy >= 1) { - int media = IFM_ETHER; - error = copyout(&media, ifmr->ifm_ulist, - sizeof(int)); - } - break; - - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; - break; - - case SIOCGIFSTATUS: - ifs = (struct ifstat *)data; - mtx_lock(&tp->tap_mtx); - if (tp->tap_pid != 0) - snprintf(ifs->ascii, sizeof(ifs->ascii), - "\tOpened by PID %d\n", tp->tap_pid); - else - ifs->ascii[0] = '\0'; - mtx_unlock(&tp->tap_mtx); - break; - - default: - error = ether_ioctl(ifp, cmd, data); - break; - } - - return (error); -} /* tapifioctl */ - - -/* - * tapifstart - * - * queue packets from higher level ready to put out - */ -static void -tapifstart(struct ifnet *ifp) -{ - struct tap_softc *tp = ifp->if_softc; - - TAPDEBUG("%s starting\n", ifp->if_xname); - - /* - * do not junk pending output if we are in VMnet mode. - * XXX: can this do any harm because of queue overflow? - */ - - mtx_lock(&tp->tap_mtx); - if (((tp->tap_flags & TAP_VMNET) == 0) && - ((tp->tap_flags & TAP_READY) != TAP_READY)) { - struct mbuf *m; - - /* Unlocked read. */ - TAPDEBUG("%s not ready, tap_flags = 0x%x\n", ifp->if_xname, - tp->tap_flags); - - for (;;) { - IF_DEQUEUE(&ifp->if_snd, m); - if (m != NULL) { - m_freem(m); - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - } else - break; - } - mtx_unlock(&tp->tap_mtx); - - return; - } - - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - - if (!IFQ_IS_EMPTY(&ifp->if_snd)) { - if (tp->tap_flags & TAP_RWAIT) { - tp->tap_flags &= ~TAP_RWAIT; - wakeup(tp); - } - - if ((tp->tap_flags & TAP_ASYNC) && (tp->tap_sigio != NULL)) { - mtx_unlock(&tp->tap_mtx); - pgsigio(&tp->tap_sigio, SIGIO, 0); - mtx_lock(&tp->tap_mtx); - } - - selwakeuppri(&tp->tap_rsel, PZERO+1); - KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); - if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ - } - - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - mtx_unlock(&tp->tap_mtx); -} /* tapifstart */ - - -/* - * tapioctl - * - * the cdevsw interface is now pretty minimal - */ -static int -tapioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) -{ - struct ifreq ifr; - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - struct tapinfo *tapp = NULL; - int f; - int error; -#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD4) - int ival; -#endif - - switch (cmd) { - case TAPSIFINFO: - tapp = (struct tapinfo *)data; - if (ifp->if_type != tapp->type) - return (EPROTOTYPE); - mtx_lock(&tp->tap_mtx); - if (ifp->if_mtu != tapp->mtu) { - strlcpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ); - ifr.ifr_mtu = tapp->mtu; - CURVNET_SET(ifp->if_vnet); - error = ifhwioctl(SIOCSIFMTU, ifp, - (caddr_t)&ifr, td); - CURVNET_RESTORE(); - if (error) { - mtx_unlock(&tp->tap_mtx); - return (error); - } - } - ifp->if_baudrate = tapp->baudrate; - mtx_unlock(&tp->tap_mtx); - break; - - case TAPGIFINFO: - tapp = (struct tapinfo *)data; - mtx_lock(&tp->tap_mtx); - tapp->mtu = ifp->if_mtu; - tapp->type = ifp->if_type; - tapp->baudrate = ifp->if_baudrate; - mtx_unlock(&tp->tap_mtx); - break; - - case TAPSDEBUG: - tapdebug = *(int *)data; - break; - - case TAPGDEBUG: - *(int *)data = tapdebug; - break; - - case TAPGIFNAME: { - struct ifreq *ifr = (struct ifreq *) data; - - strlcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ); - } break; - - case FIONBIO: - break; - - case FIOASYNC: - mtx_lock(&tp->tap_mtx); - if (*(int *)data) - tp->tap_flags |= TAP_ASYNC; - else - tp->tap_flags &= ~TAP_ASYNC; - mtx_unlock(&tp->tap_mtx); - break; - - case FIONREAD: - if (!IFQ_IS_EMPTY(&ifp->if_snd)) { - struct mbuf *mb; - - IFQ_LOCK(&ifp->if_snd); - IFQ_POLL_NOLOCK(&ifp->if_snd, mb); - for (*(int *)data = 0; mb != NULL; - mb = mb->m_next) - *(int *)data += mb->m_len; - IFQ_UNLOCK(&ifp->if_snd); - } else - *(int *)data = 0; - break; - - case FIOSETOWN: - return (fsetown(*(int *)data, &tp->tap_sigio)); - - case FIOGETOWN: - *(int *)data = fgetown(&tp->tap_sigio); - return (0); - - /* this is deprecated, FIOSETOWN should be used instead */ - case TIOCSPGRP: - return (fsetown(-(*(int *)data), &tp->tap_sigio)); - - /* this is deprecated, FIOGETOWN should be used instead */ - case TIOCGPGRP: - *(int *)data = -fgetown(&tp->tap_sigio); - return (0); - - /* VMware/VMnet port ioctl's */ - -#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD4) - case _IO('V', 0): - ival = IOCPARM_IVAL(data); - data = (caddr_t)&ival; - /* FALLTHROUGH */ -#endif - case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ - f = *(int *)data; - f &= 0x0fff; - f &= ~IFF_CANTCHANGE; - f |= IFF_UP; - - mtx_lock(&tp->tap_mtx); - ifp->if_flags = f | (ifp->if_flags & IFF_CANTCHANGE); - mtx_unlock(&tp->tap_mtx); - break; - - case SIOCGIFADDR: /* get MAC address of the remote side */ - mtx_lock(&tp->tap_mtx); - bcopy(tp->ether_addr, data, sizeof(tp->ether_addr)); - mtx_unlock(&tp->tap_mtx); - break; - - case SIOCSIFADDR: /* set MAC address of the remote side */ - mtx_lock(&tp->tap_mtx); - bcopy(data, tp->ether_addr, sizeof(tp->ether_addr)); - mtx_unlock(&tp->tap_mtx); - break; - - default: - return (ENOTTY); - } - return (0); -} /* tapioctl */ - - -/* - * tapread - * - * the cdevsw read interface - reads a packet at a time, or at - * least as much of a packet as can be read - */ -static int -tapread(struct cdev *dev, struct uio *uio, int flag) -{ - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - struct mbuf *m = NULL; - int error = 0, len; - - TAPDEBUG("%s reading, minor = %#x\n", ifp->if_xname, dev2unit(dev)); - - mtx_lock(&tp->tap_mtx); - if ((tp->tap_flags & TAP_READY) != TAP_READY) { - mtx_unlock(&tp->tap_mtx); - - /* Unlocked read. */ - TAPDEBUG("%s not ready. minor = %#x, tap_flags = 0x%x\n", - ifp->if_xname, dev2unit(dev), tp->tap_flags); - - return (EHOSTDOWN); - } - - tp->tap_flags &= ~TAP_RWAIT; - - /* sleep until we get a packet */ - do { - IF_DEQUEUE(&ifp->if_snd, m); - - if (m == NULL) { - if (flag & O_NONBLOCK) { - mtx_unlock(&tp->tap_mtx); - return (EWOULDBLOCK); - } - - tp->tap_flags |= TAP_RWAIT; - error = mtx_sleep(tp, &tp->tap_mtx, PCATCH | (PZERO + 1), - "taprd", 0); - if (error) { - mtx_unlock(&tp->tap_mtx); - return (error); - } - } - } while (m == NULL); - mtx_unlock(&tp->tap_mtx); - - /* feed packet to bpf */ - BPF_MTAP(ifp, m); - - /* xfer packet to user space */ - while ((m != NULL) && (uio->uio_resid > 0) && (error == 0)) { - len = min(uio->uio_resid, m->m_len); - if (len == 0) - break; - - error = uiomove(mtod(m, void *), len, uio); - m = m_free(m); - } - - if (m != NULL) { - TAPDEBUG("%s dropping mbuf, minor = %#x\n", ifp->if_xname, - dev2unit(dev)); - m_freem(m); - } - - return (error); -} /* tapread */ - - -/* - * tapwrite - * - * the cdevsw write interface - an atomic write is a packet - or else! - */ -static int -tapwrite(struct cdev *dev, struct uio *uio, int flag) -{ - struct ether_header *eh; - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - struct mbuf *m; - - TAPDEBUG("%s writing, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - - if (uio->uio_resid == 0) - return (0); - - if ((uio->uio_resid < 0) || (uio->uio_resid > TAPMRU)) { - TAPDEBUG("%s invalid packet len = %zd, minor = %#x\n", - ifp->if_xname, uio->uio_resid, dev2unit(dev)); - - return (EIO); - } - - if ((m = m_uiotombuf(uio, M_NOWAIT, 0, ETHER_ALIGN, - M_PKTHDR)) == NULL) { - if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); - return (ENOBUFS); - } - - m->m_pkthdr.rcvif = ifp; - - /* - * Only pass a unicast frame to ether_input(), if it would actually - * have been received by non-virtual hardware. - */ - if (m->m_len < sizeof(struct ether_header)) { - m_freem(m); - return (0); - } - eh = mtod(m, struct ether_header *); - - if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && - !ETHER_IS_MULTICAST(eh->ether_dhost) && - bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { - m_freem(m); - return (0); - } - - /* Pass packet up to parent. */ - CURVNET_SET(ifp->if_vnet); - (*ifp->if_input)(ifp, m); - CURVNET_RESTORE(); - if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); /* ibytes are counted in parent */ - - return (0); -} /* tapwrite */ - - -/* - * tappoll - * - * the poll interface, this is only useful on reads - * really. the write detect always returns true, write never blocks - * anyway, it either accepts the packet or drops it - */ -static int -tappoll(struct cdev *dev, int events, struct thread *td) -{ - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - int revents = 0; - - TAPDEBUG("%s polling, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - - if (events & (POLLIN | POLLRDNORM)) { - IFQ_LOCK(&ifp->if_snd); - if (!IFQ_IS_EMPTY(&ifp->if_snd)) { - TAPDEBUG("%s have data in queue. len = %d, " \ - "minor = %#x\n", ifp->if_xname, - ifp->if_snd.ifq_len, dev2unit(dev)); - - revents |= (events & (POLLIN | POLLRDNORM)); - } else { - TAPDEBUG("%s waiting for data, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - - selrecord(td, &tp->tap_rsel); - } - IFQ_UNLOCK(&ifp->if_snd); - } - - if (events & (POLLOUT | POLLWRNORM)) - revents |= (events & (POLLOUT | POLLWRNORM)); - - return (revents); -} /* tappoll */ - - -/* - * tap_kqfilter - * - * support for kevent() system call - */ -static int -tapkqfilter(struct cdev *dev, struct knote *kn) -{ - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - - switch (kn->kn_filter) { - case EVFILT_READ: - TAPDEBUG("%s kqfilter: EVFILT_READ, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - kn->kn_fop = &tap_read_filterops; - break; - - case EVFILT_WRITE: - TAPDEBUG("%s kqfilter: EVFILT_WRITE, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - kn->kn_fop = &tap_write_filterops; - break; - - default: - TAPDEBUG("%s kqfilter: invalid filter, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - return (EINVAL); - /* NOT REACHED */ - } - - kn->kn_hook = tp; - knlist_add(&tp->tap_rsel.si_note, kn, 0); - - return (0); -} /* tapkqfilter */ - - -/* - * tap_kqread - * - * Return true if there is data in the interface queue - */ -static int -tapkqread(struct knote *kn, long hint) -{ - int ret; - struct tap_softc *tp = kn->kn_hook; - struct cdev *dev = tp->tap_dev; - struct ifnet *ifp = tp->tap_ifp; - - if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { - TAPDEBUG("%s have data in queue. len = %d, minor = %#x\n", - ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); - ret = 1; - } else { - TAPDEBUG("%s waiting for data, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - ret = 0; - } - - return (ret); -} /* tapkqread */ - - -/* - * tap_kqwrite - * - * Always can write. Return the MTU in kn->data - */ -static int -tapkqwrite(struct knote *kn, long hint) -{ - struct tap_softc *tp = kn->kn_hook; - struct ifnet *ifp = tp->tap_ifp; - - kn->kn_data = ifp->if_mtu; - - return (1); -} /* tapkqwrite */ - - -static void -tapkqdetach(struct knote *kn) -{ - struct tap_softc *tp = kn->kn_hook; - - knlist_remove(&tp->tap_rsel.si_note, kn, 0); -} /* tapkqdetach */ - diff --git a/freebsd/sys/net/if_tap.h b/freebsd/sys/net/if_tap.h index 34f44b38..9718cee4 100644 --- a/freebsd/sys/net/if_tap.h +++ b/freebsd/sys/net/if_tap.h @@ -40,24 +40,22 @@ #ifndef _NET_IF_TAP_H_ #define _NET_IF_TAP_H_ -/* refer to if_tapvar.h for the softc stuff */ +#include <net/if_tun.h> /* maximum receive packet size (hard limit) */ #define TAPMRU 16384 -struct tapinfo { - int baudrate; /* linespeed */ - short mtu; /* maximum transmission unit */ - u_char type; /* ethernet, tokenring, etc. */ - u_char dummy; /* place holder */ -}; +#define tapinfo tuninfo -/* ioctl's for get/set debug */ -#define TAPSDEBUG _IOW('t', 90, int) -#define TAPGDEBUG _IOR('t', 89, int) -#define TAPSIFINFO _IOW('t', 91, struct tapinfo) -#define TAPGIFINFO _IOR('t', 92, struct tapinfo) -#define TAPGIFNAME _IOR('t', 93, struct ifreq) +/* + * ioctl's for get/set debug; these are aliases of TUN* ioctls, see net/if_tun.h + * for details. + */ +#define TAPSDEBUG TUNSDEBUG +#define TAPGDEBUG TUNGDEBUG +#define TAPSIFINFO TUNSIFINFO +#define TAPGIFINFO TUNGIFINFO +#define TAPGIFNAME TUNGIFNAME /* VMware ioctl's */ #define VMIO_SIOCSIFFLAGS _IOWINT('V', 0) diff --git a/freebsd/sys/net/if_tapvar.h b/freebsd/sys/net/if_tapvar.h deleted file mode 100644 index f5cf9f3e..00000000 --- a/freebsd/sys/net/if_tapvar.h +++ /dev/null @@ -1,71 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * BASED ON: - * ------------------------------------------------------------------------- - * - * Copyright (c) 1998 Brian Somers <brian@Awfulhak.org> - * All rights reserved. - * - * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> - * Nottingham University 1987. - */ - -/* - * $FreeBSD$ - * $Id: if_tapvar.h,v 0.6 2000/07/11 02:16:08 max Exp $ - */ - -#ifndef _NET_IF_TAPVAR_H_ -#define _NET_IF_TAPVAR_H_ - -/* - * tap_mtx locks tap_flags, tap_pid. tap_next locked with global tapmtx. - * Other fields locked by owning subsystems. - */ -struct tap_softc { - struct ifnet *tap_ifp; - u_short tap_flags; /* misc flags */ -#define TAP_OPEN (1 << 0) -#define TAP_INITED (1 << 1) -#define TAP_RWAIT (1 << 2) -#define TAP_ASYNC (1 << 3) -#define TAP_READY (TAP_OPEN|TAP_INITED) -#define TAP_VMNET (1 << 4) - - u_int8_t ether_addr[ETHER_ADDR_LEN]; /* ether addr of the remote side */ - - pid_t tap_pid; /* PID of process to open */ - struct sigio *tap_sigio; /* information for async I/O */ - struct selinfo tap_rsel; /* read select */ - - SLIST_ENTRY(tap_softc) tap_next; /* next device in chain */ - struct cdev *tap_dev; - struct mtx tap_mtx; /* per-softc mutex */ -}; - -#endif /* !_NET_IF_TAPVAR_H_ */ diff --git a/freebsd/sys/net/if_tun.c b/freebsd/sys/net/if_tun.c deleted file mode 100644 index 44441773..00000000 --- a/freebsd/sys/net/if_tun.c +++ /dev/null @@ -1,1055 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ - -/*- - * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> - * Nottingham University 1987. - * - * This source may be freely distributed, however I would be interested - * in any changes that are made. - * - * This driver takes packets off the IP i/f and hands them up to a - * user process to have its wicked way with. This driver has it's - * roots in a similar driver written by Phil Cockcroft (formerly) at - * UCL. This driver is based much more on read/write/poll mode of - * operation though. - * - * $FreeBSD$ - */ - -#include <rtems/bsd/local/opt_inet.h> -#include <rtems/bsd/local/opt_inet6.h> - -#include <sys/param.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/systm.h> -#include <sys/jail.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <sys/socket.h> -#include <sys/fcntl.h> -#include <sys/filio.h> -#include <sys/sockio.h> -#include <sys/ttycom.h> -#include <sys/poll.h> -#include <sys/selinfo.h> -#include <sys/signalvar.h> -#include <sys/filedesc.h> -#include <sys/kernel.h> -#include <sys/sysctl.h> -#include <sys/conf.h> -#include <sys/uio.h> -#include <sys/malloc.h> -#include <sys/random.h> - -#include <net/if.h> -#include <net/if_var.h> -#include <net/if_clone.h> -#include <net/if_types.h> -#include <net/netisr.h> -#include <net/route.h> -#include <net/vnet.h> -#ifdef INET -#include <netinet/in.h> -#endif -#include <net/bpf.h> -#include <net/if_tun.h> - -#include <sys/queue.h> -#include <sys/condvar.h> - -#include <security/mac/mac_framework.h> - -/* - * tun_list is protected by global tunmtx. Other mutable fields are - * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is - * static for the duration of a tunnel interface. - */ -struct tun_softc { - TAILQ_ENTRY(tun_softc) tun_list; - struct cdev *tun_dev; - u_short tun_flags; /* misc flags */ -#define TUN_OPEN 0x0001 -#define TUN_INITED 0x0002 -#define TUN_RCOLL 0x0004 -#define TUN_IASET 0x0008 -#define TUN_DSTADDR 0x0010 -#define TUN_LMODE 0x0020 -#define TUN_RWAIT 0x0040 -#define TUN_ASYNC 0x0080 -#define TUN_IFHEAD 0x0100 - -#define TUN_READY (TUN_OPEN | TUN_INITED) - - /* - * XXXRW: tun_pid is used to exclusively lock /dev/tun. Is this - * actually needed? Can we just return EBUSY if already open? - * Problem is that this involved inherent races when a tun device - * is handed off from one process to another, as opposed to just - * being slightly stale informationally. - */ - pid_t tun_pid; /* owning pid */ - struct ifnet *tun_ifp; /* the interface */ - struct sigio *tun_sigio; /* information for async I/O */ - struct selinfo tun_rsel; /* read select */ - struct mtx tun_mtx; /* protect mutable softc fields */ - struct cv tun_cv; /* protect against ref'd dev destroy */ -}; -#define TUN2IFP(sc) ((sc)->tun_ifp) - -#define TUNDEBUG if (tundebug) if_printf - -/* - * All mutable global variables in if_tun are locked using tunmtx, with - * the exception of tundebug, which is used unlocked, and tunclones, - * which is static after setup. - */ -static struct mtx tunmtx; -static const char tunname[] = "tun"; -static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); -static int tundebug = 0; -static int tundclone = 1; -static struct clonedevs *tunclones; -static TAILQ_HEAD(,tun_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); -SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); - -SYSCTL_DECL(_net_link); -static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, - "IP tunnel software network interface."); -SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, - "Enable legacy devfs interface creation."); - -static void tunclone(void *arg, struct ucred *cred, char *name, - int namelen, struct cdev **dev); -static void tuncreate(const char *name, struct cdev *dev); -static int tunifioctl(struct ifnet *, u_long, caddr_t); -static void tuninit(struct ifnet *); -static int tunmodevent(module_t, int, void *); -static int tunoutput(struct ifnet *, struct mbuf *, - const struct sockaddr *, struct route *ro); -static void tunstart(struct ifnet *); - -static int tun_clone_create(struct if_clone *, int, caddr_t); -static void tun_clone_destroy(struct ifnet *); -static struct if_clone *tun_cloner; - -static d_open_t tunopen; -static d_close_t tunclose; -static d_read_t tunread; -static d_write_t tunwrite; -static d_ioctl_t tunioctl; -static d_poll_t tunpoll; -static d_kqfilter_t tunkqfilter; - -static int tunkqread(struct knote *, long); -static int tunkqwrite(struct knote *, long); -static void tunkqdetach(struct knote *); - -static struct filterops tun_read_filterops = { - .f_isfd = 1, - .f_attach = NULL, - .f_detach = tunkqdetach, - .f_event = tunkqread, -}; - -static struct filterops tun_write_filterops = { - .f_isfd = 1, - .f_attach = NULL, - .f_detach = tunkqdetach, - .f_event = tunkqwrite, -}; - -static struct cdevsw tun_cdevsw = { - .d_version = D_VERSION, - .d_flags = D_NEEDMINOR, - .d_open = tunopen, - .d_close = tunclose, - .d_read = tunread, - .d_write = tunwrite, - .d_ioctl = tunioctl, - .d_poll = tunpoll, - .d_kqfilter = tunkqfilter, - .d_name = tunname, -}; - -static int -tun_clone_create(struct if_clone *ifc, int unit, caddr_t params) -{ - struct cdev *dev; - int i; - - /* find any existing device, or allocate new unit number */ - i = clone_create(&tunclones, &tun_cdevsw, &unit, &dev, 0); - if (i) { - /* No preexisting struct cdev *, create one */ - dev = make_dev(&tun_cdevsw, unit, - UID_UUCP, GID_DIALER, 0600, "%s%d", tunname, unit); - } - tuncreate(tunname, dev); - - return (0); -} - -static void -tunclone(void *arg, struct ucred *cred, char *name, int namelen, - struct cdev **dev) -{ - char devname[SPECNAMELEN + 1]; - int u, i, append_unit; - - if (*dev != NULL) - return; - - /* - * If tun cloning is enabled, only the superuser can create an - * interface. - */ - if (!tundclone || priv_check_cred(cred, PRIV_NET_IFCREATE) != 0) - return; - - if (strcmp(name, tunname) == 0) { - u = -1; - } else if (dev_stdclone(name, NULL, tunname, &u) != 1) - return; /* Don't recognise the name */ - if (u != -1 && u > IF_MAXUNIT) - return; /* Unit number too high */ - - if (u == -1) - append_unit = 1; - else - append_unit = 0; - - CURVNET_SET(CRED_TO_VNET(cred)); - /* find any existing device, or allocate new unit number */ - i = clone_create(&tunclones, &tun_cdevsw, &u, dev, 0); - if (i) { - if (append_unit) { - namelen = snprintf(devname, sizeof(devname), "%s%d", - name, u); - name = devname; - } - /* No preexisting struct cdev *, create one */ - *dev = make_dev_credf(MAKEDEV_REF, &tun_cdevsw, u, cred, - UID_UUCP, GID_DIALER, 0600, "%s", name); - } - - if_clone_create(name, namelen, NULL); - CURVNET_RESTORE(); -} - -static void -tun_destroy(struct tun_softc *tp) -{ - struct cdev *dev; - - mtx_lock(&tp->tun_mtx); - if ((tp->tun_flags & TUN_OPEN) != 0) - cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); - else - mtx_unlock(&tp->tun_mtx); - - CURVNET_SET(TUN2IFP(tp)->if_vnet); - dev = tp->tun_dev; - bpfdetach(TUN2IFP(tp)); - if_detach(TUN2IFP(tp)); - if_free(TUN2IFP(tp)); - destroy_dev(dev); - seldrain(&tp->tun_rsel); - knlist_clear(&tp->tun_rsel.si_note, 0); - knlist_destroy(&tp->tun_rsel.si_note); - mtx_destroy(&tp->tun_mtx); - cv_destroy(&tp->tun_cv); - free(tp, M_TUN); - CURVNET_RESTORE(); -} - -static void -tun_clone_destroy(struct ifnet *ifp) -{ - struct tun_softc *tp = ifp->if_softc; - - mtx_lock(&tunmtx); - TAILQ_REMOVE(&tunhead, tp, tun_list); - mtx_unlock(&tunmtx); - tun_destroy(tp); -} - -static int -tunmodevent(module_t mod, int type, void *data) -{ - static eventhandler_tag tag; - struct tun_softc *tp; - - switch (type) { - case MOD_LOAD: - mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); - clone_setup(&tunclones); - tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); - if (tag == NULL) - return (ENOMEM); - tun_cloner = if_clone_simple(tunname, tun_clone_create, - tun_clone_destroy, 0); - break; - case MOD_UNLOAD: - if_clone_detach(tun_cloner); - EVENTHANDLER_DEREGISTER(dev_clone, tag); - drain_dev_clone_events(); - - mtx_lock(&tunmtx); - while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { - TAILQ_REMOVE(&tunhead, tp, tun_list); - mtx_unlock(&tunmtx); - tun_destroy(tp); - mtx_lock(&tunmtx); - } - mtx_unlock(&tunmtx); - clone_cleanup(&tunclones); - mtx_destroy(&tunmtx); - break; - default: - return EOPNOTSUPP; - } - return 0; -} - -static moduledata_t tun_mod = { - "if_tun", - tunmodevent, - 0 -}; - -DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); -MODULE_VERSION(if_tun, 1); - -static void -tunstart(struct ifnet *ifp) -{ - struct tun_softc *tp = ifp->if_softc; - struct mbuf *m; - - TUNDEBUG(ifp,"%s starting\n", ifp->if_xname); - if (ALTQ_IS_ENABLED(&ifp->if_snd)) { - IFQ_LOCK(&ifp->if_snd); - IFQ_POLL_NOLOCK(&ifp->if_snd, m); - if (m == NULL) { - IFQ_UNLOCK(&ifp->if_snd); - return; - } - IFQ_UNLOCK(&ifp->if_snd); - } - - mtx_lock(&tp->tun_mtx); - if (tp->tun_flags & TUN_RWAIT) { - tp->tun_flags &= ~TUN_RWAIT; - wakeup(tp); - } - selwakeuppri(&tp->tun_rsel, PZERO + 1); - KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); - if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { - mtx_unlock(&tp->tun_mtx); - pgsigio(&tp->tun_sigio, SIGIO, 0); - } else - mtx_unlock(&tp->tun_mtx); -} - -/* XXX: should return an error code so it can fail. */ -static void -tuncreate(const char *name, struct cdev *dev) -{ - struct tun_softc *sc; - struct ifnet *ifp; - - sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO); - mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF); - cv_init(&sc->tun_cv, "tun_condvar"); - sc->tun_flags = TUN_INITED; - sc->tun_dev = dev; - mtx_lock(&tunmtx); - TAILQ_INSERT_TAIL(&tunhead, sc, tun_list); - mtx_unlock(&tunmtx); - - ifp = sc->tun_ifp = if_alloc(IFT_PPP); - if (ifp == NULL) - panic("%s%d: failed to if_alloc() interface.\n", - name, dev2unit(dev)); - if_initname(ifp, name, dev2unit(dev)); - ifp->if_mtu = TUNMTU; - ifp->if_ioctl = tunifioctl; - ifp->if_output = tunoutput; - ifp->if_start = tunstart; - ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; - ifp->if_softc = sc; - IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); - ifp->if_snd.ifq_drv_maxlen = 0; - IFQ_SET_READY(&ifp->if_snd); - knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx); - ifp->if_capabilities |= IFCAP_LINKSTATE; - ifp->if_capenable |= IFCAP_LINKSTATE; - - if_attach(ifp); - bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); - dev->si_drv1 = sc; - TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); -} - -static int -tunopen(struct cdev *dev, int flag, int mode, struct thread *td) -{ - struct ifnet *ifp; - struct tun_softc *tp; - - /* - * XXXRW: Non-atomic test and set of dev->si_drv1 requires - * synchronization. - */ - tp = dev->si_drv1; - if (!tp) { - tuncreate(tunname, dev); - tp = dev->si_drv1; - } - - /* - * XXXRW: This use of tun_pid is subject to error due to the - * fact that a reference to the tunnel can live beyond the - * death of the process that created it. Can we replace this - * with a simple busy flag? - */ - mtx_lock(&tp->tun_mtx); -#ifndef __rtems__ - if (tp->tun_pid != 0 && tp->tun_pid != td->td_proc->p_pid) { -#else /* __rtems__ */ - if (tp->tun_pid != 0 && tp->tun_pid != BSD_DEFAULT_PID) { -#endif /* __rtems__ */ - mtx_unlock(&tp->tun_mtx); - return (EBUSY); - } -#ifndef __rtems__ - tp->tun_pid = td->td_proc->p_pid; -#else /* __rtems__ */ - tp->tun_pid = BSD_DEFAULT_PID; -#endif /* __rtems__ */ - - tp->tun_flags |= TUN_OPEN; - ifp = TUN2IFP(tp); - if_link_state_change(ifp, LINK_STATE_UP); - TUNDEBUG(ifp, "open\n"); - mtx_unlock(&tp->tun_mtx); - - return (0); -} - -/* - * tunclose - close the device - mark i/f down & delete - * routing info - */ -static int -tunclose(struct cdev *dev, int foo, int bar, struct thread *td) -{ - struct tun_softc *tp; - struct ifnet *ifp; - - tp = dev->si_drv1; - ifp = TUN2IFP(tp); - - mtx_lock(&tp->tun_mtx); - tp->tun_flags &= ~TUN_OPEN; - tp->tun_pid = 0; - - /* - * junk all pending output - */ - CURVNET_SET(ifp->if_vnet); - IFQ_PURGE(&ifp->if_snd); - - if (ifp->if_flags & IFF_UP) { - mtx_unlock(&tp->tun_mtx); - if_down(ifp); - mtx_lock(&tp->tun_mtx); - } - - /* Delete all addresses and routes which reference this interface. */ - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - struct ifaddr *ifa; - - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - mtx_unlock(&tp->tun_mtx); - CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - /* deal w/IPv4 PtP destination; unlocked read */ - if (ifa->ifa_addr->sa_family == AF_INET) { - rtinit(ifa, (int)RTM_DELETE, - tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); - } else { - rtinit(ifa, (int)RTM_DELETE, 0); - } - } - if_purgeaddrs(ifp); - mtx_lock(&tp->tun_mtx); - } - if_link_state_change(ifp, LINK_STATE_DOWN); - CURVNET_RESTORE(); - - funsetown(&tp->tun_sigio); - selwakeuppri(&tp->tun_rsel, PZERO + 1); - KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); - TUNDEBUG (ifp, "closed\n"); - - cv_broadcast(&tp->tun_cv); - mtx_unlock(&tp->tun_mtx); - return (0); -} - -static void -tuninit(struct ifnet *ifp) -{ - struct tun_softc *tp = ifp->if_softc; -#ifdef INET - struct ifaddr *ifa; -#endif - - TUNDEBUG(ifp, "tuninit\n"); - - mtx_lock(&tp->tun_mtx); - ifp->if_flags |= IFF_UP; - ifp->if_drv_flags |= IFF_DRV_RUNNING; - getmicrotime(&ifp->if_lastchange); - -#ifdef INET - if_addr_rlock(ifp); - CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family == AF_INET) { - struct sockaddr_in *si; - - si = (struct sockaddr_in *)ifa->ifa_addr; - if (si->sin_addr.s_addr) - tp->tun_flags |= TUN_IASET; - - si = (struct sockaddr_in *)ifa->ifa_dstaddr; - if (si && si->sin_addr.s_addr) - tp->tun_flags |= TUN_DSTADDR; - } - } - if_addr_runlock(ifp); -#endif - mtx_unlock(&tp->tun_mtx); -} - -/* - * Process an ioctl request. - */ -static int -tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) -{ - struct ifreq *ifr = (struct ifreq *)data; - struct tun_softc *tp = ifp->if_softc; - struct ifstat *ifs; - int error = 0; - - switch(cmd) { - case SIOCGIFSTATUS: - ifs = (struct ifstat *)data; - mtx_lock(&tp->tun_mtx); - if (tp->tun_pid) - snprintf(ifs->ascii, sizeof(ifs->ascii), - "\tOpened by PID %d\n", tp->tun_pid); - else - ifs->ascii[0] = '\0'; - mtx_unlock(&tp->tun_mtx); - break; - case SIOCSIFADDR: - tuninit(ifp); - TUNDEBUG(ifp, "address set\n"); - break; - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; - TUNDEBUG(ifp, "mtu set\n"); - break; - case SIOCSIFFLAGS: - case SIOCADDMULTI: - case SIOCDELMULTI: - break; - default: - error = EINVAL; - } - return (error); -} - -/* - * tunoutput - queue packets from higher level ready to put out. - */ -static int -tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, - struct route *ro) -{ - struct tun_softc *tp = ifp->if_softc; - u_short cached_tun_flags; - int error; - u_int32_t af; - - TUNDEBUG (ifp, "tunoutput\n"); - -#ifdef MAC - error = mac_ifnet_check_transmit(ifp, m0); - if (error) { - m_freem(m0); - return (error); - } -#endif - - /* Could be unlocked read? */ - mtx_lock(&tp->tun_mtx); - cached_tun_flags = tp->tun_flags; - mtx_unlock(&tp->tun_mtx); - if ((cached_tun_flags & TUN_READY) != TUN_READY) { - TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); - m_freem (m0); - return (EHOSTDOWN); - } - - if ((ifp->if_flags & IFF_UP) != IFF_UP) { - m_freem (m0); - return (EHOSTDOWN); - } - - /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) - bcopy(dst->sa_data, &af, sizeof(af)); - else - af = dst->sa_family; - - if (bpf_peers_present(ifp->if_bpf)) - bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); - - /* prepend sockaddr? this may abort if the mbuf allocation fails */ - if (cached_tun_flags & TUN_LMODE) { - /* allocate space for sockaddr */ - M_PREPEND(m0, dst->sa_len, M_NOWAIT); - - /* if allocation failed drop packet */ - if (m0 == NULL) { - if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - return (ENOBUFS); - } else { - bcopy(dst, m0->m_data, dst->sa_len); - } - } - - if (cached_tun_flags & TUN_IFHEAD) { - /* Prepend the address family */ - M_PREPEND(m0, 4, M_NOWAIT); - - /* if allocation failed drop packet */ - if (m0 == NULL) { - if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - return (ENOBUFS); - } else - *(u_int32_t *)m0->m_data = htonl(af); - } else { -#ifdef INET - if (af != AF_INET) -#endif - { - m_freem(m0); - return (EAFNOSUPPORT); - } - } - - error = (ifp->if_transmit)(ifp, m0); - if (error) - return (ENOBUFS); - if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); - return (0); -} - -/* - * the cdevsw interface is now pretty minimal. - */ -static int -tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, - struct thread *td) -{ - struct ifreq ifr; - struct tun_softc *tp = dev->si_drv1; - struct tuninfo *tunp; - int error; - - switch (cmd) { - case TUNSIFINFO: - tunp = (struct tuninfo *)data; - if (TUN2IFP(tp)->if_type != tunp->type) - return (EPROTOTYPE); - mtx_lock(&tp->tun_mtx); - if (TUN2IFP(tp)->if_mtu != tunp->mtu) { - strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); - ifr.ifr_mtu = tunp->mtu; - CURVNET_SET(TUN2IFP(tp)->if_vnet); - error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), - (caddr_t)&ifr, td); - CURVNET_RESTORE(); - if (error) { - mtx_unlock(&tp->tun_mtx); - return (error); - } - } - TUN2IFP(tp)->if_baudrate = tunp->baudrate; - mtx_unlock(&tp->tun_mtx); - break; - case TUNGIFINFO: - tunp = (struct tuninfo *)data; - mtx_lock(&tp->tun_mtx); - tunp->mtu = TUN2IFP(tp)->if_mtu; - tunp->type = TUN2IFP(tp)->if_type; - tunp->baudrate = TUN2IFP(tp)->if_baudrate; - mtx_unlock(&tp->tun_mtx); - break; - case TUNSDEBUG: - tundebug = *(int *)data; - break; - case TUNGDEBUG: - *(int *)data = tundebug; - break; - case TUNSLMODE: - mtx_lock(&tp->tun_mtx); - if (*(int *)data) { - tp->tun_flags |= TUN_LMODE; - tp->tun_flags &= ~TUN_IFHEAD; - } else - tp->tun_flags &= ~TUN_LMODE; - mtx_unlock(&tp->tun_mtx); - break; - case TUNSIFHEAD: - mtx_lock(&tp->tun_mtx); - if (*(int *)data) { - tp->tun_flags |= TUN_IFHEAD; - tp->tun_flags &= ~TUN_LMODE; - } else - tp->tun_flags &= ~TUN_IFHEAD; - mtx_unlock(&tp->tun_mtx); - break; - case TUNGIFHEAD: - mtx_lock(&tp->tun_mtx); - *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; - mtx_unlock(&tp->tun_mtx); - break; - case TUNSIFMODE: - /* deny this if UP */ - if (TUN2IFP(tp)->if_flags & IFF_UP) - return(EBUSY); - - switch (*(int *)data & ~IFF_MULTICAST) { - case IFF_POINTOPOINT: - case IFF_BROADCAST: - mtx_lock(&tp->tun_mtx); - TUN2IFP(tp)->if_flags &= - ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); - TUN2IFP(tp)->if_flags |= *(int *)data; - mtx_unlock(&tp->tun_mtx); - break; - default: - return(EINVAL); - } - break; - case TUNSIFPID: - mtx_lock(&tp->tun_mtx); -#ifndef __rtems__ - tp->tun_pid = curthread->td_proc->p_pid; -#else /* __rtems__ */ - tp->tun_pid = BSD_DEFAULT_PID; -#endif /* __rtems__ */ - mtx_unlock(&tp->tun_mtx); - break; - case FIONBIO: - break; - case FIOASYNC: - mtx_lock(&tp->tun_mtx); - if (*(int *)data) - tp->tun_flags |= TUN_ASYNC; - else - tp->tun_flags &= ~TUN_ASYNC; - mtx_unlock(&tp->tun_mtx); - break; - case FIONREAD: - if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { - struct mbuf *mb; - IFQ_LOCK(&TUN2IFP(tp)->if_snd); - IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); - for (*(int *)data = 0; mb != NULL; mb = mb->m_next) - *(int *)data += mb->m_len; - IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); - } else - *(int *)data = 0; - break; - case FIOSETOWN: - return (fsetown(*(int *)data, &tp->tun_sigio)); - - case FIOGETOWN: - *(int *)data = fgetown(&tp->tun_sigio); - return (0); - - /* This is deprecated, FIOSETOWN should be used instead. */ - case TIOCSPGRP: - return (fsetown(-(*(int *)data), &tp->tun_sigio)); - - /* This is deprecated, FIOGETOWN should be used instead. */ - case TIOCGPGRP: - *(int *)data = -fgetown(&tp->tun_sigio); - return (0); - - default: - return (ENOTTY); - } - return (0); -} - -/* - * The cdevsw read interface - reads a packet at a time, or at - * least as much of a packet as can be read. - */ -static int -tunread(struct cdev *dev, struct uio *uio, int flag) -{ - struct tun_softc *tp = dev->si_drv1; - struct ifnet *ifp = TUN2IFP(tp); - struct mbuf *m; - int error=0, len; - - TUNDEBUG (ifp, "read\n"); - mtx_lock(&tp->tun_mtx); - if ((tp->tun_flags & TUN_READY) != TUN_READY) { - mtx_unlock(&tp->tun_mtx); - TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); - return (EHOSTDOWN); - } - - tp->tun_flags &= ~TUN_RWAIT; - - do { - IFQ_DEQUEUE(&ifp->if_snd, m); - if (m == NULL) { - if (flag & O_NONBLOCK) { - mtx_unlock(&tp->tun_mtx); - return (EWOULDBLOCK); - } - tp->tun_flags |= TUN_RWAIT; - error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), - "tunread", 0); - if (error != 0) { - mtx_unlock(&tp->tun_mtx); - return (error); - } - } - } while (m == NULL); - mtx_unlock(&tp->tun_mtx); - - while (m && uio->uio_resid > 0 && error == 0) { - len = min(uio->uio_resid, m->m_len); - if (len != 0) - error = uiomove(mtod(m, void *), len, uio); - m = m_free(m); - } - - if (m) { - TUNDEBUG(ifp, "Dropping mbuf\n"); - m_freem(m); - } - return (error); -} - -/* - * the cdevsw write interface - an atomic write is a packet - or else! - */ -static int -tunwrite(struct cdev *dev, struct uio *uio, int flag) -{ - struct tun_softc *tp = dev->si_drv1; - struct ifnet *ifp = TUN2IFP(tp); - struct mbuf *m; - uint32_t family, mru; - int isr; - - TUNDEBUG(ifp, "tunwrite\n"); - - if ((ifp->if_flags & IFF_UP) != IFF_UP) - /* ignore silently */ - return (0); - - if (uio->uio_resid == 0) - return (0); - - mru = TUNMRU; - if (tp->tun_flags & TUN_IFHEAD) - mru += sizeof(family); - if (uio->uio_resid < 0 || uio->uio_resid > mru) { - TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); - return (EIO); - } - - if ((m = m_uiotombuf(uio, M_NOWAIT, 0, 0, M_PKTHDR)) == NULL) { - if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); - return (ENOBUFS); - } - - m->m_pkthdr.rcvif = ifp; -#ifdef MAC - mac_ifnet_create_mbuf(ifp, m); -#endif - - /* Could be unlocked read? */ - mtx_lock(&tp->tun_mtx); - if (tp->tun_flags & TUN_IFHEAD) { - mtx_unlock(&tp->tun_mtx); - if (m->m_len < sizeof(family) && - (m = m_pullup(m, sizeof(family))) == NULL) - return (ENOBUFS); - family = ntohl(*mtod(m, u_int32_t *)); - m_adj(m, sizeof(family)); - } else { - mtx_unlock(&tp->tun_mtx); - family = AF_INET; - } - - BPF_MTAP2(ifp, &family, sizeof(family), m); - - switch (family) { -#ifdef INET - case AF_INET: - isr = NETISR_IP; - break; -#endif -#ifdef INET6 - case AF_INET6: - isr = NETISR_IPV6; - break; -#endif - default: - m_freem(m); - return (EAFNOSUPPORT); - } - random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); - if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); - if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); - CURVNET_SET(ifp->if_vnet); - M_SETFIB(m, ifp->if_fib); - netisr_dispatch(isr, m); - CURVNET_RESTORE(); - return (0); -} - -/* - * tunpoll - the poll interface, this is only useful on reads - * really. The write detect always returns true, write never blocks - * anyway, it either accepts the packet or drops it. - */ -static int -tunpoll(struct cdev *dev, int events, struct thread *td) -{ - struct tun_softc *tp = dev->si_drv1; - struct ifnet *ifp = TUN2IFP(tp); - int revents = 0; - struct mbuf *m; - - TUNDEBUG(ifp, "tunpoll\n"); - - if (events & (POLLIN | POLLRDNORM)) { - IFQ_LOCK(&ifp->if_snd); - IFQ_POLL_NOLOCK(&ifp->if_snd, m); - if (m != NULL) { - TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); - revents |= events & (POLLIN | POLLRDNORM); - } else { - TUNDEBUG(ifp, "tunpoll waiting\n"); - selrecord(td, &tp->tun_rsel); - } - IFQ_UNLOCK(&ifp->if_snd); - } - if (events & (POLLOUT | POLLWRNORM)) - revents |= events & (POLLOUT | POLLWRNORM); - - return (revents); -} - -/* - * tunkqfilter - support for the kevent() system call. - */ -static int -tunkqfilter(struct cdev *dev, struct knote *kn) -{ - struct tun_softc *tp = dev->si_drv1; - struct ifnet *ifp = TUN2IFP(tp); - - switch(kn->kn_filter) { - case EVFILT_READ: - TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - kn->kn_fop = &tun_read_filterops; - break; - - case EVFILT_WRITE: - TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - kn->kn_fop = &tun_write_filterops; - break; - - default: - TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - return(EINVAL); - } - - kn->kn_hook = tp; - knlist_add(&tp->tun_rsel.si_note, kn, 0); - - return (0); -} - -/* - * Return true of there is data in the interface queue. - */ -static int -tunkqread(struct knote *kn, long hint) -{ - int ret; - struct tun_softc *tp = kn->kn_hook; - struct cdev *dev = tp->tun_dev; - struct ifnet *ifp = TUN2IFP(tp); - - if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { - TUNDEBUG(ifp, - "%s have data in the queue. Len = %d, minor = %#x\n", - ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); - ret = 1; - } else { - TUNDEBUG(ifp, - "%s waiting for data, minor = %#x\n", ifp->if_xname, - dev2unit(dev)); - ret = 0; - } - - return (ret); -} - -/* - * Always can write, always return MTU in kn->data. - */ -static int -tunkqwrite(struct knote *kn, long hint) -{ - struct tun_softc *tp = kn->kn_hook; - struct ifnet *ifp = TUN2IFP(tp); - - kn->kn_data = ifp->if_mtu; - - return (1); -} - -static void -tunkqdetach(struct knote *kn) -{ - struct tun_softc *tp = kn->kn_hook; - - knlist_remove(&tp->tun_rsel.si_note, kn, 0); -} diff --git a/freebsd/sys/net/if_tun.h b/freebsd/sys/net/if_tun.h index 1ea375f7..a44c87bd 100644 --- a/freebsd/sys/net/if_tun.h +++ b/freebsd/sys/net/if_tun.h @@ -40,6 +40,7 @@ struct tuninfo { #define TUNSIFINFO _IOW('t', 91, struct tuninfo) #define TUNGIFINFO _IOR('t', 92, struct tuninfo) #define TUNSLMODE _IOW('t', 93, int) +#define TUNGIFNAME _IOR('t', 93, struct ifreq) #define TUNSIFMODE _IOW('t', 94, int) #define TUNSIFPID _IO('t', 95) #define TUNSIFHEAD _IOW('t', 96, int) diff --git a/freebsd/sys/net/if_tuntap.c b/freebsd/sys/net/if_tuntap.c new file mode 100644 index 00000000..3516d82b --- /dev/null +++ b/freebsd/sys/net/if_tuntap.c @@ -0,0 +1,1734 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com> + * All rights reserved. + * Copyright (c) 2019 Kyle Evans <kevans@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * BASED ON: + * ------------------------------------------------------------------------- + * + * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> + * Nottingham University 1987. + * + * This source may be freely distributed, however I would be interested + * in any changes that are made. + * + * This driver takes packets off the IP i/f and hands them up to a + * user process to have its wicked way with. This driver has it's + * roots in a similar driver written by Phil Cockcroft (formerly) at + * UCL. This driver is based much more on read/write/poll mode of + * operation though. + * + * $FreeBSD$ + */ + +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/jail.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/socket.h> +#include <sys/eventhandler.h> +#include <sys/fcntl.h> +#include <sys/filio.h> +#include <sys/sockio.h> +#include <sys/sx.h> +#include <sys/ttycom.h> +#include <sys/poll.h> +#include <sys/selinfo.h> +#include <sys/signalvar.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/malloc.h> +#include <sys/random.h> +#include <sys/ctype.h> + +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_clone.h> +#include <net/if_dl.h> +#include <net/if_media.h> +#include <net/if_types.h> +#include <net/netisr.h> +#include <net/route.h> +#include <net/vnet.h> +#ifdef INET +#include <netinet/in.h> +#endif +#include <net/bpf.h> +#include <net/if_tap.h> +#include <net/if_tun.h> + +#include <sys/queue.h> +#include <sys/condvar.h> +#include <security/mac/mac_framework.h> + +struct tuntap_driver; + +/* + * tun_list is protected by global tunmtx. Other mutable fields are + * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is + * static for the duration of a tunnel interface. + */ +struct tuntap_softc { + TAILQ_ENTRY(tuntap_softc) tun_list; + struct cdev *tun_dev; + u_short tun_flags; /* misc flags */ +#define TUN_OPEN 0x0001 +#define TUN_INITED 0x0002 +#define TUN_IASET 0x0008 +#define TUN_DSTADDR 0x0010 +#define TUN_LMODE 0x0020 +#define TUN_RWAIT 0x0040 +#define TUN_ASYNC 0x0080 +#define TUN_IFHEAD 0x0100 +#define TUN_DYING 0x0200 +#define TUN_L2 0x0400 +#define TUN_VMNET 0x0800 + +#define TUN_DRIVER_IDENT_MASK (TUN_L2 | TUN_VMNET) +#define TUN_READY (TUN_OPEN | TUN_INITED) + +#ifndef __rtems__ + pid_t tun_pid; /* owning pid */ +#endif /* __rtems__ */ + struct ifnet *tun_ifp; /* the interface */ + struct sigio *tun_sigio; /* async I/O info */ + struct tuntap_driver *tun_drv; /* appropriate driver */ + struct selinfo tun_rsel; /* read select */ + struct mtx tun_mtx; /* softc field mutex */ + struct cv tun_cv; /* for ref'd dev destroy */ + struct ether_addr tun_ether; /* remote address */ +}; +#define TUN2IFP(sc) ((sc)->tun_ifp) + +#define TUNDEBUG if (tundebug) if_printf + +#define TUN_LOCK(tp) mtx_lock(&(tp)->tun_mtx) +#define TUN_UNLOCK(tp) mtx_unlock(&(tp)->tun_mtx) + +#define TUN_VMIO_FLAG_MASK 0x0fff + +/* + * All mutable global variables in if_tun are locked using tunmtx, with + * the exception of tundebug, which is used unlocked, and the drivers' *clones, + * which are static after setup. + */ +static struct mtx tunmtx; +static eventhandler_tag tag; +static const char tunname[] = "tun"; +static const char tapname[] = "tap"; +static const char vmnetname[] = "vmnet"; +static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); +static int tundebug = 0; +static int tundclone = 1; +static int tap_allow_uopen = 0; /* allow user open() */ +static int tapuponopen = 0; /* IFF_UP on open() */ +static int tapdclone = 1; /* enable devfs cloning */ + +static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); +SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); + +static struct sx tun_ioctl_sx; +SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); + +SYSCTL_DECL(_net_link); +/* tun */ +static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, + "IP tunnel software network interface"); +SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, + "Enable legacy devfs interface creation"); + +/* tap */ +static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0, + "Ethernet tunnel software network interface"); +SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0, + "Allow user to open /dev/tap (based on node permissions)"); +SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, + "Bring interface up when /dev/tap is opened"); +SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, + "Enable legacy devfs interface creation"); +SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, ""); + +static int tuntap_name2info(const char *name, int *unit, int *flags); +static void tunclone(void *arg, struct ucred *cred, char *name, + int namelen, struct cdev **dev); +static void tuncreate(struct cdev *dev, struct tuntap_driver *); +static int tunifioctl(struct ifnet *, u_long, caddr_t); +static void tuninit(struct ifnet *); +static void tunifinit(void *xtp); +static int tuntapmodevent(module_t, int, void *); +static int tunoutput(struct ifnet *, struct mbuf *, + const struct sockaddr *, struct route *ro); +static void tunstart(struct ifnet *); +static void tunstart_l2(struct ifnet *); + +static int tun_clone_match(struct if_clone *ifc, const char *name); +static int tap_clone_match(struct if_clone *ifc, const char *name); +static int vmnet_clone_match(struct if_clone *ifc, const char *name); +static int tun_clone_create(struct if_clone *, char *, size_t, caddr_t); +static int tun_clone_destroy(struct if_clone *, struct ifnet *); + +static d_open_t tunopen; +static d_close_t tunclose; +static d_read_t tunread; +static d_write_t tunwrite; +static d_ioctl_t tunioctl; +static d_poll_t tunpoll; +static d_kqfilter_t tunkqfilter; + +static int tunkqread(struct knote *, long); +static int tunkqwrite(struct knote *, long); +static void tunkqdetach(struct knote *); + +static struct filterops tun_read_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = tunkqdetach, + .f_event = tunkqread, +}; + +static struct filterops tun_write_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = tunkqdetach, + .f_event = tunkqwrite, +}; + +static struct tuntap_driver { + struct cdevsw cdevsw; + int ident_flags; + struct unrhdr *unrhdr; + struct clonedevs *clones; + ifc_match_t *clone_match_fn; + ifc_create_t *clone_create_fn; + ifc_destroy_t *clone_destroy_fn; +} tuntap_drivers[] = { + { + .ident_flags = 0, + .cdevsw = { + .d_version = D_VERSION, + .d_flags = D_NEEDMINOR, + .d_open = tunopen, + .d_close = tunclose, + .d_read = tunread, + .d_write = tunwrite, + .d_ioctl = tunioctl, + .d_poll = tunpoll, + .d_kqfilter = tunkqfilter, + .d_name = tunname, + }, + .clone_match_fn = tun_clone_match, + .clone_create_fn = tun_clone_create, + .clone_destroy_fn = tun_clone_destroy, + }, + { + .ident_flags = TUN_L2, + .cdevsw = { + .d_version = D_VERSION, + .d_flags = D_NEEDMINOR, + .d_open = tunopen, + .d_close = tunclose, + .d_read = tunread, + .d_write = tunwrite, + .d_ioctl = tunioctl, + .d_poll = tunpoll, + .d_kqfilter = tunkqfilter, + .d_name = tapname, + }, + .clone_match_fn = tap_clone_match, + .clone_create_fn = tun_clone_create, + .clone_destroy_fn = tun_clone_destroy, + }, + { + .ident_flags = TUN_L2 | TUN_VMNET, + .cdevsw = { + .d_version = D_VERSION, + .d_flags = D_NEEDMINOR, + .d_open = tunopen, + .d_close = tunclose, + .d_read = tunread, + .d_write = tunwrite, + .d_ioctl = tunioctl, + .d_poll = tunpoll, + .d_kqfilter = tunkqfilter, + .d_name = vmnetname, + }, + .clone_match_fn = vmnet_clone_match, + .clone_create_fn = tun_clone_create, + .clone_destroy_fn = tun_clone_destroy, + }, +}; + +struct tuntap_driver_cloner { + SLIST_ENTRY(tuntap_driver_cloner) link; + struct tuntap_driver *drv; + struct if_clone *cloner; +}; + +VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) = + SLIST_HEAD_INITIALIZER(tuntap_driver_cloners); + +#define V_tuntap_driver_cloners VNET(tuntap_driver_cloners) + +/* + * Sets unit and/or flags given the device name. Must be called with correct + * vnet context. + */ +static int +tuntap_name2info(const char *name, int *outunit, int *outflags) +{ + struct tuntap_driver *drv; + struct tuntap_driver_cloner *drvc; + char *dname; + int flags, unit; + bool found; + + if (name == NULL) + return (EINVAL); + + /* + * Needed for dev_stdclone, but dev_stdclone will not modify, it just + * wants to be able to pass back a char * through the second param. We + * will always set that as NULL here, so we'll fake it. + */ + dname = __DECONST(char *, name); + found = false; + + KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), + ("tuntap_driver_cloners failed to initialize")); + SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { + KASSERT(drvc->drv != NULL, + ("tuntap_driver_cloners entry not properly initialized")); + drv = drvc->drv; + + if (strcmp(name, drv->cdevsw.d_name) == 0) { + found = true; + unit = -1; + flags = drv->ident_flags; + break; + } + + if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) { + found = true; + flags = drv->ident_flags; + break; + } + } + + if (!found) + return (ENXIO); + + if (outunit != NULL) + *outunit = unit; + if (outflags != NULL) + *outflags = flags; + return (0); +} + +/* + * Get driver information from a set of flags specified. Masks the identifying + * part of the flags and compares it against all of the available + * tuntap_drivers. Must be called with correct vnet context. + */ +static struct tuntap_driver * +tuntap_driver_from_flags(int tun_flags) +{ + struct tuntap_driver *drv; + struct tuntap_driver_cloner *drvc; + + KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), + ("tuntap_driver_cloners failed to initialize")); + SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { + KASSERT(drvc->drv != NULL, + ("tuntap_driver_cloners entry not properly initialized")); + drv = drvc->drv; + if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags) + return (drv); + } + + return (NULL); +} + + + +static int +tun_clone_match(struct if_clone *ifc, const char *name) +{ + int tunflags; + + if (tuntap_name2info(name, NULL, &tunflags) == 0) { + if ((tunflags & TUN_L2) == 0) + return (1); + } + + return (0); +} + +static int +tap_clone_match(struct if_clone *ifc, const char *name) +{ + int tunflags; + + if (tuntap_name2info(name, NULL, &tunflags) == 0) { + if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2) + return (1); + } + + return (0); +} + +static int +vmnet_clone_match(struct if_clone *ifc, const char *name) +{ + int tunflags; + + if (tuntap_name2info(name, NULL, &tunflags) == 0) { + if ((tunflags & TUN_VMNET) != 0) + return (1); + } + + return (0); +} + +static int +tun_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +{ + struct tuntap_driver *drv; + struct cdev *dev; + int err, i, tunflags, unit; + + tunflags = 0; + /* The name here tells us exactly what we're creating */ + err = tuntap_name2info(name, &unit, &tunflags); + if (err != 0) + return (err); + + drv = tuntap_driver_from_flags(tunflags); + if (drv == NULL) + return (ENXIO); + + if (unit != -1) { + /* If this unit number is still available that's okay. */ + if (alloc_unr_specific(drv->unrhdr, unit) == -1) + return (EEXIST); + } else { + unit = alloc_unr(drv->unrhdr); + } + + snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit); + + /* find any existing device, or allocate new unit number */ + i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0); + if (i) { + /* No preexisting struct cdev *, create one */ + dev = make_dev(&drv->cdevsw, unit, UID_UUCP, GID_DIALER, 0600, + "%s%d", drv->cdevsw.d_name, unit); + } + + tuncreate(dev, drv); + + return (0); +} + +static void +tunclone(void *arg, struct ucred *cred, char *name, int namelen, + struct cdev **dev) +{ + char devname[SPECNAMELEN + 1]; + struct tuntap_driver *drv; + int append_unit, i, u, tunflags; + bool mayclone; + + if (*dev != NULL) + return; + + tunflags = 0; + CURVNET_SET(CRED_TO_VNET(cred)); + if (tuntap_name2info(name, &u, &tunflags) != 0) + goto out; /* Not recognized */ + + if (u != -1 && u > IF_MAXUNIT) + goto out; /* Unit number too high */ + + mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0; + if ((tunflags & TUN_L2) != 0) { + /* tap/vmnet allow user open with a sysctl */ + mayclone = (mayclone || tap_allow_uopen) && tapdclone; + } else { + mayclone = mayclone && tundclone; + } + + /* + * If tun cloning is enabled, only the superuser can create an + * interface. + */ + if (!mayclone) + goto out; + + if (u == -1) + append_unit = 1; + else + append_unit = 0; + + drv = tuntap_driver_from_flags(tunflags); + if (drv == NULL) + goto out; + + /* find any existing device, or allocate new unit number */ + i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0); + if (i) { + if (append_unit) { + namelen = snprintf(devname, sizeof(devname), "%s%d", + name, u); + name = devname; + } + /* No preexisting struct cdev *, create one */ + *dev = make_dev_credf(MAKEDEV_REF, &drv->cdevsw, u, cred, + UID_UUCP, GID_DIALER, 0600, "%s", name); + } + + if_clone_create(name, namelen, NULL); +out: + CURVNET_RESTORE(); +} + +static void +tun_destroy(struct tuntap_softc *tp) +{ + + TUN_LOCK(tp); + tp->tun_flags |= TUN_DYING; + if ((tp->tun_flags & TUN_OPEN) != 0) + cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); + else + TUN_UNLOCK(tp); + + CURVNET_SET(TUN2IFP(tp)->if_vnet); + + destroy_dev(tp->tun_dev); + seldrain(&tp->tun_rsel); + knlist_clear(&tp->tun_rsel.si_note, 0); + knlist_destroy(&tp->tun_rsel.si_note); + if ((tp->tun_flags & TUN_L2) != 0) { + ether_ifdetach(TUN2IFP(tp)); + } else { + bpfdetach(TUN2IFP(tp)); + if_detach(TUN2IFP(tp)); + } + sx_xlock(&tun_ioctl_sx); + TUN2IFP(tp)->if_softc = NULL; + sx_xunlock(&tun_ioctl_sx); + free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); + if_free(TUN2IFP(tp)); + mtx_destroy(&tp->tun_mtx); + cv_destroy(&tp->tun_cv); + free(tp, M_TUN); + CURVNET_RESTORE(); +} + +static int +tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp) +{ + struct tuntap_softc *tp = ifp->if_softc; + + mtx_lock(&tunmtx); + TAILQ_REMOVE(&tunhead, tp, tun_list); + mtx_unlock(&tunmtx); + tun_destroy(tp); + + return (0); +} + +static void +vnet_tun_init(const void *unused __unused) +{ + struct tuntap_driver *drv; + struct tuntap_driver_cloner *drvc; + int i; + + for (i = 0; i < nitems(tuntap_drivers); ++i) { + drv = &tuntap_drivers[i]; + drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO); + + drvc->drv = drv; + drvc->cloner = if_clone_advanced(drv->cdevsw.d_name, 0, + drv->clone_match_fn, drv->clone_create_fn, + drv->clone_destroy_fn); + SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link); + }; +} +VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_tun_init, NULL); + +#ifndef __rtems__ +static void +vnet_tun_uninit(const void *unused __unused) +{ + struct tuntap_driver_cloner *drvc; + + while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) { + drvc = SLIST_FIRST(&V_tuntap_driver_cloners); + SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link); + + if_clone_detach(drvc->cloner); + free(drvc, M_TUN); + } +} +VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_tun_uninit, NULL); + +static void +tun_uninit(const void *unused __unused) +{ + struct tuntap_driver *drv; + struct tuntap_softc *tp; + int i; + + EVENTHANDLER_DEREGISTER(dev_clone, tag); + drain_dev_clone_events(); + + mtx_lock(&tunmtx); + while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { + TAILQ_REMOVE(&tunhead, tp, tun_list); + mtx_unlock(&tunmtx); + tun_destroy(tp); + mtx_lock(&tunmtx); + } + mtx_unlock(&tunmtx); + for (i = 0; i < nitems(tuntap_drivers); ++i) { + drv = &tuntap_drivers[i]; + delete_unrhdr(drv->unrhdr); + clone_cleanup(&drv->clones); + } + mtx_destroy(&tunmtx); +} +SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); +#endif /* __rtems__ */ + +static int +tuntapmodevent(module_t mod, int type, void *data) +{ + struct tuntap_driver *drv; + int i; + + switch (type) { + case MOD_LOAD: + mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); + for (i = 0; i < nitems(tuntap_drivers); ++i) { + drv = &tuntap_drivers[i]; + clone_setup(&drv->clones); + drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); + } + tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); + if (tag == NULL) + return (ENOMEM); + break; + case MOD_UNLOAD: + /* See tun_uninit, so it's done after the vnet_sysuninit() */ + break; + default: + return EOPNOTSUPP; + } + return 0; +} + +static moduledata_t tuntap_mod = { + "if_tuntap", + tuntapmodevent, + 0 +}; + +DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_tuntap, 1); +MODULE_VERSION(if_tun, 1); +MODULE_VERSION(if_tap, 1); + +static void +tunstart(struct ifnet *ifp) +{ + struct tuntap_softc *tp = ifp->if_softc; + struct mbuf *m; + + TUNDEBUG(ifp, "starting\n"); + if (ALTQ_IS_ENABLED(&ifp->if_snd)) { + IFQ_LOCK(&ifp->if_snd); + IFQ_POLL_NOLOCK(&ifp->if_snd, m); + if (m == NULL) { + IFQ_UNLOCK(&ifp->if_snd); + return; + } + IFQ_UNLOCK(&ifp->if_snd); + } + + TUN_LOCK(tp); + if (tp->tun_flags & TUN_RWAIT) { + tp->tun_flags &= ~TUN_RWAIT; + wakeup(tp); + } + selwakeuppri(&tp->tun_rsel, PZERO + 1); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); + if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { + TUN_UNLOCK(tp); + pgsigio(&tp->tun_sigio, SIGIO, 0); + } else + TUN_UNLOCK(tp); +} + +/* + * tunstart_l2 + * + * queue packets from higher level ready to put out + */ +static void +tunstart_l2(struct ifnet *ifp) +{ + struct tuntap_softc *tp = ifp->if_softc; + + TUNDEBUG(ifp, "starting\n"); + + /* + * do not junk pending output if we are in VMnet mode. + * XXX: can this do any harm because of queue overflow? + */ + + TUN_LOCK(tp); + if (((tp->tun_flags & TUN_VMNET) == 0) && + ((tp->tun_flags & TUN_READY) != TUN_READY)) { + struct mbuf *m; + + /* Unlocked read. */ + TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags); + + for (;;) { + IF_DEQUEUE(&ifp->if_snd, m); + if (m != NULL) { + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + } else + break; + } + TUN_UNLOCK(tp); + + return; + } + + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + if (tp->tun_flags & TUN_RWAIT) { + tp->tun_flags &= ~TUN_RWAIT; + wakeup(tp); + } + + if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) { + TUN_UNLOCK(tp); + pgsigio(&tp->tun_sigio, SIGIO, 0); + TUN_LOCK(tp); + } + + selwakeuppri(&tp->tun_rsel, PZERO+1); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ + } + + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + TUN_UNLOCK(tp); +} /* tunstart_l2 */ + + +/* XXX: should return an error code so it can fail. */ +static void +tuncreate(struct cdev *dev, struct tuntap_driver *drv) +{ + struct tuntap_softc *sc; + struct ifnet *ifp; + struct ether_addr eaddr; + int iflags; + u_char type; + + sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO); + mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF); + cv_init(&sc->tun_cv, "tun_condvar"); + sc->tun_flags = drv->ident_flags; + sc->tun_dev = dev; + sc->tun_drv = drv; + mtx_lock(&tunmtx); + TAILQ_INSERT_TAIL(&tunhead, sc, tun_list); + mtx_unlock(&tunmtx); + + iflags = IFF_MULTICAST; + if ((sc->tun_flags & TUN_L2) != 0) { + type = IFT_ETHER; + iflags |= IFF_BROADCAST | IFF_SIMPLEX; + } else { + type = IFT_PPP; + iflags |= IFF_POINTOPOINT; + } + ifp = sc->tun_ifp = if_alloc(type); + if (ifp == NULL) + panic("%s%d: failed to if_alloc() interface.\n", + drv->cdevsw.d_name, dev2unit(dev)); + ifp->if_softc = sc; + if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev)); + ifp->if_ioctl = tunifioctl; + ifp->if_flags = iflags; + IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); + knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx); + ifp->if_capabilities |= IFCAP_LINKSTATE; + ifp->if_capenable |= IFCAP_LINKSTATE; + + if ((sc->tun_flags & TUN_L2) != 0) { + ifp->if_mtu = ETHERMTU; + ifp->if_init = tunifinit; + ifp->if_start = tunstart_l2; + + ether_gen_addr(ifp, &eaddr); + ether_ifattach(ifp, eaddr.octet); + } else { + ifp->if_mtu = TUNMTU; + ifp->if_start = tunstart; + ifp->if_output = tunoutput; + + ifp->if_snd.ifq_drv_maxlen = 0; + IFQ_SET_READY(&ifp->if_snd); + + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + } + dev->si_drv1 = sc; + + TUN_LOCK(sc); + sc->tun_flags |= TUN_INITED; + TUN_UNLOCK(sc); + + TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); +} + +static int +tunopen(struct cdev *dev, int flag, int mode, struct thread *td) +{ + struct ifnet *ifp; + struct tuntap_driver *drv; + struct tuntap_softc *tp; + int error, tunflags; + + tunflags = 0; + CURVNET_SET(TD_TO_VNET(td)); + error = tuntap_name2info(dev->si_name, NULL, &tunflags); + if (error != 0) { + CURVNET_RESTORE(); + return (error); /* Shouldn't happen */ + } + + if ((tunflags & TUN_L2) != 0) { + /* Restrict? */ + if (tap_allow_uopen == 0) { + error = priv_check(td, PRIV_NET_TAP); + if (error != 0) { + CURVNET_RESTORE(); + return (error); + } + } + } + + /* + * XXXRW: Non-atomic test and set of dev->si_drv1 requires + * synchronization. + */ + tp = dev->si_drv1; + if (!tp) { + drv = tuntap_driver_from_flags(tunflags); + if (drv == NULL) { + CURVNET_RESTORE(); + return (ENXIO); + } + tuncreate(dev, drv); + tp = dev->si_drv1; + } + + TUN_LOCK(tp); + if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { + TUN_UNLOCK(tp); + CURVNET_RESTORE(); + return (EBUSY); + } + + ifp = TUN2IFP(tp); + + if ((tp->tun_flags & TUN_L2) != 0) { + bcopy(IF_LLADDR(ifp), tp->tun_ether.octet, + sizeof(tp->tun_ether.octet)); + + ifp->if_drv_flags |= IFF_DRV_RUNNING; + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + + if (tapuponopen) + ifp->if_flags |= IFF_UP; + } + +#ifndef __rtems__ + tp->tun_pid = td->td_proc->p_pid; +#endif /* __rtems__ */ + tp->tun_flags |= TUN_OPEN; + + if_link_state_change(ifp, LINK_STATE_UP); + TUNDEBUG(ifp, "open\n"); + TUN_UNLOCK(tp); + CURVNET_RESTORE(); + return (0); +} + +/* + * tunclose - close the device - mark i/f down & delete + * routing info + */ +static int +tunclose(struct cdev *dev, int foo, int bar, struct thread *td) +{ + struct tuntap_softc *tp; + struct ifnet *ifp; + bool l2tun; + + tp = dev->si_drv1; + ifp = TUN2IFP(tp); + + TUN_LOCK(tp); +#ifndef __rtems__ + /* + * Simply close the device if this isn't the controlling process. This + * may happen if, for instance, the tunnel has been handed off to + * another process. The original controller should be able to close it + * without putting us into an inconsistent state. + */ + if (td->td_proc->p_pid != tp->tun_pid) { + TUN_UNLOCK(tp); + return (0); + } +#endif /* __rtems__ */ + + /* + * junk all pending output + */ + CURVNET_SET(ifp->if_vnet); + + l2tun = false; + if ((tp->tun_flags & TUN_L2) != 0) { + l2tun = true; + IF_DRAIN(&ifp->if_snd); + } else { + IFQ_PURGE(&ifp->if_snd); + } + + /* For vmnet, we won't do most of the address/route bits */ + if ((tp->tun_flags & TUN_VMNET) != 0 || + (l2tun && (ifp->if_flags & IFF_LINK0) != 0)) + goto out; + + if (ifp->if_flags & IFF_UP) { + TUN_UNLOCK(tp); + if_down(ifp); + TUN_LOCK(tp); + } + + /* Delete all addresses and routes which reference this interface. */ + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + struct ifaddr *ifa; + + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + TUN_UNLOCK(tp); + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + /* deal w/IPv4 PtP destination; unlocked read */ + if (!l2tun && ifa->ifa_addr->sa_family == AF_INET) { + rtinit(ifa, (int)RTM_DELETE, + tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); + } else { + rtinit(ifa, (int)RTM_DELETE, 0); + } + } + if_purgeaddrs(ifp); + TUN_LOCK(tp); + } + +out: + if_link_state_change(ifp, LINK_STATE_DOWN); + CURVNET_RESTORE(); + + funsetown(&tp->tun_sigio); + selwakeuppri(&tp->tun_rsel, PZERO + 1); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); + TUNDEBUG (ifp, "closed\n"); + tp->tun_flags &= ~TUN_OPEN; +#ifndef __rtems__ + tp->tun_pid = 0; +#endif /* __rtems__ */ + + cv_broadcast(&tp->tun_cv); + TUN_UNLOCK(tp); + return (0); +} + +static void +tuninit(struct ifnet *ifp) +{ + struct tuntap_softc *tp = ifp->if_softc; +#ifdef INET + struct ifaddr *ifa; +#endif + + TUNDEBUG(ifp, "tuninit\n"); + + TUN_LOCK(tp); + ifp->if_drv_flags |= IFF_DRV_RUNNING; + if ((tp->tun_flags & TUN_L2) == 0) { + ifp->if_flags |= IFF_UP; + getmicrotime(&ifp->if_lastchange); +#ifdef INET + if_addr_rlock(ifp); + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family == AF_INET) { + struct sockaddr_in *si; + + si = (struct sockaddr_in *)ifa->ifa_addr; + if (si->sin_addr.s_addr) + tp->tun_flags |= TUN_IASET; + + si = (struct sockaddr_in *)ifa->ifa_dstaddr; + if (si && si->sin_addr.s_addr) + tp->tun_flags |= TUN_DSTADDR; + } + } + if_addr_runlock(ifp); +#endif + TUN_UNLOCK(tp); + } else { + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + TUN_UNLOCK(tp); + /* attempt to start output */ + tunstart_l2(ifp); + } + +} + +/* + * Used only for l2 tunnel. + */ +static void +tunifinit(void *xtp) +{ + struct tuntap_softc *tp; + + tp = (struct tuntap_softc *)xtp; + tuninit(tp->tun_ifp); +} + +/* + * Process an ioctl request. + */ +static int +tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct ifreq *ifr = (struct ifreq *)data; + struct tuntap_softc *tp; + struct ifstat *ifs; + struct ifmediareq *ifmr; + int dummy, error = 0; + bool l2tun; + + ifmr = NULL; + sx_xlock(&tun_ioctl_sx); + tp = ifp->if_softc; + if (tp == NULL) { + error = ENXIO; + goto bad; + } + l2tun = (tp->tun_flags & TUN_L2) != 0; + switch(cmd) { + case SIOCGIFSTATUS: + ifs = (struct ifstat *)data; + TUN_LOCK(tp); +#ifndef __rtems__ + if (tp->tun_pid) + snprintf(ifs->ascii, sizeof(ifs->ascii), + "\tOpened by PID %d\n", tp->tun_pid); + else +#endif /* __rtems__ */ + ifs->ascii[0] = '\0'; + TUN_UNLOCK(tp); + break; + case SIOCSIFADDR: + if (l2tun) + error = ether_ioctl(ifp, cmd, data); + else + tuninit(ifp); + if (error == 0) + TUNDEBUG(ifp, "address set\n"); + break; + case SIOCSIFMTU: + ifp->if_mtu = ifr->ifr_mtu; + TUNDEBUG(ifp, "mtu set\n"); + break; + case SIOCSIFFLAGS: + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + case SIOCGIFMEDIA: + if (!l2tun) { + error = EINVAL; + break; + } + + ifmr = (struct ifmediareq *)data; + dummy = ifmr->ifm_count; + ifmr->ifm_count = 1; + ifmr->ifm_status = IFM_AVALID; + ifmr->ifm_active = IFM_ETHER; + if (tp->tun_flags & TUN_OPEN) + ifmr->ifm_status |= IFM_ACTIVE; + ifmr->ifm_current = ifmr->ifm_active; + if (dummy >= 1) { + int media = IFM_ETHER; + error = copyout(&media, ifmr->ifm_ulist, sizeof(int)); + } + break; + default: + if (l2tun) { + error = ether_ioctl(ifp, cmd, data); + } else { + error = EINVAL; + } + } +bad: + sx_xunlock(&tun_ioctl_sx); + return (error); +} + +/* + * tunoutput - queue packets from higher level ready to put out. + */ +static int +tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, + struct route *ro) +{ + struct tuntap_softc *tp = ifp->if_softc; + u_short cached_tun_flags; + int error; + u_int32_t af; + + TUNDEBUG (ifp, "tunoutput\n"); + +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m0); + if (error) { + m_freem(m0); + return (error); + } +#endif + + /* Could be unlocked read? */ + TUN_LOCK(tp); + cached_tun_flags = tp->tun_flags; + TUN_UNLOCK(tp); + if ((cached_tun_flags & TUN_READY) != TUN_READY) { + TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); + m_freem (m0); + return (EHOSTDOWN); + } + + if ((ifp->if_flags & IFF_UP) != IFF_UP) { + m_freem (m0); + return (EHOSTDOWN); + } + + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC) + bcopy(dst->sa_data, &af, sizeof(af)); + else + af = dst->sa_family; + + if (bpf_peers_present(ifp->if_bpf)) + bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); + + /* prepend sockaddr? this may abort if the mbuf allocation fails */ + if (cached_tun_flags & TUN_LMODE) { + /* allocate space for sockaddr */ + M_PREPEND(m0, dst->sa_len, M_NOWAIT); + + /* if allocation failed drop packet */ + if (m0 == NULL) { + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (ENOBUFS); + } else { + bcopy(dst, m0->m_data, dst->sa_len); + } + } + + if (cached_tun_flags & TUN_IFHEAD) { + /* Prepend the address family */ + M_PREPEND(m0, 4, M_NOWAIT); + + /* if allocation failed drop packet */ + if (m0 == NULL) { + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (ENOBUFS); + } else + *(u_int32_t *)m0->m_data = htonl(af); + } else { +#ifdef INET + if (af != AF_INET) +#endif + { + m_freem(m0); + return (EAFNOSUPPORT); + } + } + + error = (ifp->if_transmit)(ifp, m0); + if (error) + return (ENOBUFS); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + return (0); +} + +/* + * the cdevsw interface is now pretty minimal. + */ +static int +tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, + struct thread *td) +{ + struct ifreq ifr, *ifrp; + struct tuntap_softc *tp = dev->si_drv1; + struct tuninfo *tunp; + int error, iflags; +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) + int ival; +#endif + bool l2tun; + + l2tun = (tp->tun_flags & TUN_L2) != 0; + if (l2tun) { + /* tap specific ioctls */ + switch(cmd) { + /* VMware/VMnet port ioctl's */ +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) + case _IO('V', 0): + ival = IOCPARM_IVAL(data); + data = (caddr_t)&ival; + /* FALLTHROUGH */ +#endif + case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ + iflags = *(int *)data; + iflags &= TUN_VMIO_FLAG_MASK; + iflags &= ~IFF_CANTCHANGE; + iflags |= IFF_UP; + + TUN_LOCK(tp); + TUN2IFP(tp)->if_flags = iflags | + (TUN2IFP(tp)->if_flags & IFF_CANTCHANGE); + TUN_UNLOCK(tp); + + return (0); + case SIOCGIFADDR: /* get MAC address of the remote side */ + TUN_LOCK(tp); + bcopy(&tp->tun_ether.octet, data, + sizeof(tp->tun_ether.octet)); + TUN_UNLOCK(tp); + + return (0); + case SIOCSIFADDR: /* set MAC address of the remote side */ + TUN_LOCK(tp); + bcopy(data, &tp->tun_ether.octet, + sizeof(tp->tun_ether.octet)); + TUN_UNLOCK(tp); + + return (0); + } + + /* Fall through to the common ioctls if unhandled */ + } else { + switch (cmd) { + case TUNSLMODE: + TUN_LOCK(tp); + if (*(int *)data) { + tp->tun_flags |= TUN_LMODE; + tp->tun_flags &= ~TUN_IFHEAD; + } else + tp->tun_flags &= ~TUN_LMODE; + TUN_UNLOCK(tp); + + return (0); + case TUNSIFHEAD: + TUN_LOCK(tp); + if (*(int *)data) { + tp->tun_flags |= TUN_IFHEAD; + tp->tun_flags &= ~TUN_LMODE; + } else + tp->tun_flags &= ~TUN_IFHEAD; + TUN_UNLOCK(tp); + + return (0); + case TUNGIFHEAD: + TUN_LOCK(tp); + *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; + TUN_UNLOCK(tp); + + return (0); + case TUNSIFMODE: + /* deny this if UP */ + if (TUN2IFP(tp)->if_flags & IFF_UP) + return (EBUSY); + + switch (*(int *)data & ~IFF_MULTICAST) { + case IFF_POINTOPOINT: + case IFF_BROADCAST: + TUN_LOCK(tp); + TUN2IFP(tp)->if_flags &= + ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); + TUN2IFP(tp)->if_flags |= *(int *)data; + TUN_UNLOCK(tp); + + break; + default: + return (EINVAL); + } + + return (0); + case TUNSIFPID: +#ifndef __rtems__ + TUN_LOCK(tp); + tp->tun_pid = curthread->td_proc->p_pid; + TUN_UNLOCK(tp); +#endif /* __rtems__ */ + + return (0); + } + /* Fall through to the common ioctls if unhandled */ + } + + switch (cmd) { + case TUNGIFNAME: + ifrp = (struct ifreq *)data; + strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ); + + return (0); + case TUNSIFINFO: + tunp = (struct tuninfo *)data; + if (TUN2IFP(tp)->if_type != tunp->type) + return (EPROTOTYPE); + TUN_LOCK(tp); + if (TUN2IFP(tp)->if_mtu != tunp->mtu) { + strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); + ifr.ifr_mtu = tunp->mtu; + CURVNET_SET(TUN2IFP(tp)->if_vnet); + error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), + (caddr_t)&ifr, td); + CURVNET_RESTORE(); + if (error) { + TUN_UNLOCK(tp); + return (error); + } + } + TUN2IFP(tp)->if_baudrate = tunp->baudrate; + TUN_UNLOCK(tp); + break; + case TUNGIFINFO: + tunp = (struct tuninfo *)data; + TUN_LOCK(tp); + tunp->mtu = TUN2IFP(tp)->if_mtu; + tunp->type = TUN2IFP(tp)->if_type; + tunp->baudrate = TUN2IFP(tp)->if_baudrate; + TUN_UNLOCK(tp); + break; + case TUNSDEBUG: + tundebug = *(int *)data; + break; + case TUNGDEBUG: + *(int *)data = tundebug; + break; + case FIONBIO: + break; + case FIOASYNC: + TUN_LOCK(tp); + if (*(int *)data) + tp->tun_flags |= TUN_ASYNC; + else + tp->tun_flags &= ~TUN_ASYNC; + TUN_UNLOCK(tp); + break; + case FIONREAD: + if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { + struct mbuf *mb; + IFQ_LOCK(&TUN2IFP(tp)->if_snd); + IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); + for (*(int *)data = 0; mb != NULL; mb = mb->m_next) + *(int *)data += mb->m_len; + IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); + } else + *(int *)data = 0; + break; + case FIOSETOWN: + return (fsetown(*(int *)data, &tp->tun_sigio)); + + case FIOGETOWN: + *(int *)data = fgetown(&tp->tun_sigio); + return (0); + + /* This is deprecated, FIOSETOWN should be used instead. */ + case TIOCSPGRP: + return (fsetown(-(*(int *)data), &tp->tun_sigio)); + + /* This is deprecated, FIOGETOWN should be used instead. */ + case TIOCGPGRP: + *(int *)data = -fgetown(&tp->tun_sigio); + return (0); + + default: + return (ENOTTY); + } + return (0); +} + +/* + * The cdevsw read interface - reads a packet at a time, or at + * least as much of a packet as can be read. + */ +static int +tunread(struct cdev *dev, struct uio *uio, int flag) +{ + struct tuntap_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + struct mbuf *m; + int error=0, len; + + TUNDEBUG (ifp, "read\n"); + TUN_LOCK(tp); + if ((tp->tun_flags & TUN_READY) != TUN_READY) { + TUN_UNLOCK(tp); + TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); + return (EHOSTDOWN); + } + + tp->tun_flags &= ~TUN_RWAIT; + + for (;;) { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m != NULL) + break; + if (flag & O_NONBLOCK) { + TUN_UNLOCK(tp); + return (EWOULDBLOCK); + } + tp->tun_flags |= TUN_RWAIT; + error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), + "tunread", 0); + if (error != 0) { + TUN_UNLOCK(tp); + return (error); + } + } + TUN_UNLOCK(tp); + + if ((tp->tun_flags & TUN_L2) != 0) + BPF_MTAP(ifp, m); + + while (m && uio->uio_resid > 0 && error == 0) { + len = min(uio->uio_resid, m->m_len); + if (len != 0) + error = uiomove(mtod(m, void *), len, uio); + m = m_free(m); + } + + if (m) { + TUNDEBUG(ifp, "Dropping mbuf\n"); + m_freem(m); + } + return (error); +} + +static int +tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m) +{ + struct ether_header *eh; + struct ifnet *ifp; + + ifp = TUN2IFP(tp); + + /* + * Only pass a unicast frame to ether_input(), if it would + * actually have been received by non-virtual hardware. + */ + if (m->m_len < sizeof(struct ether_header)) { + m_freem(m); + return (0); + } + + eh = mtod(m, struct ether_header *); + + if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && + !ETHER_IS_MULTICAST(eh->ether_dhost) && + bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { + m_freem(m); + return (0); + } + + /* Pass packet up to parent. */ + CURVNET_SET(ifp->if_vnet); + (*ifp->if_input)(ifp, m); + CURVNET_RESTORE(); + /* ibytes are counted in parent */ + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + return (0); +} + +static int +tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m) +{ + struct ifnet *ifp; + int family, isr; + + ifp = TUN2IFP(tp); + /* Could be unlocked read? */ + TUN_LOCK(tp); + if (tp->tun_flags & TUN_IFHEAD) { + TUN_UNLOCK(tp); + if (m->m_len < sizeof(family) && + (m = m_pullup(m, sizeof(family))) == NULL) + return (ENOBUFS); + family = ntohl(*mtod(m, u_int32_t *)); + m_adj(m, sizeof(family)); + } else { + TUN_UNLOCK(tp); + family = AF_INET; + } + + BPF_MTAP2(ifp, &family, sizeof(family), m); + + switch (family) { +#ifdef INET + case AF_INET: + isr = NETISR_IP; + break; +#endif +#ifdef INET6 + case AF_INET6: + isr = NETISR_IPV6; + break; +#endif + default: + m_freem(m); + return (EAFNOSUPPORT); + } + random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + CURVNET_SET(ifp->if_vnet); + M_SETFIB(m, ifp->if_fib); + netisr_dispatch(isr, m); + CURVNET_RESTORE(); + return (0); +} + +/* + * the cdevsw write interface - an atomic write is a packet - or else! + */ +static int +tunwrite(struct cdev *dev, struct uio *uio, int flag) +{ + struct tuntap_softc *tp; + struct ifnet *ifp; + struct mbuf *m; + uint32_t mru; + int align; + bool l2tun; + + tp = dev->si_drv1; + ifp = TUN2IFP(tp); + TUNDEBUG(ifp, "tunwrite\n"); + if ((ifp->if_flags & IFF_UP) != IFF_UP) + /* ignore silently */ + return (0); + + if (uio->uio_resid == 0) + return (0); + + l2tun = (tp->tun_flags & TUN_L2) != 0; + align = 0; + mru = l2tun ? TAPMRU : TUNMRU; + if (l2tun) + align = ETHER_ALIGN; + else if ((tp->tun_flags & TUN_IFHEAD) != 0) + mru += sizeof(uint32_t); /* family */ + if (uio->uio_resid < 0 || uio->uio_resid > mru) { + TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); + return (EIO); + } + + if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) { + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); + return (ENOBUFS); + } + + m->m_pkthdr.rcvif = ifp; +#ifdef MAC + mac_ifnet_create_mbuf(ifp, m); +#endif + + if (l2tun) + return (tunwrite_l2(tp, m)); + + return (tunwrite_l3(tp, m)); +} + +/* + * tunpoll - the poll interface, this is only useful on reads + * really. The write detect always returns true, write never blocks + * anyway, it either accepts the packet or drops it. + */ +static int +tunpoll(struct cdev *dev, int events, struct thread *td) +{ + struct tuntap_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + int revents = 0; + + TUNDEBUG(ifp, "tunpoll\n"); + + if (events & (POLLIN | POLLRDNORM)) { + IFQ_LOCK(&ifp->if_snd); + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); + revents |= events & (POLLIN | POLLRDNORM); + } else { + TUNDEBUG(ifp, "tunpoll waiting\n"); + selrecord(td, &tp->tun_rsel); + } + IFQ_UNLOCK(&ifp->if_snd); + } + revents |= events & (POLLOUT | POLLWRNORM); + + return (revents); +} + +/* + * tunkqfilter - support for the kevent() system call. + */ +static int +tunkqfilter(struct cdev *dev, struct knote *kn) +{ + struct tuntap_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + + switch(kn->kn_filter) { + case EVFILT_READ: + TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + kn->kn_fop = &tun_read_filterops; + break; + + case EVFILT_WRITE: + TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + kn->kn_fop = &tun_write_filterops; + break; + + default: + TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + return(EINVAL); + } + + kn->kn_hook = tp; + knlist_add(&tp->tun_rsel.si_note, kn, 0); + + return (0); +} + +/* + * Return true of there is data in the interface queue. + */ +static int +tunkqread(struct knote *kn, long hint) +{ + int ret; + struct tuntap_softc *tp = kn->kn_hook; + struct cdev *dev = tp->tun_dev; + struct ifnet *ifp = TUN2IFP(tp); + + if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { + TUNDEBUG(ifp, + "%s have data in the queue. Len = %d, minor = %#x\n", + ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); + ret = 1; + } else { + TUNDEBUG(ifp, + "%s waiting for data, minor = %#x\n", ifp->if_xname, + dev2unit(dev)); + ret = 0; + } + + return (ret); +} + +/* + * Always can write, always return MTU in kn->data. + */ +static int +tunkqwrite(struct knote *kn, long hint) +{ + struct tuntap_softc *tp = kn->kn_hook; + struct ifnet *ifp = TUN2IFP(tp); + + kn->kn_data = ifp->if_mtu; + + return (1); +} + +static void +tunkqdetach(struct knote *kn) +{ + struct tuntap_softc *tp = kn->kn_hook; + + knlist_remove(&tp->tun_rsel.si_note, kn, 0); +} diff --git a/freebsd/sys/net/if_var.h b/freebsd/sys/net/if_var.h index d23928e5..700296fa 100644 --- a/freebsd/sys/net/if_var.h +++ b/freebsd/sys/net/if_var.h @@ -73,6 +73,7 @@ struct netmap_adapter; struct netdump_methods; #ifdef _KERNEL +#include <sys/_eventhandler.h> #include <sys/mbuf.h> /* ifqueue only? */ #include <sys/buf_ring.h> #include <net/vnet.h> @@ -95,8 +96,9 @@ CK_STAILQ_HEAD(ifmultihead, ifmultiaddr); CK_STAILQ_HEAD(ifgrouphead, ifg_group); #ifdef _KERNEL -VNET_DECLARE(struct pfil_head, link_pfil_hook); /* packet filter hooks */ -#define V_link_pfil_hook VNET(link_pfil_hook) +VNET_DECLARE(struct pfil_head *, link_pfil_head); +#define V_link_pfil_head VNET(link_pfil_head) +#define PFIL_ETHER_NAME "ethernet" #define HHOOK_IPSEC_INET 0 #define HHOOK_IPSEC_INET6 1 @@ -193,11 +195,13 @@ struct if_encap_req { * m_snd_tag" comes from the network driver and it is free to allocate * as much additional space as it wants for its own use. */ +struct ktls_session; struct m_snd_tag; #define IF_SND_TAG_TYPE_RATE_LIMIT 0 #define IF_SND_TAG_TYPE_UNLIMITED 1 -#define IF_SND_TAG_TYPE_MAX 2 +#define IF_SND_TAG_TYPE_TLS 2 +#define IF_SND_TAG_TYPE_MAX 3 struct if_snd_tag_alloc_header { uint32_t type; /* send tag type, see IF_SND_TAG_XXX */ @@ -208,6 +212,14 @@ struct if_snd_tag_alloc_header { struct if_snd_tag_alloc_rate_limit { struct if_snd_tag_alloc_header hdr; uint64_t max_rate; /* in bytes/s */ + uint32_t flags; /* M_NOWAIT or M_WAITOK */ + uint32_t reserved; /* alignment */ +}; + +struct if_snd_tag_alloc_tls { + struct if_snd_tag_alloc_header hdr; + struct inpcb *inp; + const struct ktls_session *tls; }; struct if_snd_tag_rate_limit_params { @@ -215,13 +227,14 @@ struct if_snd_tag_rate_limit_params { uint32_t queue_level; /* 0 (empty) .. 65535 (full) */ #define IF_SND_QUEUE_LEVEL_MIN 0 #define IF_SND_QUEUE_LEVEL_MAX 65535 - uint32_t reserved; /* padding */ + uint32_t flags; /* M_NOWAIT or M_WAITOK */ }; union if_snd_tag_alloc_params { struct if_snd_tag_alloc_header hdr; struct if_snd_tag_alloc_rate_limit rate_limit; struct if_snd_tag_alloc_rate_limit unlimited; + struct if_snd_tag_alloc_tls tls; }; union if_snd_tag_modify_params { @@ -234,11 +247,37 @@ union if_snd_tag_query_params { struct if_snd_tag_rate_limit_params unlimited; }; +/* Query return flags */ +#define RT_NOSUPPORT 0x00000000 /* Not supported */ +#define RT_IS_INDIRECT 0x00000001 /* + * Interface like a lagg, select + * the actual interface for + * capabilities. + */ +#define RT_IS_SELECTABLE 0x00000002 /* + * No rate table, you select + * rates and the first + * number_of_rates are created. + */ +#define RT_IS_FIXED_TABLE 0x00000004 /* A fixed table is attached */ +#define RT_IS_UNUSABLE 0x00000008 /* It is not usable for this */ + +struct if_ratelimit_query_results { + const uint64_t *rate_table; /* Pointer to table if present */ + uint32_t flags; /* Flags indicating results */ + uint32_t max_flows; /* Max flows using, 0=unlimited */ + uint32_t number_of_rates; /* How many unique rates can be created */ + uint32_t min_segment_burst; /* The amount the adapter bursts at each send */ +}; + typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *); typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *); typedef void (if_snd_tag_free_t)(struct m_snd_tag *); +typedef void (if_ratelimit_query_t)(struct ifnet *, + struct if_ratelimit_query_results *); + /* * Structure defining a network interface. @@ -250,7 +289,9 @@ struct ifnet { CK_STAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if (CK_) */ /* protected by if_addr_lock */ u_char if_alloctype; /* if_type at time of allocation */ - +#ifndef __rtems__ + uint8_t if_numa_domain; /* NUMA domain of device */ +#endif /* __rtems__ */ /* Driver and protocol specific information that remains stable. */ void *if_softc; /* pointer to driver state */ void *if_llsoftc; /* link layer softc */ @@ -379,6 +420,7 @@ struct ifnet { if_snd_tag_modify_t *if_snd_tag_modify; if_snd_tag_query_t *if_snd_tag_query; if_snd_tag_free_t *if_snd_tag_free; + if_ratelimit_query_t *if_ratelimit_query; /* Ethernet PCP */ uint8_t if_pcp; @@ -416,24 +458,21 @@ struct rtems_ifinputreq { /* for compatibility with other BSDs */ #define if_name(ifp) ((ifp)->if_xname) +#define IF_NODOM 255 /* * Locks for address lists on the network interface. */ #define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_lock, "if_addr_lock", NULL, MTX_DEF) #define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_lock) -#define IF_ADDR_RLOCK(if) struct epoch_tracker if_addr_et; epoch_enter_preempt(net_epoch_preempt, &if_addr_et); -#define IF_ADDR_RUNLOCK(if) epoch_exit_preempt(net_epoch_preempt, &if_addr_et); #define IF_ADDR_WLOCK(if) mtx_lock(&(if)->if_addr_lock) #define IF_ADDR_WUNLOCK(if) mtx_unlock(&(if)->if_addr_lock) #define IF_ADDR_LOCK_ASSERT(if) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(if)->if_addr_lock)) #define IF_ADDR_WLOCK_ASSERT(if) mtx_assert(&(if)->if_addr_lock, MA_OWNED) -#define NET_EPOCH_ENTER() struct epoch_tracker nep_et; epoch_enter_preempt(net_epoch_preempt, &nep_et) -#define NET_EPOCH_ENTER_ET(et) epoch_enter_preempt(net_epoch_preempt, &(et)) -#define NET_EPOCH_EXIT() epoch_exit_preempt(net_epoch_preempt, &nep_et) -#define NET_EPOCH_EXIT_ET(et) epoch_exit_preempt(net_epoch_preempt, &(et)) -#define NET_EPOCH_WAIT() epoch_wait_preempt(net_epoch_preempt) - +#define NET_EPOCH_ENTER(et) epoch_enter_preempt(net_epoch_preempt, &(et)) +#define NET_EPOCH_EXIT(et) epoch_exit_preempt(net_epoch_preempt, &(et)) +#define NET_EPOCH_WAIT() epoch_wait_preempt(net_epoch_preempt) +#define NET_EPOCH_ASSERT() MPASS(in_epoch(net_epoch_preempt)) /* * Function variations on locking macros intended to be used by loadable @@ -446,7 +485,6 @@ void if_maddr_rlock(if_t ifp); /* if_multiaddrs */ void if_maddr_runlock(if_t ifp); /* if_multiaddrs */ #ifdef _KERNEL -#ifdef _SYS_EVENTHANDLER_H_ /* interface link layer address change event */ typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t); @@ -474,7 +512,6 @@ EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t); typedef void (*ifnet_event_fn)(void *, struct ifnet *ifp, int event); EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn); -#endif /* _SYS_EVENTHANDLER_H_ */ /* * interface groups @@ -513,16 +550,13 @@ EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t); mtx_init(&(ifp)->if_afdata_lock, "if_afdata", NULL, MTX_DEF) #define IF_AFDATA_WLOCK(ifp) mtx_lock(&(ifp)->if_afdata_lock) -#define IF_AFDATA_RLOCK(ifp) struct epoch_tracker if_afdata_et; epoch_enter_preempt(net_epoch_preempt, &if_afdata_et) #define IF_AFDATA_WUNLOCK(ifp) mtx_unlock(&(ifp)->if_afdata_lock) -#define IF_AFDATA_RUNLOCK(ifp) epoch_exit_preempt(net_epoch_preempt, &if_afdata_et) #define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp) #define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp) #define IF_AFDATA_TRYLOCK(ifp) mtx_trylock(&(ifp)->if_afdata_lock) #define IF_AFDATA_DESTROY(ifp) mtx_destroy(&(ifp)->if_afdata_lock) #define IF_AFDATA_LOCK_ASSERT(ifp) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ifp)->if_afdata_lock)) -#define IF_AFDATA_RLOCK_ASSERT(ifp) MPASS(in_epoch(net_epoch_preempt)); #define IF_AFDATA_WLOCK_ASSERT(ifp) mtx_assert(&(ifp)->if_afdata_lock, MA_OWNED) #define IF_AFDATA_UNLOCK_ASSERT(ifp) mtx_assert(&(ifp)->if_afdata_lock, MA_NOTOWNED) @@ -606,16 +640,13 @@ extern struct sx ifnet_sxlock; * write, but also whether it was acquired with sleep support or not. */ #define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED) -#define IFNET_RLOCK_NOSLEEP_ASSERT() MPASS(in_epoch(net_epoch_preempt)) #define IFNET_WLOCK_ASSERT() do { \ sx_assert(&ifnet_sxlock, SA_XLOCKED); \ rw_assert(&ifnet_rwlock, RA_WLOCKED); \ } while (0) #define IFNET_RLOCK() sx_slock(&ifnet_sxlock) -#define IFNET_RLOCK_NOSLEEP() struct epoch_tracker ifnet_rlock_et; epoch_enter_preempt(net_epoch_preempt, &ifnet_rlock_et) #define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock) -#define IFNET_RUNLOCK_NOSLEEP() epoch_exit_preempt(net_epoch_preempt, &ifnet_rlock_et) /* * Look up an ifnet given its index; the _ref variant also acquires a @@ -654,6 +685,8 @@ int if_delgroup(struct ifnet *, const char *); int if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **); int if_allmulti(struct ifnet *, int); struct ifnet* if_alloc(u_char); +struct ifnet* if_alloc_dev(u_char, device_t dev); +struct ifnet* if_alloc_domain(u_char, int numa_domain); void if_attach(struct ifnet *); void if_dead(struct ifnet *); int if_delmulti(struct ifnet *, struct sockaddr *); diff --git a/freebsd/sys/net/if_vlan.c b/freebsd/sys/net/if_vlan.c index 893bb2cf..10a8a3bf 100644 --- a/freebsd/sys/net/if_vlan.c +++ b/freebsd/sys/net/if_vlan.c @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_kern_tls.h> #include <rtems/bsd/local/opt_vlan.h> #include <rtems/bsd/local/opt_ratelimit.h> @@ -105,6 +106,20 @@ struct ifvlantrunk { int refcnt; }; +#if defined(KERN_TLS) || defined(RATELIMIT) +struct vlan_snd_tag { + struct m_snd_tag com; + struct m_snd_tag *tag; +}; + +static inline struct vlan_snd_tag * +mst_to_vst(struct m_snd_tag *mst) +{ + + return (__containerof(mst, struct vlan_snd_tag, com)); +} +#endif + /* * This macro provides a facility to iterate over every vlan on a trunk with * the assumption that none will be added/removed during iteration. @@ -158,7 +173,7 @@ struct vlan_mc_entry { struct epoch_context mc_epoch_ctx; }; -struct ifvlan { +struct ifvlan { struct ifvlantrunk *ifv_trunk; struct ifnet *ifv_ifp; #define TRUNK(ifv) ((ifv)->ifv_trunk) @@ -166,28 +181,19 @@ struct ifvlan { void *ifv_cookie; int ifv_pflags; /* special flags we have set on parent */ int ifv_capenable; - struct ifv_linkmib { - int ifvm_encaplen; /* encapsulation length */ - int ifvm_mtufudge; /* MTU fudged by this much */ - int ifvm_mintu; /* min transmission unit */ - uint16_t ifvm_proto; /* encapsulation ethertype */ - uint16_t ifvm_tag; /* tag to apply on packets leaving if */ - uint16_t ifvm_vid; /* VLAN ID */ - uint8_t ifvm_pcp; /* Priority Code Point (PCP). */ - } ifv_mib; + int ifv_encaplen; /* encapsulation length */ + int ifv_mtufudge; /* MTU fudged by this much */ + int ifv_mintu; /* min transmission unit */ + uint16_t ifv_proto; /* encapsulation ethertype */ + uint16_t ifv_tag; /* tag to apply on packets leaving if */ + uint16_t ifv_vid; /* VLAN ID */ + uint8_t ifv_pcp; /* Priority Code Point (PCP). */ struct task lladdr_task; CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY CK_SLIST_ENTRY(ifvlan) ifv_list; #endif }; -#define ifv_proto ifv_mib.ifvm_proto -#define ifv_tag ifv_mib.ifvm_tag -#define ifv_vid ifv_mib.ifvm_vid -#define ifv_pcp ifv_mib.ifvm_pcp -#define ifv_encaplen ifv_mib.ifvm_encaplen -#define ifv_mtufudge ifv_mib.ifvm_mtufudge -#define ifv_mintu ifv_mib.ifvm_mintu /* Special flags we should propagate to parent. */ static struct { @@ -235,10 +241,6 @@ static struct sx _VLAN_SX_ID; #define VLAN_LOCKING_DESTROY() \ sx_destroy(&_VLAN_SX_ID) -#define VLAN_RLOCK() NET_EPOCH_ENTER(); -#define VLAN_RUNLOCK() NET_EPOCH_EXIT(); -#define VLAN_RLOCK_ASSERT() MPASS(in_epoch(net_epoch_preempt)) - #define VLAN_SLOCK() sx_slock(&_VLAN_SX_ID) #define VLAN_SUNLOCK() sx_sunlock(&_VLAN_SX_ID) #define VLAN_XLOCK() sx_xlock(&_VLAN_SX_ID) @@ -254,11 +256,8 @@ static struct sx _VLAN_SX_ID; */ #define TRUNK_LOCK_INIT(trunk) mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF) #define TRUNK_LOCK_DESTROY(trunk) mtx_destroy(&(trunk)->lock) -#define TRUNK_RLOCK(trunk) NET_EPOCH_ENTER() #define TRUNK_WLOCK(trunk) mtx_lock(&(trunk)->lock) -#define TRUNK_RUNLOCK(trunk) NET_EPOCH_EXIT(); #define TRUNK_WUNLOCK(trunk) mtx_unlock(&(trunk)->lock) -#define TRUNK_RLOCK_ASSERT(trunk) MPASS(in_epoch(net_epoch_preempt)) #define TRUNK_LOCK_ASSERT(trunk) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(trunk)->lock)) #define TRUNK_WLOCK_ASSERT(trunk) mtx_assert(&(trunk)->lock, MA_OWNED); @@ -282,9 +281,14 @@ static void trunk_destroy(struct ifvlantrunk *trunk); static void vlan_init(void *foo); static void vlan_input(struct ifnet *ifp, struct mbuf *m); static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); -#ifdef RATELIMIT +#if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); +static int vlan_snd_tag_modify(struct m_snd_tag *, + union if_snd_tag_modify_params *); +static int vlan_snd_tag_query(struct m_snd_tag *, + union if_snd_tag_query_params *); +static void vlan_snd_tag_free(struct m_snd_tag *); #endif static void vlan_qflush(struct ifnet *ifp); static int vlan_setflag(struct ifnet *ifp, int flag, int status, @@ -292,6 +296,8 @@ static int vlan_setflag(struct ifnet *ifp, int flag, int status, static int vlan_setflags(struct ifnet *ifp, int status); static int vlan_setmulti(struct ifnet *ifp); static int vlan_transmit(struct ifnet *ifp, struct mbuf *m); +static int vlan_output(struct ifnet *ifp, struct mbuf *m, + const struct sockaddr *dst, struct route *ro); static void vlan_unconfig(struct ifnet *ifp); static void vlan_unconfig_locked(struct ifnet *ifp, int departing); static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag); @@ -474,7 +480,7 @@ vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { struct ifvlan *ifv; - TRUNK_RLOCK_ASSERT(trunk); + NET_EPOCH_ASSERT(); CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) if (ifv->ifv_vid == vid) @@ -619,16 +625,17 @@ vlan_setmulti(struct ifnet *ifp) static void vlan_iflladdr(void *arg __unused, struct ifnet *ifp) { + struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *ifv_ifp; struct ifvlantrunk *trunk; struct sockaddr_dl *sdl; - /* Need the rmlock since this is run on taskqueue_swi. */ - VLAN_RLOCK(); + /* Need the epoch since this is run on taskqueue_swi. */ + NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return; } @@ -654,7 +661,7 @@ vlan_iflladdr(void *arg __unused, struct ifnet *ifp) taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); } TRUNK_WUNLOCK(trunk); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); } /* @@ -700,17 +707,18 @@ vlan_ifdetach(void *arg __unused, struct ifnet *ifp) static struct ifnet * vlan_trunkdev(struct ifnet *ifp) { + struct epoch_tracker et; struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (NULL); - VLAN_RLOCK(); + NET_EPOCH_ENTER(et); ifv = ifp->if_softc; ifp = NULL; if (ifv->ifv_trunk) ifp = PARENT(ifv); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return (ifp); } @@ -782,20 +790,21 @@ vlan_setcookie(struct ifnet *ifp, void *cookie) static struct ifnet * vlan_devat(struct ifnet *ifp, uint16_t vid) { + struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; - VLAN_RLOCK(); + NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return (NULL); } ifp = NULL; ifv = vlan_gethash(trunk, vid); if (ifv) ifp = ifv->ifv_ifp; - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return (ifp); } @@ -1055,17 +1064,16 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = vlanname; ifp->if_dunit = unit; - /* NB: flags are not set here */ - ifp->if_linkmib = &ifv->ifv_mib; - ifp->if_linkmiblen = sizeof(ifv->ifv_mib); - /* NB: mtu is not set here */ ifp->if_init = vlan_init; ifp->if_transmit = vlan_transmit; ifp->if_qflush = vlan_qflush; ifp->if_ioctl = vlan_ioctl; -#ifdef RATELIMIT +#if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = vlan_snd_tag_alloc; + ifp->if_snd_tag_modify = vlan_snd_tag_modify; + ifp->if_snd_tag_query = vlan_snd_tag_query; + ifp->if_snd_tag_free = vlan_snd_tag_free; #endif ifp->if_flags = VLAN_IFFLAGS; ether_ifattach(ifp, eaddr); @@ -1135,15 +1143,16 @@ vlan_init(void *foo __unused) static int vlan_transmit(struct ifnet *ifp, struct mbuf *m) { + struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *p; int error, len, mcast; - VLAN_RLOCK(); + NET_EPOCH_ENTER(et); ifv = ifp->if_softc; if (TRUNK(ifv) == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); m_freem(m); return (ENETDOWN); } @@ -1153,20 +1162,40 @@ vlan_transmit(struct ifnet *ifp, struct mbuf *m) BPF_MTAP(ifp, m); +#if defined(KERN_TLS) || defined(RATELIMIT) + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { + struct vlan_snd_tag *vst; + struct m_snd_tag *mst; + + MPASS(m->m_pkthdr.snd_tag->ifp == ifp); + mst = m->m_pkthdr.snd_tag; + vst = mst_to_vst(mst); + if (vst->tag->ifp != p) { + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + NET_EPOCH_EXIT(et); + m_freem(m); + return (EAGAIN); + } + + m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag); + m_snd_tag_rele(mst); + } +#endif + /* * Do not run parent's if_transmit() if the parent is not up, * or parent's driver will cause a system crash. */ if (!UP_AND_RUNNING(p)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); m_freem(m); return (ENETDOWN); } if (!ether_8021q_frame(&m, ifp, p, ifv->ifv_vid, ifv->ifv_pcp)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return (0); } @@ -1180,10 +1209,31 @@ vlan_transmit(struct ifnet *ifp, struct mbuf *m) if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return (error); } +static int +vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, + struct route *ro) +{ + struct epoch_tracker et; + struct ifvlan *ifv; + struct ifnet *p; + + NET_EPOCH_ENTER(et); + ifv = ifp->if_softc; + if (TRUNK(ifv) == NULL) { + NET_EPOCH_EXIT(et); + m_freem(m); + return (ENETDOWN); + } + p = PARENT(ifv); + NET_EPOCH_EXIT(et); + return p->if_output(ifp, m, dst, ro); +} + + /* * The ifp->if_qflush entry point for vlan(4) is a no-op. */ @@ -1195,15 +1245,16 @@ vlan_qflush(struct ifnet *ifp __unused) static void vlan_input(struct ifnet *ifp, struct mbuf *m) { + struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; struct m_tag *mtag; uint16_t vid, tag; - VLAN_RLOCK(); + NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); m_freem(m); return; } @@ -1226,7 +1277,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { if_printf(ifp, "cannot pullup VLAN header\n"); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return; } evl = mtod(m, struct ether_vlan_header *); @@ -1249,7 +1300,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) __func__, ifp->if_xname, ifp->if_type); #endif if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); m_freem(m); return; } @@ -1259,7 +1310,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; @@ -1279,7 +1330,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) sizeof(uint8_t), M_NOWAIT); if (mtag == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); m_freem(m); return; } @@ -1290,7 +1341,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) m->m_pkthdr.rcvif = ifv->ifv_ifp; if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); /* Pass it back through the parent's input routine. */ (*ifv->ifv_ifp->if_input)(ifv->ifv_ifp, m); @@ -1316,6 +1367,7 @@ vlan_lladdr_fn(void *arg, int pending __unused) static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) { + struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifnet *ifp; int error = 0; @@ -1397,7 +1449,6 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) */ ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge; ifp->if_baudrate = p->if_baudrate; - ifp->if_output = p->if_output; ifp->if_input = p->if_input; ifp->if_resolvemulti = p->if_resolvemulti; ifp->if_addrlen = p->if_addrlen; @@ -1405,6 +1456,12 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) ifp->if_pcp = ifv->ifv_pcp; /* + * We wrap the parent's if_output using vlan_output to ensure that it + * can't become stale. + */ + ifp->if_output = vlan_output; + + /* * Copy only a selected subset of flags from the parent. * Other flags are none of our business. */ @@ -1415,9 +1472,9 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) ifp->if_link_state = p->if_link_state; - TRUNK_RLOCK(TRUNK(ifv)); + NET_EPOCH_ENTER(et); vlan_capabilities(ifv); - TRUNK_RUNLOCK(TRUNK(ifv)); + NET_EPOCH_EXIT(et); /* * Set up our interface address to reflect the underlying @@ -1589,14 +1646,15 @@ vlan_setflags(struct ifnet *ifp, int status) static void vlan_link_state(struct ifnet *ifp) { + struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; /* Called from a taskqueue_swi task, so we cannot sleep. */ - VLAN_RLOCK(); + NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return; } @@ -1607,7 +1665,7 @@ vlan_link_state(struct ifnet *ifp) trunk->parent->if_link_state); } TRUNK_WUNLOCK(trunk); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); } static void @@ -1620,7 +1678,7 @@ vlan_capabilities(struct ifvlan *ifv) u_long hwa = 0; VLAN_SXLOCK_ASSERT(); - TRUNK_RLOCK_ASSERT(TRUNK(ifv)); + NET_EPOCH_ASSERT(); p = PARENT(ifv); ifp = ifv->ifv_ifp; @@ -1704,6 +1762,30 @@ vlan_capabilities(struct ifvlan *ifv) ena |= (mena & IFCAP_TXRTLMT); #endif + /* + * If the parent interface supports unmapped mbufs, so does + * the VLAN interface. Note that this should be fine even for + * interfaces that don't support hardware tagging as headers + * are prepended in normal mbufs to unmapped mbufs holding + * payload data. + */ + cap |= (p->if_capabilities & IFCAP_NOMAP); + ena |= (mena & IFCAP_NOMAP); + + /* + * If the parent interface can offload encryption and segmentation + * of TLS records over TCP, propagate it's capability to the VLAN + * interface. + * + * All TLS drivers in the tree today can deal with VLANs. If + * this ever changes, then a new IFCAP_VLAN_TXTLS can be + * defined. + */ + if (p->if_capabilities & IFCAP_TXTLS) + cap |= p->if_capabilities & IFCAP_TXTLS; + if (p->if_capenable & IFCAP_TXTLS) + ena |= mena & IFCAP_TXTLS; + ifp->if_capabilities = cap; ifp->if_capenable = ena; ifp->if_hwassist = hwa; @@ -1712,6 +1794,7 @@ vlan_capabilities(struct ifvlan *ifv) static void vlan_trunk_capabilities(struct ifnet *ifp) { + struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; @@ -1721,11 +1804,11 @@ vlan_trunk_capabilities(struct ifnet *ifp) VLAN_SUNLOCK(); return; } - TRUNK_RLOCK(trunk); + NET_EPOCH_ENTER(et); VLAN_FOREACH(ifv, trunk) { vlan_capabilities(ifv); } - TRUNK_RUNLOCK(trunk); + NET_EPOCH_EXIT(et); VLAN_SUNLOCK(); } @@ -1917,9 +2000,11 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) ifv->ifv_capenable = ifr->ifr_reqcap; trunk = TRUNK(ifv); if (trunk != NULL) { - TRUNK_RLOCK(trunk); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); vlan_capabilities(ifv); - TRUNK_RUNLOCK(trunk); + NET_EPOCH_EXIT(et); } VLAN_SUNLOCK(); break; @@ -1932,18 +2017,77 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) return (error); } -#ifdef RATELIMIT +#if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { + struct epoch_tracker et; + struct vlan_snd_tag *vst; + struct ifvlan *ifv; + struct ifnet *parent; + int error; - /* get trunk device */ - ifp = vlan_trunkdev(ifp); - if (ifp == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0) + NET_EPOCH_ENTER(et); + ifv = ifp->if_softc; + if (ifv->ifv_trunk != NULL) + parent = PARENT(ifv); + else + parent = NULL; + if (parent == NULL || parent->if_snd_tag_alloc == NULL) { + NET_EPOCH_EXIT(et); return (EOPNOTSUPP); - /* forward allocation request */ - return (ifp->if_snd_tag_alloc(ifp, params, ppmt)); + } + if_ref(parent); + NET_EPOCH_EXIT(et); + + vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT); + if (vst == NULL) { + if_rele(parent); + return (ENOMEM); + } + + error = parent->if_snd_tag_alloc(parent, params, &vst->tag); + if_rele(parent); + if (error) { + free(vst, M_VLAN); + return (error); + } + + m_snd_tag_init(&vst->com, ifp); + + *ppmt = &vst->com; + return (0); +} + +static int +vlan_snd_tag_modify(struct m_snd_tag *mst, + union if_snd_tag_modify_params *params) +{ + struct vlan_snd_tag *vst; + + vst = mst_to_vst(mst); + return (vst->tag->ifp->if_snd_tag_modify(vst->tag, params)); +} + +static int +vlan_snd_tag_query(struct m_snd_tag *mst, + union if_snd_tag_query_params *params) +{ + struct vlan_snd_tag *vst; + + vst = mst_to_vst(mst); + return (vst->tag->ifp->if_snd_tag_query(vst->tag, params)); +} + +static void +vlan_snd_tag_free(struct m_snd_tag *mst) +{ + struct vlan_snd_tag *vst; + + vst = mst_to_vst(mst); + m_snd_tag_rele(vst->tag); + free(vst, M_VLAN); } #endif diff --git a/freebsd/sys/net/if_vlan_var.h b/freebsd/sys/net/if_vlan_var.h index 0b66ec0a..28b0fa73 100644 --- a/freebsd/sys/net/if_vlan_var.h +++ b/freebsd/sys/net/if_vlan_var.h @@ -150,13 +150,13 @@ extern int (*vlan_pcp_p)(struct ifnet *, uint16_t *); extern int (*vlan_setcookie_p)(struct ifnet *, void *); extern void *(*vlan_cookie_p)(struct ifnet *); -#ifdef _SYS_EVENTHANDLER_H_ +#include <sys/_eventhandler.h> + /* VLAN state change events */ typedef void (*vlan_config_fn)(void *, struct ifnet *, uint16_t); typedef void (*vlan_unconfig_fn)(void *, struct ifnet *, uint16_t); EVENTHANDLER_DECLARE(vlan_config, vlan_config_fn); EVENTHANDLER_DECLARE(vlan_unconfig, vlan_unconfig_fn); -#endif /* _SYS_EVENTHANDLER_H_ */ #endif /* _KERNEL */ diff --git a/freebsd/sys/net/iflib.h b/freebsd/sys/net/iflib.h index 8c2be41b..cda00c4c 100644 --- a/freebsd/sys/net/iflib.h +++ b/freebsd/sys/net/iflib.h @@ -69,6 +69,9 @@ typedef struct if_rxd_frag { uint16_t irf_len; } *if_rxd_frag_t; +/* bnxt supports 64 with hardware LRO enabled */ +#define IFLIB_MAX_RX_SEGS 64 + typedef struct if_rxd_info { /* set by iflib */ uint16_t iri_qsidx; /* qset index */ @@ -76,7 +79,7 @@ typedef struct if_rxd_info { /* XXX redundant with the new irf_len field */ uint16_t iri_len; /* packet length */ qidx_t iri_cidx; /* consumer index of cq */ - struct ifnet *iri_ifp; /* some drivers >1 interface per softc */ + if_t iri_ifp; /* driver may have >1 iface per softc */ /* updated by driver */ if_rxd_frag_t iri_frags; @@ -129,12 +132,12 @@ typedef struct if_pkt_info { uint8_t ipi_mflags; /* packet mbuf flags */ uint32_t ipi_tcp_seq; /* tcp seqno */ - uint32_t ipi_tcp_sum; /* tcp csum */ + uint32_t __spare0__; } *if_pkt_info_t; typedef struct if_irq { struct resource *ii_res; - int ii_rid; + int __spare0__; void *ii_tag; } *if_irq_t; @@ -163,7 +166,7 @@ typedef struct pci_vendor_info { uint32_t pvi_subdevice_id; uint32_t pvi_rev_id; uint32_t pvi_class_mask; - caddr_t pvi_name; + const char *pvi_name; } pci_vendor_info_t; #define PVID(vendor, devid, name) {vendor, devid, 0, 0, 0, 0, name} @@ -191,9 +194,8 @@ typedef struct if_softc_ctx { int isc_vectors; int isc_nrxqsets; int isc_ntxqsets; - uint8_t isc_min_tx_latency; /* disable doorbell update batching */ - uint8_t isc_rx_mvec_enable; /* generate mvecs on rx */ - uint32_t isc_txrx_budget_bytes_max; + uint16_t __spare0__; + uint32_t __spare1__; int isc_msix_bar; /* can be model specific - initialize in attach_pre */ int isc_tx_nsegments; /* can be model specific - initialize in attach_pre */ int isc_ntxd[8]; @@ -215,16 +217,23 @@ typedef struct if_softc_ctx { int isc_rss_table_mask; int isc_nrxqsets_max; int isc_ntxqsets_max; - uint32_t isc_tx_qdepth; + uint32_t __spare2__; iflib_intr_mode_t isc_intr; uint16_t isc_max_frame_size; /* set at init time by driver */ uint16_t isc_min_frame_size; /* set at init time by driver, only used if IFLIB_NEED_ETHER_PAD is set. */ uint32_t isc_pause_frames; /* set by driver for iflib_timer to detect */ - pci_vendor_info_t isc_vendor_info; /* set by iflib prior to attach_pre */ + uint32_t __spare3__; + uint32_t __spare4__; + uint32_t __spare5__; + uint32_t __spare6__; + uint32_t __spare7__; + uint32_t __spare8__; + caddr_t __spare9__; int isc_disable_msix; if_txrx_t isc_txrx; + struct ifmedia *isc_media; } *if_softc_ctx_t; /* @@ -244,8 +253,8 @@ struct if_shared_ctx { int isc_admin_intrcnt; /* # of admin/link interrupts */ /* fields necessary for probe */ - pci_vendor_info_t *isc_vendor_info; - char *isc_driver_version; + const pci_vendor_info_t *isc_vendor_info; + const char *isc_driver_version; /* optional function to transform the read values to match the table*/ void (*isc_parse_devinfo) (uint16_t *device_id, uint16_t *subvendor_id, uint16_t *subdevice_id, uint16_t *rev_id); @@ -260,7 +269,7 @@ struct if_shared_ctx { int isc_nfl __aligned(CACHE_LINE_SIZE); int isc_ntxqs; /* # of tx queues per tx qset - usually 1 */ int isc_nrxqs; /* # of rx queues per rx qset - intel 1, chelsio 2, broadcom 3 */ - int isc_rx_process_limit; + int __spare0__; int isc_tx_reclaim_thresh; int isc_flags; const char *isc_name; @@ -284,11 +293,6 @@ typedef enum { IFLIB_INTR_IOV, } iflib_intr_type_t; -#ifndef ETH_ADDR_LEN -#define ETH_ADDR_LEN 6 -#endif - - /* * Interface has a separate command queue for RX */ @@ -358,7 +362,10 @@ typedef enum { * Interface needs admin task to ignore interface up/down status */ #define IFLIB_ADMIN_ALWAYS_RUN 0x10000 - +/* + * Driver will pass the media + */ +#define IFLIB_DRIVER_MEDIA 0x20000 /* * field accessors @@ -378,6 +385,8 @@ void iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN]); void iflib_request_reset(if_ctx_t ctx); uint8_t iflib_in_detach(if_ctx_t ctx); +uint32_t iflib_get_rx_mbuf_sz(if_ctx_t ctx); + /* * If the driver can plug cleanly in to newbus use these */ @@ -388,6 +397,12 @@ int iflib_device_suspend(device_t); int iflib_device_resume(device_t); int iflib_device_shutdown(device_t); +/* + * Use this instead of iflib_device_probe if the driver should report + * BUS_PROBE_VENDOR instead of BUS_PROBE_DEFAULT. (For example, an out-of-tree + * driver based on iflib). + */ +int iflib_device_probe_vendor(device_t); int iflib_device_iov_init(device_t, uint16_t, const nvlist_t *); void iflib_device_iov_uninit(device_t); @@ -400,8 +415,6 @@ int iflib_device_iov_add_vf(device_t, uint16_t, const nvlist_t *); int iflib_device_register(device_t dev, void *softc, if_shared_ctx_t sctx, if_ctx_t *ctxp); int iflib_device_deregister(if_ctx_t); - - int iflib_irq_alloc(if_ctx_t, if_irq_t, int, driver_filter_t, void *filter_arg, driver_intr_t, void *arg, const char *name); int iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid, iflib_intr_type_t type, driver_filter_t *filter, @@ -410,33 +423,28 @@ void iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t t void iflib_irq_free(if_ctx_t ctx, if_irq_t irq); -void iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name); +void iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, + const char *name); void iflib_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn, const char *name); - void iflib_config_gtask_deinit(struct grouptask *gtask); - - void iflib_tx_intr_deferred(if_ctx_t ctx, int txqid); void iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid); void iflib_admin_intr_deferred(if_ctx_t ctx); void iflib_iov_intr_deferred(if_ctx_t ctx); - void iflib_link_state_change(if_ctx_t ctx, int linkstate, uint64_t baudrate); int iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags); +int iflib_dma_alloc_align(if_ctx_t ctx, int size, int align, iflib_dma_info_t dma, int mapflags); void iflib_dma_free(iflib_dma_info_t dma); - int iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count); void iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count); - struct sx *iflib_ctx_lock_get(if_ctx_t); -struct mtx *iflib_qset_lock_get(if_ctx_t, uint16_t); void iflib_led_create(if_ctx_t ctx); @@ -448,4 +456,5 @@ void iflib_add_int_delay_sysctl(if_ctx_t, const char *, const char *, */ if_pseudo_t iflib_clone_register(if_shared_ctx_t); void iflib_clone_deregister(if_pseudo_t); + #endif /* __IFLIB_H_ */ diff --git a/freebsd/sys/net/netisr.c b/freebsd/sys/net/netisr.c index a3da964b..0f7c4800 100644 --- a/freebsd/sys/net/netisr.c +++ b/freebsd/sys/net/netisr.c @@ -868,6 +868,7 @@ netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy, ("%s: invalid policy %u for %s", __func__, npp->np_policy, npp->np_name)); + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); ifp = m->m_pkthdr.rcvif; if (ifp != NULL) *cpuidp = nws_array[(ifp->if_index + source) % nws_count]; diff --git a/freebsd/sys/net/pfil.c b/freebsd/sys/net/pfil.c index 65af515f..1dea915d 100644 --- a/freebsd/sys/net/pfil.c +++ b/freebsd/sys/net/pfil.c @@ -6,6 +6,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * + * Copyright (c) 2019 Gleb Smirnoff <glebius@FreeBSD.org> * Copyright (c) 1996 Matthew R. Green * All rights reserved. * @@ -34,445 +35,650 @@ */ #include <sys/param.h> +#include <sys/conf.h> #include <sys/kernel.h> +#include <sys/epoch.h> #include <sys/errno.h> #include <sys/lock.h> #include <sys/malloc.h> -#include <sys/rmlock.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/systm.h> -#include <sys/condvar.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/queue.h> +#include <sys/ucred.h> +#include <sys/jail.h> #include <net/if.h> #include <net/if_var.h> #include <net/pfil.h> -static struct mtx pfil_global_lock; - -MTX_SYSINIT(pfil_heads_lock, &pfil_global_lock, "pfil_head_list lock", - MTX_DEF); - -static struct packet_filter_hook *pfil_chain_get(int, struct pfil_head *); -static int pfil_chain_add(pfil_chain_t *, struct packet_filter_hook *, int); -static int pfil_chain_remove(pfil_chain_t *, void *, void *); -static int pfil_add_hook_priv(void *, void *, int, struct pfil_head *, bool); +static MALLOC_DEFINE(M_PFIL, "pfil", "pfil(9) packet filter hooks"); + +static int pfil_ioctl(struct cdev *, u_long, caddr_t, int, struct thread *); +static struct cdevsw pfil_cdevsw = { + .d_ioctl = pfil_ioctl, + .d_name = PFILDEV, + .d_version = D_VERSION, +}; +static struct cdev *pfil_dev; + +static struct mtx pfil_lock; +MTX_SYSINIT(pfil_mtxinit, &pfil_lock, "pfil(9) lock", MTX_DEF); +#define PFIL_LOCK() mtx_lock(&pfil_lock) +#define PFIL_UNLOCK() mtx_unlock(&pfil_lock) +#define PFIL_LOCK_ASSERT() mtx_assert(&pfil_lock, MA_OWNED) + +#define PFIL_EPOCH net_epoch_preempt +#define PFIL_EPOCH_ENTER(et) epoch_enter_preempt(net_epoch_preempt, &(et)) +#define PFIL_EPOCH_EXIT(et) epoch_exit_preempt(net_epoch_preempt, &(et)) + +struct pfil_hook { + pfil_func_t hook_func; + void *hook_ruleset; + int hook_flags; + int hook_links; + enum pfil_types hook_type; + const char *hook_modname; + const char *hook_rulname; + LIST_ENTRY(pfil_hook) hook_list; +}; + +struct pfil_link { + CK_STAILQ_ENTRY(pfil_link) link_chain; + pfil_func_t link_func; + void *link_ruleset; + int link_flags; + struct pfil_hook *link_hook; + struct epoch_context link_epoch_ctx; +}; + +typedef CK_STAILQ_HEAD(pfil_chain, pfil_link) pfil_chain_t; +struct pfil_head { + int head_nhooksin; + int head_nhooksout; + pfil_chain_t head_in; + pfil_chain_t head_out; + int head_flags; + enum pfil_types head_type; + LIST_ENTRY(pfil_head) head_list; + const char *head_name; +}; LIST_HEAD(pfilheadhead, pfil_head); -VNET_DEFINE(struct pfilheadhead, pfil_head_list); +VNET_DEFINE_STATIC(struct pfilheadhead, pfil_head_list) = + LIST_HEAD_INITIALIZER(pfil_head_list); #define V_pfil_head_list VNET(pfil_head_list) -VNET_DEFINE(struct rmlock, pfil_lock); - -#define PFIL_LOCK_INIT_REAL(l, t) \ - rm_init_flags(l, "PFil " t " rmlock", RM_RECURSE) -#define PFIL_LOCK_DESTROY_REAL(l) \ - rm_destroy(l) -#define PFIL_LOCK_INIT(p) do { \ - if ((p)->flags & PFIL_FLAG_PRIVATE_LOCK) { \ - PFIL_LOCK_INIT_REAL(&(p)->ph_lock, "private"); \ - (p)->ph_plock = &(p)->ph_lock; \ - } else \ - (p)->ph_plock = &V_pfil_lock; \ -} while (0) -#define PFIL_LOCK_DESTROY(p) do { \ - if ((p)->flags & PFIL_FLAG_PRIVATE_LOCK) \ - PFIL_LOCK_DESTROY_REAL((p)->ph_plock); \ -} while (0) - -#define PFIL_TRY_RLOCK(p, t) rm_try_rlock((p)->ph_plock, (t)) -#define PFIL_RLOCK(p, t) rm_rlock((p)->ph_plock, (t)) -#define PFIL_WLOCK(p) rm_wlock((p)->ph_plock) -#define PFIL_RUNLOCK(p, t) rm_runlock((p)->ph_plock, (t)) -#define PFIL_WUNLOCK(p) rm_wunlock((p)->ph_plock) -#define PFIL_WOWNED(p) rm_wowned((p)->ph_plock) - -#define PFIL_HEADLIST_LOCK() mtx_lock(&pfil_global_lock) -#define PFIL_HEADLIST_UNLOCK() mtx_unlock(&pfil_global_lock) -/* - * pfil_run_hooks() runs the specified packet filter hook chain. - */ +LIST_HEAD(pfilhookhead, pfil_hook); +VNET_DEFINE_STATIC(struct pfilhookhead, pfil_hook_list) = + LIST_HEAD_INITIALIZER(pfil_hook_list); +#define V_pfil_hook_list VNET(pfil_hook_list) + +static struct pfil_link *pfil_link_remove(pfil_chain_t *, pfil_hook_t ); +static void pfil_link_free(epoch_context_t); + int -pfil_run_hooks(struct pfil_head *ph, struct mbuf **mp, struct ifnet *ifp, - int dir, int flags, struct inpcb *inp) +pfil_realloc(pfil_packet_t *p, int flags, struct ifnet *ifp) { - struct rm_priotracker rmpt; - struct packet_filter_hook *pfh; - struct mbuf *m = *mp; - int rv = 0; - - PFIL_RLOCK(ph, &rmpt); - KASSERT(ph->ph_nhooks >= 0, ("Pfil hook count dropped < 0")); - for (pfh = pfil_chain_get(dir, ph); pfh != NULL; - pfh = TAILQ_NEXT(pfh, pfil_chain)) { - if (pfh->pfil_func_flags != NULL) { - rv = (*pfh->pfil_func_flags)(pfh->pfil_arg, &m, ifp, - dir, flags, inp); - if (rv != 0 || m == NULL) - break; - } - if (pfh->pfil_func != NULL) { - rv = (*pfh->pfil_func)(pfh->pfil_arg, &m, ifp, dir, - inp); - if (rv != 0 || m == NULL) - break; - } - } - PFIL_RUNLOCK(ph, &rmpt); - *mp = m; - return (rv); + struct mbuf *m; + + MPASS(flags & PFIL_MEMPTR); + + if ((m = m_devget(p->mem, PFIL_LENGTH(flags), 0, ifp, NULL)) == NULL) + return (ENOMEM); + *p = pfil_packet_align(*p); + *p->m = m; + + return (0); } -static struct packet_filter_hook * -pfil_chain_get(int dir, struct pfil_head *ph) +static __noinline int +pfil_fake_mbuf(pfil_func_t func, pfil_packet_t *p, struct ifnet *ifp, int flags, + void *ruleset, struct inpcb *inp) { + struct mbuf m, *mp; + pfil_return_t rv; + + (void)m_init(&m, M_NOWAIT, MT_DATA, M_NOFREE | M_PKTHDR); + m_extadd(&m, p->mem, PFIL_LENGTH(flags), NULL, NULL, NULL, 0, + EXT_RXRING); + m.m_len = m.m_pkthdr.len = PFIL_LENGTH(flags); + mp = &m; + flags &= ~(PFIL_MEMPTR | PFIL_LENMASK); + + rv = func(&mp, ifp, flags, ruleset, inp); + if (rv == PFIL_PASS && mp != &m) { + /* + * Firewalls that need pfil_fake_mbuf() most likely don't + * know they need return PFIL_REALLOCED. + */ + rv = PFIL_REALLOCED; + *p = pfil_packet_align(*p); + *p->m = mp; + } - if (dir == PFIL_IN) - return (TAILQ_FIRST(&ph->ph_in)); - else if (dir == PFIL_OUT) - return (TAILQ_FIRST(&ph->ph_out)); - else - return (NULL); + return (rv); } -#ifndef __rtems__ /* - * pfil_try_rlock() acquires rm reader lock for specified head - * if this is immediately possible. + * pfil_run_hooks() runs the specified packet filter hook chain. */ int -pfil_try_rlock(struct pfil_head *ph, struct rm_priotracker *tracker) +pfil_run_hooks(struct pfil_head *head, pfil_packet_t p, struct ifnet *ifp, + int flags, struct inpcb *inp) { - - return (PFIL_TRY_RLOCK(ph, tracker)); + struct epoch_tracker et; + pfil_chain_t *pch; + struct pfil_link *link; + pfil_return_t rv; + bool realloc = false; + + if (PFIL_DIR(flags) == PFIL_IN) + pch = &head->head_in; + else if (__predict_true(PFIL_DIR(flags) == PFIL_OUT)) + pch = &head->head_out; + else + panic("%s: bogus flags %d", __func__, flags); + + rv = PFIL_PASS; + PFIL_EPOCH_ENTER(et); + CK_STAILQ_FOREACH(link, pch, link_chain) { + if ((flags & PFIL_MEMPTR) && !(link->link_flags & PFIL_MEMPTR)) + rv = pfil_fake_mbuf(link->link_func, &p, ifp, flags, + link->link_ruleset, inp); + else + rv = (*link->link_func)(p, ifp, flags, + link->link_ruleset, inp); + if (rv == PFIL_DROPPED || rv == PFIL_CONSUMED) + break; + else if (rv == PFIL_REALLOCED) { + flags &= ~(PFIL_MEMPTR | PFIL_LENMASK); + realloc = true; + } + } + PFIL_EPOCH_EXIT(et); + if (realloc && rv == PFIL_PASS) + rv = PFIL_REALLOCED; + return (rv); } -#endif /* __rtems__ */ /* - * pfil_rlock() acquires rm reader lock for specified head. + * pfil_head_register() registers a pfil_head with the packet filter hook + * mechanism. */ -void -pfil_rlock(struct pfil_head *ph, struct rm_priotracker *tracker) +pfil_head_t +pfil_head_register(struct pfil_head_args *pa) { + struct pfil_head *head, *list; - PFIL_RLOCK(ph, tracker); -} + MPASS(pa->pa_version == PFIL_VERSION); -/* - * pfil_runlock() releases reader lock for specified head. - */ -void -pfil_runlock(struct pfil_head *ph, struct rm_priotracker *tracker) -{ + head = malloc(sizeof(struct pfil_head), M_PFIL, M_WAITOK); + + head->head_nhooksin = head->head_nhooksout = 0; + head->head_flags = pa->pa_flags; + head->head_type = pa->pa_type; + head->head_name = pa->pa_headname; + CK_STAILQ_INIT(&head->head_in); + CK_STAILQ_INIT(&head->head_out); + + PFIL_LOCK(); + LIST_FOREACH(list, &V_pfil_head_list, head_list) + if (strcmp(pa->pa_headname, list->head_name) == 0) { + printf("pfil: duplicate head \"%s\"\n", + pa->pa_headname); + } + LIST_INSERT_HEAD(&V_pfil_head_list, head, head_list); + PFIL_UNLOCK(); - PFIL_RUNLOCK(ph, tracker); + return (head); } /* - * pfil_wlock() acquires writer lock for specified head. + * pfil_head_unregister() removes a pfil_head from the packet filter hook + * mechanism. The producer of the hook promises that all outstanding + * invocations of the hook have completed before it unregisters the hook. */ void -pfil_wlock(struct pfil_head *ph) +pfil_head_unregister(pfil_head_t ph) { + struct pfil_link *link, *next; + + PFIL_LOCK(); + LIST_REMOVE(ph, head_list); - PFIL_WLOCK(ph); + CK_STAILQ_FOREACH_SAFE(link, &ph->head_in, link_chain, next) { + link->link_hook->hook_links--; + free(link, M_PFIL); + } + CK_STAILQ_FOREACH_SAFE(link, &ph->head_out, link_chain, next) { + link->link_hook->hook_links--; + free(link, M_PFIL); + } + PFIL_UNLOCK(); } -/* - * pfil_wunlock() releases writer lock for specified head. - */ -void -pfil_wunlock(struct pfil_head *ph) +pfil_hook_t +pfil_add_hook(struct pfil_hook_args *pa) { + struct pfil_hook *hook, *list; + + MPASS(pa->pa_version == PFIL_VERSION); + + hook = malloc(sizeof(struct pfil_hook), M_PFIL, M_WAITOK | M_ZERO); + hook->hook_func = pa->pa_func; + hook->hook_ruleset = pa->pa_ruleset; + hook->hook_flags = pa->pa_flags; + hook->hook_type = pa->pa_type; + hook->hook_modname = pa->pa_modname; + hook->hook_rulname = pa->pa_rulname; + + PFIL_LOCK(); + LIST_FOREACH(list, &V_pfil_hook_list, hook_list) + if (strcmp(pa->pa_modname, list->hook_modname) == 0 && + strcmp(pa->pa_rulname, list->hook_rulname) == 0) { + printf("pfil: duplicate hook \"%s:%s\"\n", + pa->pa_modname, pa->pa_rulname); + } + LIST_INSERT_HEAD(&V_pfil_hook_list, hook, hook_list); + PFIL_UNLOCK(); - PFIL_WUNLOCK(ph); + return (hook); } -/* - * pfil_wowned() returns a non-zero value if the current thread owns - * an exclusive lock. - */ -int -pfil_wowned(struct pfil_head *ph) +static int +pfil_unlink(struct pfil_link_args *pa, pfil_head_t head, pfil_hook_t hook) { + struct pfil_link *in, *out; + + PFIL_LOCK_ASSERT(); - return (PFIL_WOWNED(ph)); + if (pa->pa_flags & PFIL_IN) { + in = pfil_link_remove(&head->head_in, hook); + if (in != NULL) { + head->head_nhooksin--; + hook->hook_links--; + } + } else + in = NULL; + if (pa->pa_flags & PFIL_OUT) { + out = pfil_link_remove(&head->head_out, hook); + if (out != NULL) { + head->head_nhooksout--; + hook->hook_links--; + } + } else + out = NULL; + PFIL_UNLOCK(); + + if (in != NULL) + epoch_call(PFIL_EPOCH, &in->link_epoch_ctx, pfil_link_free); + if (out != NULL) + epoch_call(PFIL_EPOCH, &out->link_epoch_ctx, pfil_link_free); + + if (in == NULL && out == NULL) + return (ENOENT); + else + return (0); } -/* - * pfil_head_register() registers a pfil_head with the packet filter hook - * mechanism. - */ int -pfil_head_register(struct pfil_head *ph) +pfil_link(struct pfil_link_args *pa) { - struct pfil_head *lph; - - PFIL_HEADLIST_LOCK(); - LIST_FOREACH(lph, &V_pfil_head_list, ph_list) { - if (ph->ph_type == lph->ph_type && - ph->ph_un.phu_val == lph->ph_un.phu_val) { - PFIL_HEADLIST_UNLOCK(); - return (EEXIST); - } + struct pfil_link *in, *out, *link; + struct pfil_head *head; + struct pfil_hook *hook; + int error; + + MPASS(pa->pa_version == PFIL_VERSION); + + if ((pa->pa_flags & (PFIL_IN | PFIL_UNLINK)) == PFIL_IN) + in = malloc(sizeof(*in), M_PFIL, M_WAITOK | M_ZERO); + else + in = NULL; + if ((pa->pa_flags & (PFIL_OUT | PFIL_UNLINK)) == PFIL_OUT) + out = malloc(sizeof(*out), M_PFIL, M_WAITOK | M_ZERO); + else + out = NULL; + + PFIL_LOCK(); + if (pa->pa_flags & PFIL_HEADPTR) + head = pa->pa_head; + else + LIST_FOREACH(head, &V_pfil_head_list, head_list) + if (strcmp(pa->pa_headname, head->head_name) == 0) + break; + if (pa->pa_flags & PFIL_HOOKPTR) + hook = pa->pa_hook; + else + LIST_FOREACH(hook, &V_pfil_hook_list, hook_list) + if (strcmp(pa->pa_modname, hook->hook_modname) == 0 && + strcmp(pa->pa_rulname, hook->hook_rulname) == 0) + break; + if (head == NULL || hook == NULL) { + error = ENOENT; + goto fail; + } + + if (pa->pa_flags & PFIL_UNLINK) + return (pfil_unlink(pa, head, hook)); + + if (head->head_type != hook->hook_type || + ((hook->hook_flags & pa->pa_flags) & ~head->head_flags)) { + error = EINVAL; + goto fail; + } + + if (pa->pa_flags & PFIL_IN) + CK_STAILQ_FOREACH(link, &head->head_in, link_chain) + if (link->link_hook == hook) { + error = EEXIST; + goto fail; + } + if (pa->pa_flags & PFIL_OUT) + CK_STAILQ_FOREACH(link, &head->head_out, link_chain) + if (link->link_hook == hook) { + error = EEXIST; + goto fail; + } + + if (pa->pa_flags & PFIL_IN) { + in->link_hook = hook; + in->link_func = hook->hook_func; + in->link_flags = hook->hook_flags; + in->link_ruleset = hook->hook_ruleset; + if (pa->pa_flags & PFIL_APPEND) + CK_STAILQ_INSERT_TAIL(&head->head_in, in, link_chain); + else + CK_STAILQ_INSERT_HEAD(&head->head_in, in, link_chain); + hook->hook_links++; + head->head_nhooksin++; + } + if (pa->pa_flags & PFIL_OUT) { + out->link_hook = hook; + out->link_func = hook->hook_func; + out->link_flags = hook->hook_flags; + out->link_ruleset = hook->hook_ruleset; + if (pa->pa_flags & PFIL_APPEND) + CK_STAILQ_INSERT_HEAD(&head->head_out, out, link_chain); + else + CK_STAILQ_INSERT_TAIL(&head->head_out, out, link_chain); + hook->hook_links++; + head->head_nhooksout++; } - PFIL_LOCK_INIT(ph); - ph->ph_nhooks = 0; - TAILQ_INIT(&ph->ph_in); - TAILQ_INIT(&ph->ph_out); - LIST_INSERT_HEAD(&V_pfil_head_list, ph, ph_list); - PFIL_HEADLIST_UNLOCK(); + PFIL_UNLOCK(); + return (0); + +fail: + PFIL_UNLOCK(); + free(in, M_PFIL); + free(out, M_PFIL); + return (error); +} + +static void +pfil_link_free(epoch_context_t ctx) +{ + struct pfil_link *link; + + link = __containerof(ctx, struct pfil_link, link_epoch_ctx); + free(link, M_PFIL); } /* - * pfil_head_unregister() removes a pfil_head from the packet filter hook - * mechanism. The producer of the hook promises that all outstanding - * invocations of the hook have completed before it unregisters the hook. + * pfil_remove_hook removes a filter from all filtering points. */ -int -pfil_head_unregister(struct pfil_head *ph) +void +pfil_remove_hook(pfil_hook_t hook) { - struct packet_filter_hook *pfh, *pfnext; - - PFIL_HEADLIST_LOCK(); - LIST_REMOVE(ph, ph_list); - PFIL_HEADLIST_UNLOCK(); - TAILQ_FOREACH_SAFE(pfh, &ph->ph_in, pfil_chain, pfnext) - free(pfh, M_IFADDR); - TAILQ_FOREACH_SAFE(pfh, &ph->ph_out, pfil_chain, pfnext) - free(pfh, M_IFADDR); - PFIL_LOCK_DESTROY(ph); - return (0); + struct pfil_head *head; + struct pfil_link *in, *out; + + PFIL_LOCK(); + LIST_FOREACH(head, &V_pfil_head_list, head_list) { +retry: + in = pfil_link_remove(&head->head_in, hook); + if (in != NULL) { + head->head_nhooksin--; + hook->hook_links--; + epoch_call(PFIL_EPOCH, &in->link_epoch_ctx, + pfil_link_free); + } + out = pfil_link_remove(&head->head_out, hook); + if (out != NULL) { + head->head_nhooksout--; + hook->hook_links--; + epoch_call(PFIL_EPOCH, &out->link_epoch_ctx, + pfil_link_free); + } + if (in != NULL || out != NULL) + /* What if some stupid admin put same filter twice? */ + goto retry; + } + LIST_REMOVE(hook, hook_list); + PFIL_UNLOCK(); + MPASS(hook->hook_links == 0); + free(hook, M_PFIL); } /* - * pfil_head_get() returns the pfil_head for a given key/dlt. + * Internal: Remove a pfil hook from a hook chain. */ -struct pfil_head * -pfil_head_get(int type, u_long val) +static struct pfil_link * +pfil_link_remove(pfil_chain_t *chain, pfil_hook_t hook) { - struct pfil_head *ph; + struct pfil_link *link; - PFIL_HEADLIST_LOCK(); - LIST_FOREACH(ph, &V_pfil_head_list, ph_list) - if (ph->ph_type == type && ph->ph_un.phu_val == val) - break; - PFIL_HEADLIST_UNLOCK(); - return (ph); + PFIL_LOCK_ASSERT(); + + CK_STAILQ_FOREACH(link, chain, link_chain) + if (link->link_hook == hook) { + CK_STAILQ_REMOVE(chain, link, pfil_link, link_chain); + return (link); + } + + return (NULL); } -/* - * pfil_add_hook_flags() adds a function to the packet filter hook. the - * flags are: - * PFIL_IN call me on incoming packets - * PFIL_OUT call me on outgoing packets - * PFIL_ALL call me on all of the above - * PFIL_WAITOK OK to call malloc with M_WAITOK. - */ -int -pfil_add_hook_flags(pfil_func_flags_t func, void *arg, int flags, - struct pfil_head *ph) +static void +pfil_init(const void *unused __unused) { - return (pfil_add_hook_priv(func, arg, flags, ph, true)); + struct make_dev_args args; + int error; + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_WAITOK | MAKEDEV_CHECKNAME; + args.mda_devsw = &pfil_cdevsw; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_WHEEL; + args.mda_mode = 0600; + error = make_dev_s(&args, &pfil_dev, PFILDEV); + KASSERT(error == 0, ("%s: failed to create dev: %d", __func__, error)); } +/* + * Make sure the pfil bits are first before any possible subsystem which + * might piggyback on the SI_SUB_PROTO_PFIL. + */ +SYSINIT(pfil_init, SI_SUB_PROTO_PFIL, SI_ORDER_FIRST, pfil_init, NULL); /* - * pfil_add_hook() adds a function to the packet filter hook. the - * flags are: - * PFIL_IN call me on incoming packets - * PFIL_OUT call me on outgoing packets - * PFIL_ALL call me on all of the above - * PFIL_WAITOK OK to call malloc with M_WAITOK. + * User control interface. */ -int -pfil_add_hook(pfil_func_t func, void *arg, int flags, struct pfil_head *ph) +static int pfilioc_listheads(struct pfilioc_list *); +static int pfilioc_listhooks(struct pfilioc_list *); +static int pfilioc_link(struct pfilioc_link *); + +static int +pfil_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, + struct thread *td) { - return (pfil_add_hook_priv(func, arg, flags, ph, false)); + int error; + + CURVNET_SET(TD_TO_VNET(td)); + error = 0; + switch (cmd) { + case PFILIOC_LISTHEADS: + error = pfilioc_listheads((struct pfilioc_list *)addr); + break; + case PFILIOC_LISTHOOKS: + error = pfilioc_listhooks((struct pfilioc_list *)addr); + break; + case PFILIOC_LINK: + error = pfilioc_link((struct pfilioc_link *)addr); + break; + default: + error = EINVAL; + break; + } + CURVNET_RESTORE(); + return (error); } static int -pfil_add_hook_priv(void *func, void *arg, int flags, - struct pfil_head *ph, bool hasflags) +pfilioc_listheads(struct pfilioc_list *req) { - struct packet_filter_hook *pfh1 = NULL; - struct packet_filter_hook *pfh2 = NULL; - int err; - - if (flags & PFIL_IN) { - pfh1 = (struct packet_filter_hook *)malloc(sizeof(*pfh1), - M_IFADDR, (flags & PFIL_WAITOK) ? M_WAITOK : M_NOWAIT); - if (pfh1 == NULL) { - err = ENOMEM; - goto error; - } - } - if (flags & PFIL_OUT) { - pfh2 = (struct packet_filter_hook *)malloc(sizeof(*pfh1), - M_IFADDR, (flags & PFIL_WAITOK) ? M_WAITOK : M_NOWAIT); - if (pfh2 == NULL) { - err = ENOMEM; - goto error; - } + struct pfil_head *head; + struct pfil_link *link; + struct pfilioc_head *iohead; + struct pfilioc_hook *iohook; + u_int nheads, nhooks, hd, hk; + int error; + + PFIL_LOCK(); +restart: + nheads = nhooks = 0; + LIST_FOREACH(head, &V_pfil_head_list, head_list) { + nheads++; + nhooks += head->head_nhooksin + head->head_nhooksout; } - PFIL_WLOCK(ph); - if (flags & PFIL_IN) { - pfh1->pfil_func_flags = hasflags ? func : NULL; - pfh1->pfil_func = hasflags ? NULL : func; - pfh1->pfil_arg = arg; - err = pfil_chain_add(&ph->ph_in, pfh1, flags & ~PFIL_OUT); - if (err) - goto locked_error; - ph->ph_nhooks++; + PFIL_UNLOCK(); + + if (req->pio_nheads < nheads || req->pio_nhooks < nhooks) { + req->pio_nheads = nheads; + req->pio_nhooks = nhooks; + return (0); } - if (flags & PFIL_OUT) { - pfh2->pfil_func_flags = hasflags ? func : NULL; - pfh2->pfil_func = hasflags ? NULL : func; - pfh2->pfil_arg = arg; - err = pfil_chain_add(&ph->ph_out, pfh2, flags & ~PFIL_IN); - if (err) { - if (flags & PFIL_IN) - pfil_chain_remove(&ph->ph_in, func, arg); - goto locked_error; + + iohead = malloc(sizeof(*iohead) * nheads, M_TEMP, M_WAITOK); + iohook = malloc(sizeof(*iohook) * nhooks, M_TEMP, M_WAITOK); + + hd = hk = 0; + PFIL_LOCK(); + LIST_FOREACH(head, &V_pfil_head_list, head_list) { + if (hd + 1 > nheads || + hk + head->head_nhooksin + head->head_nhooksout > nhooks) { + /* Configuration changed during malloc(). */ + free(iohead, M_TEMP); + free(iohook, M_TEMP); + goto restart; + } + strlcpy(iohead[hd].pio_name, head->head_name, + sizeof(iohead[0].pio_name)); + iohead[hd].pio_nhooksin = head->head_nhooksin; + iohead[hd].pio_nhooksout = head->head_nhooksout; + iohead[hd].pio_type = head->head_type; + CK_STAILQ_FOREACH(link, &head->head_in, link_chain) { + strlcpy(iohook[hk].pio_module, + link->link_hook->hook_modname, + sizeof(iohook[0].pio_module)); + strlcpy(iohook[hk].pio_ruleset, + link->link_hook->hook_rulname, + sizeof(iohook[0].pio_ruleset)); + hk++; } - ph->ph_nhooks++; + CK_STAILQ_FOREACH(link, &head->head_out, link_chain) { + strlcpy(iohook[hk].pio_module, + link->link_hook->hook_modname, + sizeof(iohook[0].pio_module)); + strlcpy(iohook[hk].pio_ruleset, + link->link_hook->hook_rulname, + sizeof(iohook[0].pio_ruleset)); + hk++; + } + hd++; } - PFIL_WUNLOCK(ph); - return (0); -locked_error: - PFIL_WUNLOCK(ph); -error: - if (pfh1 != NULL) - free(pfh1, M_IFADDR); - if (pfh2 != NULL) - free(pfh2, M_IFADDR); - return (err); -} + PFIL_UNLOCK(); -/* - * pfil_remove_hook_flags removes a specific function from the packet filter hook - * chain. - */ -int -pfil_remove_hook_flags(pfil_func_flags_t func, void *arg, int flags, - struct pfil_head *ph) -{ - return (pfil_remove_hook((pfil_func_t)func, arg, flags, ph)); -} + error = copyout(iohead, req->pio_heads, + sizeof(*iohead) * min(hd, req->pio_nheads)); + if (error == 0) + error = copyout(iohook, req->pio_hooks, + sizeof(*iohook) * min(req->pio_nhooks, hk)); -/* - * pfil_remove_hook removes a specific function from the packet filter hook - * chain. - */ -int -pfil_remove_hook(pfil_func_t func, void *arg, int flags, struct pfil_head *ph) -{ - int err = 0; + req->pio_nheads = hd; + req->pio_nhooks = hk; - PFIL_WLOCK(ph); - if (flags & PFIL_IN) { - err = pfil_chain_remove(&ph->ph_in, func, arg); - if (err == 0) - ph->ph_nhooks--; - } - if ((err == 0) && (flags & PFIL_OUT)) { - err = pfil_chain_remove(&ph->ph_out, func, arg); - if (err == 0) - ph->ph_nhooks--; - } - PFIL_WUNLOCK(ph); - return (err); -} + free(iohead, M_TEMP); + free(iohook, M_TEMP); -/* - * Internal: Add a new pfil hook into a hook chain. - */ -static int -pfil_chain_add(pfil_chain_t *chain, struct packet_filter_hook *pfh1, int flags) -{ - struct packet_filter_hook *pfh; - - /* - * First make sure the hook is not already there. - */ - TAILQ_FOREACH(pfh, chain, pfil_chain) - if (((pfh->pfil_func != NULL && pfh->pfil_func == pfh1->pfil_func) || - (pfh->pfil_func_flags != NULL && - pfh->pfil_func_flags == pfh1->pfil_func_flags)) && - pfh->pfil_arg == pfh1->pfil_arg) - return (EEXIST); - - /* - * Insert the input list in reverse order of the output list so that - * the same path is followed in or out of the kernel. - */ - if (flags & PFIL_IN) - TAILQ_INSERT_HEAD(chain, pfh1, pfil_chain); - else - TAILQ_INSERT_TAIL(chain, pfh1, pfil_chain); - return (0); + return (error); } -/* - * Internal: Remove a pfil hook from a hook chain. - */ static int -pfil_chain_remove(pfil_chain_t *chain, void *func, void *arg) +pfilioc_listhooks(struct pfilioc_list *req) { - struct packet_filter_hook *pfh; - - TAILQ_FOREACH(pfh, chain, pfil_chain) - if ((pfh->pfil_func == func || pfh->pfil_func_flags == func) && - pfh->pfil_arg == arg) { - TAILQ_REMOVE(chain, pfh, pfil_chain); - free(pfh, M_IFADDR); - return (0); + struct pfil_hook *hook; + struct pfilioc_hook *iohook; + u_int nhooks, hk; + int error; + + PFIL_LOCK(); +restart: + nhooks = 0; + LIST_FOREACH(hook, &V_pfil_hook_list, hook_list) + nhooks++; + PFIL_UNLOCK(); + + if (req->pio_nhooks < nhooks) { + req->pio_nhooks = nhooks; + return (0); + } + + iohook = malloc(sizeof(*iohook) * nhooks, M_TEMP, M_WAITOK); + + hk = 0; + PFIL_LOCK(); + LIST_FOREACH(hook, &V_pfil_hook_list, hook_list) { + if (hk + 1 > nhooks) { + /* Configuration changed during malloc(). */ + free(iohook, M_TEMP); + goto restart; } - return (ENOENT); -} + strlcpy(iohook[hk].pio_module, hook->hook_modname, + sizeof(iohook[0].pio_module)); + strlcpy(iohook[hk].pio_ruleset, hook->hook_rulname, + sizeof(iohook[0].pio_ruleset)); + iohook[hk].pio_type = hook->hook_type; + iohook[hk].pio_flags = hook->hook_flags; + hk++; + } + PFIL_UNLOCK(); -/* - * Stuff that must be initialized for every instance (including the first of - * course). - */ -static void -vnet_pfil_init(const void *unused __unused) -{ + error = copyout(iohook, req->pio_hooks, + sizeof(*iohook) * min(req->pio_nhooks, hk)); + req->pio_nhooks = hk; + free(iohook, M_TEMP); - LIST_INIT(&V_pfil_head_list); - PFIL_LOCK_INIT_REAL(&V_pfil_lock, "shared"); + return (error); } -/* - * Called for the removal of each instance. - */ -static void -vnet_pfil_uninit(const void *unused __unused) +static int +pfilioc_link(struct pfilioc_link *req) { + struct pfil_link_args args; - KASSERT(LIST_EMPTY(&V_pfil_head_list), - ("%s: pfil_head_list %p not empty", __func__, &V_pfil_head_list)); - PFIL_LOCK_DESTROY_REAL(&V_pfil_lock); -} + if (req->pio_flags & ~(PFIL_IN | PFIL_OUT | PFIL_UNLINK | PFIL_APPEND)) + return (EINVAL); -/* - * Starting up. - * - * VNET_SYSINIT is called for each existing vnet and each new vnet. - * Make sure the pfil bits are first before any possible subsystem which - * might piggyback on the SI_SUB_PROTO_PFIL. - */ -VNET_SYSINIT(vnet_pfil_init, SI_SUB_PROTO_PFIL, SI_ORDER_FIRST, - vnet_pfil_init, NULL); - -/* - * Closing up shop. These are done in REVERSE ORDER. Not called on reboot. - * - * VNET_SYSUNINIT is called for each exiting vnet as it exits. - */ -VNET_SYSUNINIT(vnet_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_FIRST, - vnet_pfil_uninit, NULL); + args.pa_version = PFIL_VERSION; + args.pa_flags = req->pio_flags; + args.pa_headname = req->pio_name; + args.pa_modname = req->pio_module; + args.pa_rulname = req->pio_ruleset; + + return (pfil_link(&args)); +} diff --git a/freebsd/sys/net/pfil.h b/freebsd/sys/net/pfil.h index 8fdaf5a6..da045b30 100644 --- a/freebsd/sys/net/pfil.h +++ b/freebsd/sys/net/pfil.h @@ -4,6 +4,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * + * Copyright (c) 2019 Gleb Smirnoff <glebius@FreeBSD.org> * Copyright (c) 1996 Matthew R. Green * All rights reserved. * @@ -34,98 +35,180 @@ #ifndef _NET_PFIL_H_ #define _NET_PFIL_H_ -#include <sys/systm.h> -#include <sys/queue.h> -#include <sys/_lock.h> -#include <sys/_mutex.h> -#include <sys/lock.h> -#include <sys/rmlock.h> -#include <net/vnet.h> +#include <sys/ioccom.h> +enum pfil_types { + PFIL_TYPE_IP4, + PFIL_TYPE_IP6, + PFIL_TYPE_ETHERNET, +}; + +#define MAXPFILNAME 64 + +struct pfilioc_head { + char pio_name[MAXPFILNAME]; + int pio_nhooksin; + int pio_nhooksout; + enum pfil_types pio_type; +}; + +struct pfilioc_hook { + char pio_module[MAXPFILNAME]; + char pio_ruleset[MAXPFILNAME]; + int pio_flags; + enum pfil_types pio_type; +}; + +struct pfilioc_list { + u_int pio_nheads; + u_int pio_nhooks; + struct pfilioc_head *pio_heads; + struct pfilioc_hook *pio_hooks; +}; + +struct pfilioc_link { + char pio_name[MAXPFILNAME]; + char pio_module[MAXPFILNAME]; + char pio_ruleset[MAXPFILNAME]; + int pio_flags; +}; + +#define PFILDEV "pfil" +#define PFILIOC_LISTHEADS _IOWR('P', 1, struct pfilioc_list) +#define PFILIOC_LISTHOOKS _IOWR('P', 2, struct pfilioc_list) +#define PFILIOC_LINK _IOW('P', 3, struct pfilioc_link) + +#define PFIL_IN 0x00010000 +#define PFIL_OUT 0x00020000 +#define PFIL_FWD 0x00040000 +#define PFIL_DIR(f) ((f) & (PFIL_IN|PFIL_OUT)) +#define PFIL_MEMPTR 0x00080000 +#define PFIL_HEADPTR 0x00100000 +#define PFIL_HOOKPTR 0x00200000 +#define PFIL_APPEND 0x00400000 +#define PFIL_UNLINK 0x00800000 +#define PFIL_LENMASK 0x0000ffff +#define PFIL_LENGTH(f) ((f) & PFIL_LENMASK) + +#ifdef _KERNEL struct mbuf; struct ifnet; struct inpcb; -typedef int (*pfil_func_t)(void *, struct mbuf **, struct ifnet *, int, - struct inpcb *); -typedef int (*pfil_func_flags_t)(void *, struct mbuf **, struct ifnet *, - int, int, struct inpcb *); +typedef union { + struct mbuf **m; + void *mem; + uintptr_t __ui; +} pfil_packet_t __attribute__((__transparent_union__)); + +static inline pfil_packet_t +pfil_packet_align(pfil_packet_t p) +{ + + return ((pfil_packet_t ) (((uintptr_t)(p).mem + + (_Alignof(void *) - 1)) & - _Alignof(void *))); +} + +static inline struct mbuf * +pfil_mem2mbuf(void *v) +{ + + return (*(struct mbuf **) (((uintptr_t)(v) + + (_Alignof(void *) - 1)) & - _Alignof(void *))); +} + +typedef enum { + PFIL_PASS = 0, + PFIL_DROPPED, + PFIL_CONSUMED, + PFIL_REALLOCED, +} pfil_return_t; + +typedef pfil_return_t (*pfil_func_t)(pfil_packet_t, struct ifnet *, int, + void *, struct inpcb *); +/* + * A pfil head is created by a packet intercept point. + * + * A pfil hook is created by a packet filter. + * + * Hooks are chained on heads. Historically some hooking happens + * automatically, e.g. ipfw(4), pf(4) and ipfilter(4) would register + * theirselves on IPv4 and IPv6 input/output. + */ + +typedef struct pfil_hook * pfil_hook_t; +typedef struct pfil_head * pfil_head_t; /* - * The packet filter hooks are designed for anything to call them to - * possibly intercept the packet. Multiple filter hooks are chained - * together and after each other in the specified order. + * Give us a chance to modify pfil_xxx_args structures in future. */ -struct packet_filter_hook { - TAILQ_ENTRY(packet_filter_hook) pfil_chain; - pfil_func_t pfil_func; - pfil_func_flags_t pfil_func_flags; - void *pfil_arg; +#define PFIL_VERSION 1 + +/* Argument structure used by packet filters to register themselves. */ +struct pfil_hook_args { + int pa_version; + int pa_flags; + enum pfil_types pa_type; + pfil_func_t pa_func; + void *pa_ruleset; + const char *pa_modname; + const char *pa_rulname; }; -#define PFIL_IN 0x00000001 -#define PFIL_OUT 0x00000002 -#define PFIL_WAITOK 0x00000004 -#define PFIL_FWD 0x00000008 -#define PFIL_ALL (PFIL_IN|PFIL_OUT) +/* Public functions for pfil hook management by packet filters. */ +pfil_hook_t pfil_add_hook(struct pfil_hook_args *); +void pfil_remove_hook(pfil_hook_t); -typedef TAILQ_HEAD(pfil_chain, packet_filter_hook) pfil_chain_t; +/* Argument structure used by ioctl() and packet filters to set filters. */ +struct pfil_link_args { + int pa_version; + int pa_flags; + union { + const char *pa_headname; + pfil_head_t pa_head; + }; + union { + struct { + const char *pa_modname; + const char *pa_rulname; + }; + pfil_hook_t pa_hook; + }; +}; -#define PFIL_TYPE_AF 1 /* key is AF_* type */ -#define PFIL_TYPE_IFNET 2 /* key is ifnet pointer */ +/* Public function to configure filter chains. Used by ioctl() and filters. */ +int pfil_link(struct pfil_link_args *); -#define PFIL_FLAG_PRIVATE_LOCK 0x01 /* Personal lock instead of global */ +/* Argument structure used by inspection points to register themselves. */ +struct pfil_head_args { + int pa_version; + int pa_flags; + enum pfil_types pa_type; + const char *pa_headname; +}; +/* Public functions for pfil head management by inspection points. */ +pfil_head_t pfil_head_register(struct pfil_head_args *); +void pfil_head_unregister(pfil_head_t); + +/* Public functions to run the packet inspection by inspection points. */ +int pfil_run_hooks(struct pfil_head *, pfil_packet_t, struct ifnet *, int, + struct inpcb *inp); /* - * A pfil head is created by each protocol or packet intercept point. - * For packet is then run through the hook chain for inspection. + * Minimally exposed structure to avoid function call in case of absence + * of any filters by protocols and macros to do the check. */ -struct pfil_head { - pfil_chain_t ph_in; - pfil_chain_t ph_out; - int ph_type; - int ph_nhooks; -#if defined( __linux__ ) || defined( _WIN32 ) - rwlock_t ph_mtx; -#else - struct rmlock *ph_plock; /* Pointer to the used lock */ - struct rmlock ph_lock; /* Private lock storage */ - int flags; -#endif - union { - u_long phu_val; - void *phu_ptr; - } ph_un; -#define ph_af ph_un.phu_val -#define ph_ifnet ph_un.phu_ptr - LIST_ENTRY(pfil_head) ph_list; +struct _pfil_head { + int head_nhooksin; + int head_nhooksout; }; +#define PFIL_HOOKED_IN(p) (((struct _pfil_head *)(p))->head_nhooksin > 0) +#define PFIL_HOOKED_OUT(p) (((struct _pfil_head *)(p))->head_nhooksout > 0) -VNET_DECLARE(struct rmlock, pfil_lock); -#define V_pfil_lock VNET(pfil_lock) - -/* Public functions for pfil hook management by packet filters. */ -struct pfil_head *pfil_head_get(int, u_long); -int pfil_add_hook_flags(pfil_func_flags_t, void *, int, struct pfil_head *); -int pfil_add_hook(pfil_func_t, void *, int, struct pfil_head *); -int pfil_remove_hook_flags(pfil_func_flags_t, void *, int, struct pfil_head *); -int pfil_remove_hook(pfil_func_t, void *, int, struct pfil_head *); -#define PFIL_HOOKED(p) ((p)->ph_nhooks > 0) - -/* Public functions to run the packet inspection by protocols. */ -int pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *, int, - int, struct inpcb *inp); - -/* Public functions for pfil head management by protocols. */ -int pfil_head_register(struct pfil_head *); -int pfil_head_unregister(struct pfil_head *); - -/* Public pfil locking functions for self managed locks by packet filters. */ -int pfil_try_rlock(struct pfil_head *, struct rm_priotracker *); -void pfil_rlock(struct pfil_head *, struct rm_priotracker *); -void pfil_runlock(struct pfil_head *, struct rm_priotracker *); -void pfil_wlock(struct pfil_head *); -void pfil_wunlock(struct pfil_head *); -int pfil_wowned(struct pfil_head *ph); +/* + * Alloc mbuf to be used instead of memory pointer. + */ +int pfil_realloc(pfil_packet_t *, int, struct ifnet *); +#endif /* _KERNEL */ #endif /* _NET_PFIL_H_ */ diff --git a/freebsd/sys/net/pfvar.h b/freebsd/sys/net/pfvar.h index 2924c06d..bfa7e773 100644 --- a/freebsd/sys/net/pfvar.h +++ b/freebsd/sys/net/pfvar.h @@ -41,6 +41,7 @@ #include <sys/cpuset.h> #include <sys/malloc.h> #include <sys/refcount.h> +#include <sys/sysctl.h> #include <sys/lock.h> #include <sys/rmlock.h> #include <sys/tree.h> @@ -95,6 +96,9 @@ struct pf_addr_wrap { #ifdef _KERNEL +SYSCTL_DECL(_net_pf); +MALLOC_DECLARE(M_PFHASH); + struct pfi_dynaddr { TAILQ_ENTRY(pfi_dynaddr) entry; struct pf_addr pfid_addr4; @@ -1017,6 +1021,17 @@ struct pfr_tstats { int pfrts_cnt; int pfrts_refcnt[PFR_REFCNT_MAX]; }; + +struct pfr_ktstats { + struct pfr_table pfrts_t; + counter_u64_t pfrkts_packets[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; + counter_u64_t pfrkts_bytes[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; + counter_u64_t pfrkts_match; + counter_u64_t pfrkts_nomatch; + long pfrkts_tzero; + int pfrkts_cnt; + int pfrkts_refcnt[PFR_REFCNT_MAX]; +}; #define pfrts_name pfrts_t.pfrt_name #define pfrts_flags pfrts_t.pfrt_flags @@ -1030,8 +1045,9 @@ union sockaddr_union { #endif /* _SOCKADDR_UNION_DEFINED */ struct pfr_kcounters { - u_int64_t pfrkc_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; - u_int64_t pfrkc_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + counter_u64_t pfrkc_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + counter_u64_t pfrkc_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + long pfrkc_tzero; }; SLIST_HEAD(pfr_kentryworkq, pfr_kentry); @@ -1039,8 +1055,7 @@ struct pfr_kentry { struct radix_node pfrke_node[2]; union sockaddr_union pfrke_sa; SLIST_ENTRY(pfr_kentry) pfrke_workq; - struct pfr_kcounters *pfrke_counters; - long pfrke_tzero; + struct pfr_kcounters pfrke_counters; u_int8_t pfrke_af; u_int8_t pfrke_net; u_int8_t pfrke_not; @@ -1050,7 +1065,7 @@ struct pfr_kentry { SLIST_HEAD(pfr_ktableworkq, pfr_ktable); RB_HEAD(pfr_ktablehead, pfr_ktable); struct pfr_ktable { - struct pfr_tstats pfrkt_ts; + struct pfr_ktstats pfrkt_kts; RB_ENTRY(pfr_ktable) pfrkt_tree; SLIST_ENTRY(pfr_ktable) pfrkt_workq; struct radix_node_head *pfrkt_ip4; @@ -1061,18 +1076,18 @@ struct pfr_ktable { long pfrkt_larg; int pfrkt_nflags; }; -#define pfrkt_t pfrkt_ts.pfrts_t +#define pfrkt_t pfrkt_kts.pfrts_t #define pfrkt_name pfrkt_t.pfrt_name #define pfrkt_anchor pfrkt_t.pfrt_anchor #define pfrkt_ruleset pfrkt_t.pfrt_ruleset #define pfrkt_flags pfrkt_t.pfrt_flags -#define pfrkt_cnt pfrkt_ts.pfrts_cnt -#define pfrkt_refcnt pfrkt_ts.pfrts_refcnt -#define pfrkt_packets pfrkt_ts.pfrts_packets -#define pfrkt_bytes pfrkt_ts.pfrts_bytes -#define pfrkt_match pfrkt_ts.pfrts_match -#define pfrkt_nomatch pfrkt_ts.pfrts_nomatch -#define pfrkt_tzero pfrkt_ts.pfrts_tzero +#define pfrkt_cnt pfrkt_kts.pfrkts_cnt +#define pfrkt_refcnt pfrkt_kts.pfrkts_refcnt +#define pfrkt_packets pfrkt_kts.pfrkts_packets +#define pfrkt_bytes pfrkt_kts.pfrkts_bytes +#define pfrkt_match pfrkt_kts.pfrkts_match +#define pfrkt_nomatch pfrkt_kts.pfrkts_nomatch +#define pfrkt_tzero pfrkt_kts.pfrkts_tzero /* keep synced with pfi_kif, used in RB_FIND */ struct pfi_kif_cmp { @@ -1601,7 +1616,7 @@ VNET_DECLARE(uint64_t, pf_stateid[MAXCPU]); #define V_pf_stateid VNET(pf_stateid) TAILQ_HEAD(pf_altqqueue, pf_altq); -VNET_DECLARE(struct pf_altqqueue, pf_altqs[2]); +VNET_DECLARE(struct pf_altqqueue, pf_altqs[4]); #define V_pf_altqs VNET(pf_altqs) VNET_DECLARE(struct pf_palist, pf_pabuf); #define V_pf_pabuf VNET(pf_pabuf) @@ -1616,8 +1631,12 @@ VNET_DECLARE(u_int32_t, ticket_pabuf); #define V_ticket_pabuf VNET(ticket_pabuf) VNET_DECLARE(struct pf_altqqueue *, pf_altqs_active); #define V_pf_altqs_active VNET(pf_altqs_active) +VNET_DECLARE(struct pf_altqqueue *, pf_altq_ifs_active); +#define V_pf_altq_ifs_active VNET(pf_altq_ifs_active) VNET_DECLARE(struct pf_altqqueue *, pf_altqs_inactive); #define V_pf_altqs_inactive VNET(pf_altqs_inactive) +VNET_DECLARE(struct pf_altqqueue *, pf_altq_ifs_inactive); +#define V_pf_altq_ifs_inactive VNET(pf_altq_ifs_inactive) VNET_DECLARE(struct pf_rulequeue, pf_unlinked_rules); #define V_pf_unlinked_rules VNET(pf_unlinked_rules) diff --git a/freebsd/sys/net/route.c b/freebsd/sys/net/route.c index 3cd909c1..36f3bf41 100644 --- a/freebsd/sys/net/route.c +++ b/freebsd/sys/net/route.c @@ -625,11 +625,12 @@ rtredirect_fib(struct sockaddr *dst, int error = 0; short *stat = NULL; struct rt_addrinfo info; + struct epoch_tracker et; struct ifaddr *ifa; struct rib_head *rnh; ifa = NULL; - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) { error = EAFNOSUPPORT; @@ -724,7 +725,7 @@ done: if (rt) RTFREE_LOCKED(rt); out: - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); if (error) V_rtstat.rts_badredirect++; else if (stat != NULL) @@ -1307,11 +1308,14 @@ rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info) /* * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, * it will be referenced so the caller must free it. + * + * Assume basic consistency checks are executed by callers: + * RTAX_DST exists, if RTF_GATEWAY is set, RTAX_GATEWAY exists as well. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { - struct ifaddr *ifa; + struct epoch_tracker et; int needref, error; /* @@ -1320,22 +1324,55 @@ rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) */ error = 0; needref = (info->rti_ifa == NULL); - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); + + /* If we have interface specified by the ifindex in the address, use it */ if (info->rti_ifp == NULL && ifpaddr != NULL && - ifpaddr->sa_family == AF_LINK && - (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) { - info->rti_ifp = ifa->ifa_ifp; + ifpaddr->sa_family == AF_LINK) { + const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)ifpaddr; + if (sdl->sdl_index != 0) + info->rti_ifp = ifnet_byindex_locked(sdl->sdl_index); } + /* + * If we have source address specified, try to find it + * TODO: avoid enumerating all ifas on all interfaces. + */ if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if (info->rti_ifa == NULL) { struct sockaddr *sa; - sa = ifaaddr != NULL ? ifaaddr : - (gateway != NULL ? gateway : dst); - if (sa != NULL && info->rti_ifp != NULL) + /* + * Most common use case for the userland-supplied routes. + * + * Choose sockaddr to select ifa. + * -- if ifp is set -- + * Order of preference: + * 1) IFA address + * 2) gateway address + * Note: for interface routes link-level gateway address + * is specified to indicate the interface index without + * specifying RTF_GATEWAY. In this case, ignore gateway + * Note: gateway AF may be different from dst AF. In this case, + * ignore gateway + * 3) final destination. + * 4) if all of these fails, try to get at least link-level ifa. + * -- else -- + * try to lookup gateway or dst in the routing table to get ifa + */ + if (info->rti_info[RTAX_IFA] != NULL) + sa = info->rti_info[RTAX_IFA]; + else if ((info->rti_flags & RTF_GATEWAY) != 0 && + gateway->sa_family == dst->sa_family) + sa = gateway; + else + sa = dst; + if (info->rti_ifp != NULL) { info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); - else if (dst != NULL && gateway != NULL) + /* Case 4 */ + if (info->rti_ifa == NULL && gateway != NULL) + info->rti_ifa = ifaof_ifpforaddr(gateway, info->rti_ifp); + } else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) @@ -1348,7 +1385,7 @@ rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) ifa_ref(info->rti_ifa); } else error = ENETUNREACH; - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (error); } @@ -1585,6 +1622,8 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, switch (req) { case RTM_DELETE: if (netmask) { + if (dst->sa_len > sizeof(mdst)) + return (EINVAL); rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } @@ -1990,7 +2029,7 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) char tempbuf[_SOCKADDR_TMPSIZE]; int didwork = 0; int a_failure = 0; - static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + struct sockaddr_dl *sdl = NULL; struct rib_head *rnh; if (flags & RTF_HOST) { @@ -2045,7 +2084,14 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask); dst = (struct sockaddr *)tempbuf; } - } + } else if (cmd == RTM_ADD) { + sdl = (struct sockaddr_dl *)tempbuf; + bzero(sdl, sizeof(struct sockaddr_dl)); + sdl->sdl_family = AF_LINK; + sdl->sdl_len = sizeof(struct sockaddr_dl); + sdl->sdl_type = ifa->ifa_ifp->if_type; + sdl->sdl_index = ifa->ifa_ifp->if_index; + } /* * Now go through all the requested tables (fibs) and do the * requested action. Realistically, this will either be fib 0 @@ -2108,8 +2154,7 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) * doing this for compatibility reasons */ if (cmd == RTM_ADD) - info.rti_info[RTAX_GATEWAY] = - (struct sockaddr *)&null_sdl; + info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)sdl; else info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; @@ -2136,15 +2181,6 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) rt->rt_ifa = ifa; } #endif - /* - * doing this for compatibility reasons - */ - if (cmd == RTM_ADD) { - ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = - rt->rt_ifp->if_type; - ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = - rt->rt_ifp->if_index; - } RT_ADDREF(rt); RT_UNLOCK(rt); rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum); diff --git a/freebsd/sys/net/route.h b/freebsd/sys/net/route.h index c4333838..bdeb9869 100644 --- a/freebsd/sys/net/route.h +++ b/freebsd/sys/net/route.h @@ -210,6 +210,7 @@ struct rtentry { #define NHF_DEFAULT 0x0080 /* Default route */ #define NHF_BROADCAST 0x0100 /* RTF_BROADCAST */ #define NHF_GATEWAY 0x0200 /* RTF_GATEWAY */ +#define NHF_HOST 0x0400 /* RTF_HOST */ /* Nexthop request flags */ #define NHR_IFAIF 0x01 /* Return ifa_ifp interface */ diff --git a/freebsd/sys/net/route_var.h b/freebsd/sys/net/route_var.h index 9d0d1931..db3db4e3 100644 --- a/freebsd/sys/net/route_var.h +++ b/freebsd/sys/net/route_var.h @@ -67,6 +67,7 @@ fib_rte_to_nh_flags(int rt_flags) uint16_t res; res = (rt_flags & RTF_REJECT) ? NHF_REJECT : 0; + res |= (rt_flags & RTF_HOST) ? NHF_HOST : 0; res |= (rt_flags & RTF_BLACKHOLE) ? NHF_BLACKHOLE : 0; res |= (rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) ? NHF_REDIRECT : 0; res |= (rt_flags & RTF_BROADCAST) ? NHF_BROADCAST : 0; diff --git a/freebsd/sys/net/rtsock.c b/freebsd/sys/net/rtsock.c index e1b87095..6c457a9d 100644 --- a/freebsd/sys/net/rtsock.c +++ b/freebsd/sys/net/rtsock.c @@ -33,6 +33,7 @@ * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD$ */ +#include <rtems/bsd/local/opt_ddb.h> #include <rtems/bsd/local/opt_mpath.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> @@ -55,6 +56,11 @@ #include <sys/sysctl.h> #include <sys/systm.h> +#ifdef DDB +#include <ddb/ddb.h> +#include <ddb/db_lex.h> +#endif + #include <net/if.h> #include <net/if_var.h> #include <net/if_dl.h> @@ -448,6 +454,9 @@ static int rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred) { +#if defined(INET) || defined(INET6) + struct epoch_tracker et; +#endif /* First, see if the returned address is part of the jail. */ if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) { @@ -468,7 +477,7 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, * Try to find an address on the given outgoing interface * that belongs to the jail. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; @@ -480,7 +489,7 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, break; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (!found) { /* * As a last resort return the 'default' jail address. @@ -510,7 +519,7 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, * Try to find an address on the given outgoing interface * that belongs to the jail. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; @@ -523,7 +532,7 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, break; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (!found) { /* * As a last resort return the 'default' jail address. @@ -627,6 +636,8 @@ route_output(struct mbuf *m, struct socket *so, ...) if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) senderr(EINVAL); + if (rtm->rtm_flags & RTF_RNH_LOCKED) + senderr(EINVAL); info.rti_flags = rtm->rtm_flags; if (info.rti_info[RTAX_DST] == NULL || info.rti_info[RTAX_DST]->sa_family >= AF_MAX || @@ -798,16 +809,17 @@ route_output(struct mbuf *m, struct socket *so, ...) if (rt->rt_ifp != NULL && rt->rt_ifp->if_type == IFT_PROPVIRTUAL) { + struct epoch_tracker et; struct ifaddr *ifa; - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1, RT_ALL_FIBS); if (ifa != NULL) rt_maskedcopy(ifa->ifa_addr, &laddr, ifa->ifa_netmask); - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); } else rt_maskedcopy(rt->rt_ifa->ifa_addr, &laddr, @@ -1571,7 +1583,7 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) struct rt_addrinfo info; struct sockaddr_storage ss; - IFNET_RLOCK_NOSLEEP_ASSERT(); + NET_EPOCH_ASSERT(); if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) return 0; @@ -1765,7 +1777,7 @@ sysctl_iflist(int af, struct walkarg *w) bzero((caddr_t)&info, sizeof(info)); bzero(&ifd, sizeof(ifd)); - NET_EPOCH_ENTER_ET(et); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; @@ -1815,7 +1827,7 @@ sysctl_iflist(int af, struct walkarg *w) info.rti_info[RTAX_BRD] = NULL; } done: - NET_EPOCH_EXIT_ET(et); + NET_EPOCH_EXIT(et); return (error); } @@ -1823,6 +1835,7 @@ static int sysctl_ifmalist(int af, struct walkarg *w) { struct rt_addrinfo info; + struct epoch_tracker et; struct ifaddr *ifa; struct ifmultiaddr *ifma; struct ifnet *ifp; @@ -1831,13 +1844,12 @@ sysctl_ifmalist(int af, struct walkarg *w) error = 0; bzero((caddr_t)&info, sizeof(info)); - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (af && af != ifma->ifma_addr->sa_family) continue; @@ -1864,11 +1876,10 @@ sysctl_ifmalist(int af, struct walkarg *w) break; } } - IF_ADDR_RUNLOCK(ifp); if (error != 0) break; } - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (error); } @@ -1955,11 +1966,13 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) for (error = 0; error == 0 && i <= lim; i++) { rnh = rt_tables_get_rnh(fib, i); if (rnh != NULL) { + struct epoch_tracker et; + RIB_RLOCK(rnh); - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); error = rnh->rnh_walktree(&rnh->head, sysctl_dumpentry, &w); - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); RIB_RUNLOCK(rnh); } else if (af != 0) error = EAFNOSUPPORT; @@ -2008,3 +2021,408 @@ static struct domain routedomain = { }; VNET_DOMAIN_SET(route); + +#ifdef DDB +/* + * Unfortunately, RTF_ values are expressed as raw masks rather than powers of + * 2, so we cannot use them as nice C99 initializer indices below. + */ +static const char * const rtf_flag_strings[] = { + "UP", + "GATEWAY", + "HOST", + "REJECT", + "DYNAMIC", + "MODIFIED", + "DONE", + "UNUSED_0x80", + "UNUSED_0x100", + "XRESOLVE", + "LLDATA", + "STATIC", + "BLACKHOLE", + "UNUSED_0x2000", + "PROTO2", + "PROTO1", + "UNUSED_0x10000", + "UNUSED_0x20000", + "PROTO3", + "FIXEDMTU", + "PINNED", + "LOCAL", + "BROADCAST", + "MULTICAST", + /* Big gap. */ + [28] = "STICKY", + [30] = "RNH_LOCKED", + [31] = "GWFLAG_COMPAT", +}; + +static const char * __pure +rt_flag_name(unsigned idx) +{ + if (idx >= nitems(rtf_flag_strings)) + return ("INVALID_FLAG"); + if (rtf_flag_strings[idx] == NULL) + return ("UNKNOWN"); + return (rtf_flag_strings[idx]); +} + +static void +rt_dumpaddr_ddb(const char *name, const struct sockaddr *sa) +{ + char buf[INET6_ADDRSTRLEN], *res; + + res = NULL; + if (sa == NULL) + res = "NULL"; + else if (sa->sa_family == AF_INET) { + res = inet_ntop(AF_INET, + &((const struct sockaddr_in *)sa)->sin_addr, + buf, sizeof(buf)); + } else if (sa->sa_family == AF_INET6) { + res = inet_ntop(AF_INET6, + &((const struct sockaddr_in6 *)sa)->sin6_addr, + buf, sizeof(buf)); + } else if (sa->sa_family == AF_LINK) { + res = "on link"; + } + + if (res != NULL) { + db_printf("%s <%s> ", name, res); + return; + } + + db_printf("%s <af:%d> ", name, sa->sa_family); +} + +static int +rt_dumpentry_ddb(struct radix_node *rn, void *arg __unused) +{ + struct sockaddr_storage ss; + struct rtentry *rt; + int flags, idx; + + /* If RNTORT is important, put it in a header. */ + rt = (void *)rn; + + rt_dumpaddr_ddb("dst", rt_key(rt)); + rt_dumpaddr_ddb("gateway", rt->rt_gateway); + rt_dumpaddr_ddb("netmask", rtsock_fix_netmask(rt_key(rt), rt_mask(rt), + &ss)); + if (rt->rt_ifp != NULL && (rt->rt_ifp->if_flags & IFF_DYING) == 0) { + rt_dumpaddr_ddb("ifp", rt->rt_ifp->if_addr->ifa_addr); + rt_dumpaddr_ddb("ifa", rt->rt_ifa->ifa_addr); + } + + db_printf("flags "); + flags = rt->rt_flags; + if (flags == 0) + db_printf("none"); + + while ((idx = ffs(flags)) > 0) { + idx--; + + if (flags != rt->rt_flags) + db_printf(","); + db_printf("%s", rt_flag_name(idx)); + + flags &= ~(1ul << idx); + } + + db_printf("\n"); + return (0); +} + +DB_SHOW_COMMAND(routetable, db_show_routetable_cmd) +{ + struct rib_head *rnh; + int error, i, lim; + + if (have_addr) + i = lim = addr; + else { + i = 1; + lim = AF_MAX; + } + + for (; i <= lim; i++) { + rnh = rt_tables_get_rnh(0, i); + if (rnh == NULL) { + if (have_addr) { + db_printf("%s: AF %d not supported?\n", + __func__, i); + break; + } + continue; + } + + if (!have_addr && i > 1) + db_printf("\n"); + + db_printf("Route table for AF %d%s%s%s:\n", i, + (i == AF_INET || i == AF_INET6) ? " (" : "", + (i == AF_INET) ? "INET" : (i == AF_INET6) ? "INET6" : "", + (i == AF_INET || i == AF_INET6) ? ")" : ""); + + error = rnh->rnh_walktree(&rnh->head, rt_dumpentry_ddb, NULL); + if (error != 0) + db_printf("%s: walktree(%d): %d\n", __func__, i, + error); + } +} + +_DB_FUNC(_show, route, db_show_route_cmd, db_show_table, CS_OWN, NULL) +{ + char buf[INET6_ADDRSTRLEN], *bp; + const void *dst_addrp; + struct sockaddr *dstp; + struct rtentry *rt; + union { + struct sockaddr_in dest_sin; + struct sockaddr_in6 dest_sin6; + } u; + uint16_t hextets[8]; + unsigned i, tets; + int t, af, exp, tokflags; + + /* + * Undecoded address family. No double-colon expansion seen yet. + */ + af = -1; + exp = -1; + /* Assume INET6 to start; we can work back if guess was wrong. */ + tokflags = DRT_WSPACE | DRT_HEX | DRT_HEXADECIMAL; + + /* + * db_command has lexed 'show route' for us. + */ + t = db_read_token_flags(tokflags); + if (t == tWSPACE) + t = db_read_token_flags(tokflags); + + /* + * tEOL: Just 'show route' isn't a valid mode. + * tMINUS: It's either '-h' or some invalid option. Regardless, usage. + */ + if (t == tEOL || t == tMINUS) + goto usage; + + db_unread_token(t); + + tets = nitems(hextets); + + /* + * Each loop iteration, we expect to read one octet (v4) or hextet + * (v6), followed by an appropriate field separator ('.' or ':' or + * '::'). + * + * At the start of each loop, we're looking for a number (octet or + * hextet). + * + * INET6 addresses have a special case where they may begin with '::'. + */ + for (i = 0; i < tets; i++) { + t = db_read_token_flags(tokflags); + + if (t == tCOLONCOLON) { + /* INET6 with leading '::' or invalid. */ + if (i != 0) { + db_printf("Parse error: unexpected extra " + "colons.\n"); + goto exit; + } + + af = AF_INET6; + exp = i; + hextets[i] = 0; + continue; + } else if (t == tNUMBER) { + /* + * Lexer separates out '-' as tMINUS, but make the + * assumption explicit here. + */ + MPASS(db_tok_number >= 0); + + if (af == AF_INET && db_tok_number > UINT8_MAX) { + db_printf("Not a valid v4 octet: %ld\n", + (long)db_tok_number); + goto exit; + } + hextets[i] = db_tok_number; + } else if (t == tEOL) { + /* + * We can only detect the end of an IPv6 address in + * compact representation with EOL. + */ + if (af != AF_INET6 || exp < 0) { + db_printf("Parse failed. Got unexpected EOF " + "when the address is not a compact-" + "representation IPv6 address.\n"); + goto exit; + } + break; + } else { + db_printf("Parse failed. Unexpected token %d.\n", t); + goto exit; + } + + /* Next, look for a separator, if appropriate. */ + if (i == tets - 1) + continue; + + t = db_read_token_flags(tokflags); + if (af < 0) { + if (t == tCOLON) { + af = AF_INET6; + continue; + } + if (t == tCOLONCOLON) { + af = AF_INET6; + i++; + hextets[i] = 0; + exp = i; + continue; + } + if (t == tDOT) { + unsigned hn, dn; + + af = AF_INET; + /* Need to fixup the first parsed number. */ + if (hextets[0] > 0x255 || + (hextets[0] & 0xf0) > 0x90 || + (hextets[0] & 0xf) > 9) { + db_printf("Not a valid v4 octet: %x\n", + hextets[0]); + goto exit; + } + + hn = hextets[0]; + dn = (hn >> 8) * 100 + + ((hn >> 4) & 0xf) * 10 + + (hn & 0xf); + + hextets[0] = dn; + + /* Switch to decimal for remaining octets. */ + tokflags &= ~DRT_RADIX_MASK; + tokflags |= DRT_DECIMAL; + + tets = 4; + continue; + } + + db_printf("Parse error. Unexpected token %d.\n", t); + goto exit; + } else if (af == AF_INET) { + if (t == tDOT) + continue; + db_printf("Expected '.' (%d) between octets but got " + "(%d).\n", tDOT, t); + goto exit; + + } else if (af == AF_INET6) { + if (t == tCOLON) + continue; + if (t == tCOLONCOLON) { + if (exp < 0) { + i++; + hextets[i] = 0; + exp = i; + continue; + } + db_printf("Got bogus second '::' in v6 " + "address.\n"); + goto exit; + } + if (t == tEOL) { + /* + * Handle in the earlier part of the loop + * because we need to handle trailing :: too. + */ + db_unread_token(t); + continue; + } + + db_printf("Expected ':' (%d) or '::' (%d) between " + "hextets but got (%d).\n", tCOLON, tCOLONCOLON, t); + goto exit; + } + } + + /* Check for trailing garbage. */ + if (i == tets) { + t = db_read_token_flags(tokflags); + if (t != tEOL) { + db_printf("Got unexpected garbage after address " + "(%d).\n", t); + goto exit; + } + } + + /* + * Need to expand compact INET6 addresses. + * + * Technically '::' for a single ':0:' is MUST NOT but just in case, + * don't bother expanding that form (exp >= 0 && i == tets case). + */ + if (af == AF_INET6 && exp >= 0 && i < tets) { + if (exp + 1 < i) { + memmove(&hextets[exp + 1 + (nitems(hextets) - i)], + &hextets[exp + 1], + (i - (exp + 1)) * sizeof(hextets[0])); + } + memset(&hextets[exp + 1], 0, (nitems(hextets) - i) * + sizeof(hextets[0])); + } + + memset(&u, 0, sizeof(u)); + if (af == AF_INET) { + u.dest_sin.sin_family = AF_INET; + u.dest_sin.sin_len = sizeof(u.dest_sin); + u.dest_sin.sin_addr.s_addr = htonl( + ((uint32_t)hextets[0] << 24) | + ((uint32_t)hextets[1] << 16) | + ((uint32_t)hextets[2] << 8) | + (uint32_t)hextets[3]); + dstp = (void *)&u.dest_sin; + dst_addrp = &u.dest_sin.sin_addr; + } else if (af == AF_INET6) { + u.dest_sin6.sin6_family = AF_INET6; + u.dest_sin6.sin6_len = sizeof(u.dest_sin6); + for (i = 0; i < nitems(hextets); i++) + u.dest_sin6.sin6_addr.s6_addr16[i] = htons(hextets[i]); + dstp = (void *)&u.dest_sin6; + dst_addrp = &u.dest_sin6.sin6_addr; + } else { + MPASS(false); + /* UNREACHABLE */ + /* Appease Clang false positive: */ + dstp = NULL; + } + + bp = inet_ntop(af, dst_addrp, buf, sizeof(buf)); + if (bp != NULL) + db_printf("Looking up route to destination '%s'\n", bp); + + CURVNET_SET(vnet0); + rt = rtalloc1(dstp, 0, RTF_RNH_LOCKED); + CURVNET_RESTORE(); + + if (rt == NULL) { + db_printf("Could not get route for that server.\n"); + return; + } + + rt_dumpentry_ddb((void *)rt, NULL); + RTFREE_LOCKED(rt); + + return; +usage: + db_printf("Usage: 'show route <address>'\n" + " Currently accepts only dotted-decimal INET or colon-separated\n" + " hextet INET6 addresses.\n"); +exit: + db_skip_to_eol(); +} +#endif diff --git a/freebsd/sys/net/sff8472.h b/freebsd/sys/net/sff8472.h index d38fcfc0..9fa465a1 100644 --- a/freebsd/sys/net/sff8472.h +++ b/freebsd/sys/net/sff8472.h @@ -379,7 +379,7 @@ enum { /* * Table 3.2 Identifier values. - * Identifier constants has taken from SFF-8024 rev 4.2 table 4.1 + * Identifier constants has taken from SFF-8024 rev 4.6 table 4.1 * (as referenced by table 3.2 footer) * */ enum { @@ -396,10 +396,10 @@ enum { SFF_8024_ID_X2 = 0xA, /* X2 */ SFF_8024_ID_DWDM_SFP = 0xB, /* DWDM-SFP */ SFF_8024_ID_QSFP = 0xC, /* QSFP */ - SFF_8024_ID_QSFPPLUS = 0xD, /* QSFP+ */ + SFF_8024_ID_QSFPPLUS = 0xD, /* QSFP+ or later */ SFF_8024_ID_CXP = 0xE, /* CXP */ - SFF_8024_ID_HD4X = 0xF, /* Shielded Mini Multilane HD 4X */ - SFF_8024_ID_HD8X = 0x10, /* Shielded Mini Multilane HD 8X */ + SFF_8024_ID_HD4X = 0xF, /* Shielded Mini Multilane HD 4X */ + SFF_8024_ID_HD8X = 0x10, /* Shielded Mini Multilane HD 8X */ SFF_8024_ID_QSFP28 = 0x11, /* QSFP28 or later */ SFF_8024_ID_CXP2 = 0x12, /* CXP2 (aka CXP28) */ SFF_8024_ID_CDFP = 0x13, /* CDFP (Style 1/Style 2) */ @@ -408,34 +408,49 @@ enum { SFF_8024_ID_CDFP3 = 0x16, /* CDFP (Style3) */ SFF_8024_ID_MICROQSFP = 0x17, /* microQSFP */ SFF_8024_ID_QSFP_DD = 0x18, /* QSFP-DD 8X Pluggable Transceiver */ - SFF_8024_ID_LAST = SFF_8024_ID_QSFP_DD - }; - -static const char *sff_8024_id[SFF_8024_ID_LAST + 1] = {"Unknown", - "GBIC", - "SFF", - "SFP/SFP+/SFP28", - "XBI", - "Xenpak", - "XFP", - "XFF", - "XFP-E", - "XPAK", - "X2", - "DWDM-SFP/SFP+", - "QSFP", - "QSFP+", - "CXP", - "HD4X", - "HD8X", - "QSFP28", - "CXP2", - "CDFP", - "SMM4", - "SMM8", - "CDFP3", - "microQSFP", - "QSFP-DD"}; + SFF_8024_ID_OSFP8X = 0x19, /* OSFP 8X Pluggable Transceiver */ + SFF_8024_ID_SFP_DD = 0x1A, /* SFP-DD 2X Pluggable Transceiver */ + SFF_8024_ID_DSFP = 0x1B, /* DSFP Dual SFF Pluggable Transceiver */ + SFF_8024_ID_X4ML = 0x1C, /* x4 MiniLink/OcuLink */ + SFF_8024_ID_X8ML = 0x1D, /* x8 MiniLink */ + SFF_8024_ID_QSFP_CMIS = 0x1E, /* QSFP+ or later w/ Common Management + Interface Specification */ + SFF_8024_ID_LAST = SFF_8024_ID_QSFP_CMIS +}; + +static const char *sff_8024_id[SFF_8024_ID_LAST + 1] = { + "Unknown", + "GBIC", + "SFF", + "SFP/SFP+/SFP28", + "XBI", + "Xenpak", + "XFP", + "XFF", + "XFP-E", + "XPAK", + "X2", + "DWDM-SFP/SFP+", + "QSFP", + "QSFP+", + "CXP", + "HD4X", + "HD8X", + "QSFP28", + "CXP2", + "CDFP", + "SMM4", + "SMM8", + "CDFP3", + "microQSFP", + "QSFP-DD", + "QSFP8X", + "SFP-DD", + "DSFP", + "x4MiniLink/OcuLink", + "x8MiniLink", + "QSFP+(CIMS)" +}; /* Keep compatibility with old definitions */ #define SFF_8472_ID_UNKNOWN SFF_8024_ID_UNKNOWN diff --git a/freebsd/sys/net/vnet.h b/freebsd/sys/net/vnet.h index b4168750..a8c9887e 100644 --- a/freebsd/sys/net/vnet.h +++ b/freebsd/sys/net/vnet.h @@ -273,7 +273,8 @@ extern struct sx vnet_sxlock; /* struct _hack is to stop this from being used with static data */ #define VNET_DEFINE(t, n) \ struct _hack; t VNET_NAME(n) __section(VNET_SETNAME) __used -#if defined(KLD_MODULE) && (defined(__aarch64__) || defined(__riscv)) +#if defined(KLD_MODULE) && (defined(__aarch64__) || defined(__riscv) \ + || defined(__powerpc64__)) /* * As with DPCPU_DEFINE_STATIC we are unable to mark this data as static * in modules on some architectures. diff --git a/freebsd/sys/net80211/ieee80211.c b/freebsd/sys/net80211/ieee80211.c index a81b5343..f003c769 100644 --- a/freebsd/sys/net80211/ieee80211.c +++ b/freebsd/sys/net80211/ieee80211.c @@ -407,8 +407,10 @@ ieee80211_ifdetach(struct ieee80211com *ic) * The VAP is responsible for setting and clearing * the VIMAGE context. */ - while ((vap = TAILQ_FIRST(&ic->ic_vaps)) != NULL) + while ((vap = TAILQ_FIRST(&ic->ic_vaps)) != NULL) { + ieee80211_com_vdetach(vap); ieee80211_vap_destroy(vap); + } ieee80211_waitfor_parent(ic); ieee80211_sysctl_detach(ic); @@ -1386,6 +1388,8 @@ getflags(const uint8_t bands[], uint32_t flags[], int ht40, int vht80) /* * Add one 20 MHz channel into specified channel list. + * You MUST NOT mix bands when calling this. It will not add 5ghz + * channels if you have any B/G/N band bit set. */ /* XXX VHT */ int @@ -1632,6 +1636,17 @@ ieee80211_add_channel_list_2ghz(struct ieee80211_channel chans[], int maxchans, } int +ieee80211_add_channels_default_2ghz(struct ieee80211_channel chans[], + int maxchans, int *nchans, const uint8_t bands[], int ht40) +{ + const uint8_t default_chan_list[] = + { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; + + return (ieee80211_add_channel_list_2ghz(chans, maxchans, nchans, + default_chan_list, nitems(default_chan_list), bands, ht40)); +} + +int ieee80211_add_channel_list_5ghz(struct ieee80211_channel chans[], int maxchans, int *nchans, const uint8_t ieee[], int nieee, const uint8_t bands[], int ht40) diff --git a/freebsd/sys/net80211/ieee80211.h b/freebsd/sys/net80211/ieee80211.h index db46b8f1..61389169 100644 --- a/freebsd/sys/net80211/ieee80211.h +++ b/freebsd/sys/net80211/ieee80211.h @@ -951,9 +951,11 @@ enum { IEEE80211_ELEMID_ERP = 42, IEEE80211_ELEMID_HTCAP = 45, IEEE80211_ELEMID_QOS = 46, + IEEE80211_ELEMID_RESERVED_47 = 47, IEEE80211_ELEMID_RSN = 48, IEEE80211_ELEMID_XRATES = 50, IEEE80211_ELEMID_APCHANREP = 51, + IEEE80211_ELEMID_MOBILITY_DOMAIN = 54, IEEE80211_ELEMID_HTINFO = 61, IEEE80211_ELEMID_SECCHAN_OFFSET = 62, IEEE80211_ELEMID_RRM_ENACAPS = 70, diff --git a/freebsd/sys/net80211/ieee80211_adhoc.c b/freebsd/sys/net80211/ieee80211_adhoc.c index fdd02e46..3f3b6d1f 100644 --- a/freebsd/sys/net80211/ieee80211_adhoc.c +++ b/freebsd/sys/net80211/ieee80211_adhoc.c @@ -524,11 +524,9 @@ adhoc_input(struct ieee80211_node *ni, struct mbuf *m, /* * Save QoS bits for use below--before we strip the header. */ - if (subtype == IEEE80211_FC0_SUBTYPE_QOS) { - qos = (dir == IEEE80211_FC1_DIR_DSTODS) ? - ((struct ieee80211_qosframe_addr4 *)wh)->i_qos[0] : - ((struct ieee80211_qosframe *)wh)->i_qos[0]; - } else + if (subtype == IEEE80211_FC0_SUBTYPE_QOS) + qos = ieee80211_getqos(wh)[0]; + else qos = 0; /* diff --git a/freebsd/sys/net80211/ieee80211_amrr.c b/freebsd/sys/net80211/ieee80211_amrr.c index 42e9ac1a..a827f470 100644 --- a/freebsd/sys/net80211/ieee80211_amrr.c +++ b/freebsd/sys/net80211/ieee80211_amrr.c @@ -104,12 +104,13 @@ static void amrr_setinterval(const struct ieee80211vap *vap, int msecs) { struct ieee80211_amrr *amrr = vap->iv_rs; - int t; + + if (!amrr) + return; if (msecs < 100) msecs = 100; - t = msecs_to_ticks(msecs); - amrr->amrr_interval = (t < 1) ? 1 : t; + amrr->amrr_interval = msecs_to_ticks(msecs); } static void @@ -168,6 +169,12 @@ amrr_node_init(struct ieee80211_node *ni) struct ieee80211_amrr_node *amn; uint8_t rate; + if (!amrr) { + if_printf(vap->iv_ifp, "ratectl structure was not allocated, " + "per-node structure allocation skipped\n"); + return; + } + if (ni->ni_rctls == NULL) { ni->ni_rctls = amn = IEEE80211_MALLOC(sizeof(struct ieee80211_amrr_node), M_80211_RATECTL, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); @@ -329,10 +336,19 @@ static int amrr_rate(struct ieee80211_node *ni, void *arg __unused, uint32_t iarg __unused) { struct ieee80211_amrr_node *amn = ni->ni_rctls; - struct ieee80211_amrr *amrr = amn->amn_amrr; + struct ieee80211_amrr *amrr; const struct ieee80211_rateset *rs = NULL; int rix; + /* XXX should return -1 here, but drivers may not expect this... */ + if (!amn) + { + ni->ni_txrate = ni->ni_rates.rs_rates[0]; + return 0; + } + + amrr = amn->amn_amrr; + /* 11n or not? Pick the right rateset */ if (amrr_node_is_11n(ni)) { /* XXX ew */ @@ -371,6 +387,9 @@ amrr_tx_complete(const struct ieee80211_node *ni, struct ieee80211_amrr_node *amn = ni->ni_rctls; int retries; + if (!amn) + return; + retries = 0; if (status->flags & IEEE80211_RATECTL_STATUS_LONG_RETRY) retries = status->long_retries; @@ -388,6 +407,9 @@ amrr_tx_update_cb(void *arg, struct ieee80211_node *ni) struct ieee80211_amrr_node *amn = ni->ni_rctls; int txcnt, success, retrycnt; + if (!amn) + return; + txcnt = stats->nframes; success = stats->nsuccess; retrycnt = 0; @@ -422,9 +444,12 @@ amrr_sysctl_interval(SYSCTL_HANDLER_ARGS) { struct ieee80211vap *vap = arg1; struct ieee80211_amrr *amrr = vap->iv_rs; - int msecs = ticks_to_msecs(amrr->amrr_interval); - int error; + int msecs, error; + + if (!amrr) + return ENOMEM; + msecs = ticks_to_msecs(amrr->amrr_interval); error = sysctl_handle_int(oidp, &msecs, 0, req); if (error || !req->newptr) return error; @@ -438,6 +463,9 @@ amrr_sysctlattach(struct ieee80211vap *vap, { struct ieee80211_amrr *amrr = vap->iv_rs; + if (!amrr) + return; + SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "amrr_rate_interval", CTLTYPE_INT | CTLFLAG_RW, vap, 0, amrr_sysctl_interval, "I", "amrr operation interval (ms)"); @@ -459,6 +487,9 @@ amrr_node_stats(struct ieee80211_node *ni, struct sbuf *s) /* XXX TODO: check locking? */ + if (!amn) + return; + /* XXX TODO: this should be a method */ if (amrr_node_is_11n(ni)) { rs = (struct ieee80211_rateset *) &ni->ni_htrates; diff --git a/freebsd/sys/net80211/ieee80211_crypto.c b/freebsd/sys/net80211/ieee80211_crypto.c index 2e28538c..264e3f17 100644 --- a/freebsd/sys/net80211/ieee80211_crypto.c +++ b/freebsd/sys/net80211/ieee80211_crypto.c @@ -664,14 +664,15 @@ ieee80211_crypto_decap(struct ieee80211_node *ni, struct mbuf *m, int hdrlen, k = &ni->ni_ucastkey; /* - * Insure crypto header is contiguous for all decap work. + * Insure crypto header is contiguous and long enough for all + * decap work. */ cip = k->wk_cipher; - if (m->m_len < hdrlen + cip->ic_header && - (m = m_pullup(m, hdrlen + cip->ic_header)) == NULL) { + if (m->m_len < hdrlen + cip->ic_header) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_CRYPTO, wh->i_addr2, - "unable to pullup %s header", cip->ic_name); - vap->iv_stats.is_rx_wepfail++; /* XXX */ + "frame is too short (%d < %u) for crypto decap", + cip->ic_name, m->m_len, hdrlen + cip->ic_header); + vap->iv_stats.is_rx_tooshort++; *key = NULL; return (0); } diff --git a/freebsd/sys/net80211/ieee80211_dfs.c b/freebsd/sys/net80211/ieee80211_dfs.c index 2c454516..119c11a7 100644 --- a/freebsd/sys/net80211/ieee80211_dfs.c +++ b/freebsd/sys/net80211/ieee80211_dfs.c @@ -158,8 +158,7 @@ cac_timeout(void *arg) /* XXX clobbers any existing desired channel */ /* NB: dfs->newchan may be NULL, that's ok */ vap->iv_des_chan = dfs->newchan; - /* XXX recursive lock need ieee80211_new_state_locked */ - ieee80211_new_state(vap, IEEE80211_S_SCAN, 0); + ieee80211_new_state_locked(vap, IEEE80211_S_SCAN, 0); } else { if_printf(vap->iv_ifp, "CAC timer on channel %u (%u MHz) expired; " diff --git a/freebsd/sys/net80211/ieee80211_freebsd.c b/freebsd/sys/net80211/ieee80211_freebsd.c index 00430f77..f8d5d0f3 100644 --- a/freebsd/sys/net80211/ieee80211_freebsd.c +++ b/freebsd/sys/net80211/ieee80211_freebsd.c @@ -70,8 +70,6 @@ SYSCTL_INT(_net_wlan, OID_AUTO, debug, CTLFLAG_RW, &ieee80211_debug, 0, "debugging printfs"); #endif -static MALLOC_DEFINE(M_80211_COM, "80211com", "802.11 com state"); - static const char wlanname[] = "wlan"; static struct if_clone *wlan_cloner; @@ -138,13 +136,12 @@ int ieee80211_sysctl_msecs_ticks(SYSCTL_HANDLER_ARGS) { int msecs = ticks_to_msecs(*(int *)arg1); - int error, t; + int error; error = sysctl_handle_int(oidp, &msecs, 0, req); if (error || !req->newptr) return error; - t = msecs_to_ticks(msecs); - *(int *)arg1 = (t < 1) ? 1 : t; + *(int *)arg1 = msecs_to_ticks(msecs); return 0; } @@ -309,6 +306,52 @@ ieee80211_sysctl_vdetach(struct ieee80211vap *vap) } } +#define MS(_v, _f) (((_v) & _f##_M) >> _f##_S) +int +ieee80211_com_vincref(struct ieee80211vap *vap) +{ + uint32_t ostate; + + ostate = atomic_fetchadd_32(&vap->iv_com_state, IEEE80211_COM_REF_ADD); + + if (ostate & IEEE80211_COM_DETACHED) { + atomic_subtract_32(&vap->iv_com_state, IEEE80211_COM_REF_ADD); + return (ENETDOWN); + } + + if (MS(ostate, IEEE80211_COM_REF) == IEEE80211_COM_REF_MAX) { + atomic_subtract_32(&vap->iv_com_state, IEEE80211_COM_REF_ADD); + return (EOVERFLOW); + } + + return (0); +} + +void +ieee80211_com_vdecref(struct ieee80211vap *vap) +{ + uint32_t ostate; + + ostate = atomic_fetchadd_32(&vap->iv_com_state, -IEEE80211_COM_REF_ADD); + + KASSERT(MS(ostate, IEEE80211_COM_REF) != 0, + ("com reference counter underflow")); + + (void) ostate; +} + +void +ieee80211_com_vdetach(struct ieee80211vap *vap) +{ + int sleep_time; + + sleep_time = msecs_to_ticks(250); + atomic_set_32(&vap->iv_com_state, IEEE80211_COM_DETACHED); + while (MS(atomic_load_32(&vap->iv_com_state), IEEE80211_COM_REF) != 0) + pause("comref", sleep_time); +} +#undef MS + int ieee80211_node_dectestref(struct ieee80211_node *ni) { diff --git a/freebsd/sys/net80211/ieee80211_freebsd.h b/freebsd/sys/net80211/ieee80211_freebsd.h index 8395eb00..4e06b76a 100644 --- a/freebsd/sys/net80211/ieee80211_freebsd.h +++ b/freebsd/sys/net80211/ieee80211_freebsd.h @@ -38,6 +38,7 @@ #include <sys/rwlock.h> #include <sys/sysctl.h> #include <sys/taskqueue.h> +#include <sys/time.h> /* * Common state locking definitions. @@ -224,6 +225,11 @@ typedef struct mtx ieee80211_rt_lock_t; */ #include <machine/atomic.h> +struct ieee80211vap; +int ieee80211_com_vincref(struct ieee80211vap *); +void ieee80211_com_vdecref(struct ieee80211vap *); +void ieee80211_com_vdetach(struct ieee80211vap *); + #define ieee80211_node_initref(_ni) \ do { ((_ni)->ni_refcnt = 1); } while (0) #define ieee80211_node_incref(_ni) \ @@ -235,7 +241,6 @@ int ieee80211_node_dectestref(struct ieee80211_node *ni); #define ieee80211_node_refcnt(_ni) (_ni)->ni_refcnt struct ifqueue; -struct ieee80211vap; void ieee80211_drain_ifq(struct ifqueue *); void ieee80211_flush_ifq(struct ifqueue *, struct ieee80211vap *); @@ -245,9 +250,8 @@ void ieee80211_vap_destroy(struct ieee80211vap *); (((_ifp)->if_flags & IFF_UP) && \ ((_ifp)->if_drv_flags & IFF_DRV_RUNNING)) -/* XXX TODO: cap these at 1, as hz may not be 1000 */ -#define msecs_to_ticks(ms) (((ms)*hz)/1000) -#define ticks_to_msecs(t) (1000*(t) / hz) +#define msecs_to_ticks(ms) MSEC_2_TICKS(ms) +#define ticks_to_msecs(t) TICKS_2_MSEC(t) #define ticks_to_secs(t) ((t) / hz) #define ieee80211_time_after(a,b) ((long)(b) - (long)(a) < 0) diff --git a/freebsd/sys/net80211/ieee80211_hostap.c b/freebsd/sys/net80211/ieee80211_hostap.c index 0cc43748..2671547f 100644 --- a/freebsd/sys/net80211/ieee80211_hostap.c +++ b/freebsd/sys/net80211/ieee80211_hostap.c @@ -710,11 +710,9 @@ hostap_input(struct ieee80211_node *ni, struct mbuf *m, /* * Save QoS bits for use below--before we strip the header. */ - if (subtype == IEEE80211_FC0_SUBTYPE_QOS) { - qos = (dir == IEEE80211_FC1_DIR_DSTODS) ? - ((struct ieee80211_qosframe_addr4 *)wh)->i_qos[0] : - ((struct ieee80211_qosframe *)wh)->i_qos[0]; - } else + if (subtype == IEEE80211_FC0_SUBTYPE_QOS) + qos = ieee80211_getqos(wh)[0]; + else qos = 0; /* diff --git a/freebsd/sys/net80211/ieee80211_ht.c b/freebsd/sys/net80211/ieee80211_ht.c index c6a3a200..da294488 100644 --- a/freebsd/sys/net80211/ieee80211_ht.c +++ b/freebsd/sys/net80211/ieee80211_ht.c @@ -888,10 +888,7 @@ ieee80211_ampdu_reorder(struct ieee80211_node *ni, struct mbuf *m, if (IEEE80211_IS_MULTICAST(wh->i_addr1)) return PROCESS; - if (IEEE80211_IS_DSTODS(wh)) - tid = ((struct ieee80211_qosframe_addr4 *)wh)->i_qos[0]; - else - tid = wh->i_qos[0]; + tid = ieee80211_getqos(wh)[0]; tid &= IEEE80211_QOS_TID; rap = &ni->ni_rx_ampdu[tid]; if ((rap->rxa_flags & IEEE80211_AGGR_XCHGPEND) == 0) { @@ -1732,7 +1729,7 @@ ieee80211_ht_updateparams(struct ieee80211_node *ni, const struct ieee80211_ie_htinfo *htinfo; ieee80211_parse_htcap(ni, htcapie); - if (vap->iv_htcaps & IEEE80211_HTCAP_SMPS) + if (vap->iv_htcaps & IEEE80211_HTC_SMPS) htcap_update_mimo_ps(ni); htcap_update_shortgi(ni); htcap_update_ldpc(ni); @@ -1885,7 +1882,7 @@ ieee80211_ht_updatehtcap(struct ieee80211_node *ni, const uint8_t *htcapie) struct ieee80211vap *vap = ni->ni_vap; ieee80211_parse_htcap(ni, htcapie); - if (vap->iv_htcaps & IEEE80211_HTCAP_SMPS) + if (vap->iv_htcaps & IEEE80211_HTC_SMPS) htcap_update_mimo_ps(ni); htcap_update_shortgi(ni); htcap_update_ldpc(ni); diff --git a/freebsd/sys/net80211/ieee80211_hwmp.c b/freebsd/sys/net80211/ieee80211_hwmp.c index b8950c5e..681ff7b5 100644 --- a/freebsd/sys/net80211/ieee80211_hwmp.c +++ b/freebsd/sys/net80211/ieee80211_hwmp.c @@ -2017,6 +2017,7 @@ done: */ IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP, dest, "%s", "queue frame until path found"); + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (void *)(uintptr_t) ieee80211_mac_hash(ic, dest); /* XXX age chosen randomly */ diff --git a/freebsd/sys/net80211/ieee80211_ioctl.c b/freebsd/sys/net80211/ieee80211_ioctl.c index 52712514..0396e8be 100644 --- a/freebsd/sys/net80211/ieee80211_ioctl.c +++ b/freebsd/sys/net80211/ieee80211_ioctl.c @@ -2206,18 +2206,6 @@ ieee80211_ioctl_setregdomain(struct ieee80211vap *vap, } static int -ieee80211_ioctl_setroam(struct ieee80211vap *vap, - const struct ieee80211req *ireq) -{ - if (ireq->i_len != sizeof(vap->iv_roamparms)) - return EINVAL; - /* XXX validate params */ - /* XXX? ENETRESET to push to device? */ - return copyin(ireq->i_data, vap->iv_roamparms, - sizeof(vap->iv_roamparms)); -} - -static int checkrate(const struct ieee80211_rateset *rs, int rate) { int i; @@ -2247,6 +2235,73 @@ checkmcs(const struct ieee80211_htrateset *rs, int mcs) } static int +ieee80211_ioctl_setroam(struct ieee80211vap *vap, + const struct ieee80211req *ireq) +{ + struct ieee80211com *ic = vap->iv_ic; + struct ieee80211_roamparams_req *parms; + struct ieee80211_roamparam *src, *dst; + const struct ieee80211_htrateset *rs_ht; + const struct ieee80211_rateset *rs; + int changed, error, mode, is11n, nmodes; + + if (ireq->i_len != sizeof(vap->iv_roamparms)) + return EINVAL; + + parms = IEEE80211_MALLOC(sizeof(*parms), M_TEMP, + IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); + if (parms == NULL) + return ENOMEM; + + error = copyin(ireq->i_data, parms, ireq->i_len); + if (error != 0) + goto fail; + + changed = 0; + nmodes = IEEE80211_MODE_MAX; + + /* validate parameters and check if anything changed */ + for (mode = IEEE80211_MODE_11A; mode < nmodes; mode++) { + if (isclr(ic->ic_modecaps, mode)) + continue; + src = &parms->params[mode]; + dst = &vap->iv_roamparms[mode]; + rs = &ic->ic_sup_rates[mode]; /* NB: 11n maps to legacy */ + rs_ht = &ic->ic_sup_htrates; + is11n = (mode == IEEE80211_MODE_11NA || + mode == IEEE80211_MODE_11NG); + /* XXX TODO: 11ac */ + if (src->rate != dst->rate) { + if (!checkrate(rs, src->rate) && + (!is11n || !checkmcs(rs_ht, src->rate))) { + error = EINVAL; + goto fail; + } + changed++; + } + if (src->rssi != dst->rssi) + changed++; + } + if (changed) { + /* + * Copy new parameters in place and notify the + * driver so it can push state to the device. + */ + /* XXX locking? */ + for (mode = IEEE80211_MODE_11A; mode < nmodes; mode++) { + if (isset(ic->ic_modecaps, mode)) + vap->iv_roamparms[mode] = parms->params[mode]; + } + + if (vap->iv_roaming == IEEE80211_ROAMING_DEVICE) + error = ERESTART; + } + +fail: IEEE80211_FREE(parms, M_TEMP); + return error; +} + +static int ieee80211_ioctl_settxparams(struct ieee80211vap *vap, const struct ieee80211req *ireq) { @@ -2517,20 +2572,12 @@ ieee80211_scanreq(struct ieee80211vap *vap, struct ieee80211_scan_req *sr) sr->sr_duration > IEEE80211_IOC_SCAN_DURATION_MAX) return EINVAL; sr->sr_duration = msecs_to_ticks(sr->sr_duration); - if (sr->sr_duration < 1) - sr->sr_duration = 1; } /* convert min/max channel dwell */ - if (sr->sr_mindwell != 0) { + if (sr->sr_mindwell != 0) sr->sr_mindwell = msecs_to_ticks(sr->sr_mindwell); - if (sr->sr_mindwell < 1) - sr->sr_mindwell = 1; - } - if (sr->sr_maxdwell != 0) { + if (sr->sr_maxdwell != 0) sr->sr_maxdwell = msecs_to_ticks(sr->sr_maxdwell); - if (sr->sr_maxdwell < 1) - sr->sr_maxdwell = 1; - } /* NB: silently reduce ssid count to what is supported */ if (sr->sr_nssid > IEEE80211_SCAN_MAX_SSID) sr->sr_nssid = IEEE80211_SCAN_MAX_SSID; @@ -3482,10 +3529,14 @@ ieee80211_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ieee80211vap *vap = ifp->if_softc; struct ieee80211com *ic = vap->iv_ic; - int error = 0, wait = 0; + int error = 0, wait = 0, ic_used; struct ifreq *ifr; struct ifaddr *ifa; /* XXX */ + ic_used = (cmd != SIOCSIFMTU && cmd != SIOCG80211STATS); + if (ic_used && (error = ieee80211_com_vincref(vap)) != 0) + return (error); + switch (cmd) { case SIOCSIFFLAGS: IEEE80211_LOCK(ic); @@ -3539,9 +3590,13 @@ ieee80211_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) /* * Check if the MAC address was changed * via SIOCSIFLLADDR ioctl. + * + * NB: device may be detached during initialization; + * use if_ioctl for existence check. */ if_addr_rlock(ifp); - if ((ifp->if_flags & IFF_UP) == 0 && + if (ifp->if_ioctl == ieee80211_ioctl && + (ifp->if_flags & IFF_UP) == 0 && !IEEE80211_ADDR_EQ(vap->iv_myaddr, IF_LLADDR(ifp))) IEEE80211_ADDR_COPY(vap->iv_myaddr, IF_LLADDR(ifp)); @@ -3618,5 +3673,9 @@ ieee80211_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = ether_ioctl(ifp, cmd, data); break; } + + if (ic_used) + ieee80211_com_vdecref(vap); + return (error); } diff --git a/freebsd/sys/net80211/ieee80211_mesh.c b/freebsd/sys/net80211/ieee80211_mesh.c index f747c214..5a83b630 100644 --- a/freebsd/sys/net80211/ieee80211_mesh.c +++ b/freebsd/sys/net80211/ieee80211_mesh.c @@ -1227,6 +1227,7 @@ mesh_forward(struct ieee80211vap *vap, struct mbuf *m, M_WME_SETAC(mcopy, WME_AC_BE); /* XXX do we know m_nextpkt is NULL? */ + MPASS((mcopy->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); mcopy->m_pkthdr.rcvif = (void *) ni; /* @@ -1657,12 +1658,7 @@ mesh_input(struct ieee80211_node *ni, struct mbuf *m, * in the Mesh Control field and a 3 address qos frame * is used. */ - if (IEEE80211_IS_DSTODS(wh)) - *(uint16_t *)qos = *(uint16_t *) - ((struct ieee80211_qosframe_addr4 *)wh)->i_qos; - else - *(uint16_t *)qos = *(uint16_t *) - ((struct ieee80211_qosframe *)wh)->i_qos; + *(uint16_t *)qos = *(uint16_t *)ieee80211_getqos(wh); /* * NB: The mesh STA sets the Mesh Control Present diff --git a/freebsd/sys/net80211/ieee80211_output.c b/freebsd/sys/net80211/ieee80211_output.c index 06fca965..57ea67d3 100644 --- a/freebsd/sys/net80211/ieee80211_output.c +++ b/freebsd/sys/net80211/ieee80211_output.c @@ -165,6 +165,7 @@ ieee80211_vap_pkt_send_dest(struct ieee80211vap *vap, struct mbuf *m, * uses any existing value for rcvif to identify the * interface it (might have been) received on. */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (void *)ni; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1: 0; @@ -530,6 +531,7 @@ ieee80211_raw_output(struct ieee80211vap *vap, struct ieee80211_node *ni, * that the mbuf has the same node value that * it would if it were going via the normal path. */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (void *)ni; /* @@ -606,6 +608,97 @@ ieee80211_validate_frame(struct mbuf *m, return (0); } +static int +ieee80211_validate_rate(struct ieee80211_node *ni, uint8_t rate) +{ + struct ieee80211com *ic = ni->ni_ic; + + if (IEEE80211_IS_HT_RATE(rate)) { + if ((ic->ic_htcaps & IEEE80211_HTC_HT) == 0) + return (EINVAL); + + rate = IEEE80211_RV(rate); + if (rate <= 31) { + if (rate > ic->ic_txstream * 8 - 1) + return (EINVAL); + + return (0); + } + + if (rate == 32) { + if ((ic->ic_htcaps & IEEE80211_HTC_TXMCS32) == 0) + return (EINVAL); + + return (0); + } + + if ((ic->ic_htcaps & IEEE80211_HTC_TXUNEQUAL) == 0) + return (EINVAL); + + switch (ic->ic_txstream) { + case 0: + case 1: + return (EINVAL); + case 2: + if (rate > 38) + return (EINVAL); + + return (0); + case 3: + if (rate > 52) + return (EINVAL); + + return (0); + case 4: + default: + if (rate > 76) + return (EINVAL); + + return (0); + } + } + + if (!ieee80211_isratevalid(ic->ic_rt, rate)) + return (EINVAL); + + return (0); +} + +static int +ieee80211_sanitize_rates(struct ieee80211_node *ni, struct mbuf *m, + const struct ieee80211_bpf_params *params) +{ + int error; + + if (!params) + return (0); /* nothing to do */ + + /* NB: most drivers assume that ibp_rate0 is set (!= 0). */ + if (params->ibp_rate0 != 0) { + error = ieee80211_validate_rate(ni, params->ibp_rate0); + if (error != 0) + return (error); + } else { + /* XXX pre-setup some default (e.g., mgmt / mcast) rate */ + /* XXX __DECONST? */ + (void) m; + } + + if (params->ibp_rate1 != 0 && + (error = ieee80211_validate_rate(ni, params->ibp_rate1)) != 0) + return (error); + + if (params->ibp_rate2 != 0 && + (error = ieee80211_validate_rate(ni, params->ibp_rate2)) != 0) + return (error); + + if (params->ibp_rate3 != 0 && + (error = ieee80211_validate_rate(ni, params->ibp_rate3)) != 0) + return (error); + + return (0); +} + /* * 802.11 output routine. This is (currently) used only to * connect bpf write calls to the 802.11 layer for injecting @@ -720,6 +813,10 @@ ieee80211_output(struct ifnet *ifp, struct mbuf *m, } else M_WME_SETAC(m, WME_AC_BE); + error = ieee80211_sanitize_rates(ni, m, params); + if (error != 0) + senderr(error); + IEEE80211_NODE_STAT(ni, tx_data); if (IEEE80211_IS_MULTICAST(wh->i_addr1)) { IEEE80211_NODE_STAT(ni, tx_mcast); @@ -1700,7 +1797,6 @@ ieee80211_encap(struct ieee80211vap *vap, struct ieee80211_node *ni, * capability; this may also change when we pull * aggregation up into net80211 */ - seqno = ni->ni_txseqs[tid]++; *(uint16_t *)wh->i_seq = htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT); M_SEQNO_SET(m, seqno); @@ -1856,14 +1952,8 @@ ieee80211_fragment(struct ieee80211vap *vap, struct mbuf *m0, whf = mtod(m, struct ieee80211_frame *); memcpy(whf, wh, hdrsize); #ifdef IEEE80211_SUPPORT_MESH - if (vap->iv_opmode == IEEE80211_M_MBSS) { - if (IEEE80211_IS_DSTODS(wh)) - ((struct ieee80211_qosframe_addr4 *) - whf)->i_qos[1] &= ~IEEE80211_QOS_MC; - else - ((struct ieee80211_qosframe *) - whf)->i_qos[1] &= ~IEEE80211_QOS_MC; - } + if (vap->iv_opmode == IEEE80211_M_MBSS) + ieee80211_getqos(wh)[1] &= ~IEEE80211_QOS_MC; #endif *(uint16_t *)&whf->i_seq[0] |= htole16( (fragno & IEEE80211_SEQ_FRAG_MASK) << diff --git a/freebsd/sys/net80211/ieee80211_proto.c b/freebsd/sys/net80211/ieee80211_proto.c index 129e11e2..c62d0875 100644 --- a/freebsd/sys/net80211/ieee80211_proto.c +++ b/freebsd/sys/net80211/ieee80211_proto.c @@ -349,6 +349,9 @@ ieee80211_proto_vattach(struct ieee80211vap *vap) * driver and/or user applications. */ for (i = IEEE80211_MODE_11A; i < IEEE80211_MODE_MAX; i++) { + if (isclr(ic->ic_modecaps, i)) + continue; + const struct ieee80211_rateset *rs = &ic->ic_sup_rates[i]; vap->iv_txparms[i].ucastrate = IEEE80211_FIXED_RATE_NONE; diff --git a/freebsd/sys/net80211/ieee80211_proto.h b/freebsd/sys/net80211/ieee80211_proto.h index c1637c57..717de30d 100644 --- a/freebsd/sys/net80211/ieee80211_proto.h +++ b/freebsd/sys/net80211/ieee80211_proto.h @@ -303,6 +303,22 @@ void ieee80211_wme_ic_getparams(struct ieee80211com *ic, int ieee80211_wme_vap_ac_is_noack(struct ieee80211vap *vap, int ac); /* + * Return pointer to the QoS field from a Qos frame. + */ +static __inline uint8_t * +ieee80211_getqos(void *data) +{ + struct ieee80211_frame *wh = data; + + KASSERT(IEEE80211_QOS_HAS_SEQ(wh), ("QoS field is absent!")); + + if (IEEE80211_IS_DSTODS(wh)) + return (((struct ieee80211_qosframe_addr4 *)wh)->i_qos); + else + return (((struct ieee80211_qosframe *)wh)->i_qos); +} + +/* * Return the WME TID from a QoS frame. If no TID * is present return the index for the "non-QoS" entry. */ diff --git a/freebsd/sys/net80211/ieee80211_rssadapt.c b/freebsd/sys/net80211/ieee80211_rssadapt.c index f05af298..52f81a99 100644 --- a/freebsd/sys/net80211/ieee80211_rssadapt.c +++ b/freebsd/sys/net80211/ieee80211_rssadapt.c @@ -119,12 +119,13 @@ static void rssadapt_setinterval(const struct ieee80211vap *vap, int msecs) { struct ieee80211_rssadapt *rs = vap->iv_rs; - int t; + + if (!rs) + return; if (msecs < 100) msecs = 100; - t = msecs_to_ticks(msecs); - rs->interval = (t < 1) ? 1 : t; + rs->interval = msecs_to_ticks(msecs); } static void @@ -179,6 +180,12 @@ rssadapt_node_init(struct ieee80211_node *ni) struct ieee80211_rssadapt *rsa = vap->iv_rs; const struct ieee80211_rateset *rs = &ni->ni_rates; + if (!rsa) { + if_printf(vap->iv_ifp, "ratectl structure was not allocated, " + "per-node structure allocation skipped\n"); + return; + } + if (ni->ni_rctls == NULL) { ni->ni_rctls = ra = IEEE80211_MALLOC(sizeof(struct ieee80211_rssadapt_node), @@ -233,10 +240,18 @@ rssadapt_rate(struct ieee80211_node *ni, void *arg __unused, uint32_t iarg) { struct ieee80211_rssadapt_node *ra = ni->ni_rctls; u_int pktlen = iarg; - const struct ieee80211_rateset *rs = &ra->ra_rates; + const struct ieee80211_rateset *rs; uint16_t (*thrs)[IEEE80211_RATE_SIZE]; int rix, rssi; + /* XXX should return -1 here, but drivers may not expect this... */ + if (!ra) + { + ni->ni_txrate = ni->ni_rates.rs_rates[0]; + return 0; + } + + rs = &ra->ra_rates; if ((ticks - ra->ra_ticks) > ra->ra_rs->interval) { rssadapt_updatestats(ra); ra->ra_ticks = ticks; @@ -322,6 +337,9 @@ rssadapt_tx_complete(const struct ieee80211_node *ni, struct ieee80211_rssadapt_node *ra = ni->ni_rctls; int pktlen, rssi; + if (!ra) + return; + if ((status->flags & (IEEE80211_RATECTL_STATUS_PKTLEN|IEEE80211_RATECTL_STATUS_RSSI)) != (IEEE80211_RATECTL_STATUS_PKTLEN|IEEE80211_RATECTL_STATUS_RSSI)) @@ -346,9 +364,12 @@ rssadapt_sysctl_interval(SYSCTL_HANDLER_ARGS) { struct ieee80211vap *vap = arg1; struct ieee80211_rssadapt *rs = vap->iv_rs; - int msecs = ticks_to_msecs(rs->interval); - int error; + int msecs, error; + + if (!rs) + return ENOMEM; + msecs = ticks_to_msecs(rs->interval); error = sysctl_handle_int(oidp, &msecs, 0, req); if (error || !req->newptr) return error; diff --git a/freebsd/sys/net80211/ieee80211_scan.c b/freebsd/sys/net80211/ieee80211_scan.c index 9b58ff98..c6f03fec 100644 --- a/freebsd/sys/net80211/ieee80211_scan.c +++ b/freebsd/sys/net80211/ieee80211_scan.c @@ -132,13 +132,21 @@ void ieee80211_scan_vattach(struct ieee80211vap *vap) { struct ieee80211com *ic = vap->iv_ic; + int m; vap->iv_bgscanidle = (IEEE80211_BGSCAN_IDLE_DEFAULT*1000)/hz; vap->iv_bgscanintvl = IEEE80211_BGSCAN_INTVAL_DEFAULT*hz; vap->iv_scanvalid = IEEE80211_SCAN_VALID_DEFAULT*hz; vap->iv_roaming = IEEE80211_ROAMING_AUTO; - memcpy(vap->iv_roamparms, defroam, sizeof(defroam)); + + memset(vap->iv_roamparms, 0, sizeof(vap->iv_roamparms)); + for (m = IEEE80211_MODE_AUTO + 1; m < IEEE80211_MODE_MAX; m++) { + if (isclr(ic->ic_modecaps, m)) + continue; + + memcpy(&vap->iv_roamparms[m], &defroam[m], sizeof(defroam[m])); + } ic->ic_scan_methods->sc_vattach(vap); } @@ -296,7 +304,7 @@ ieee80211_scan_dump(struct ieee80211_scan_state *ss) if_printf(vap->iv_ifp, "scan set "); ieee80211_scan_dump_channels(ss); - printf(" dwell min %lums max %lums\n", + printf(" dwell min %ums max %ums\n", ticks_to_msecs(ss->ss_mindwell), ticks_to_msecs(ss->ss_maxdwell)); } #endif /* IEEE80211_DEBUG */ diff --git a/freebsd/sys/net80211/ieee80211_scan_sta.c b/freebsd/sys/net80211/ieee80211_scan_sta.c index a7a1fc29..8dda8499 100644 --- a/freebsd/sys/net80211/ieee80211_scan_sta.c +++ b/freebsd/sys/net80211/ieee80211_scan_sta.c @@ -474,6 +474,8 @@ static const u_int chanflags[IEEE80211_MODE_MAX] = { /* check legacy */ [IEEE80211_MODE_11NA] = IEEE80211_CHAN_A, [IEEE80211_MODE_11NG] = IEEE80211_CHAN_G, + [IEEE80211_MODE_VHT_5GHZ] = IEEE80211_CHAN_A, + [IEEE80211_MODE_VHT_2GHZ] = IEEE80211_CHAN_G, }; static void @@ -496,12 +498,15 @@ add_channels(struct ieee80211vap *vap, if (c == NULL || isexcluded(vap, c)) continue; if (mode == IEEE80211_MODE_AUTO) { + KASSERT(IEEE80211_IS_CHAN_B(c), + ("%s: wrong channel for 'auto' mode %u / %u\n", + __func__, c->ic_freq, c->ic_flags)); + /* * XXX special-case 11b/g channels so we select * the g channel if both are present. */ - if (IEEE80211_IS_CHAN_B(c) && - (cg = find11gchannel(ic, i, c->ic_freq)) != NULL) + if ((cg = find11gchannel(ic, i, c->ic_freq)) != NULL) c = cg; } ss->ss_chans[ss->ss_last++] = c; @@ -620,32 +625,48 @@ makescanlist(struct ieee80211_scan_state *ss, struct ieee80211vap *vap, */ for (scan = table; scan->list != NULL; scan++) { mode = scan->mode; - if (vap->iv_des_mode != IEEE80211_MODE_AUTO) { + + switch (mode) { + case IEEE80211_MODE_11B: + if (vap->iv_des_mode == IEEE80211_MODE_11B) + break; + /* - * If a desired mode was specified, scan only - * channels that satisfy that constraint. + * The scan table marks 2.4Ghz channels as b + * so if the desired mode is 11g / 11ng / 11acg, + * then use the 11b channel list but upgrade the mode. + * + * NB: 11b -> AUTO lets add_channels upgrade an + * 11b channel to 11g if available. */ - if (vap->iv_des_mode != mode) { - /* - * The scan table marks 2.4Ghz channels as b - * so if the desired mode is 11g, then use - * the 11b channel list but upgrade the mode. - */ - if (vap->iv_des_mode == IEEE80211_MODE_11G) { - if (mode == IEEE80211_MODE_11G) /* Skip the G check */ - continue; - else if (mode == IEEE80211_MODE_11B) - mode = IEEE80211_MODE_11G; /* upgrade */ - } + if (vap->iv_des_mode == IEEE80211_MODE_AUTO || + vap->iv_des_mode == IEEE80211_MODE_11G || + vap->iv_des_mode == IEEE80211_MODE_11NG || + vap->iv_des_mode == IEEE80211_MODE_VHT_2GHZ) { + mode = vap->iv_des_mode; + break; } - } else { + + continue; + case IEEE80211_MODE_11A: + /* Use 11a channel list for 11na / 11ac modes */ + if (vap->iv_des_mode == IEEE80211_MODE_11NA || + vap->iv_des_mode == IEEE80211_MODE_VHT_5GHZ) { + mode = vap->iv_des_mode; + break; + } + + /* FALLTHROUGH */ + default: /* - * This lets add_channels upgrade an 11b channel - * to 11g if available. + * If a desired mode was specified, scan only + * channels that satisfy that constraint. */ - if (mode == IEEE80211_MODE_11B) - mode = IEEE80211_MODE_AUTO; + if (vap->iv_des_mode != IEEE80211_MODE_AUTO && + vap->iv_des_mode != mode) + continue; } + #ifdef IEEE80211_F_XR /* XR does not operate on turbo channels */ if ((vap->iv_flags & IEEE80211_F_XR) && @@ -1335,12 +1356,14 @@ sta_roam_check(struct ieee80211_scan_state *ss, struct ieee80211vap *vap) mode = ieee80211_chan2mode(ic->ic_bsschan); roamRate = vap->iv_roamparms[mode].rate; roamRssi = vap->iv_roamparms[mode].rssi; + KASSERT(roamRate != 0 && roamRssi != 0, ("iv_roamparms are not" + "initialized for %s mode!", ieee80211_phymode_name[mode])); + ucastRate = vap->iv_txparms[mode].ucastrate; /* NB: the most up to date rssi is in the node, not the scan cache */ curRssi = ic->ic_node_getrssi(ni); if (ucastRate == IEEE80211_FIXED_RATE_NONE) { curRate = ni->ni_txrate; - roamRate &= IEEE80211_RATE_VAL; IEEE80211_DPRINTF(vap, IEEE80211_MSG_ROAM, "%s: currssi %d currate %u roamrssi %d roamrate %u\n", __func__, curRssi, curRate, roamRssi, roamRate); diff --git a/freebsd/sys/net80211/ieee80211_sta.c b/freebsd/sys/net80211/ieee80211_sta.c index 1993f566..3dc9ee16 100644 --- a/freebsd/sys/net80211/ieee80211_sta.c +++ b/freebsd/sys/net80211/ieee80211_sta.c @@ -788,11 +788,9 @@ sta_input(struct ieee80211_node *ni, struct mbuf *m, /* * Save QoS bits for use below--before we strip the header. */ - if (subtype == IEEE80211_FC0_SUBTYPE_QOS) { - qos = (dir == IEEE80211_FC1_DIR_DSTODS) ? - ((struct ieee80211_qosframe_addr4 *)wh)->i_qos[0] : - ((struct ieee80211_qosframe *)wh)->i_qos[0]; - } else + if (subtype == IEEE80211_FC0_SUBTYPE_QOS) + qos = ieee80211_getqos(wh)[0]; + else qos = 0; /* diff --git a/freebsd/sys/net80211/ieee80211_tdma.c b/freebsd/sys/net80211/ieee80211_tdma.c index b18803fc..c675af9b 100644 --- a/freebsd/sys/net80211/ieee80211_tdma.c +++ b/freebsd/sys/net80211/ieee80211_tdma.c @@ -129,6 +129,9 @@ static int tdma_process_params(struct ieee80211_node *ni, static void settxparms(struct ieee80211vap *vap, enum ieee80211_phymode mode, int rate) { + if (isclr(vap->iv_ic->ic_modecaps, mode)) + return; + vap->iv_txparms[mode].ucastrate = rate; vap->iv_txparms[mode].mcastrate = rate; } diff --git a/freebsd/sys/net80211/ieee80211_var.h b/freebsd/sys/net80211/ieee80211_var.h index ee17c806..24ffbe10 100644 --- a/freebsd/sys/net80211/ieee80211_var.h +++ b/freebsd/sys/net80211/ieee80211_var.h @@ -400,6 +400,7 @@ struct ieee80211vap { uint32_t iv_caps; /* capabilities */ uint32_t iv_htcaps; /* HT capabilities */ uint32_t iv_htextcaps; /* HT extended capabilities */ + uint32_t iv_com_state; /* com usage / detached flag */ enum ieee80211_opmode iv_opmode; /* operation mode */ enum ieee80211_state iv_state; /* state machine state */ enum ieee80211_state iv_nstate; /* pending state */ @@ -685,6 +686,12 @@ MALLOC_DECLARE(M_80211_VAP); #define IEEE80211_VFHT_BITS \ "\20\1VHT\2VHT40\3VHT80\4VHT80P80\5VHT160" +#define IEEE80211_COM_DETACHED 0x00000001 /* ieee80211_ifdetach called */ +#define IEEE80211_COM_REF_ADD 0x00000002 /* add / remove reference */ +#define IEEE80211_COM_REF_M 0xfffffffe /* reference counter bits */ +#define IEEE80211_COM_REF_S 1 +#define IEEE80211_COM_REF_MAX (IEEE80211_COM_REF_M >> IEEE80211_COM_REF_S) + int ic_printf(struct ieee80211com *, const char *, ...) __printflike(2, 3); void ieee80211_ifattach(struct ieee80211com *); void ieee80211_ifdetach(struct ieee80211com *); @@ -727,6 +734,8 @@ uint32_t ieee80211_get_channel_center_freq1(const struct ieee80211_channel *); uint32_t ieee80211_get_channel_center_freq2(const struct ieee80211_channel *); int ieee80211_add_channel_list_2ghz(struct ieee80211_channel[], int, int *, const uint8_t[], int, const uint8_t[], int); +int ieee80211_add_channels_default_2ghz(struct ieee80211_channel[], int, + int *, const uint8_t[], int); int ieee80211_add_channel_list_5ghz(struct ieee80211_channel[], int, int *, const uint8_t[], int, const uint8_t[], int); struct ieee80211_channel *ieee80211_find_channel(struct ieee80211com *, diff --git a/freebsd/sys/net80211/ieee80211_wds.c b/freebsd/sys/net80211/ieee80211_wds.c index 1bceacef..3b57c6a3 100644 --- a/freebsd/sys/net80211/ieee80211_wds.c +++ b/freebsd/sys/net80211/ieee80211_wds.c @@ -301,6 +301,7 @@ ieee80211_dwds_mcast(struct ieee80211vap *vap0, struct mbuf *m) continue; } mcopy->m_flags |= M_MCAST; + MPASS((mcopy->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); mcopy->m_pkthdr.rcvif = (void *) ni; err = ieee80211_parent_xmitpkt(ic, mcopy); @@ -334,6 +335,7 @@ ieee80211_dwds_discover(struct ieee80211_node *ni, struct mbuf *m) * XXX handle overflow? * XXX per/vap beacon interval? */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (void *)(uintptr_t) ieee80211_mac_hash(ic, ni->ni_macaddr); (void) ieee80211_ageq_append(&ic->ic_stageq, m, @@ -585,11 +587,9 @@ wds_input(struct ieee80211_node *ni, struct mbuf *m, /* * Save QoS bits for use below--before we strip the header. */ - if (subtype == IEEE80211_FC0_SUBTYPE_QOS) { - qos = (dir == IEEE80211_FC1_DIR_DSTODS) ? - ((struct ieee80211_qosframe_addr4 *)wh)->i_qos[0] : - ((struct ieee80211_qosframe *)wh)->i_qos[0]; - } else + if (subtype == IEEE80211_FC0_SUBTYPE_QOS) + qos = ieee80211_getqos(wh)[0]; + else qos = 0; /* diff --git a/freebsd/sys/net80211/ieee80211_wps.h b/freebsd/sys/net80211/ieee80211_wps.h new file mode 100644 index 00000000..32cc667e --- /dev/null +++ b/freebsd/sys/net80211/ieee80211_wps.h @@ -0,0 +1,149 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017 J.R. Oldroyd, Open Advisors Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef _NET80211_IEEE80211_WPS_H_ +#define _NET80211_IEEE80211_WPS_H_ + +/* + * 802.11 WPS implementation definitions. + */ + +#define IEEE80211_WPS_ATTR_AP_CHANNEL 0x1001 +#define IEEE80211_WPS_ATTR_ASSOC_STATE 0x1002 +#define IEEE80211_WPS_ATTR_AUTH_TYPE 0x1003 +#define IEEE80211_WPS_ATTR_AUTH_TYPE_FLAGS 0x1004 +#define IEEE80211_WPS_ATTR_AUTHENTICATOR 0x1005 +#define IEEE80211_WPS_ATTR_CONFIG_METHODS 0x1008 +#define IEEE80211_WPS_ATTR_CONFIG_ERROR 0x1009 +#define IEEE80211_WPS_ATTR_CONFIRM_URL4 0x100a +#define IEEE80211_WPS_ATTR_CONFIRM_URL6 0x100b +#define IEEE80211_WPS_ATTR_CONN_TYPE 0x100c +#define IEEE80211_WPS_ATTR_CONN_TYPE_FLAGS 0x100d +#define IEEE80211_WPS_ATTR_CRED 0x100e +#define IEEE80211_WPS_ATTR_ENCR_TYPE 0x100f +#define IEEE80211_WPS_ATTR_ENCR_TYPE_FLAGS 0x1010 +#define IEEE80211_WPS_ATTR_DEV_NAME 0x1011 +#define IEEE80211_WPS_ATTR_DEV_PASSWORD_ID 0x1012 +#define IEEE80211_WPS_ATTR_E_HASH1 0x1014 +#define IEEE80211_WPS_ATTR_E_HASH2 0x1015 +#define IEEE80211_WPS_ATTR_E_SNONCE1 0x1016 +#define IEEE80211_WPS_ATTR_E_SNONCE2 0x1017 +#define IEEE80211_WPS_ATTR_ENCR_SETTINGS 0x1018 +#define IEEE80211_WPS_ATTR_ENROLLEE_NONCE 0x101a +#define IEEE80211_WPS_ATTR_FEATURE_ID 0x101b +#define IEEE80211_WPS_ATTR_IDENTITY 0x101c +#define IEEE80211_WPS_ATTR_IDENTITY_PROOF 0x101d +#define IEEE80211_WPS_ATTR_KEY_WRAP_AUTH 0x101e +#define IEEE80211_WPS_ATTR_KEY_ID 0x101f +#define IEEE80211_WPS_ATTR_MAC_ADDR 0x1020 +#define IEEE80211_WPS_ATTR_MANUFACTURER 0x1021 +#define IEEE80211_WPS_ATTR_MSG_TYPE 0x1022 +#define IEEE80211_WPS_ATTR_MODEL_NAME 0x1023 +#define IEEE80211_WPS_ATTR_MODEL_NUMBER 0x1024 +#define IEEE80211_WPS_ATTR_NETWORK_INDEX 0x1026 +#define IEEE80211_WPS_ATTR_NETWORK_KEY 0x1027 +#define IEEE80211_WPS_ATTR_NETWORK_KEY_INDEX 0x1028 +#define IEEE80211_WPS_ATTR_NEW_DEVICE_NAME 0x1029 +#define IEEE80211_WPS_ATTR_NEW_PASSWORD 0x102a +#define IEEE80211_WPS_ATTR_OOB_DEVICE_PASSWORD 0x102c +#define IEEE80211_WPS_ATTR_OS_VERSION 0x102d +#define IEEE80211_WPS_ATTR_POWER_LEVEL 0x102f +#define IEEE80211_WPS_ATTR_PSK_CURRENT 0x1030 +#define IEEE80211_WPS_ATTR_PSK_MAX 0x1031 +#define IEEE80211_WPS_ATTR_PUBLIC_KEY 0x1032 +#define IEEE80211_WPS_ATTR_RADIO_ENABLE 0x1033 +#define IEEE80211_WPS_ATTR_REBOOT 0x1034 +#define IEEE80211_WPS_ATTR_REGISTRAR_CURRENT 0x1035 +#define IEEE80211_WPS_ATTR_REGISTRAR_ESTBLSHD 0x1036 +#define IEEE80211_WPS_ATTR_REGISTRAR_LIST 0x1037 +#define IEEE80211_WPS_ATTR_REGISTRAR_MAX 0x1038 +#define IEEE80211_WPS_ATTR_REGISTRAR_NONCE 0x1039 +#define IEEE80211_WPS_ATTR_REQUEST_TYPE 0x103a +#define IEEE80211_WPS_ATTR_RESPONSE_TYPE 0x103b +#define IEEE80211_WPS_ATTR_RF_BANDS 0x103c +#define IEEE80211_WPS_ATTR_R_HASH1 0x103d +#define IEEE80211_WPS_ATTR_R_HASH2 0x103e +#define IEEE80211_WPS_ATTR_R_SNONCE1 0x103f +#define IEEE80211_WPS_ATTR_R_SNONCE2 0x1040 +#define IEEE80211_WPS_ATTR_SELECTED_REGISTRAR 0x1041 +#define IEEE80211_WPS_ATTR_SERIAL_NUMBER 0x1042 +#define IEEE80211_WPS_ATTR_WPS_STATE 0x1044 +#define IEEE80211_WPS_ATTR_SSID 0x1045 +#define IEEE80211_WPS_ATTR_TOTAL_NETWORKS 0x1046 +#define IEEE80211_WPS_ATTR_UUID_E 0x1047 +#define IEEE80211_WPS_ATTR_UUID_R 0x1048 +#define IEEE80211_WPS_ATTR_VENDOR_EXT 0x1049 +#define IEEE80211_WPS_ATTR_VERSION 0x104a +#define IEEE80211_WPS_ATTR_X509_CERT_REQ 0x104b +#define IEEE80211_WPS_ATTR_X509_CERT 0x104c +#define IEEE80211_WPS_ATTR_EAP_IDENTITY 0x104d +#define IEEE80211_WPS_ATTR_MSG_COUNTER 0x104e +#define IEEE80211_WPS_ATTR_PUBKEY_HASH 0x104f +#define IEEE80211_WPS_ATTR_REKEY_KEY 0x1050 +#define IEEE80211_WPS_ATTR_KEY_LIFETIME 0x1051 +#define IEEE80211_WPS_ATTR_PERMITTED_CONFIG_METHODS 0x1052 +#define IEEE80211_WPS_ATTR_SELECTED_REGISTRAR_CONFIG_METHODS 0x1053 +#define IEEE80211_WPS_ATTR_PRIMARY_DEV_TYPE 0x1054 +#define IEEE80211_WPS_ATTR_SECONDARY_DEV_TYPE_LIST 0x1055 +#define IEEE80211_WPS_ATTR_PORTABLE_DEV 0x1056 +#define IEEE80211_WPS_ATTR_AP_SETUP_LOCKED 0x1057 +#define IEEE80211_WPS_ATTR_APPLICATION_EXT 0x1058 +#define IEEE80211_WPS_ATTR_EAP_TYPE 0x1059 +#define IEEE80211_WPS_ATTR_IV 0x1060 +#define IEEE80211_WPS_ATTR_KEY_PROVIDED_AUTO 0x1061 +#define IEEE80211_WPS_ATTR_802_1X_ENABLED 0x1062 +#define IEEE80211_WPS_ATTR_AP_SESSION_KEY 0x1063 +#define IEEE80211_WPS_ATTR_WEP_TRANSMIT_KEY 0x1064 +#define IEEE80211_WPS_ATTR_REQUESTED_DEV_TYPE 0x106a +#define IEEE80211_WPS_ATTR_EXTENSIBILITY_TEST 0x10fa /* _NOT_ defined in the spec */ + +/* RF bands bitmask */ +#define IEEE80211_WPS_RF_BAND_24GHZ 0x01 +#define IEEE80211_WPS_RF_BAND_50GHZ 0x02 +#define IEEE80211_WPS_RF_BAND_600GHZ 0x04 + +/* Config methods bitmask */ +#define IEEE80211_WPS_CONFIG_USBA 0x0001 +#define IEEE80211_WPS_CONFIG_ETHERNET 0x0002 +#define IEEE80211_WPS_CONFIG_LABEL 0x0004 +#define IEEE80211_WPS_CONFIG_DISPLAY 0x0008 +#define IEEE80211_WPS_CONFIG_EXT_NFC_TOKEN 0x0010 +#define IEEE80211_WPS_CONFIG_INT_NFC_TOKEN 0x0020 +#define IEEE80211_WPS_CONFIG_NFC_INTERFACE 0x0040 +#define IEEE80211_WPS_CONFIG_PUSHBUTTON 0x0080 +#define IEEE80211_WPS_CONFIG_KEYPAD 0x0100 +#define IEEE80211_WPS_CONFIG_VIRT_PUSHBUTTON 0x0200 +#define IEEE80211_WPS_CONFIG_PHY_PUSHBUTTON 0x0400 +#define IEEE80211_WPS_CONFIG_P2PS 0x1000 +#define IEEE80211_WPS_CONFIG_VIRT_DISPLAY 0x2000 +#define IEEE80211_WPS_CONFIG_PHY_DISPLAY 0x4000 + +/* Wi-Fi Protected Setup state */ +#define IEEE80211_WPS_STATE_NOT_CONFIGURED 0x01 +#define IEEE80211_WPS_STATE_CONFIGURED 0x02 +#endif /* _NET80211_IEEE80211_WPS_H_ */ diff --git a/freebsd/sys/netinet/cc/cc_newreno.c b/freebsd/sys/netinet/cc/cc_newreno.c index b1307c92..e1993664 100644 --- a/freebsd/sys/netinet/cc/cc_newreno.c +++ b/freebsd/sys/netinet/cc/cc_newreno.c @@ -201,7 +201,7 @@ newreno_ack_received(struct cc_var *ccv, uint16_t type) static void newreno_after_idle(struct cc_var *ccv) { - int rw; + uint32_t rw; /* * If we've been idle for more than one retransmit timeout the old @@ -216,11 +216,7 @@ newreno_after_idle(struct cc_var *ccv) * * See RFC5681 Section 4.1. "Restarting Idle Connections". */ - if (V_tcp_do_rfc3390) - rw = min(4 * CCV(ccv, t_maxseg), - max(2 * CCV(ccv, t_maxseg), 4380)); - else - rw = CCV(ccv, t_maxseg) * 2; + rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp)); CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); } @@ -301,7 +297,12 @@ newreno_post_recovery(struct cc_var *ccv) pipe = CCV(ccv, snd_max) - ccv->curack; if (pipe < CCV(ccv, snd_ssthresh)) - CCV(ccv, snd_cwnd) = pipe + CCV(ccv, t_maxseg); + /* + * Ensure that cwnd does not collapse to 1 MSS under + * adverse conditons. Implements RFC6582 + */ + CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + + CCV(ccv, t_maxseg); else CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); } diff --git a/freebsd/sys/netinet/if_ether.c b/freebsd/sys/netinet/if_ether.c index e9efd89e..1206691f 100644 --- a/freebsd/sys/netinet/if_ether.c +++ b/freebsd/sys/netinet/if_ether.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <sys/param.h> +#include <sys/eventhandler.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/queue.h> @@ -163,8 +164,15 @@ SYSCTL_PROC(_net_link_ether_inet, OID_AUTO, garp_rexmit_count, "Number of times to retransmit GARP packets;" " 0 to disable, maximum of 16"); +VNET_DEFINE_STATIC(int, arp_log_level) = LOG_INFO; /* Min. log(9) level. */ +#define V_arp_log_level VNET(arp_log_level) +SYSCTL_INT(_net_link_ether_arp, OID_AUTO, log_level, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(arp_log_level), 0, + "Minimum log(9) level for recording rate limited arp log messages. " + "The higher will be log more (emerg=0, info=6 (default), debug=7)."); #define ARP_LOG(pri, ...) do { \ - if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps)) \ + if ((pri) <= V_arp_log_level && \ + ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps)) \ log((pri), "arp: " __VA_ARGS__); \ } while (0) @@ -343,8 +351,8 @@ arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf, * - arp header target ip address * - arp header source ethernet address */ -void -arprequest(struct ifnet *ifp, const struct in_addr *sip, +static int +arprequest_internal(struct ifnet *ifp, const struct in_addr *sip, const struct in_addr *tip, u_char *enaddr) { struct mbuf *m; @@ -361,9 +369,10 @@ arprequest(struct ifnet *ifp, const struct in_addr *sip, * The caller did not supply a source address, try to find * a compatible one among those assigned to this interface. */ + struct epoch_tracker et; struct ifaddr *ifa; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; @@ -381,17 +390,17 @@ arprequest(struct ifnet *ifp, const struct in_addr *sip, IA_MASKSIN(ifa)->sin_addr.s_addr)) break; /* found it. */ } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (sip == NULL) { printf("%s: cannot find matching address\n", __func__); - return; + return (EADDRNOTAVAIL); } } if (enaddr == NULL) enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp); if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) - return; + return (ENOMEM); m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) + 2 * ifp->if_addrlen; m->m_pkthdr.len = m->m_len; @@ -418,7 +427,7 @@ arprequest(struct ifnet *ifp, const struct in_addr *sip, if (error != 0 && error != EAFNOSUPPORT) { ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", if_name(ifp), error); - return; + return (error); } ro.ro_prepend = linkhdr; @@ -427,10 +436,23 @@ arprequest(struct ifnet *ifp, const struct in_addr *sip, m->m_flags |= M_BCAST; m_clrprotoflags(m); /* Avoid confusing lower layers. */ - (*ifp->if_output)(ifp, m, &sa, &ro); + error = (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txrequests); + if (error) { + ARPSTAT_INC(txerrors); + ARP_LOG(LOG_DEBUG, "Failed to send ARP packet on %s: %d\n", + if_name(ifp), error); + } + return (error); } +void +arprequest(struct ifnet *ifp, const struct in_addr *sip, + const struct in_addr *tip, u_char *enaddr) +{ + + (void) arprequest_internal(ifp, sip, tip, enaddr); +} /* * Resolve an IP address into an ethernet address - heavy version. @@ -461,9 +483,11 @@ arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m, *plle = NULL; if ((flags & LLE_CREATE) == 0) { - IF_AFDATA_RLOCK(ifp); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); } if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); @@ -556,7 +580,7 @@ arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m, error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN; if (renew) { - int canceled; + int canceled, e; LLE_ADDREF(la); la->la_expire = time_uptime; @@ -566,7 +590,13 @@ arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m, LLE_REMREF(la); la->la_asked++; LLE_WUNLOCK(la); - arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL); + e = arprequest_internal(ifp, NULL, &SIN(dst)->sin_addr, NULL); + /* + * Only overwrite 'error' in case of error; in case of success + * the proper return value was already set above. + */ + if (e != 0) + return (e); return (error); } @@ -595,6 +625,7 @@ arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { + struct epoch_tracker et; struct llentry *la = NULL; if (pflags != NULL) @@ -616,7 +647,7 @@ arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, } } - IF_AFDATA_RLOCK(ifp); + NET_EPOCH_ENTER(et); la = lla_lookup(LLTABLE(ifp), plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, dst); if (la != NULL && (la->r_flags & RLLE_VALID) != 0) { /* Entry found, let's copy lle info */ @@ -630,12 +661,12 @@ arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, *plle = la; LLE_WUNLOCK(la); } - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (0); } if (plle && la) LLE_WUNLOCK(la); - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst, desten, pflags, plle)); @@ -780,6 +811,7 @@ in_arpinput(struct mbuf *m) int lladdr_off; int error; char addrbuf[INET_ADDRSTRLEN]; + struct epoch_tracker et; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; @@ -872,17 +904,17 @@ in_arpinput(struct mbuf *m) * No match, use the first inet address on the receive interface * as a dummy address for the rest of the function. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (ifa->ifa_carp == NULL || (*carp_iamatch_p)(ifa, &enaddr))) { ia = ifatoia(ifa); ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); goto match; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); /* * If bridging, fall back to using any inet address. @@ -939,9 +971,9 @@ match: sin.sin_family = AF_INET; sin.sin_addr = isaddr; dst = (struct sockaddr *)&sin; - IF_AFDATA_RLOCK(ifp); + NET_EPOCH_ENTER(et); la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (la != NULL) arp_check_update_lle(ah, isaddr, ifp, bridged, la); else if (itaddr.s_addr == myaddr.s_addr) { @@ -1019,9 +1051,9 @@ reply: struct llentry *lle = NULL; sin.sin_addr = itaddr; - IF_AFDATA_RLOCK(ifp); + NET_EPOCH_ENTER(et); lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin); - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if ((lle != NULL) && (lle->la_flags & LLE_PUB)) { (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); @@ -1332,6 +1364,8 @@ garp_rexmit(void *arg) return; } + CURVNET_SET(ia->ia_ifa.ifa_ifp->if_vnet); + /* * Drop lock while the ARP request is generated. */ @@ -1359,6 +1393,8 @@ garp_rexmit(void *arg) ifa_free(&ia->ia_ifa); } } + + CURVNET_RESTORE(); } /* diff --git a/freebsd/sys/netinet/igmp.c b/freebsd/sys/netinet/igmp.c index 970a01a0..7fac6a70 100644 --- a/freebsd/sys/netinet/igmp.c +++ b/freebsd/sys/netinet/igmp.c @@ -694,6 +694,7 @@ static int igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { + struct epoch_tracker et; struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; @@ -735,7 +736,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, * for the interface on which the query arrived, * except those which are already running. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) @@ -763,7 +764,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, break; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); out_locked: IGMP_UNLOCK(); @@ -779,6 +780,7 @@ static int igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { + struct epoch_tracker et; struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; @@ -836,7 +838,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, */ CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)", ifp, ifp->if_xname); - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) @@ -844,7 +846,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, inm = (struct in_multi *)ifma->ifma_protospec; igmp_v2_update_group(inm, timer); } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); } else { /* * Group-specific IGMPv2 query, we need only @@ -1220,11 +1222,13 @@ igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { - NET_EPOCH_ENTER(); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); } CTR3(KTR_IGMPV3, "process v1 report 0x%08x on ifp %p(%s)", @@ -1309,6 +1313,7 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; + struct epoch_tracker et; struct in_ifaddr *ia; struct in_multi *inm; @@ -1317,23 +1322,23 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, * leave requires knowing that we are the only member of a * group. */ - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (0); } IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) { - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (0); } if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } @@ -1349,7 +1354,7 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); } - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); CTR3(KTR_IGMPV3, "process v2 report 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); @@ -1991,6 +1996,7 @@ igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) struct ifnet *ifp; struct in_multi *inm; struct in_multi_head inm_free_tmp; + struct epoch_tracker et; CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); @@ -2011,7 +2017,7 @@ igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) * for all memberships scoped to this link. */ ifp = igi->igi_ifp; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) @@ -2056,7 +2062,7 @@ igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) inm->inm_timer = 0; mbufq_drain(&inm->inm_scq); } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); inm_release_list_deferred(&inm_free_tmp); } @@ -3299,6 +3305,7 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq) static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) { + struct epoch_tracker et; struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; @@ -3321,7 +3328,7 @@ igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) ifp = igi->igi_ifp; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) @@ -3352,7 +3359,7 @@ igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) break; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); send: loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; @@ -3524,13 +3531,14 @@ igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) ip->ip_src.s_addr = INADDR_ANY; if (m->m_flags & M_IGMP_LOOP) { + struct epoch_tracker et; struct in_ifaddr *ia; - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) ip->ip_src = ia->ia_addr.sin_addr; - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); } ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); diff --git a/freebsd/sys/netinet/in.c b/freebsd/sys/netinet/in.c index db1ebda0..c3f172f3 100644 --- a/freebsd/sys/netinet/in.c +++ b/freebsd/sys/netinet/in.c @@ -141,20 +141,21 @@ in_localip(struct in_addr in) int in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { + struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == in.s_addr) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (1); } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (0); } @@ -192,15 +193,10 @@ int in_canforward(struct in_addr in) { u_long i = ntohl(in.s_addr); - u_long net; - if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i)) + if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i) || + IN_ZERONET(i) || IN_LOOPBACK(i)) return (0); - if (IN_CLASSA(i)) { - net = i & IN_CLASSA_NET; - if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) - return (0); - } return (1); } @@ -230,6 +226,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; + struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; int error; @@ -281,7 +278,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * address was specified, find that one instead of the * first one on the interface, if possible. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; @@ -299,7 +296,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } if (ifa == NULL) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } @@ -330,7 +327,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, break; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (error); } @@ -344,6 +341,7 @@ in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; + struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; @@ -380,7 +378,7 @@ in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) */ iaIsFirst = true; ia = NULL; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; @@ -393,7 +391,7 @@ in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) ia = it; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (ia != NULL) (void )in_difaddr_ioctl(cmd, data, ifp, td); @@ -923,7 +921,7 @@ in_ifscrub_all(void) IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ - /* IF_ADDR_RLOCK(ifp); */ + /* NET_EPOCH_ENTER(et); */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; @@ -939,7 +937,7 @@ in_ifscrub_all(void) (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL); } - /* IF_ADDR_RUNLOCK(ifp); */ + /* NET_EPOCH_EXIT(et); */ in_purgemaddrs(ifp); igmp_domifdetach(ifp); } @@ -971,6 +969,7 @@ in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) int in_broadcast(struct in_addr in, struct ifnet *ifp) { + struct epoch_tracker et; struct ifaddr *ifa; int found; @@ -984,14 +983,14 @@ in_broadcast(struct in_addr in, struct ifnet *ifp) * Look through the list of addresses for a match * with a broadcast address. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) { found = 1; break; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (found); } @@ -1382,15 +1381,13 @@ in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3add IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); - lle = in_lltable_find_dst(llt, sin->sin_addr); + KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) != + (LLE_UNLOCKED | LLE_EXCLUSIVE), + ("wrong lle request flags: %#x", flags)); + lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) return (NULL); - - KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) != - (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X", - flags)); - if (flags & LLE_UNLOCKED) return (lle); @@ -1399,6 +1396,17 @@ in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3add else LLE_RLOCK(lle); + /* + * If the afdata lock is not held, the LLE may have been unlinked while + * we were blocked on the LLE lock. Check for this case. + */ + if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) { + if (flags & LLE_EXCLUSIVE) + LLE_WUNLOCK(lle); + else + LLE_RUNLOCK(lle); + return (NULL); + } return (lle); } diff --git a/freebsd/sys/netinet/in_fib.c b/freebsd/sys/netinet/in_fib.c index f61909ea..63141326 100644 --- a/freebsd/sys/netinet/in_fib.c +++ b/freebsd/sys/netinet/in_fib.c @@ -98,7 +98,6 @@ fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, uint32_t flags, struct nhop4_extended *pnh4) { struct sockaddr_in *gw; - struct in_ifaddr *ia; if ((flags & NHR_IFAIF) != 0) pnh4->nh_ifp = rte->rt_ifa->ifa_ifp; @@ -115,10 +114,8 @@ fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, gw = (struct sockaddr_in *)rt_key(rte); if (gw->sin_addr.s_addr == 0) pnh4->nh_flags |= NHF_DEFAULT; - /* XXX: Set RTF_BROADCAST if GW address is broadcast */ - - ia = ifatoia(rte->rt_ifa); - pnh4->nh_src = IA_SIN(ia)->sin_addr; + pnh4->nh_ia = ifatoia(rte->rt_ifa); + pnh4->nh_src = IA_SIN(pnh4->nh_ia)->sin_addr; } /* diff --git a/freebsd/sys/netinet/in_fib.h b/freebsd/sys/netinet/in_fib.h index fa72fd76..f0b4d159 100644 --- a/freebsd/sys/netinet/in_fib.h +++ b/freebsd/sys/netinet/in_fib.h @@ -43,12 +43,13 @@ struct nhop4_basic { /* Extended nexthop info used for control protocols */ struct nhop4_extended { struct ifnet *nh_ifp; /* Logical egress interface */ + struct in_ifaddr *nh_ia; /* Associated address */ uint16_t nh_mtu; /* nexthop mtu */ uint16_t nh_flags; /* nhop flags */ uint8_t spare[4]; struct in_addr nh_addr; /* GW/DST IPv4 address */ struct in_addr nh_src; /* default source IPv4 address */ - uint64_t spare2[2]; + uint64_t spare2; }; int fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flags, diff --git a/freebsd/sys/netinet/in_mcast.c b/freebsd/sys/netinet/in_mcast.c index 3b1d57f8..3fc4aa01 100644 --- a/freebsd/sys/netinet/in_mcast.c +++ b/freebsd/sys/netinet/in_mcast.c @@ -96,7 +96,9 @@ static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", /* * Locking: - * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. + * + * - Lock order is: Giant, IN_MULTI_LOCK, INP_WLOCK, + * IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip_moptions and in_mfilter are covered by the INP_WLOCK. @@ -146,12 +148,11 @@ static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); static void imf_purge(struct in_mfilter *); static void imf_rollback(struct in_mfilter *); static void imf_reap(struct in_mfilter *); -static int imo_grow(struct ip_moptions *); -static size_t imo_match_group(const struct ip_moptions *, +static struct in_mfilter * + imo_match_group(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *); static struct in_msource * - imo_match_source(const struct ip_moptions *, const size_t, - const struct sockaddr *); + imo_match_source(struct in_mfilter *, const struct sockaddr *); static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback); static int in_getmulti(struct ifnet *, const struct in_addr *, @@ -335,6 +336,26 @@ imf_init(struct in_mfilter *imf, const int st0, const int st1) imf->imf_st[1] = st1; } +struct in_mfilter * +ip_mfilter_alloc(const int mflags, const int st0, const int st1) +{ + struct in_mfilter *imf; + + imf = malloc(sizeof(*imf), M_INMFILTER, mflags); + if (imf != NULL) + imf_init(imf, st0, st1); + + return (imf); +} + +void +ip_mfilter_free(struct in_mfilter *imf) +{ + + imf_purge(imf); + free(imf, M_INMFILTER); +} + /* * Function for looking up an in_multi record for an IPv4 multicast address * on a given interface. ifp must be valid. If no record found, return NULL. @@ -369,100 +390,42 @@ inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) struct in_multi * inm_lookup(struct ifnet *ifp, const struct in_addr ina) { + struct epoch_tracker et; struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); inm = inm_lookup_locked(ifp, ina); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (inm); } /* - * Resize the ip_moptions vector to the next power-of-two minus 1. - * May be called with locks held; do not sleep. - */ -static int -imo_grow(struct ip_moptions *imo) -{ - struct in_multi **nmships; - struct in_multi **omships; - struct in_mfilter *nmfilters; - struct in_mfilter *omfilters; - size_t idx; - size_t newmax; - size_t oldmax; - - nmships = NULL; - nmfilters = NULL; - omships = imo->imo_membership; - omfilters = imo->imo_mfilters; - oldmax = imo->imo_max_memberships; - newmax = ((oldmax + 1) * 2) - 1; - - if (newmax <= IP_MAX_MEMBERSHIPS) { - nmships = (struct in_multi **)realloc(omships, - sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT); - nmfilters = (struct in_mfilter *)realloc(omfilters, - sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT); - if (nmships != NULL && nmfilters != NULL) { - /* Initialize newly allocated source filter heads. */ - for (idx = oldmax; idx < newmax; idx++) { - imf_init(&nmfilters[idx], MCAST_UNDEFINED, - MCAST_EXCLUDE); - } - imo->imo_max_memberships = newmax; - imo->imo_membership = nmships; - imo->imo_mfilters = nmfilters; - } - } - - if (nmships == NULL || nmfilters == NULL) { - if (nmships != NULL) - free(nmships, M_IPMOPTS); - if (nmfilters != NULL) - free(nmfilters, M_INMFILTER); - return (ETOOMANYREFS); - } - - return (0); -} - -/* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ -static size_t +static struct in_mfilter * imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in *gsin; - struct in_multi **pinm; - int idx; - int nmships; + struct in_mfilter *imf; + struct in_multi *inm; gsin = (const struct sockaddr_in *)group; - /* The imo_membership array may be lazy allocated. */ - if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) - return (-1); - - nmships = imo->imo_num_memberships; - pinm = &imo->imo_membership[0]; - for (idx = 0; idx < nmships; idx++, pinm++) { - if (*pinm == NULL) + IP_MFILTER_FOREACH(imf, &imo->imo_head) { + inm = imf->imf_inm; + if (inm == NULL) continue; - if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) && - in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) { + if ((ifp == NULL || (inm->inm_ifp == ifp)) && + in_hosteq(inm->inm_addr, gsin->sin_addr)) { break; } } - if (idx >= nmships) - idx = -1; - - return (idx); + return (imf); } /* @@ -473,22 +436,13 @@ imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, * it exists, which may not be the desired behaviour. */ static struct in_msource * -imo_match_source(const struct ip_moptions *imo, const size_t gidx, - const struct sockaddr *src) +imo_match_source(struct in_mfilter *imf, const struct sockaddr *src) { struct ip_msource find; - struct in_mfilter *imf; struct ip_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); - KASSERT(gidx != -1 && gidx < imo->imo_num_memberships, - ("%s: invalid index %d\n", __func__, (int)gidx)); - - /* The imo_mfilters array may be lazy allocated. */ - if (imo->imo_mfilters == NULL) - return (NULL); - imf = &imo->imo_mfilters[gidx]; /* Source trees are keyed in host byte order. */ psa = (const sockunion_t *)src; @@ -508,14 +462,14 @@ int imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { - size_t gidx; + struct in_mfilter *imf; struct in_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); - gidx = imo_match_group(imo, ifp, group); - if (gidx == -1) + imf = imo_match_group(imo, ifp, group); + if (imf == NULL) return (MCAST_NOTGMEMBER); /* @@ -527,8 +481,8 @@ imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, * NOTE: We are comparing group state here at IGMP t1 (now) * with socket-layer t0 (since last downcall). */ - mode = imo->imo_mfilters[gidx].imf_st[1]; - ims = imo_match_source(imo, gidx, src); + mode = imf->imf_st[1]; + ims = imo_match_source(imf, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->imsl_st[0] != mode)) @@ -1453,7 +1407,6 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; - size_t idx; uint16_t fmode; int error, doblock; @@ -1532,20 +1485,18 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); + IN_MULTI_LOCK(); + /* * Check if we are actually a member of this group. */ imo = inp_findmoptions(inp); - idx = imo_match_group(imo, ifp, &gsa->sa); - if (idx == -1 || imo->imo_mfilters == NULL) { + imf = imo_match_group(imo, ifp, &gsa->sa); + if (imf == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } - - KASSERT(imo->imo_mfilters != NULL, - ("%s: imo_mfilters not allocated", __func__)); - imf = &imo->imo_mfilters[idx]; - inm = imo->imo_membership[idx]; + inm = imf->imf_inm; /* * Attempting to use the delta-based API on an @@ -1563,7 +1514,7 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ - ims = imo_match_source(imo, idx, &ssa->sa); + ims = imo_match_source(imf, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), doblock ? "" : "not "); @@ -1594,14 +1545,13 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) /* * Begin state merge transaction at IGMP layer. */ - IN_MULTI_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); - goto out_in_multi_locked; + goto out_imf_rollback; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); @@ -1610,9 +1560,6 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); -out_in_multi_locked: - - IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); @@ -1623,6 +1570,7 @@ out_imf_rollback: out_inp_locked: INP_WUNLOCK(inp); + IN_MULTI_UNLOCK(); return (error); } @@ -1637,9 +1585,6 @@ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; - struct in_multi **immp; - struct in_mfilter *imfp; - size_t idx; INP_WLOCK(inp); if (inp->inp_moptions != NULL) @@ -1648,29 +1593,16 @@ inp_findmoptions(struct inpcb *inp) INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); - immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, - M_WAITOK | M_ZERO); - imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS, - M_INMFILTER, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = in_mcast_loop; - imo->imo_num_memberships = 0; - imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; - imo->imo_membership = immp; - - /* Initialize per-group source filters. */ - for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) - imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); - imo->imo_mfilters = imfp; + STAILQ_INIT(&imo->imo_head); INP_WLOCK(inp); if (inp->inp_moptions != NULL) { - free(imfp, M_INMFILTER); - free(immp, M_IPMOPTS); free(imo, M_IPMOPTS); return (inp->inp_moptions); } @@ -1681,32 +1613,25 @@ inp_findmoptions(struct inpcb *inp) static void inp_gcmoptions(struct ip_moptions *imo) { - struct in_mfilter *imf; + struct in_mfilter *imf; struct in_multi *inm; struct ifnet *ifp; - size_t idx, nmships; - - nmships = imo->imo_num_memberships; - for (idx = 0; idx < nmships; ++idx) { - imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL; - if (imf) - imf_leave(imf); - inm = imo->imo_membership[idx]; - ifp = inm->inm_ifp; - if (ifp != NULL) { - CURVNET_SET(ifp->if_vnet); - (void)in_leavegroup(inm, imf); - CURVNET_RESTORE(); - } else { - (void)in_leavegroup(inm, imf); + + while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) { + ip_mfilter_remove(&imo->imo_head, imf); + + imf_leave(imf); + if ((inm = imf->imf_inm) != NULL) { + if ((ifp = inm->inm_ifp) != NULL) { + CURVNET_SET(ifp->if_vnet); + (void)in_leavegroup(inm, imf); + CURVNET_RESTORE(); + } else { + (void)in_leavegroup(inm, imf); + } } - if (imf) - imf_purge(imf); + ip_mfilter_free(imf); } - - if (imo->imo_mfilters) - free(imo->imo_mfilters, M_INMFILTER); - free(imo->imo_membership, M_IPMOPTS); free(imo, M_IPMOPTS); } @@ -1742,7 +1667,7 @@ inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; - size_t idx, nsrcs, ncsrcs; + size_t nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); @@ -1769,12 +1694,11 @@ inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) * Lookup group on the socket. */ gsa = (sockunion_t *)&msfr.msfr_group; - idx = imo_match_group(imo, ifp, &gsa->sa); - if (idx == -1 || imo->imo_mfilters == NULL) { + imf = imo_match_group(imo, ifp, &gsa->sa); + if (imf == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } - imf = &imo->imo_mfilters[idx]; /* * Ignore memberships which are in limbo. @@ -1889,13 +1813,15 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) if (!in_nullhost(imo->imo_multicast_addr)) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { + struct epoch_tracker et; + mreqn.imr_ifindex = ifp->if_index; - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) mreqn.imr_address = IA_SIN(ia)->sin_addr; - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); } } INP_WUNLOCK(inp); @@ -2032,14 +1958,11 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) struct ip_moptions *imo; struct in_multi *inm; struct in_msource *lims; - size_t idx; int error, is_new; ifp = NULL; - imf = NULL; lims = NULL; error = 0; - is_new = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; @@ -2048,41 +1971,50 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { - case IP_ADD_MEMBERSHIP: - case IP_ADD_SOURCE_MEMBERSHIP: { - struct ip_mreq_source mreqs; + case IP_ADD_MEMBERSHIP: { + struct ip_mreqn mreqn; - if (sopt->sopt_name == IP_ADD_MEMBERSHIP) { - error = sooptcopyin(sopt, &mreqs, - sizeof(struct ip_mreq), - sizeof(struct ip_mreq)); - /* - * Do argument switcharoo from ip_mreq into - * ip_mreq_source to avoid using two instances. - */ - mreqs.imr_interface = mreqs.imr_sourceaddr; - mreqs.imr_sourceaddr.s_addr = INADDR_ANY; - } else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { - error = sooptcopyin(sopt, &mreqs, - sizeof(struct ip_mreq_source), - sizeof(struct ip_mreq_source)); - } + if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) + error = sooptcopyin(sopt, &mreqn, + sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); + else + error = sooptcopyin(sopt, &mreqn, + sizeof(struct ip_mreq), sizeof(struct ip_mreq)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); - gsa->sin.sin_addr = mreqs.imr_multiaddr; + gsa->sin.sin_addr = mreqn.imr_multiaddr; + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); - if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { - ssa->sin.sin_family = AF_INET; - ssa->sin.sin_len = sizeof(struct sockaddr_in); - ssa->sin.sin_addr = mreqs.imr_sourceaddr; - } + if (sopt->sopt_valsize == sizeof(struct ip_mreqn) && + mreqn.imr_ifindex != 0) + ifp = ifnet_byindex(mreqn.imr_ifindex); + else + ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, + mreqn.imr_address); + break; + } + case IP_ADD_SOURCE_MEMBERSHIP: { + struct ip_mreq_source mreqs; + + error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), + sizeof(struct ip_mreq_source)); + if (error) + return (error); + gsa->sin.sin_family = ssa->sin.sin_family = AF_INET; + gsa->sin.sin_len = ssa->sin.sin_len = + sizeof(struct sockaddr_in); + + gsa->sin.sin_addr = mreqs.imr_multiaddr; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); + ssa->sin.sin_addr = mreqs.imr_sourceaddr; + ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqs.imr_interface); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", @@ -2138,13 +2070,25 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); + IN_MULTI_LOCK(); + + /* + * Find the membership in the membership list. + */ imo = inp_findmoptions(inp); - idx = imo_match_group(imo, ifp, &gsa->sa); - if (idx == -1) { + imf = imo_match_group(imo, ifp, &gsa->sa); + if (imf == NULL) { is_new = 1; + inm = NULL; + + if (ip_mfilter_count(&imo->imo_head) >= IP_MAX_MEMBERSHIPS) { + error = ENOMEM; + goto out_inp_locked; + } } else { - inm = imo->imo_membership[idx]; - imf = &imo->imo_mfilters[idx]; + is_new = 0; + inm = imf->imf_inm; + if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership @@ -2171,7 +2115,7 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ - lims = imo_match_source(imo, idx, &ssa->sa); + lims = imo_match_source(imf, &ssa->sa); if (lims != NULL /*&& lims->imsl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; @@ -2204,27 +2148,6 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) */ INP_WLOCK_ASSERT(inp); - if (is_new) { - if (imo->imo_num_memberships == imo->imo_max_memberships) { - error = imo_grow(imo); - if (error) - goto out_inp_locked; - } - /* - * Allocate the new slot upfront so we can deal with - * grafting the new source filter in same code path - * as for join-source on existing membership. - */ - idx = imo->imo_num_memberships; - imo->imo_membership[idx] = NULL; - imo->imo_num_memberships++; - KASSERT(imo->imo_mfilters != NULL, - ("%s: imf_mfilters vector was not allocated", __func__)); - imf = &imo->imo_mfilters[idx]; - KASSERT(RB_EMPTY(&imf->imf_sources), - ("%s: imf_sources not empty", __func__)); - } - /* * Graft new source into filter list for this inpcb's * membership of the group. The in_multi may not have @@ -2240,7 +2163,11 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/source", __func__); - imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); + imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE); + if (imf == NULL) { + error = ENOMEM; + goto out_inp_locked; + } } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); } @@ -2249,34 +2176,41 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); error = ENOMEM; - goto out_imo_free; + goto out_inp_locked; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__); - imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); + imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE); + if (imf == NULL) { + error = ENOMEM; + goto out_inp_locked; + } } } /* * Begin state merge transaction at IGMP layer. */ - in_pcbref(inp); - INP_WUNLOCK(inp); - IN_MULTI_LOCK(); - if (is_new) { + in_pcbref(inp); + INP_WUNLOCK(inp); + error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf, - &inm); + &imf->imf_inm); + + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) { + error = ENXIO; + goto out_inp_unlocked; + } if (error) { CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", __func__); - IN_MULTI_LIST_UNLOCK(); - goto out_imo_free; + goto out_inp_locked; } - inm_acquire(inm); - imo->imo_membership[idx] = inm; + inm_acquire(imf->imf_inm); } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); @@ -2285,7 +2219,9 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); - goto out_in_multi_locked; + imf_rollback(imf); + imf_reap(imf); + goto out_inp_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); @@ -2293,40 +2229,30 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); - goto out_in_multi_locked; + imf_rollback(imf); + imf_reap(imf); + goto out_inp_locked; } } + if (is_new) + ip_mfilter_insert(&imo->imo_head, imf); -out_in_multi_locked: + imf_commit(imf); + imf = NULL; +out_inp_locked: + INP_WUNLOCK(inp); +out_inp_unlocked: IN_MULTI_UNLOCK(); - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - return (ENXIO); - if (error) { - imf_rollback(imf); - if (is_new) - imf_purge(imf); - else - imf_reap(imf); - } else { - imf_commit(imf); - } -out_imo_free: - if (error && is_new) { - inm = imo->imo_membership[idx]; - if (inm != NULL) { + if (is_new && imf) { + if (imf->imf_inm != NULL) { IN_MULTI_LIST_LOCK(); - inm_release_deferred(inm); + inm_release_deferred(imf->imf_inm); IN_MULTI_LIST_UNLOCK(); } - imo->imo_membership[idx] = NULL; - --imo->imo_num_memberships; + ip_mfilter_free(imf); } - -out_inp_locked: - INP_WUNLOCK(inp); return (error); } @@ -2345,12 +2271,12 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; - size_t idx; - int error, is_final; + int error; + bool is_final; ifp = NULL; error = 0; - is_final = 1; + is_final = true; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; @@ -2450,20 +2376,21 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); + IN_MULTI_LOCK(); + /* - * Find the membership in the membership array. + * Find the membership in the membership list. */ imo = inp_findmoptions(inp); - idx = imo_match_group(imo, ifp, &gsa->sa); - if (idx == -1) { + imf = imo_match_group(imo, ifp, &gsa->sa); + if (imf == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } - inm = imo->imo_membership[idx]; - imf = &imo->imo_mfilters[idx]; + inm = imf->imf_inm; if (ssa->ss.ss_family != AF_UNSPEC) - is_final = 0; + is_final = false; /* * Begin state merge transaction at socket layer. @@ -2475,13 +2402,14 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { + ip_mfilter_remove(&imo->imo_head, imf); imf_leave(imf); } else { if (imf->imf_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_inp_locked; } - ims = imo_match_source(imo, idx, &ssa->sa); + ims = imo_match_source(imf, &ssa->sa); if (ims == NULL) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), "not "); @@ -2500,17 +2428,7 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) /* * Begin state merge transaction at IGMP layer. */ - in_pcbref(inp); - INP_WUNLOCK(inp); - IN_MULTI_LOCK(); - - if (is_final) { - /* - * Give up the multicast address record to which - * the membership points. - */ - (void)in_leavegroup_locked(inm, imf); - } else { + if (!is_final) { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); @@ -2518,7 +2436,9 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); - goto out_in_multi_locked; + imf_rollback(imf); + imf_reap(imf); + goto out_inp_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); @@ -2527,34 +2447,27 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); + imf_rollback(imf); + imf_reap(imf); + goto out_inp_locked; } } - -out_in_multi_locked: - - IN_MULTI_UNLOCK(); - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - return (ENXIO); - - if (error) - imf_rollback(imf); - else - imf_commit(imf); - + imf_commit(imf); imf_reap(imf); - if (is_final) { - /* Remove the gap in the membership and filter array. */ - for (++idx; idx < imo->imo_num_memberships; ++idx) { - imo->imo_membership[idx-1] = imo->imo_membership[idx]; - imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx]; - } - imo->imo_num_memberships--; - } - out_inp_locked: INP_WUNLOCK(inp); + + if (is_final && imf) { + /* + * Give up the multicast address record to which + * the membership points. + */ + (void) in_leavegroup_locked(imf->imf_inm, imf); + ip_mfilter_free(imf); + } + + IN_MULTI_UNLOCK(); return (error); } @@ -2644,7 +2557,6 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; - size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), @@ -2676,18 +2588,19 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) if (ifp == NULL) return (EADDRNOTAVAIL); + IN_MULTI_LOCK(); + /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = inp_findmoptions(inp); - idx = imo_match_group(imo, ifp, &gsa->sa); - if (idx == -1 || imo->imo_mfilters == NULL) { + imf = imo_match_group(imo, ifp, &gsa->sa); + if (imf == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } - inm = imo->imo_membership[idx]; - imf = &imo->imo_mfilters[idx]; + inm = imf->imf_inm; /* * Begin state merge transaction at socket layer. @@ -2764,7 +2677,6 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) goto out_imf_rollback; INP_WLOCK_ASSERT(inp); - IN_MULTI_LOCK(); /* * Begin state merge transaction at IGMP layer. @@ -2775,7 +2687,7 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); - goto out_in_multi_locked; + goto out_imf_rollback; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); @@ -2784,10 +2696,6 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); -out_in_multi_locked: - - IN_MULTI_UNLOCK(); - out_imf_rollback: if (error) imf_rollback(imf); @@ -2798,6 +2706,7 @@ out_imf_rollback: out_inp_locked: INP_WUNLOCK(inp); + IN_MULTI_UNLOCK(); return (error); } @@ -2968,6 +2877,7 @@ static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in_addr src, group; + struct epoch_tracker et; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in_multi *inm; @@ -3014,7 +2924,7 @@ sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) IN_MULTI_LIST_LOCK(); - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) @@ -3043,7 +2953,7 @@ sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) break; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); IN_MULTI_LIST_UNLOCK(); @@ -3052,7 +2962,14 @@ sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3) -static const char *inm_modestrs[] = { "un", "in", "ex" }; +static const char *inm_modestrs[] = { + [MCAST_UNDEFINED] = "un", + [MCAST_INCLUDE] = "in", + [MCAST_EXCLUDE] = "ex", +}; +_Static_assert(MCAST_UNDEFINED == 0 && + MCAST_EXCLUDE + 1 == nitems(inm_modestrs), + "inm_modestrs: no longer matches #defines"); static const char * inm_mode_str(const int mode) @@ -3064,16 +2981,20 @@ inm_mode_str(const int mode) } static const char *inm_statestrs[] = { - "not-member", - "silent", - "idle", - "lazy", - "sleeping", - "awakening", - "query-pending", - "sg-query-pending", - "leaving" + [IGMP_NOT_MEMBER] = "not-member", + [IGMP_SILENT_MEMBER] = "silent", + [IGMP_REPORTING_MEMBER] = "reporting", + [IGMP_IDLE_MEMBER] = "idle", + [IGMP_LAZY_MEMBER] = "lazy", + [IGMP_SLEEPING_MEMBER] = "sleeping", + [IGMP_AWAKENING_MEMBER] = "awakening", + [IGMP_G_QUERY_PENDING_MEMBER] = "query-pending", + [IGMP_SG_QUERY_PENDING_MEMBER] = "sg-query-pending", + [IGMP_LEAVING_MEMBER] = "leaving", }; +_Static_assert(IGMP_NOT_MEMBER == 0 && + IGMP_LEAVING_MEMBER + 1 == nitems(inm_statestrs), + "inm_statetrs: no longer matches #defines"); static const char * inm_state_str(const int state) diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c index c00593e5..6147c566 100644 --- a/freebsd/sys/netinet/in_pcb.c +++ b/freebsd/sys/netinet/in_pcb.c @@ -92,6 +92,9 @@ __FBSDID("$FreeBSD$"); #if defined(INET) || defined(INET6) #include <netinet/in.h> #include <netinet/in_pcb.h> +#ifdef INET +#include <netinet/in_var.h> +#endif #include <netinet/ip_var.h> #include <netinet/tcp_var.h> #ifdef TCPHPTS @@ -99,16 +102,13 @@ __FBSDID("$FreeBSD$"); #endif #include <netinet/udp.h> #include <netinet/udp_var.h> -#endif -#ifdef INET -#include <netinet/in_var.h> -#endif #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/in6_pcb.h> #include <netinet6/in6_var.h> #include <netinet6/ip6_var.h> #endif /* INET6 */ +#endif #include <netipsec/ipsec_support.h> @@ -216,6 +216,22 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); + +#ifdef RATELIMIT +counter_u64_t rate_limit_active; +counter_u64_t rate_limit_alloc_fail; +counter_u64_t rate_limit_set_ok; + +static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD, 0, + "IP Rate Limiting"); +SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, + &rate_limit_active, "Active rate limited connections"); +SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, + &rate_limit_alloc_fail, "Rate limited connection failures"); +SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, + &rate_limit_set_ok, "Rate limited setting succeeded"); +#endif /* RATELIMIT */ + #endif /* INET */ /* @@ -516,6 +532,9 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) if (inp == NULL) return (ENOBUFS); bzero(&inp->inp_start_zero, inp_zero_size); +#ifdef NUMA + inp->inp_numa_domain = M_NODOM; +#endif inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); @@ -1024,6 +1043,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct sockaddr *sa; struct sockaddr_in *sin; struct route sro; + struct epoch_tracker et; int error; KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); @@ -1059,7 +1079,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, * network and try to find a corresponding interface to take * the source address from. */ - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; @@ -1223,7 +1243,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, } done: - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); if (sro.ro_rt != NULL) RTFREE(sro.ro_rt); return (error); @@ -1576,6 +1596,7 @@ in_pcbfree_deferred(epoch_context_t ctx) inp = __containerof(ctx, struct inpcb, inp_epoch_ctx); INP_WLOCK(inp); + CURVNET_SET(inp->inp_vnet); #ifdef INET struct ip_moptions *imo = inp->inp_moptions; inp->inp_moptions = NULL; @@ -1608,6 +1629,7 @@ in_pcbfree_deferred(epoch_context_t ctx) #ifdef INET inp_freemoptions(imo); #endif + CURVNET_RESTORE(); } /* @@ -1785,8 +1807,9 @@ void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; + struct in_multi *inm; + struct in_mfilter *imf; struct ip_moptions *imo; - int i, gap; INP_INFO_WLOCK(pcbinfo); CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { @@ -1807,17 +1830,18 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) * * XXX This can all be deferred to an epoch_call */ - for (i = 0, gap = 0; i < imo->imo_num_memberships; - i++) { - if (imo->imo_membership[i]->inm_ifp == ifp) { - IN_MULTI_LOCK_ASSERT(); - in_leavegroup_locked(imo->imo_membership[i], NULL); - gap++; - } else if (gap != 0) - imo->imo_membership[i - gap] = - imo->imo_membership[i]; +restart: + IP_MFILTER_FOREACH(imf, &imo->imo_head) { + if ((inm = imf->imf_inm) == NULL) + continue; + if (inm->inm_ifp != ifp) + continue; + ip_mfilter_remove(&imo->imo_head, imf); + IN_MULTI_LOCK_ASSERT(); + in_leavegroup_locked(inm, NULL); + ip_mfilter_free(imf); + goto restart; } - imo->imo_num_memberships -= gap; } INP_WUNLOCK(inp); } @@ -3174,6 +3198,7 @@ in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, + .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; struct ifnet *ifp; @@ -3260,7 +3285,8 @@ in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) */ int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, - uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate) + uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) + { union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = (max_pacing_rate == -1U) ? @@ -3268,12 +3294,13 @@ in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.max_rate = max_pacing_rate, + .rate_limit.flags = M_NOWAIT, }; int error; INP_WLOCK_ASSERT(inp); - if (inp->inp_snd_tag != NULL) + if (*st != NULL) return (EINVAL); if (ifp->if_snd_tag_alloc == NULL) { @@ -3281,16 +3308,37 @@ in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, } else { error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); - /* - * At success increment the refcount on - * the send tag's network interface: - */ - if (error == 0) - if_ref(inp->inp_snd_tag->ifp); +#ifdef INET + if (error == 0) { + counter_u64_add(rate_limit_set_ok, 1); + counter_u64_add(rate_limit_active, 1); + } else + counter_u64_add(rate_limit_alloc_fail, 1); +#endif } return (error); } +void +in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst) +{ + if (ifp == NULL) + return; + + /* + * If the device was detached while we still had reference(s) + * on the ifp, we assume if_snd_tag_free() was replaced with + * stubs. + */ + ifp->if_snd_tag_free(mst); + + /* release reference count on network interface */ + if_rele(ifp); +#ifdef INET + counter_u64_add(rate_limit_active, -1); +#endif +} + /* * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", * if any: @@ -3299,7 +3347,6 @@ void in_pcbdetach_txrtlmt(struct inpcb *inp) { struct m_snd_tag *mst; - struct ifnet *ifp; INP_WLOCK_ASSERT(inp); @@ -3309,19 +3356,57 @@ in_pcbdetach_txrtlmt(struct inpcb *inp) if (mst == NULL) return; - ifp = mst->ifp; - if (ifp == NULL) - return; + m_snd_tag_rele(mst); +} + +int +in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) +{ + int error; /* - * If the device was detached while we still had reference(s) - * on the ifp, we assume if_snd_tag_free() was replaced with - * stubs. + * If the existing send tag is for the wrong interface due to + * a route change, first drop the existing tag. Set the + * CHANGED flag so that we will keep trying to allocate a new + * tag if we fail to allocate one this time. */ - ifp->if_snd_tag_free(mst); + if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { + in_pcbdetach_txrtlmt(inp); + inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; + } - /* release reference count on network interface */ - if_rele(ifp); + /* + * NOTE: When attaching to a network interface a reference is + * made to ensure the network interface doesn't go away until + * all ratelimit connections are gone. The network interface + * pointers compared below represent valid network interfaces, + * except when comparing towards NULL. + */ + if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { + error = 0; + } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { + if (inp->inp_snd_tag != NULL) + in_pcbdetach_txrtlmt(inp); + error = 0; + } else if (inp->inp_snd_tag == NULL) { + /* + * In order to utilize packet pacing with RSS, we need + * to wait until there is a valid RSS hash before we + * can proceed: + */ + if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { + error = EAGAIN; + } else { + error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), + mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); + } + } else { + error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); + } + if (error == 0 || error == EOPNOTSUPP) + inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; + + return (error); } /* @@ -3366,36 +3451,8 @@ in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) */ max_pacing_rate = socket->so_max_pacing_rate; - /* - * NOTE: When attaching to a network interface a reference is - * made to ensure the network interface doesn't go away until - * all ratelimit connections are gone. The network interface - * pointers compared below represent valid network interfaces, - * except when comparing towards NULL. - */ - if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { - error = 0; - } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { - if (inp->inp_snd_tag != NULL) - in_pcbdetach_txrtlmt(inp); - error = 0; - } else if (inp->inp_snd_tag == NULL) { - /* - * In order to utilize packet pacing with RSS, we need - * to wait until there is a valid RSS hash before we - * can proceed: - */ - if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { - error = EAGAIN; - } else { - error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), - mb->m_pkthdr.flowid, max_pacing_rate); - } - } else { - error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); - } - if (error == 0 || error == EOPNOTSUPP) - inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; + error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); + if (did_upgrade) INP_DOWNGRADE(inp); } @@ -3406,16 +3463,11 @@ in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) void in_pcboutput_eagain(struct inpcb *inp) { - struct socket *socket; bool did_upgrade; if (inp == NULL) return; - socket = inp->inp_socket; - if (socket == NULL) - return; - if (inp->inp_snd_tag == NULL) return; @@ -3442,4 +3494,16 @@ in_pcboutput_eagain(struct inpcb *inp) if (did_upgrade) INP_DOWNGRADE(inp); } + +#ifdef INET +static void +rl_init(void *st) +{ + rate_limit_active = counter_u64_alloc(M_WAITOK); + rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); + rate_limit_set_ok = counter_u64_alloc(M_WAITOK); +} + +SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); +#endif #endif /* RATELIMIT */ diff --git a/freebsd/sys/netinet/in_pcb.h b/freebsd/sys/netinet/in_pcb.h index ecbd7a22..7d15b24b 100644 --- a/freebsd/sys/netinet/in_pcb.h +++ b/freebsd/sys/netinet/in_pcb.h @@ -57,9 +57,6 @@ #endif #include <sys/ck.h> -#define in6pcb inpcb /* for KAME src sync over BSD*'s */ -#define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ - /* * struct inpcb is the common protocol control block structure used in most * IP transport protocols. @@ -272,7 +269,7 @@ struct inpcb { inp_hpts_calls :1, /* (i) from output hpts */ inp_input_calls :1, /* (i) from input hpts */ inp_spare_bits2 : 4; - uint8_t inp_spare_byte; /* Compiler hole */ + uint8_t inp_numa_domain; /* numa domain */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct socket *inp_socket; /* (i) back pointer to socket */ uint32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */ @@ -342,7 +339,6 @@ struct inpcb { #define in6p_faddr inp_inc.inc6_faddr #define in6p_laddr inp_inc.inc6_laddr #define in6p_zoneid inp_inc.inc6_zoneid -#define in6p_flowinfo inp_flow #define inp_vnet inp_pcbinfo->ipi_vnet @@ -632,12 +628,12 @@ int inp_so_options(const struct inpcb *inp); #define INP_INFO_LOCK_INIT(ipi, d) \ mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE) #define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock) -#define INP_INFO_RLOCK_ET(ipi, et) NET_EPOCH_ENTER_ET((et)) +#define INP_INFO_RLOCK_ET(ipi, et) NET_EPOCH_ENTER((et)) #define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock) #define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock) #define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock) -#define INP_INFO_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT_ET((et)) -#define INP_INFO_RUNLOCK_TP(ipi, tp) NET_EPOCH_EXIT_ET(*(tp)->t_inpcb->inp_et) +#define INP_INFO_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT((et)) +#define INP_INFO_RUNLOCK_TP(ipi, tp) NET_EPOCH_EXIT(*(tp)->t_inpcb->inp_et) #define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock) #define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock)) #define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt)) @@ -670,8 +666,8 @@ int inp_so_options(const struct inpcb *inp); #define INP_HASH_RLOCK(ipi) struct epoch_tracker inp_hash_et; epoch_enter_preempt(net_epoch_preempt, &inp_hash_et) #define INP_HASH_RLOCK_ET(ipi, et) epoch_enter_preempt(net_epoch_preempt, &(et)) #define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock) -#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT_ET(inp_hash_et) -#define INP_HASH_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT_ET((et)) +#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT(inp_hash_et) +#define INP_HASH_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT((et)) #define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock) #define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock)) #define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED); @@ -759,7 +755,9 @@ int inp_so_options(const struct inpcb *inp); #define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */ #define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */ #define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */ - +#define INP_SUPPORTS_MBUFQ 0x00004000 /* Supports the mbuf queue method of LRO */ +#define INP_MBUF_QUEUE_READY 0x00008000 /* The transport is pacing, inputs can be queued */ +#define INP_DONT_SACK_QUEUE 0x00010000 /* If a sack arrives do not wake me */ /* * Flags passed to in_pcblookup*() functions. */ @@ -771,7 +769,6 @@ int inp_so_options(const struct inpcb *inp); INPLOOKUP_WLOCKPCB) #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) -#define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family @@ -881,8 +878,13 @@ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); #ifdef RATELIMIT -int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t); +int +in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *, + struct mbuf *, uint32_t); +int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, + uint32_t, struct m_snd_tag **); void in_pcbdetach_txrtlmt(struct inpcb *); +void in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst); int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t); int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *); int in_pcbquery_txrlevel(struct inpcb *, uint32_t *); diff --git a/freebsd/sys/netinet/in_var.h b/freebsd/sys/netinet/in_var.h index 5b7a464b..50112481 100644 --- a/freebsd/sys/netinet/in_var.h +++ b/freebsd/sys/netinet/in_var.h @@ -232,9 +232,61 @@ struct in_mfilter { struct ip_msource_tree imf_sources; /* source list for (S,G) */ u_long imf_nsrc; /* # of source entries */ uint8_t imf_st[2]; /* state before/at commit */ + struct in_multi *imf_inm; /* associated multicast address */ + STAILQ_ENTRY(in_mfilter) imf_entry; /* list entry */ }; /* + * Helper types and functions for IPv4 multicast filters. + */ +STAILQ_HEAD(ip_mfilter_head, in_mfilter); + +struct in_mfilter *ip_mfilter_alloc(int mflags, int st0, int st1); +void ip_mfilter_free(struct in_mfilter *); + +static inline void +ip_mfilter_init(struct ip_mfilter_head *head) +{ + + STAILQ_INIT(head); +} + +static inline struct in_mfilter * +ip_mfilter_first(const struct ip_mfilter_head *head) +{ + + return (STAILQ_FIRST(head)); +} + +static inline void +ip_mfilter_insert(struct ip_mfilter_head *head, struct in_mfilter *imf) +{ + + STAILQ_INSERT_TAIL(head, imf, imf_entry); +} + +static inline void +ip_mfilter_remove(struct ip_mfilter_head *head, struct in_mfilter *imf) +{ + + STAILQ_REMOVE(head, imf, in_mfilter, imf_entry); +} + +#define IP_MFILTER_FOREACH(imf, head) \ + STAILQ_FOREACH(imf, head, imf_entry) + +static inline size_t +ip_mfilter_count(struct ip_mfilter_head *head) +{ + struct in_mfilter *imf; + size_t num = 0; + + STAILQ_FOREACH(imf, head, imf_entry) + num++; + return (num); +} + +/* * IPv4 group descriptor. * * For every entry on an ifnet's if_multiaddrs list which represents diff --git a/freebsd/sys/netinet/ip_carp.c b/freebsd/sys/netinet/ip_carp.c index 1fb208a2..03ce7bbe 100644 --- a/freebsd/sys/netinet/ip_carp.c +++ b/freebsd/sys/netinet/ip_carp.c @@ -646,6 +646,7 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) struct carp_softc *sc; uint64_t tmp_counter; struct timeval sc_tv, ch_tv; + struct epoch_tracker et; int error; /* @@ -659,7 +660,7 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) * (these should never happen, and as noted above, we may * miss real loops; this is just a double-check). */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); error = 0; match = NULL; IFNET_FOREACH_IFA(ifp, ifa) { @@ -673,7 +674,7 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) ifa = error ? NULL : match; if (ifa != NULL) ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (ifa == NULL) { if (error == ELOOP) { @@ -881,18 +882,19 @@ carp_send_ad_error(struct carp_softc *sc, int error) static struct ifaddr * carp_best_ifa(int af, struct ifnet *ifp) { + struct epoch_tracker et; struct ifaddr *ifa, *best; if (af >= AF_MAX) return (NULL); best = NULL; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == af && (best == NULL || ifa_preferred(best, ifa))) best = ifa; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (best != NULL) ifa_ref(best); return (best); @@ -1169,10 +1171,11 @@ carp_send_na(struct carp_softc *sc) struct ifaddr * carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) { + struct epoch_tracker et; struct ifaddr *ifa; ifa = NULL; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; @@ -1184,7 +1187,7 @@ carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) ifa_ref(ifa); break; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (ifa); } @@ -1192,16 +1195,17 @@ carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) caddr_t carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { + struct epoch_tracker et; struct ifaddr *ifa; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); IFNET_FOREACH_IFA(ifp, ifa) if (ifa->ifa_addr->sa_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { struct carp_softc *sc = ifa->ifa_carp; struct m_tag *mtag; - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), M_NOWAIT); @@ -1214,7 +1218,7 @@ carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) return (LLADDR(&sc->sc_addr)); } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (NULL); } @@ -1369,25 +1373,24 @@ carp_multicast_setup(struct carp_if *cif, sa_family_t sa) case AF_INET: { struct ip_moptions *imo = &cif->cif_imo; + struct in_mfilter *imf; struct in_addr addr; - if (imo->imo_membership) + if (ip_mfilter_first(&imo->imo_head) != NULL) return (0); - imo->imo_membership = (struct in_multi **)malloc( - (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP, - M_WAITOK); - imo->imo_mfilters = NULL; - imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; + imf = ip_mfilter_alloc(M_WAITOK, 0, 0); + ip_mfilter_init(&imo->imo_head); imo->imo_multicast_vif = -1; addr.s_addr = htonl(INADDR_CARP_GROUP); if ((error = in_joingroup(ifp, &addr, NULL, - &imo->imo_membership[0])) != 0) { - free(imo->imo_membership, M_CARP); + &imf->imf_inm)) != 0) { + ip_mfilter_free(imf); break; } - imo->imo_num_memberships++; + + ip_mfilter_insert(&imo->imo_head, imf); imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = CARP_DFLTTL; imo->imo_multicast_loop = 0; @@ -1398,17 +1401,16 @@ carp_multicast_setup(struct carp_if *cif, sa_family_t sa) case AF_INET6: { struct ip6_moptions *im6o = &cif->cif_im6o; + struct in6_mfilter *im6f[2]; struct in6_addr in6; - struct in6_multi *in6m; - if (im6o->im6o_membership) + if (ip6_mfilter_first(&im6o->im6o_head)) return (0); - im6o->im6o_membership = (struct in6_multi **)malloc( - (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP, - M_ZERO | M_WAITOK); - im6o->im6o_mfilters = NULL; - im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; + im6f[0] = ip6_mfilter_alloc(M_WAITOK, 0, 0); + im6f[1] = ip6_mfilter_alloc(M_WAITOK, 0, 0); + + ip6_mfilter_init(&im6o->im6o_head); im6o->im6o_multicast_hlim = CARP_DFLTTL; im6o->im6o_multicast_ifp = ifp; @@ -1417,17 +1419,15 @@ carp_multicast_setup(struct carp_if *cif, sa_family_t sa) in6.s6_addr16[0] = htons(0xff02); in6.s6_addr8[15] = 0x12; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { - free(im6o->im6o_membership, M_CARP); + ip6_mfilter_free(im6f[0]); + ip6_mfilter_free(im6f[1]); break; } - in6m = NULL; - if ((error = in6_joingroup(ifp, &in6, NULL, &in6m, 0)) != 0) { - free(im6o->im6o_membership, M_CARP); + if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[0]->im6f_in6m, 0)) != 0) { + ip6_mfilter_free(im6f[0]); + ip6_mfilter_free(im6f[1]); break; } - in6m_acquire(in6m); - im6o->im6o_membership[0] = in6m; - im6o->im6o_num_memberships++; /* Join solicited multicast address. */ bzero(&in6, sizeof(in6)); @@ -1436,20 +1436,21 @@ carp_multicast_setup(struct carp_if *cif, sa_family_t sa) in6.s6_addr32[2] = htonl(1); in6.s6_addr32[3] = 0; in6.s6_addr8[12] = 0xff; + if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { - in6_leavegroup(im6o->im6o_membership[0], NULL); - free(im6o->im6o_membership, M_CARP); + ip6_mfilter_free(im6f[0]); + ip6_mfilter_free(im6f[1]); break; } - in6m = NULL; - if ((error = in6_joingroup(ifp, &in6, NULL, &in6m, 0)) != 0) { - in6_leavegroup(im6o->im6o_membership[0], NULL); - free(im6o->im6o_membership, M_CARP); + + if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[1]->im6f_in6m, 0)) != 0) { + in6_leavegroup(im6f[0]->im6f_in6m, NULL); + ip6_mfilter_free(im6f[0]); + ip6_mfilter_free(im6f[1]); break; } - in6m_acquire(in6m); - im6o->im6o_membership[1] = in6m; - im6o->im6o_num_memberships++; + ip6_mfilter_insert(&im6o->im6o_head, im6f[0]); + ip6_mfilter_insert(&im6o->im6o_head, im6f[1]); break; } #endif @@ -1464,35 +1465,38 @@ carp_multicast_setup(struct carp_if *cif, sa_family_t sa) static void carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa) { - +#ifdef INET + struct ip_moptions *imo = &cif->cif_imo; + struct in_mfilter *imf; +#endif +#ifdef INET6 + struct ip6_moptions *im6o = &cif->cif_im6o; + struct in6_mfilter *im6f; +#endif sx_assert(&carp_sx, SA_XLOCKED); switch (sa) { #ifdef INET case AF_INET: - if (cif->cif_naddrs == 0) { - struct ip_moptions *imo = &cif->cif_imo; - - in_leavegroup(imo->imo_membership[0], NULL); - KASSERT(imo->imo_mfilters == NULL, - ("%s: imo_mfilters != NULL", __func__)); - free(imo->imo_membership, M_CARP); - imo->imo_membership = NULL; + if (cif->cif_naddrs != 0) + break; + while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) { + ip_mfilter_remove(&imo->imo_head, imf); + in_leavegroup(imf->imf_inm, NULL); + ip_mfilter_free(imf); } break; #endif #ifdef INET6 case AF_INET6: - if (cif->cif_naddrs6 == 0) { - struct ip6_moptions *im6o = &cif->cif_im6o; - - in6_leavegroup(im6o->im6o_membership[0], NULL); - in6_leavegroup(im6o->im6o_membership[1], NULL); - KASSERT(im6o->im6o_mfilters == NULL, - ("%s: im6o_mfilters != NULL", __func__)); - free(im6o->im6o_membership, M_CARP); - im6o->im6o_membership = NULL; + if (cif->cif_naddrs6 != 0) + break; + + while ((im6f = ip6_mfilter_first(&im6o->im6o_head)) != NULL) { + ip6_mfilter_remove(&im6o->im6o_head, im6f); + in6_leavegroup(im6f->im6f_in6m, NULL); + ip6_mfilter_free(im6f); } break; #endif @@ -2178,21 +2182,6 @@ static struct protosw in6_carp_protosw = { }; #endif -#ifdef VIMAGE -#if defined(__i386__) -/* - * XXX This is a hack to work around an absolute relocation outside - * set_vnet by one (on the stop symbol) for carpstats. Add a dummy variable - * to the end of the file in the hope that the linker will just keep the - * order (as it seems to do at the moment). It is understood to be fragile. - * See PR 230857 for a longer discussion of the problem and the referenced - * review for possible alternate solutions. Each is a hack; we just need - * the least intrusive one for the next release. - */ -VNET_DEFINE(char, carp_zzz) = 0xde; -#endif -#endif - static void carp_mod_cleanup(void) { diff --git a/freebsd/sys/netinet/ip_divert.c b/freebsd/sys/netinet/ip_divert.c index 6a99645e..a7f602b2 100644 --- a/freebsd/sys/netinet/ip_divert.c +++ b/freebsd/sys/netinet/ip_divert.c @@ -186,7 +186,7 @@ div_input(struct mbuf **mp, int *offp, int proto) * then pass them along with mbuf chain. */ static void -divert_packet(struct mbuf *m, int incoming) +divert_packet(struct mbuf *m, bool incoming) { struct ip *ip; struct inpcb *inp; @@ -467,19 +467,20 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, * Clear the port and the ifname to make sure * there are no distractions for ifa_ifwithaddr. */ + struct epoch_tracker et; struct ifaddr *ifa; bzero(sin->sin_zero, sizeof(sin->sin_zero)); sin->sin_port = 0; - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); ifa = ifa_ifwithaddr((struct sockaddr *) sin); if (ifa == NULL) { error = EADDRNOTAVAIL; - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); goto cantsend; } m->m_pkthdr.rcvif = ifa->ifa_ifp; - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); } #ifdef MAC mac_socket_create_mbuf(so, m); diff --git a/freebsd/sys/netinet/ip_fastfwd.c b/freebsd/sys/netinet/ip_fastfwd.c index 05deb4d8..1d55c3d2 100644 --- a/freebsd/sys/netinet/ip_fastfwd.c +++ b/freebsd/sys/netinet/ip_fastfwd.c @@ -92,11 +92,11 @@ __FBSDID("$FreeBSD$"); #include <sys/socket.h> #include <sys/sysctl.h> -#include <net/pfil.h> #include <net/if.h> #include <net/if_types.h> #include <net/if_var.h> #include <net/if_dl.h> +#include <net/pfil.h> #include <net/route.h> #include <net/vnet.h> @@ -230,12 +230,11 @@ ip_tryforward(struct mbuf *m) /* * Run through list of ipfilter hooks for input packets */ - if (!PFIL_HOOKED(&V_inet_pfil_hook)) + if (!PFIL_HOOKED_IN(V_inet_pfil_head)) goto passin; - if (pfil_run_hooks( - &V_inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, 0, NULL) || - m == NULL) + if (pfil_run_hooks(V_inet_pfil_head, &m, m->m_pkthdr.rcvif, PFIL_IN, + NULL) != PFIL_PASS) goto drop; M_ASSERTVALID(m); @@ -323,13 +322,12 @@ passin: /* * Step 5: outgoing firewall packet processing */ - if (!PFIL_HOOKED(&V_inet_pfil_hook)) + if (!PFIL_HOOKED_OUT(V_inet_pfil_head)) goto passout; - if (pfil_run_hooks(&V_inet_pfil_hook, &m, nh.nh_ifp, PFIL_OUT, PFIL_FWD, - NULL) || m == NULL) { + if (pfil_run_hooks(V_inet_pfil_head, &m, nh.nh_ifp, + PFIL_OUT | PFIL_FWD, NULL) != PFIL_PASS) goto drop; - } M_ASSERTVALID(m); M_ASSERTPKTHDR(m); diff --git a/freebsd/sys/netinet/ip_fw.h b/freebsd/sys/netinet/ip_fw.h index 41351215..7a01c82b 100644 --- a/freebsd/sys/netinet/ip_fw.h +++ b/freebsd/sys/netinet/ip_fw.h @@ -134,6 +134,13 @@ typedef struct _ip_fw3_opheader { #define IP_FW_NPTV6_STATS 154 /* Get NPTv6 instance statistics */ #define IP_FW_NPTV6_RESET_STATS 155 /* Reset NPTv6 instance statistics */ +#define IP_FW_NAT64CLAT_CREATE 160 /* Create clat NAT64 instance */ +#define IP_FW_NAT64CLAT_DESTROY 161 /* Destroy clat NAT64 instance */ +#define IP_FW_NAT64CLAT_CONFIG 162 /* Modify clat NAT64 instance */ +#define IP_FW_NAT64CLAT_LIST 163 /* List clat NAT64 instances */ +#define IP_FW_NAT64CLAT_STATS 164 /* Get NAT64CLAT instance statistics */ +#define IP_FW_NAT64CLAT_RESET_STATS 165 /* Reset NAT64CLAT instance statistics */ + /* * The kernel representation of ipfw rules is made of a list of * 'instructions' (for all practical purposes equivalent to BPF @@ -286,6 +293,7 @@ enum ipfw_opcodes { /* arguments (4 byte each) */ O_EXTERNAL_DATA, /* variable length data */ O_SKIP_ACTION, /* none */ + O_TCPMSS, /* arg1=MSS value */ O_LAST_OPCODE /* not an opcode! */ }; diff --git a/freebsd/sys/netinet/ip_gre.c b/freebsd/sys/netinet/ip_gre.c index 1758bfff..560478d6 100644 --- a/freebsd/sys/netinet/ip_gre.c +++ b/freebsd/sys/netinet/ip_gre.c @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$"); #include <sys/jail.h> #include <sys/systm.h> #include <sys/socket.h> +#include <sys/socketvar.h> #include <sys/sockio.h> #include <sys/mbuf.h> #include <sys/errno.h> @@ -60,15 +61,19 @@ __FBSDID("$FreeBSD$"); #include <netinet/in.h> #include <netinet/in_var.h> +#include <netinet/in_pcb.h> #include <netinet/ip.h> #include <netinet/ip_encap.h> #include <netinet/ip_var.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> #ifdef INET6 #include <netinet/ip6.h> #endif #include <net/if_gre.h> +#include <machine/in_cksum.h> #define GRE_TTL 30 VNET_DEFINE(int, ip_gre_ttl) = GRE_TTL; @@ -76,14 +81,22 @@ VNET_DEFINE(int, ip_gre_ttl) = GRE_TTL; SYSCTL_INT(_net_inet_ip, OID_AUTO, grettl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_gre_ttl), 0, "Default TTL value for encapsulated packets"); +struct in_gre_socket { + struct gre_socket base; + in_addr_t addr; +}; +VNET_DEFINE_STATIC(struct gre_sockets *, ipv4_sockets) = NULL; VNET_DEFINE_STATIC(struct gre_list *, ipv4_hashtbl) = NULL; VNET_DEFINE_STATIC(struct gre_list *, ipv4_srchashtbl) = NULL; +#define V_ipv4_sockets VNET(ipv4_sockets) #define V_ipv4_hashtbl VNET(ipv4_hashtbl) #define V_ipv4_srchashtbl VNET(ipv4_srchashtbl) #define GRE_HASH(src, dst) (V_ipv4_hashtbl[\ in_gre_hashval((src), (dst)) & (GRE_HASH_SIZE - 1)]) #define GRE_SRCHASH(src) (V_ipv4_srchashtbl[\ fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)]) +#define GRE_SOCKHASH(src) (V_ipv4_sockets[\ + fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)]) #define GRE_HASH_SC(sc) GRE_HASH((sc)->gre_oip.ip_src.s_addr,\ (sc)->gre_oip.ip_dst.s_addr) @@ -96,17 +109,43 @@ in_gre_hashval(in_addr_t src, in_addr_t dst) return (fnv_32_buf(&dst, sizeof(dst), ret)); } +static struct gre_socket* +in_gre_lookup_socket(in_addr_t addr) +{ + struct gre_socket *gs; + struct in_gre_socket *s; + + CK_LIST_FOREACH(gs, &GRE_SOCKHASH(addr), chain) { + s = __containerof(gs, struct in_gre_socket, base); + if (s->addr == addr) + break; + } + return (gs); +} + static int -in_gre_checkdup(const struct gre_softc *sc, in_addr_t src, in_addr_t dst) +in_gre_checkdup(const struct gre_softc *sc, in_addr_t src, in_addr_t dst, + uint32_t opts) { + struct gre_list *head; struct gre_softc *tmp; + struct gre_socket *gs; if (sc->gre_family == AF_INET && sc->gre_oip.ip_src.s_addr == src && - sc->gre_oip.ip_dst.s_addr == dst) + sc->gre_oip.ip_dst.s_addr == dst && + (sc->gre_options & GRE_UDPENCAP) == (opts & GRE_UDPENCAP)) return (EEXIST); - CK_LIST_FOREACH(tmp, &GRE_HASH(src, dst), chain) { + if (opts & GRE_UDPENCAP) { + gs = in_gre_lookup_socket(src); + if (gs == NULL) + return (0); + head = &gs->list; + } else + head = &GRE_HASH(src, dst); + + CK_LIST_FOREACH(tmp, head, chain) { if (tmp == sc) continue; if (tmp->gre_oip.ip_src.s_addr == src && @@ -183,35 +222,228 @@ in_gre_srcaddr(void *arg __unused, const struct sockaddr *sa, } static void +in_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp, + const struct sockaddr *sa, void *ctx) +{ + struct epoch_tracker et; + struct gre_socket *gs; + struct gre_softc *sc; + in_addr_t dst; + + NET_EPOCH_ENTER(et); + /* + * udp_append() holds reference to inp, it is safe to check + * inp_flags2 without INP_RLOCK(). + * If socket was closed before we have entered NET_EPOCH section, + * INP_FREED flag should be set. Otherwise it should be safe to + * make access to ctx data, because gre_so will be freed by + * gre_sofree() via epoch_call(). + */ + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + NET_EPOCH_EXIT(et); + m_freem(m); + return; + } + + gs = (struct gre_socket *)ctx; + dst = ((const struct sockaddr_in *)sa)->sin_addr.s_addr; + CK_LIST_FOREACH(sc, &gs->list, chain) { + if (sc->gre_oip.ip_dst.s_addr == dst) + break; + } + if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){ + gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc); + NET_EPOCH_EXIT(et); + return; + } + m_freem(m); + NET_EPOCH_EXIT(et); +} + +static int +in_gre_setup_socket(struct gre_softc *sc) +{ + struct sockopt sopt; + struct sockaddr_in sin; + struct in_gre_socket *s; + struct gre_socket *gs; + in_addr_t addr; + int error, value; + + /* + * NOTE: we are protected with gre_ioctl_sx lock. + * + * First check that socket is already configured. + * If so, check that source addres was not changed. + * If address is different, check that there are no other tunnels + * and close socket. + */ + addr = sc->gre_oip.ip_src.s_addr; + gs = sc->gre_so; + if (gs != NULL) { + s = __containerof(gs, struct in_gre_socket, base); + if (s->addr != addr) { + if (CK_LIST_EMPTY(&gs->list)) { + CK_LIST_REMOVE(gs, chain); + soclose(gs->so); + epoch_call(net_epoch_preempt, &gs->epoch_ctx, + gre_sofree); + } + gs = sc->gre_so = NULL; + } + } + + if (gs == NULL) { + /* + * Check that socket for given address is already + * configured. + */ + gs = in_gre_lookup_socket(addr); + if (gs == NULL) { + s = malloc(sizeof(*s), M_GRE, M_WAITOK | M_ZERO); + s->addr = addr; + gs = &s->base; + + error = socreate(sc->gre_family, &gs->so, + SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, + curthread); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot create socket: %d\n", error); + free(s, M_GRE); + return (error); + } + + error = udp_set_kernel_tunneling(gs->so, + in_gre_udp_input, NULL, gs); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot set UDP tunneling: %d\n", error); + goto fail; + } + + memset(&sopt, 0, sizeof(sopt)); + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = IPPROTO_IP; + sopt.sopt_name = IP_BINDANY; + sopt.sopt_val = &value; + sopt.sopt_valsize = sizeof(value); + value = 1; + error = sosetopt(gs->so, &sopt); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot set IP_BINDANY opt: %d\n", error); + goto fail; + } + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(sin); + sin.sin_addr.s_addr = addr; + sin.sin_port = htons(GRE_UDPPORT); + error = sobind(gs->so, (struct sockaddr *)&sin, + curthread); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot bind socket: %d\n", error); + goto fail; + } + /* Add socket to the chain */ + CK_LIST_INSERT_HEAD(&GRE_SOCKHASH(addr), gs, chain); + } + } + + /* Add softc to the socket's list */ + CK_LIST_INSERT_HEAD(&gs->list, sc, chain); + sc->gre_so = gs; + return (0); +fail: + soclose(gs->so); + free(s, M_GRE); + return (error); +} + +static int in_gre_attach(struct gre_softc *sc) { + struct grehdr *gh; + int error; - sc->gre_hlen = sizeof(struct greip); + if (sc->gre_options & GRE_UDPENCAP) { + sc->gre_csumflags = CSUM_UDP; + sc->gre_hlen = sizeof(struct greudp); + sc->gre_oip.ip_p = IPPROTO_UDP; + gh = &sc->gre_udphdr->gi_gre; + gre_update_udphdr(sc, &sc->gre_udp, + in_pseudo(sc->gre_oip.ip_src.s_addr, + sc->gre_oip.ip_dst.s_addr, 0)); + } else { + sc->gre_hlen = sizeof(struct greip); + sc->gre_oip.ip_p = IPPROTO_GRE; + gh = &sc->gre_iphdr->gi_gre; + } sc->gre_oip.ip_v = IPVERSION; sc->gre_oip.ip_hl = sizeof(struct ip) >> 2; - sc->gre_oip.ip_p = IPPROTO_GRE; - gre_updatehdr(sc, &sc->gre_gihdr->gi_gre); - CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain); + gre_update_hdr(sc, gh); + + /* + * If we return error, this means that sc is not linked, + * and caller should reset gre_family and free(sc->gre_hdr). + */ + if (sc->gre_options & GRE_UDPENCAP) { + error = in_gre_setup_socket(sc); + if (error != 0) + return (error); + } else + CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain); CK_LIST_INSERT_HEAD(&GRE_SRCHASH(sc->gre_oip.ip_src.s_addr), sc, srchash); + + /* Set IFF_DRV_RUNNING if interface is ready */ + in_gre_set_running(sc); + return (0); } -void +int in_gre_setopts(struct gre_softc *sc, u_long cmd, uint32_t value) { - - MPASS(cmd == GRESKEY || cmd == GRESOPTS); + int error; /* NOTE: we are protected with gre_ioctl_sx lock */ + MPASS(cmd == GRESKEY || cmd == GRESOPTS || cmd == GRESPORT); MPASS(sc->gre_family == AF_INET); + + /* + * If we are going to change encapsulation protocol, do check + * for duplicate tunnels. Return EEXIST here to do not confuse + * user. + */ + if (cmd == GRESOPTS && + (sc->gre_options & GRE_UDPENCAP) != (value & GRE_UDPENCAP) && + in_gre_checkdup(sc, sc->gre_oip.ip_src.s_addr, + sc->gre_oip.ip_dst.s_addr, value) == EADDRNOTAVAIL) + return (EEXIST); + CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); GRE_WAIT(); - if (cmd == GRESKEY) + switch (cmd) { + case GRESKEY: sc->gre_key = value; - else + break; + case GRESOPTS: sc->gre_options = value; - in_gre_attach(sc); + break; + case GRESPORT: + sc->gre_port = value; + break; + } + error = in_gre_attach(sc); + if (error != 0) { + sc->gre_family = 0; + free(sc->gre_hdr, M_GRE); + } + return (error); } int @@ -243,9 +475,10 @@ in_gre_ioctl(struct gre_softc *sc, u_long cmd, caddr_t data) if (V_ipv4_hashtbl == NULL) { V_ipv4_hashtbl = gre_hashinit(); V_ipv4_srchashtbl = gre_hashinit(); + V_ipv4_sockets = (struct gre_sockets *)gre_hashinit(); } error = in_gre_checkdup(sc, src->sin_addr.s_addr, - dst->sin_addr.s_addr); + dst->sin_addr.s_addr, sc->gre_options); if (error == EADDRNOTAVAIL) break; if (error == EEXIST) { @@ -253,7 +486,7 @@ in_gre_ioctl(struct gre_softc *sc, u_long cmd, caddr_t data) error = 0; break; } - ip = malloc(sizeof(struct greip) + 3 * sizeof(uint32_t), + ip = malloc(sizeof(struct greudp) + 3 * sizeof(uint32_t), M_GRE, M_WAITOK | M_ZERO); ip->ip_src.s_addr = src->sin_addr.s_addr; ip->ip_dst.s_addr = dst->sin_addr.s_addr; @@ -269,8 +502,11 @@ in_gre_ioctl(struct gre_softc *sc, u_long cmd, caddr_t data) sc->gre_hdr = ip; sc->gre_oseq = 0; sc->gre_iseq = UINT32_MAX; - in_gre_attach(sc); - in_gre_set_running(sc); + error = in_gre_attach(sc); + if (error != 0) { + sc->gre_family = 0; + free(sc->gre_hdr, M_GRE); + } break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: @@ -356,5 +592,6 @@ in_gre_uninit(void) V_ipv4_hashtbl = NULL; GRE_WAIT(); gre_hashdestroy(V_ipv4_srchashtbl); + gre_hashdestroy((struct gre_list *)V_ipv4_sockets); } } diff --git a/freebsd/sys/netinet/ip_icmp.c b/freebsd/sys/netinet/ip_icmp.c index 7e75f3d9..e86d0afc 100644 --- a/freebsd/sys/netinet/ip_icmp.c +++ b/freebsd/sys/netinet/ip_icmp.c @@ -156,7 +156,7 @@ SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW, VNET_DEFINE_STATIC(int, icmptstamprepl) = 1; #define V_icmptstamprepl VNET(icmptstamprepl) -SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmptstamprepl), 0, "Respond to ICMP Timestamp packets"); @@ -395,6 +395,7 @@ freeit: int icmp_input(struct mbuf **mp, int *offp, int proto) { + struct epoch_tracker et; struct icmp *icp; struct in_ifaddr *ia; struct mbuf *m = *mp; @@ -422,7 +423,7 @@ icmp_input(struct mbuf **mp, int *offp, int proto) inet_ntoa_r(ip->ip_dst, dstbuf), icmplen); } #endif - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); if (icmplen < ICMP_MINLEN) { ICMPSTAT_INC(icps_tooshort); goto freeit; @@ -430,7 +431,7 @@ icmp_input(struct mbuf **mp, int *offp, int proto) i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { ICMPSTAT_INC(icps_tooshort); - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); @@ -548,7 +549,7 @@ icmp_input(struct mbuf **mp, int *offp, int proto) if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { /* This should actually not happen */ ICMPSTAT_INC(icps_tooshort); - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); @@ -641,7 +642,7 @@ reflect: ICMPSTAT_INC(icps_reflect); ICMPSTAT_INC(icps_outhist[icp->icmp_type]); icmp_reflect(m); - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (IPPROTO_DONE); case ICMP_REDIRECT: @@ -718,13 +719,13 @@ reflect: } raw: - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); freeit: - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); m_freem(m); return (IPPROTO_DONE); } @@ -736,6 +737,7 @@ static void icmp_reflect(struct mbuf *m) { struct rm_priotracker in_ifa_tracker; + struct epoch_tracker et; struct ip *ip = mtod(m, struct ip *); struct ifaddr *ifa; struct ifnet *ifp; @@ -779,7 +781,7 @@ icmp_reflect(struct mbuf *m) */ ifp = m->m_pkthdr.rcvif; if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; @@ -787,11 +789,11 @@ icmp_reflect(struct mbuf *m) if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == t.s_addr) { t = IA_SIN(ia)->sin_addr; - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); goto match; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); } /* * If the packet was transiting through us, use the address of @@ -800,16 +802,16 @@ icmp_reflect(struct mbuf *m) * criteria apply. */ if (V_icmp_rfi && ifp != NULL) { - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); goto match; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); } /* * If the incoming packet was not addressed directly to us, use @@ -818,16 +820,16 @@ icmp_reflect(struct mbuf *m) * with normal source selection. */ if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) { - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); goto match; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); } /* * If the packet was transiting through us, use the address of diff --git a/freebsd/sys/netinet/ip_input.c b/freebsd/sys/netinet/ip_input.c index 136a774f..4dc4acd0 100644 --- a/freebsd/sys/netinet/ip_input.c +++ b/freebsd/sys/netinet/ip_input.c @@ -59,11 +59,11 @@ __FBSDID("$FreeBSD$"); #include <sys/syslog.h> #include <sys/sysctl.h> -#include <net/pfil.h> #include <net/if.h> #include <net/if_types.h> #include <net/if_var.h> #include <net/if_dl.h> +#include <net/pfil.h> #include <net/route.h> #include <net/netisr.h> #include <net/rss_config.h> @@ -136,7 +136,7 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_checkinterface), 0, "Verify packet arrives on correct interface"); -VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ +VNET_DEFINE(pfil_head_t, inet_pfil_head); /* Packet filter hooks */ static struct netisr_handler ip_nh = { .nh_name = "ip", @@ -303,6 +303,7 @@ SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQDROPS, intr_direct_queue_drops, void ip_init(void) { + struct pfil_head_args args; struct protosw *pr; int i; @@ -313,11 +314,11 @@ ip_init(void) ipreass_init(); /* Initialize packet filter hooks. */ - V_inet_pfil_hook.ph_type = PFIL_TYPE_AF; - V_inet_pfil_hook.ph_af = AF_INET; - if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0) - printf("%s: WARNING: unable to register pfil hook, " - "error %d\n", __func__, i); + args.pa_version = PFIL_VERSION; + args.pa_flags = PFIL_IN | PFIL_OUT; + args.pa_type = PFIL_TYPE_IP4; + args.pa_headname = PFIL_INET_NAME; + V_inet_pfil_head = pfil_head_register(&args); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET, &V_ipsec_hhh_in[HHOOK_IPSEC_INET], @@ -379,10 +380,7 @@ ip_destroy(void *unused __unused) #endif netisr_unregister_vnet(&ip_nh); - if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0) - printf("%s: WARNING: unable to unregister pfil hook, " - "error %d\n", __func__, error); - + pfil_head_unregister(V_inet_pfil_head); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " @@ -503,10 +501,10 @@ ip_input(struct mbuf *m) IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); - /* 127/8 must not appear on wire - RFC1122 */ + /* IN_LOOPBACK must not appear on the wire - RFC1122 */ ifp = m->m_pkthdr.rcvif; - if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || - (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { + if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) || + IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); goto bad; @@ -601,11 +599,12 @@ tooshort: */ /* Jump over all PFIL processing if hooks are not active. */ - if (!PFIL_HOOKED(&V_inet_pfil_hook)) + if (!PFIL_HOOKED_IN(V_inet_pfil_head)) goto passin; odst = ip->ip_dst; - if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, 0, NULL) != 0) + if (pfil_run_hooks(V_inet_pfil_head, &m, ifp, PFIL_IN, NULL) != + PFIL_PASS) return; if (m == NULL) /* consumed by filter */ return; @@ -711,7 +710,9 @@ passin: * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { - IF_ADDR_RLOCK(ifp); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; @@ -721,7 +722,7 @@ passin: counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); goto ours; } #ifdef BOOTP_COMPAT @@ -729,12 +730,12 @@ passin: counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); goto ours; } #endif } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); ia = NULL; } /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ @@ -954,6 +955,7 @@ ip_forward(struct mbuf *m, int srcrt) struct sockaddr_in *sin; struct in_addr dest; struct route ro; + struct epoch_tracker et; int error, type = 0, code = 0, mtu = 0; if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { @@ -982,7 +984,7 @@ ip_forward(struct mbuf *m, int srcrt) #else in_rtalloc_ign(&ro, 0, M_GETFIB(m)); #endif - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); if (ro.ro_rt != NULL) { ia = ifatoia(ro.ro_rt->rt_ifa); } else @@ -1134,7 +1136,7 @@ ip_forward(struct mbuf *m, int srcrt) } icmp_error(mcopy, type, code, dest.s_addr, mtu); out: - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); } #define CHECK_SO_CT(sp, ct) \ diff --git a/freebsd/sys/netinet/ip_mroute.c b/freebsd/sys/netinet/ip_mroute.c index 987549c6..c939196a 100644 --- a/freebsd/sys/netinet/ip_mroute.c +++ b/freebsd/sys/netinet/ip_mroute.c @@ -876,16 +876,18 @@ add_vif(struct vifctl *vifcp) */ ifp = NULL; } else { + struct epoch_tracker et; + sin.sin_addr = vifcp->vifc_lcl_addr; - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); VIF_UNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); } if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { @@ -1680,7 +1682,6 @@ static void send_packet(struct vif *vifp, struct mbuf *m) { struct ip_moptions imo; - struct in_multi *imm[2]; int error __unused; VIF_LOCK_ASSERT(); @@ -1689,9 +1690,7 @@ send_packet(struct vif *vifp, struct mbuf *m) imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; imo.imo_multicast_loop = 1; imo.imo_multicast_vif = -1; - imo.imo_num_memberships = 0; - imo.imo_max_memberships = 2; - imo.imo_membership = &imm[0]; + STAILQ_INIT(&imo.imo_head); /* * Re-entrancy should not be a problem here, because diff --git a/freebsd/sys/netinet/ip_options.c b/freebsd/sys/netinet/ip_options.c index c90077ea..fbbe96d3 100644 --- a/freebsd/sys/netinet/ip_options.c +++ b/freebsd/sys/netinet/ip_options.c @@ -111,6 +111,7 @@ ip_dooptions(struct mbuf *m, int pass) uint32_t ntime; struct nhop4_extended nh_ext; struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; + struct epoch_tracker et; /* Ignore or reject packets with IP options. */ if (V_ip_doopts == 0) @@ -121,7 +122,7 @@ ip_dooptions(struct mbuf *m, int pass) goto bad_unlocked; } - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); dst = ip->ip_dst; cp = (u_char *)(ip + 1); cnt = (ip->ip_hl << 2) - sizeof (struct ip); @@ -227,7 +228,7 @@ dropit: #endif IPSTAT_INC(ips_cantforward); m_freem(m); - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (1); } } @@ -382,14 +383,14 @@ dropit: cp[IPOPT_OFFSET] += sizeof(uint32_t); } } - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); if (forward && V_ipforwarding) { ip_forward(m, 1); return (1); } return (0); bad: - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); bad_unlocked: icmp_error(m, type, code, 0, 0); IPSTAT_INC(ips_badoptions); diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c index 477388d7..1a068b87 100644 --- a/freebsd/sys/netinet/ip_output.c +++ b/freebsd/sys/netinet/ip_output.c @@ -37,17 +37,19 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> -#include <rtems/bsd/local/opt_ratelimit.h> #include <rtems/bsd/local/opt_ipsec.h> +#include <rtems/bsd/local/opt_kern_tls.h> #include <rtems/bsd/local/opt_mbuf_stress_test.h> #include <rtems/bsd/local/opt_mpath.h> +#include <rtems/bsd/local/opt_ratelimit.h> #include <rtems/bsd/local/opt_route.h> -#include <rtems/bsd/local/opt_sctp.h> #include <rtems/bsd/local/opt_rss.h> +#include <rtems/bsd/local/opt_sctp.h> #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> +#include <sys/ktls.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> @@ -74,6 +76,7 @@ __FBSDID("$FreeBSD$"); #include <net/vnet.h> #include <netinet/in.h> +#include <netinet/in_fib.h> #include <netinet/in_kdtrace.h> #include <netinet/in_systm.h> #include <netinet/ip.h> @@ -110,24 +113,33 @@ extern int in_mcast_loop; extern struct protosw inetsw[]; static inline int -ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp, - struct sockaddr_in *dst, int *fibnum, int *error) +ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags, + struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error) { struct m_tag *fwd_tag = NULL; struct mbuf *m; struct in_addr odst; struct ip *ip; + int pflags = PFIL_OUT; + + if (flags & IP_FORWARDING) + pflags |= PFIL_FWD; m = *mp; ip = mtod(m, struct ip *); /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; - *error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, PFIL_OUT, 0, inp); - m = *mp; - if ((*error) != 0 || m == NULL) + switch (pfil_run_hooks(V_inet_pfil_head, mp, ifp, pflags, inp)) { + case PFIL_DROPPED: + *error = EPERM; + /* FALLTHROUGH */ + case PFIL_CONSUMED: return 1; /* Finished */ - + case PFIL_PASS: + *error = 0; + } + m = *mp; ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ @@ -200,6 +212,83 @@ ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp, return 0; } +static int +ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m, + const struct sockaddr_in *gw, struct route *ro) +{ +#ifdef KERN_TLS + struct ktls_session *tls = NULL; +#endif + struct m_snd_tag *mst; + int error; + + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); + mst = NULL; + +#ifdef KERN_TLS + /* + * If this is an unencrypted TLS record, save a reference to + * the record. This local reference is used to call + * ktls_output_eagain after the mbuf has been freed (thus + * dropping the mbuf's reference) in if_output. + */ + if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) { + tls = ktls_hold(m->m_next->m_ext.ext_pgs->tls); + mst = tls->snd_tag; + + /* + * If a TLS session doesn't have a valid tag, it must + * have had an earlier ifp mismatch, so drop this + * packet. + */ + if (mst == NULL) { + error = EAGAIN; + goto done; + } + } +#endif +#ifdef RATELIMIT + if (inp != NULL && mst == NULL) { + if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 || + (inp->inp_snd_tag != NULL && + inp->inp_snd_tag->ifp != ifp)) + in_pcboutput_txrtlmt(inp, ifp, m); + + if (inp->inp_snd_tag != NULL) + mst = inp->inp_snd_tag; + } +#endif + if (mst != NULL) { + KASSERT(m->m_pkthdr.rcvif == NULL, + ("trying to add a send tag to a forwarded packet")); + if (mst->ifp != ifp) { + error = EAGAIN; + goto done; + } + + /* stamp send tag on mbuf */ + m->m_pkthdr.snd_tag = m_snd_tag_ref(mst); + m->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } + + error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); + +done: + /* Check for route change invalidating send tags. */ +#ifdef KERN_TLS + if (tls != NULL) { + if (error == EAGAIN) + error = ktls_output_eagain(inp, tls); + ktls_free(tls); + } +#endif +#ifdef RATELIMIT + if (error == EAGAIN) + in_pcboutput_eagain(inp); +#endif + return (error); +} + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). @@ -217,23 +306,24 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct rm_priotracker in_ifa_tracker; + struct epoch_tracker et; struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; int error = 0; - struct sockaddr_in *dst; + struct sockaddr_in *dst, sin; const struct sockaddr_in *gw; struct in_ifaddr *ia; + struct in_addr src; int isbroadcast; uint16_t ip_len, ip_off; - struct route iproute; - struct rtentry *rte; /* cache for ro->ro_rt */ uint32_t fibnum; #if defined(IPSEC) || defined(IPSEC_SUPPORT) int no_route_but_check_spd = 0; #endif + M_ASSERTPKTHDR(m); if (inp != NULL) { @@ -243,11 +333,9 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } - } - - if (ro == NULL) { - ro = &iproute; - bzero(ro, sizeof (*ro)); +#ifdef NUMA + m->m_pkthdr.numa_domain = inp->inp_numa_domain; +#endif } if (opt) { @@ -274,26 +362,28 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, /* * dst/gw handling: * - * dst can be rewritten but always points to &ro->ro_dst. * gw is readonly but can point either to dst OR rt_gateway, * therefore we need restore gw if we're redoing lookup. */ - gw = dst = (struct sockaddr_in *)&ro->ro_dst; fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); - rte = ro->ro_rt; - if (rte == NULL) { + if (ro != NULL) + dst = (struct sockaddr_in *)&ro->ro_dst; + else + dst = &sin; + if (ro == NULL || ro->ro_rt == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } - NET_EPOCH_ENTER(); + gw = dst; + NET_EPOCH_ENTER(et); again: /* * Validate route against routing table additions; * a better/more specific route might have been added. */ - if (inp) + if (inp != NULL && ro != NULL && ro->ro_rt != NULL) RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); /* * If there is a cached route, @@ -303,15 +393,12 @@ again: * cache with IPv6. * Also check whether routing cache needs invalidation. */ - rte = ro->ro_rt; - if (rte && ((rte->rt_flags & RTF_UP) == 0 || - rte->rt_ifp == NULL || - !RT_LINK_IS_UP(rte->rt_ifp) || - dst->sin_family != AF_INET || - dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { + if (ro != NULL && ro->ro_rt != NULL && + ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + ro->ro_rt->rt_ifp == NULL || !RT_LINK_IS_UP(ro->ro_rt->rt_ifp) || + dst->sin_family != AF_INET || + dst->sin_addr.s_addr != ip->ip_dst.s_addr)) RO_INVALIDATE_CACHE(ro); - rte = NULL; - } ia = NULL; /* * If routing to interface only, short circuit routing lookup. @@ -331,8 +418,10 @@ again: ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; + mtu = ifp->if_mtu; ip->ip_ttl = 1; isbroadcast = 1; + src = IA_SIN(ia)->sin_addr; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL && @@ -343,9 +432,11 @@ again: goto bad; } ifp = ia->ia_ifp; + mtu = ifp->if_mtu; ip->ip_ttl = 1; isbroadcast = ifp->if_flags & IFF_BROADCAST ? in_ifaddr_broadcast(dst->sin_addr, ia) : 0; + src = IA_SIN(ia)->sin_addr; } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* @@ -353,15 +444,21 @@ again: * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; + mtu = ifp->if_mtu; IFP_TO_IA(ifp, ia, &in_ifa_tracker); isbroadcast = 0; /* fool gcc */ - } else { - /* - * We want to do any cloning requested by the link layer, - * as this is probably required in all cases for correct - * operation (as it is for ARP). - */ - if (rte == NULL) { + /* Interface may have no addresses. */ + if (ia != NULL) + src = IA_SIN(ia)->sin_addr; + else + src.s_addr = INADDR_ANY; + } else if (ro != NULL) { + if (ro->ro_rt == NULL) { + /* + * We want to do any cloning requested by the link + * layer, as this is probably required in all cases + * for correct operation (as it is for ARP). + */ #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), @@ -369,12 +466,47 @@ again: #else in_rtalloc_ign(ro, 0, fibnum); #endif - rte = ro->ro_rt; + if (ro->ro_rt == NULL || + (ro->ro_rt->rt_flags & RTF_UP) == 0 || + ro->ro_rt->rt_ifp == NULL || + !RT_LINK_IS_UP(ro->ro_rt->rt_ifp)) { +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + /* + * There is no route for this packet, but it is + * possible that a matching SPD entry exists. + */ + no_route_but_check_spd = 1; + mtu = 0; /* Silence GCC warning. */ + goto sendit; +#endif + IPSTAT_INC(ips_noroute); + error = EHOSTUNREACH; + goto bad; + } } - if (rte == NULL || - (rte->rt_flags & RTF_UP) == 0 || - rte->rt_ifp == NULL || - !RT_LINK_IS_UP(rte->rt_ifp)) { + ia = ifatoia(ro->ro_rt->rt_ifa); + ifp = ro->ro_rt->rt_ifp; + counter_u64_add(ro->ro_rt->rt_pksent, 1); + rt_update_ro_flags(ro); + if (ro->ro_rt->rt_flags & RTF_GATEWAY) + gw = (struct sockaddr_in *)ro->ro_rt->rt_gateway; + if (ro->ro_rt->rt_flags & RTF_HOST) + isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); + else if (ifp->if_flags & IFF_BROADCAST) + isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia); + else + isbroadcast = 0; + if (ro->ro_rt->rt_flags & RTF_HOST) + mtu = ro->ro_rt->rt_mtu; + else + mtu = ifp->if_mtu; + src = IA_SIN(ia)->sin_addr; + } else { + struct nhop4_extended nh; + + bzero(&nh, sizeof(nh)); + if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh) != + 0) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * There is no route for this packet, but it is @@ -388,31 +520,29 @@ again: error = EHOSTUNREACH; goto bad; } - ia = ifatoia(rte->rt_ifa); - ifp = rte->rt_ifp; - counter_u64_add(rte->rt_pksent, 1); - rt_update_ro_flags(ro); - if (rte->rt_flags & RTF_GATEWAY) - gw = (struct sockaddr_in *)rte->rt_gateway; - if (rte->rt_flags & RTF_HOST) - isbroadcast = (rte->rt_flags & RTF_BROADCAST); - else if (ifp->if_flags & IFF_BROADCAST) - isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia); - else - isbroadcast = 0; + ifp = nh.nh_ifp; + mtu = nh.nh_mtu; + /* + * We are rewriting here dst to be gw actually, contradicting + * comment at the beginning of the function. However, in this + * case we are always dealing with on stack dst. + * In case if pfil(9) sends us back to beginning of the + * function, the dst would be rewritten by ip_output_pfil(). + */ + MPASS(dst == &sin); + dst->sin_addr = nh.nh_addr; + ia = nh.nh_ia; + src = nh.nh_src; + isbroadcast = (((nh.nh_flags & (NHF_HOST | NHF_BROADCAST)) == + (NHF_HOST | NHF_BROADCAST)) || + ((ifp->if_flags & IFF_BROADCAST) && + in_ifaddr_broadcast(dst->sin_addr, ia))); } - /* - * Calculate MTU. If we have a route that is up, use that, - * otherwise use the interface's MTU. - */ - if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) - mtu = rte->rt_mtu; - else - mtu = ifp->if_mtu; /* Catch a possible divide by zero later. */ - KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p", - __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp)); + KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (rt_flags=0x%08x) ifp=%p", + __func__, mtu, ro, + (ro != NULL && ro->ro_rt != NULL) ? ro->ro_rt->rt_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; @@ -448,11 +578,8 @@ again: * If source address not specified yet, use address * of outgoing interface. */ - if (ip->ip_src.s_addr == INADDR_ANY) { - /* Interface may have no addresses. */ - if (ia != NULL) - ip->ip_src = IA_SIN(ia)->sin_addr; - } + if (ip->ip_src.s_addr == INADDR_ANY) + ip->ip_src = src; if ((imo == NULL && in_mcast_loop) || (imo && imo->imo_multicast_loop)) { @@ -515,12 +642,8 @@ again: * If the source address is not specified yet, use the address * of the outoing interface. */ - if (ip->ip_src.s_addr == INADDR_ANY) { - /* Interface may have no addresses. */ - if (ia != NULL) { - ip->ip_src = IA_SIN(ia)->sin_addr; - } - } + if (ip->ip_src.s_addr == INADDR_ANY) + ip->ip_src = src; /* * Look for broadcast address and @@ -569,8 +692,9 @@ sendit: #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ - if (PFIL_HOOKED(&V_inet_pfil_hook)) { - switch (ip_output_pfil(&m, ifp, inp, dst, &fibnum, &error)) { + if (PFIL_HOOKED_OUT(V_inet_pfil_head)) { + switch (ip_output_pfil(&m, ifp, flags, inp, dst, &fibnum, + &error)) { case 1: /* Finished */ goto done; @@ -580,9 +704,10 @@ sendit: case -1: /* Need to try again */ /* Reset everything for a new round */ - RO_RTFREE(ro); - ro->ro_prepend = NULL; - rte = NULL; + if (ro != NULL) { + RO_RTFREE(ro); + ro->ro_prepend = NULL; + } gw = dst; ip = mtod(m, struct ip *); goto again; @@ -590,9 +715,9 @@ sendit: } } - /* 127/8 must not appear on wire - RFC1122. */ - if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || - (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { + /* IN_LOOPBACK must not appear on the wire - RFC1122. */ + if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) || + IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); error = EADDRNOTAVAIL; @@ -602,11 +727,30 @@ sendit: m->m_pkthdr.csum_flags |= CSUM_IP; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { + m = mb_unmapped_to_ext(m); + if (m == NULL) { + IPSTAT_INC(ips_odropped); + error = ENOBUFS; + goto bad; + } in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) { + m = mb_unmapped_to_ext(m); + if (m == NULL) { + IPSTAT_INC(ips_odropped); + error = ENOBUFS; + goto bad; + } } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { + m = mb_unmapped_to_ext(m); + if (m == NULL) { + IPSTAT_INC(ips_odropped); + error = ENOBUFS; + goto bad; + } sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } @@ -649,23 +793,7 @@ sendit: */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); -#ifdef RATELIMIT - if (inp != NULL) { - if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) - in_pcboutput_txrtlmt(inp, ifp, m); - /* stamp send tag on mbuf */ - m->m_pkthdr.snd_tag = inp->inp_snd_tag; - } else { - m->m_pkthdr.snd_tag = NULL; - } -#endif - error = (*ifp->if_output)(ifp, m, - (const struct sockaddr *)gw, ro); -#ifdef RATELIMIT - /* check for route change */ - if (error == EAGAIN) - in_pcboutput_eagain(inp); -#endif + error = ip_output_send(inp, ifp, m, gw, ro); goto done; } @@ -701,23 +829,7 @@ sendit: IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp, mtod(m, struct ip *), NULL); -#ifdef RATELIMIT - if (inp != NULL) { - if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) - in_pcboutput_txrtlmt(inp, ifp, m); - /* stamp send tag on mbuf */ - m->m_pkthdr.snd_tag = inp->inp_snd_tag; - } else { - m->m_pkthdr.snd_tag = NULL; - } -#endif - error = (*ifp->if_output)(ifp, m, - (const struct sockaddr *)gw, ro); -#ifdef RATELIMIT - /* check for route change */ - if (error == EAGAIN) - in_pcboutput_eagain(inp); -#endif + error = ip_output_send(inp, ifp, m, gw, ro); } else m_freem(m); } @@ -726,16 +838,7 @@ sendit: IPSTAT_INC(ips_fragmented); done: - if (ro == &iproute) - RO_RTFREE(ro); - else if (rte == NULL) - /* - * If the caller supplied a route but somehow the reference - * to it has been released need to prevent the caller - * calling RTFREE on it again. - */ - ro->ro_rt = NULL; - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (error); bad: m_freem(m); @@ -783,11 +886,23 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m0 = mb_unmapped_to_ext(m0); + if (m0 == NULL) { + error = ENOBUFS; + IPSTAT_INC(ips_odropped); + goto done; + } in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { + m0 = mb_unmapped_to_ext(m0); + if (m0 == NULL) { + error = ENOBUFS; + IPSTAT_INC(ips_odropped); + goto done; + } sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } @@ -1264,7 +1379,8 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) if (inp->inp_options) { struct mbuf *options; - options = m_dup(inp->inp_options, M_NOWAIT); + options = m_copym(inp->inp_options, 0, + M_COPYALL, M_NOWAIT); INP_RUNLOCK(inp); if (options != NULL) { error = sooptcopyout(sopt, diff --git a/freebsd/sys/netinet/ip_reass.c b/freebsd/sys/netinet/ip_reass.c index 70a6edae..63353485 100644 --- a/freebsd/sys/netinet/ip_reass.c +++ b/freebsd/sys/netinet/ip_reass.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/eventhandler.h> +#include <sys/kernel.h> #include <sys/hash.h> #include <sys/mbuf.h> #include <sys/malloc.h> diff --git a/freebsd/sys/netinet/ip_var.h b/freebsd/sys/netinet/ip_var.h index 86615a15..7580a7b4 100644 --- a/freebsd/sys/netinet/ip_var.h +++ b/freebsd/sys/netinet/ip_var.h @@ -82,6 +82,7 @@ struct ipoption { char ipopt_list[MAX_IPOPTLEN]; /* options proper */ }; +#if defined(_NETINET_IN_VAR_H_) && defined(_KERNEL) /* * Structure attached to inpcb.ip_moptions and * passed to ip_output when IP multicast options are in use. @@ -93,12 +94,11 @@ struct ip_moptions { u_long imo_multicast_vif; /* vif num outgoing multicasts */ u_char imo_multicast_ttl; /* TTL for outgoing multicasts */ u_char imo_multicast_loop; /* 1 => hear sends if a member */ - u_short imo_num_memberships; /* no. memberships this socket */ - u_short imo_max_memberships; /* max memberships this socket */ - struct in_multi **imo_membership; /* group memberships */ - struct in_mfilter *imo_mfilters; /* source filters */ - struct epoch_context imo_epoch_ctx; + struct ip_mfilter_head imo_head; /* group membership list */ }; +#else +struct ip_moptions; +#endif struct ipstat { uint64_t ips_total; /* total packets received */ @@ -241,8 +241,9 @@ extern int (*ip_rsvp_vif)(struct socket *, struct sockopt *); extern void (*ip_rsvp_force_done)(struct socket *); extern int (*rsvp_input_p)(struct mbuf **, int *, int); -VNET_DECLARE(struct pfil_head, inet_pfil_hook); /* packet filter hooks */ -#define V_inet_pfil_hook VNET(inet_pfil_hook) +VNET_DECLARE(struct pfil_head *, inet_pfil_head); +#define V_inet_pfil_head VNET(inet_pfil_head) +#define PFIL_INET_NAME "inet" void in_delayed_cksum(struct mbuf *m); @@ -291,13 +292,11 @@ VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr); #define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) /* Divert hooks. */ -extern void (*ip_divert_ptr)(struct mbuf *m, int incoming); +extern void (*ip_divert_ptr)(struct mbuf *m, bool incoming); /* ng_ipfw hooks -- XXX make it the same as divert and dummynet */ -extern int (*ng_ipfw_input_p)(struct mbuf **, int, - struct ip_fw_args *, int); - +extern int (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool); extern int (*ip_dn_ctl_ptr)(struct sockopt *); -extern int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); +extern int (*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *); #endif /* _KERNEL */ #endif /* !_NETINET_IP_VAR_H_ */ diff --git a/freebsd/sys/netinet/libalias/alias_sctp.c b/freebsd/sys/netinet/libalias/alias_sctp.c index 4f7d4940..ea05cc4d 100644 --- a/freebsd/sys/netinet/libalias/alias_sctp.c +++ b/freebsd/sys/netinet/libalias/alias_sctp.c @@ -77,6 +77,7 @@ #ifdef _KERNEL #include <machine/stdarg.h> #include <sys/param.h> +#include <sys/gsb_crc32.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/module.h> diff --git a/freebsd/sys/netinet/netdump/netdump.h b/freebsd/sys/netinet/netdump/netdump.h index 12a527ee..4e952332 100644 --- a/freebsd/sys/netinet/netdump/netdump.h +++ b/freebsd/sys/netinet/netdump/netdump.h @@ -60,20 +60,20 @@ struct netdump_ack { uint32_t na_seqno; /* Match acks with msgs. */ } __packed; -struct netdump_conf { +struct netdump_conf_freebsd12 { #ifndef __rtems__ - struct diocskerneldump_arg ndc_kda; + struct diocskerneldump_arg_freebsd12 ndc12_kda; #endif /* __rtems__ */ - char ndc_iface[IFNAMSIZ]; - struct in_addr ndc_server; - struct in_addr ndc_client; - struct in_addr ndc_gateway; + char ndc12_iface[IFNAMSIZ]; + struct in_addr ndc12_server; + struct in_addr ndc12_client; + struct in_addr ndc12_gateway; }; -#define _PATH_NETDUMP "/dev/netdump" +#define NETDUMPGCONF_FREEBSD12 _IOR('n', 1, struct netdump_conf_freebsd12) +#define NETDUMPSCONF_FREEBSD12 _IOW('n', 2, struct netdump_conf_freebsd12) -#define NETDUMPGCONF _IOR('n', 1, struct netdump_conf) -#define NETDUMPSCONF _IOW('n', 2, struct netdump_conf) +#define _PATH_NETDUMP "/dev/netdump" #ifdef _KERNEL #ifdef NETDUMP diff --git a/freebsd/sys/netinet/raw_ip.c b/freebsd/sys/netinet/raw_ip.c index 1cac8d12..17217a5c 100644 --- a/freebsd/sys/netinet/raw_ip.c +++ b/freebsd/sys/netinet/raw_ip.c @@ -102,10 +102,9 @@ VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL; VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; int (*ip_dn_ctl_ptr)(struct sockopt *); -int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); -void (*ip_divert_ptr)(struct mbuf *, int); -int (*ng_ipfw_input_p)(struct mbuf **, int, - struct ip_fw_args *, int); +int (*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *); +void (*ip_divert_ptr)(struct mbuf *, bool); +int (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool); #ifdef INET /* @@ -456,6 +455,8 @@ rip_output(struct mbuf *m, struct socket *so, ...) u_long dst; int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST; + int cnt, hlen; + u_char opttype, optlen, *cp; va_start(ap, so); dst = va_arg(ap, u_long); @@ -510,26 +511,61 @@ rip_output(struct mbuf *m, struct socket *so, ...) m_freem(m); return(EMSGSIZE); } - INP_RLOCK(inp); ip = mtod(m, struct ip *); - error = prison_check_ip4(inp->inp_cred, &ip->ip_src); - if (error != 0) { - INP_RUNLOCK(inp); - m_freem(m); - return (error); + hlen = ip->ip_hl << 2; + if (m->m_len < hlen) { + m = m_pullup(m, hlen); + if (m == NULL) + return (EINVAL); + ip = mtod(m, struct ip *); } + INP_RLOCK(inp); /* * Don't allow both user specified and setsockopt options, * and don't allow packet length sizes that will crash. */ - if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) - || (ntohs(ip->ip_len) != m->m_pkthdr.len) - || (ntohs(ip->ip_len) < (ip->ip_hl << 2))) { + if ((hlen < sizeof (*ip)) + || ((hlen > sizeof (*ip)) && inp->inp_options) + || (ntohs(ip->ip_len) != m->m_pkthdr.len)) { INP_RUNLOCK(inp); m_freem(m); return (EINVAL); } + error = prison_check_ip4(inp->inp_cred, &ip->ip_src); + if (error != 0) { + INP_RUNLOCK(inp); + m_freem(m); + return (error); + } + /* + * Don't allow IP options which do not have the required + * structure as specified in section 3.1 of RFC 791 on + * pages 15-23. + */ + cp = (u_char *)(ip + 1); + cnt = hlen - sizeof (struct ip); + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opttype = cp[IPOPT_OPTVAL]; + if (opttype == IPOPT_EOL) + break; + if (opttype == IPOPT_NOP) { + optlen = 1; + continue; + } + if (cnt < IPOPT_OLEN + sizeof(u_char)) { + INP_RUNLOCK(inp); + m_freem(m); + return (EINVAL); + } + optlen = cp[IPOPT_OLEN]; + if (optlen < IPOPT_OLEN + sizeof(u_char) || + optlen > cnt) { + INP_RUNLOCK(inp); + m_freem(m); + return (EINVAL); + } + } /* * This doesn't allow application to specify ID of zero, * but we got this limitation from the beginning of history. diff --git a/freebsd/sys/netinet/sctp.h b/freebsd/sys/netinet/sctp.h index 64fd5442..27e5fd49 100644 --- a/freebsd/sys/netinet/sctp.h +++ b/freebsd/sys/netinet/sctp.h @@ -491,6 +491,7 @@ struct sctp_error_auth_invalid_hmac { * time */ #define SCTP_SAT_NETWORK_BURST_INCR 2 /* how many times to multiply maxburst * in sat */ +#define SCTP_MAX_SENDALL_LIMIT 1024 /* Data Chuck Specific Flags */ #define SCTP_DATA_FRAG_MASK 0x03 @@ -516,6 +517,7 @@ struct sctp_error_auth_invalid_hmac { #define SCTP_PCB_FLAGS_BOUNDALL 0x00000004 #define SCTP_PCB_FLAGS_ACCEPTING 0x00000008 #define SCTP_PCB_FLAGS_UNBOUND 0x00000010 +#define SCTP_PCB_FLAGS_SND_ITERATOR_UP 0x00000020 #define SCTP_PCB_FLAGS_CLOSE_IP 0x00040000 #define SCTP_PCB_FLAGS_WAS_CONNECTED 0x00080000 #define SCTP_PCB_FLAGS_WAS_ABORTED 0x00100000 diff --git a/freebsd/sys/netinet/sctp_asconf.c b/freebsd/sys/netinet/sctp_asconf.c index 2c66f65c..584a9d3b 100644 --- a/freebsd/sys/netinet/sctp_asconf.c +++ b/freebsd/sys/netinet/sctp_asconf.c @@ -705,6 +705,7 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, if (param_length <= sizeof(struct sctp_paramhdr)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: param length (%u) too short\n", param_length); sctp_m_freem(m_ack); + return; } /* get the entire parameter */ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, param_length, aparam_buf); @@ -1368,7 +1369,7 @@ sctp_asconf_queue_add(struct sctp_tcb *stcb, struct sctp_ifa *ifa, if (sctp_asconf_queue_mgmt(stcb, stcb->asoc.asconf_addr_del_pending, SCTP_DEL_IP_ADDRESS) == 0) { - SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_add: queing pending delete\n"); + SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_add: queuing pending delete\n"); pending_delete_queued = 1; /* clear out the pending delete info */ stcb->asoc.asconf_del_pending = 0; @@ -1956,12 +1957,10 @@ sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, case AF_INET: { struct sockaddr_in *sin; - struct in6pcb *inp6; - inp6 = (struct in6pcb *)&inp->ip_inp.inp; /* invalid if we are a v6 only endpoint */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && - SCTP_IPV6_V6ONLY(inp6)) + SCTP_IPV6_V6ONLY(inp)) return; sin = &ifa->address.sin; @@ -2034,11 +2033,8 @@ sctp_asconf_iterator_ep(struct sctp_inpcb *inp, void *ptr, uint32_t val SCTP_UNU case AF_INET: { /* invalid if we are a v6 only endpoint */ - struct in6pcb *inp6; - - inp6 = (struct in6pcb *)&inp->ip_inp.inp; if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && - SCTP_IPV6_V6ONLY(inp6)) { + SCTP_IPV6_V6ONLY(inp)) { cnt_invalid++; if (asc->cnt == cnt_invalid) return (1); @@ -2149,13 +2145,11 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, case AF_INET: { /* invalid if we are a v6 only endpoint */ - struct in6pcb *inp6; struct sockaddr_in *sin; - inp6 = (struct in6pcb *)&inp->ip_inp.inp; /* invalid if we are a v6 only endpoint */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && - SCTP_IPV6_V6ONLY(inp6)) + SCTP_IPV6_V6ONLY(inp)) continue; sin = &ifa->address.sin; @@ -2172,7 +2166,7 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, continue; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && - SCTP_IPV6_V6ONLY(inp6)) { + SCTP_IPV6_V6ONLY(inp)) { cnt_invalid++; if (asc->cnt == cnt_invalid) return; diff --git a/freebsd/sys/netinet/sctp_auth.c b/freebsd/sys/netinet/sctp_auth.c index 8301a98f..3555bb87 100644 --- a/freebsd/sys/netinet/sctp_auth.c +++ b/freebsd/sys/netinet/sctp_auth.c @@ -525,7 +525,7 @@ sctp_insert_sharedkey(struct sctp_keyhead *shared_keys, } else if (new_skey->keyid == skey->keyid) { /* replace the existing key */ /* verify this key *can* be replaced */ - if ((skey->deactivated) && (skey->refcount > 1)) { + if ((skey->deactivated) || (skey->refcount > 1)) { SCTPDBG(SCTP_DEBUG_AUTH1, "can't replace shared key id %u\n", new_skey->keyid); diff --git a/freebsd/sys/netinet/sctp_bsd_addr.c b/freebsd/sys/netinet/sctp_bsd_addr.c index 0f0ddd89..962fae37 100644 --- a/freebsd/sys/netinet/sctp_bsd_addr.c +++ b/freebsd/sys/netinet/sctp_bsd_addr.c @@ -210,11 +210,13 @@ sctp_init_ifns_for_vrf(int vrfid) IFNET_RLOCK(); CK_STAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) { + struct epoch_tracker et; + if (sctp_is_desired_interface_type(ifn) == 0) { /* non desired type */ continue; } - IF_ADDR_RLOCK(ifn); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL) { continue; @@ -267,7 +269,7 @@ sctp_init_ifns_for_vrf(int vrfid) sctp_ifa->localifa_flags &= ~SCTP_ADDR_DEFER_USE; } } - IF_ADDR_RUNLOCK(ifn); + NET_EPOCH_EXIT(et); } IFNET_RUNLOCK(); } diff --git a/freebsd/sys/netinet/sctp_constants.h b/freebsd/sys/netinet/sctp_constants.h index d07381d5..7ee7e311 100644 --- a/freebsd/sys/netinet/sctp_constants.h +++ b/freebsd/sys/netinet/sctp_constants.h @@ -983,6 +983,9 @@ __FBSDID("$FreeBSD$"); ((((uint8_t *)&(a)->s_addr)[0] == 169) && \ (((uint8_t *)&(a)->s_addr)[1] == 254)) +/* Maximum size of optval for IPPROTO_SCTP level socket options. */ +#define SCTP_SOCKET_OPTION_LIMIT (64 * 1024) + #if defined(_KERNEL) #define SCTP_GETTIME_TIMEVAL(x) (getmicrouptime(x)) diff --git a/freebsd/sys/netinet/sctp_crc32.c b/freebsd/sys/netinet/sctp_crc32.c index e22abeb6..a387d528 100644 --- a/freebsd/sys/netinet/sctp_crc32.c +++ b/freebsd/sys/netinet/sctp_crc32.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_sctp.h> +#include <sys/gsb_crc32.h> #ifdef SCTP #include <netinet/sctp_os.h> #include <netinet/sctp.h> @@ -133,16 +134,16 @@ sctp_delayed_cksum(struct mbuf *m, uint32_t offset) SCTP_STAT_INCR(sctps_sendswcrc); offset += offsetof(struct sctphdr, checksum); - if (offset + sizeof(uint32_t) > (uint32_t)(m->m_len)) { + if (offset + sizeof(uint32_t) > (uint32_t)(m->m_pkthdr.len)) { #ifdef INVARIANTS - panic("sctp_delayed_cksum(): m->m_len: %d, offset: %u.", - m->m_len, offset); + panic("sctp_delayed_cksum(): m->m_pkthdr.len: %d, offset: %u.", + m->m_pkthdr.len, offset); #else - SCTP_PRINTF("sctp_delayed_cksum(): m->m_len: %d, offset: %u.\n", - m->m_len, offset); + SCTP_PRINTF("sctp_delayed_cksum(): m->m_pkthdr.len: %d, offset: %u.\n", + m->m_pkthdr.len, offset); #endif return; } - *(uint32_t *)(m->m_data + offset) = checksum; + m_copyback(m, (int)offset, (int)sizeof(uint32_t), (caddr_t)&checksum); } #endif diff --git a/freebsd/sys/netinet/sctp_indata.c b/freebsd/sys/netinet/sctp_indata.c index 28e3f5b2..59654ac6 100644 --- a/freebsd/sys/netinet/sctp_indata.c +++ b/freebsd/sys/netinet/sctp_indata.c @@ -917,6 +917,9 @@ restart: break; } } + if (cnt_added && strm->pd_api_started) { + sctp_wakeup_the_read_socket(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); + } if ((control->length > pd_point) && (strm->pd_api_started == 0)) { strm->pd_api_started = 1; control->pdapi_started = 1; @@ -949,6 +952,15 @@ sctp_inject_old_unordered_data(struct sctp_tcb *stcb, SCTPDBG(SCTP_DEBUG_XXX, "chunk is a first fsn: %u becomes fsn_included\n", chk->rec.data.fsn); + at = TAILQ_FIRST(&control->reasm); + if (at && SCTP_TSN_GT(chk->rec.data.fsn, at->rec.data.fsn)) { + /* + * The first chunk in the reassembly is a smaller + * TSN than this one, even though this has a first, + * it must be from a subsequent msg. + */ + goto place_chunk; + } if (control->first_frag_seen) { /* * In old un-ordered we can reassembly on one @@ -1469,6 +1481,16 @@ sctp_queue_data_for_reasm(struct sctp_tcb *stcb, struct sctp_association *asoc, "The last fsn is now in place fsn: %u\n", chk->rec.data.fsn); control->last_frag_seen = 1; + if (SCTP_TSN_GT(control->top_fsn, chk->rec.data.fsn)) { + SCTPDBG(SCTP_DEBUG_XXX, + "New fsn: %u is not at top_fsn: %u -- abort\n", + chk->rec.data.fsn, + control->top_fsn); + sctp_abort_in_reasm(stcb, control, chk, + abort_flag, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_9); + return; + } } if (asoc->idata_supported || control->first_frag_seen) { /* @@ -1484,7 +1506,7 @@ sctp_queue_data_for_reasm(struct sctp_tcb *stcb, struct sctp_association *asoc, */ sctp_abort_in_reasm(stcb, control, chk, abort_flag, - SCTP_FROM_SCTP_INDATA + SCTP_LOC_9); + SCTP_FROM_SCTP_INDATA + SCTP_LOC_10); return; } } @@ -1496,7 +1518,7 @@ sctp_queue_data_for_reasm(struct sctp_tcb *stcb, struct sctp_association *asoc, chk->rec.data.fsn, control->top_fsn); sctp_abort_in_reasm(stcb, control, chk, abort_flag, - SCTP_FROM_SCTP_INDATA + SCTP_LOC_10); + SCTP_FROM_SCTP_INDATA + SCTP_LOC_11); return; } if (asoc->idata_supported || control->first_frag_seen) { @@ -1517,7 +1539,7 @@ sctp_queue_data_for_reasm(struct sctp_tcb *stcb, struct sctp_association *asoc, chk->rec.data.fsn, control->fsn_included); sctp_abort_in_reasm(stcb, control, chk, abort_flag, - SCTP_FROM_SCTP_INDATA + SCTP_LOC_11); + SCTP_FROM_SCTP_INDATA + SCTP_LOC_12); return; } } @@ -1532,7 +1554,7 @@ sctp_queue_data_for_reasm(struct sctp_tcb *stcb, struct sctp_association *asoc, control->top_fsn); sctp_abort_in_reasm(stcb, control, chk, abort_flag, - SCTP_FROM_SCTP_INDATA + SCTP_LOC_12); + SCTP_FROM_SCTP_INDATA + SCTP_LOC_13); return; } } @@ -1575,7 +1597,7 @@ sctp_queue_data_for_reasm(struct sctp_tcb *stcb, struct sctp_association *asoc, at->rec.data.fsn); sctp_abort_in_reasm(stcb, control, chk, abort_flag, - SCTP_FROM_SCTP_INDATA + SCTP_LOC_13); + SCTP_FROM_SCTP_INDATA + SCTP_LOC_14); return; } } @@ -3088,13 +3110,12 @@ sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1 * update RTO too ? */ if (tp1->do_rtt) { - if (*rto_ok) { - tp1->whoTo->RTO = - sctp_calculate_rto(stcb, - &stcb->asoc, - tp1->whoTo, - &tp1->sent_rcv_time, - SCTP_RTT_FROM_DATA); + if (*rto_ok && + sctp_calculate_rto(stcb, + &stcb->asoc, + tp1->whoTo, + &tp1->sent_rcv_time, + SCTP_RTT_FROM_DATA)) { *rto_ok = 0; } if (tp1->whoTo->rto_needed == 0) { @@ -4066,16 +4087,12 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, /* update RTO too? */ if (tp1->do_rtt) { - if (rto_ok) { - tp1->whoTo->RTO = - /* - * sa_ignore - * NO_NULL_CHK - */ - sctp_calculate_rto(stcb, - asoc, tp1->whoTo, - &tp1->sent_rcv_time, - SCTP_RTT_FROM_DATA); + if (rto_ok && + sctp_calculate_rto(stcb, + &stcb->asoc, + tp1->whoTo, + &tp1->sent_rcv_time, + SCTP_RTT_FROM_DATA)) { rto_ok = 0; } if (tp1->whoTo->rto_needed == 0) { @@ -4684,12 +4701,12 @@ hopeless_peer: /* update RTO too? */ if (tp1->do_rtt) { - if (rto_ok) { - tp1->whoTo->RTO = - sctp_calculate_rto(stcb, - asoc, tp1->whoTo, - &tp1->sent_rcv_time, - SCTP_RTT_FROM_DATA); + if (rto_ok && + sctp_calculate_rto(stcb, + &stcb->asoc, + tp1->whoTo, + &tp1->sent_rcv_time, + SCTP_RTT_FROM_DATA)) { rto_ok = 0; } if (tp1->whoTo->rto_needed == 0) { diff --git a/freebsd/sys/netinet/sctp_indata.h b/freebsd/sys/netinet/sctp_indata.h index 59ceac3a..3f3099e8 100644 --- a/freebsd/sys/netinet/sctp_indata.h +++ b/freebsd/sys/netinet/sctp_indata.h @@ -61,7 +61,6 @@ sctp_build_readq_entry(struct sctp_tcb *stcb, (_ctl)->sinfo_ppid = ppid; \ (_ctl)->sinfo_context = context; \ (_ctl)->fsn_included = 0xffffffff; \ - (_ctl)->top_fsn = 0xffffffff; \ (_ctl)->sinfo_tsn = tsn; \ (_ctl)->sinfo_cumtsn = tsn; \ (_ctl)->sinfo_assoc_id = sctp_get_associd((in_it)); \ diff --git a/freebsd/sys/netinet/sctp_input.c b/freebsd/sys/netinet/sctp_input.c index 5386aae4..f380c208 100644 --- a/freebsd/sys/netinet/sctp_input.c +++ b/freebsd/sys/netinet/sctp_input.c @@ -446,22 +446,48 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, { struct sctp_association *asoc; struct mbuf *op_err; - int retval, abort_flag; - uint32_t initack_limit; + int retval, abort_flag, cookie_found; + int initack_limit; int nat_friendly = 0; /* First verify that we have no illegal param's */ abort_flag = 0; + cookie_found = 0; op_err = sctp_arethere_unrecognized_parameters(m, (offset + sizeof(struct sctp_init_chunk)), - &abort_flag, (struct sctp_chunkhdr *)cp, &nat_friendly); + &abort_flag, (struct sctp_chunkhdr *)cp, + &nat_friendly, &cookie_found); if (abort_flag) { /* Send an abort and notify peer */ sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_no_unlock = 1; return (-1); } + if (!cookie_found) { + uint16_t len; + + len = (uint16_t)(sizeof(struct sctp_error_missing_param) + sizeof(uint16_t)); + /* We abort with an error of missing mandatory param */ + op_err = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); + if (op_err != NULL) { + struct sctp_error_missing_param *cause; + + SCTP_BUF_LEN(op_err) = len; + cause = mtod(op_err, struct sctp_error_missing_param *); + /* Subtract the reserved param */ + cause->cause.code = htons(SCTP_CAUSE_MISSING_PARAM); + cause->cause.length = htons(len); + cause->num_missing_params = htonl(1); + cause->type[0] = htons(SCTP_STATE_COOKIE); + } + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, + src, dst, sh, op_err, + mflowtype, mflowid, + vrf_id, net->port); + *abort_no_unlock = 1; + return (-3); + } asoc = &stcb->asoc; asoc->peer_supports_nat = (uint8_t)nat_friendly; /* process the peer's parameters in the INIT-ACK */ @@ -524,42 +550,10 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3); /* calculate the RTO */ - net->RTO = sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, + sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, SCTP_RTT_FROM_NON_DATA); - retval = sctp_send_cookie_echo(m, offset, stcb, net); - if (retval < 0) { - /* - * No cookie, we probably should send a op error. But in any - * case if there is no cookie in the INIT-ACK, we can - * abandon the peer, its broke. - */ - if (retval == -3) { - uint16_t len; - - len = (uint16_t)(sizeof(struct sctp_error_missing_param) + sizeof(uint16_t)); - /* We abort with an error of missing mandatory param */ - op_err = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); - if (op_err != NULL) { - struct sctp_error_missing_param *cause; - - SCTP_BUF_LEN(op_err) = len; - cause = mtod(op_err, struct sctp_error_missing_param *); - /* Subtract the reserved param */ - cause->cause.code = htons(SCTP_CAUSE_MISSING_PARAM); - cause->cause.length = htons(len); - cause->num_missing_params = htonl(1); - cause->type[0] = htons(SCTP_STATE_COOKIE); - } - sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, - src, dst, sh, op_err, - mflowtype, mflowid, - vrf_id, net->port); - *abort_no_unlock = 1; - } - return (retval); - } - - return (0); + retval = sctp_send_cookie_echo(m, offset, initack_limit, stcb, net); + return (retval); } static void @@ -656,7 +650,7 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, tv.tv_sec = cp->heartbeat.hb_info.time_value_1; tv.tv_usec = cp->heartbeat.hb_info.time_value_2; /* Now lets do a RTO with this */ - r_net->RTO = sctp_calculate_rto(stcb, &stcb->asoc, r_net, &tv, + sctp_calculate_rto(stcb, &stcb->asoc, r_net, &tv, SCTP_RTT_FROM_NON_DATA); if (!(r_net->dest_state & SCTP_ADDR_REACHABLE)) { r_net->dest_state |= SCTP_ADDR_REACHABLE; @@ -711,34 +705,37 @@ static int sctp_handle_nat_colliding_state(struct sctp_tcb *stcb) { /* - * return 0 means we want you to proceed with the abort non-zero - * means no abort processing + * Return 0 means we want you to proceed with the abort non-zero + * means no abort processing. */ + uint32_t new_vtag; struct sctpasochead *head; if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { + new_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_INP_INFO_WLOCK(); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); + } else { + return (0); } if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) { /* generate a new vtag and send init */ LIST_REMOVE(stcb, sctp_asocs); - stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); + stcb->asoc.my_vtag = new_vtag; head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); - sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); SCTP_INP_INFO_WUNLOCK(); + sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); return (1); - } - if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) { + } else { /* * treat like a case where the cookie expired i.e.: - dump * current cookie. - generate a new vtag. - resend init. @@ -748,15 +745,15 @@ sctp_handle_nat_colliding_state(struct sctp_tcb *stcb) SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, &stcb->asoc); - stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); + stcb->asoc.my_vtag = new_vtag; head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); - sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); SCTP_INP_INFO_WUNLOCK(); + sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); return (1); } return (0); @@ -1682,8 +1679,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, old.tv_sec = cookie->time_entered.tv_sec; old.tv_usec = cookie->time_entered.tv_usec; net->hb_responded = 1; - net->RTO = sctp_calculate_rto(stcb, asoc, net, - &old, + sctp_calculate_rto(stcb, asoc, net, &old, SCTP_RTT_FROM_NON_DATA); if (stcb->asoc.sctp_autoclose_ticks && @@ -2157,8 +2153,8 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, ntohl(initack_cp->init.initiate_tag), vrf_id, ntohs(initack_cp->init.num_outbound_streams), port, - (struct thread *)NULL - ); + (struct thread *)NULL, + SCTP_DONT_INITIALIZE_AUTH_PARAMS); if (stcb == NULL) { struct mbuf *op_err; @@ -2407,8 +2403,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, /* calculate the RTT and set the encaps port */ old.tv_sec = cookie->time_entered.tv_sec; old.tv_usec = cookie->time_entered.tv_usec; - (*netp)->RTO = sctp_calculate_rto(stcb, asoc, *netp, - &old, SCTP_RTT_FROM_NON_DATA); + sctp_calculate_rto(stcb, asoc, *netp, &old, SCTP_RTT_FROM_NON_DATA); } /* respond with a COOKIE-ACK */ sctp_send_cookie_ack(stcb); @@ -2984,8 +2979,7 @@ sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED, SCTP_STAT_INCR_COUNTER32(sctps_activeestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); if (asoc->overall_error_count == 0) { - net->RTO = sctp_calculate_rto(stcb, asoc, net, - &asoc->time_entered, + sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, SCTP_RTT_FROM_NON_DATA); } (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); diff --git a/freebsd/sys/netinet/sctp_os_bsd.h b/freebsd/sys/netinet/sctp_os_bsd.h index abe8e2c9..68528de0 100644 --- a/freebsd/sys/netinet/sctp_os_bsd.h +++ b/freebsd/sys/netinet/sctp_os_bsd.h @@ -97,9 +97,6 @@ __FBSDID("$FreeBSD$"); #include <crypto/sha1.h> #include <crypto/sha2/sha256.h> -#ifndef in6pcb -#define in6pcb inpcb -#endif /* Declare all the malloc names for all the various mallocs */ MALLOC_DECLARE(SCTP_M_MAP); MALLOC_DECLARE(SCTP_M_STRMI); @@ -368,10 +365,10 @@ typedef struct callout sctp_os_timer_t; */ /* get the v6 hop limit */ -#define SCTP_GET_HLIM(inp, ro) in6_selecthlim((struct in6pcb *)&inp->ip_inp.inp, (ro ? (ro->ro_rt ? (ro->ro_rt->rt_ifp) : (NULL)) : (NULL))); +#define SCTP_GET_HLIM(inp, ro) in6_selecthlim(&inp->ip_inp.inp, (ro ? (ro->ro_rt ? (ro->ro_rt->rt_ifp) : (NULL)) : (NULL))); /* is the endpoint v6only? */ -#define SCTP_IPV6_V6ONLY(inp) (((struct inpcb *)inp)->inp_flags & IN6P_IPV6_V6ONLY) +#define SCTP_IPV6_V6ONLY(sctp_inpcb) ((sctp_inpcb)->ip_inp.inp.inp_flags & IN6P_IPV6_V6ONLY) /* is the socket non-blocking? */ #define SCTP_SO_IS_NBIO(so) ((so)->so_state & SS_NBIO) #define SCTP_SET_SO_NBIO(so) ((so)->so_state |= SS_NBIO) @@ -431,7 +428,7 @@ typedef struct rtentry sctp_rtentry_t; m_clrprotoflags(o_pak); \ if (local_stcb && local_stcb->sctp_ep) \ result = ip6_output(o_pak, \ - ((struct in6pcb *)(local_stcb->sctp_ep))->in6p_outputopts, \ + ((struct inpcb *)(local_stcb->sctp_ep))->in6p_outputopts, \ (ro), 0, 0, ifp, NULL); \ else \ result = ip6_output(o_pak, NULL, (ro), 0, 0, ifp, NULL); \ diff --git a/freebsd/sys/netinet/sctp_output.c b/freebsd/sys/netinet/sctp_output.c index b01ec41f..3890ea11 100644 --- a/freebsd/sys/netinet/sctp_output.c +++ b/freebsd/sys/netinet/sctp_output.c @@ -4291,10 +4291,12 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, if (net->port) { mtu -= sizeof(struct udphdr); } - if ((stcb != NULL) && (stcb->asoc.smallest_mtu > mtu)) { - sctp_mtu_size_reset(inp, &stcb->asoc, mtu); + if (mtu < net->mtu) { + if ((stcb != NULL) && (stcb->asoc.smallest_mtu > mtu)) { + sctp_mtu_size_reset(inp, &stcb->asoc, mtu); + } + net->mtu = mtu; } - net->mtu = mtu; } } else if (ro->ro_rt == NULL) { /* route was freed */ @@ -4336,7 +4338,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, * at the SCTP layer. So use the value from * the IP layer. */ - flowlabel = ntohl(((struct in6pcb *)inp)->in6p_flowinfo); + flowlabel = ntohl(((struct inpcb *)inp)->inp_flow); } flowlabel &= 0x000fffff; len = SCTP_MIN_OVERHEAD; @@ -4391,7 +4393,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, * at the SCTP layer. So use the value from * the IP layer. */ - tos_value = (ntohl(((struct in6pcb *)inp)->in6p_flowinfo) >> 20) & 0xff; + tos_value = (ntohl(((struct inpcb *)inp)->inp_flow) >> 20) & 0xff; } tos_value &= 0xfc; if (ecn_ok) { @@ -4649,10 +4651,12 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, if (net->port) { mtu -= sizeof(struct udphdr); } - if ((stcb != NULL) && (stcb->asoc.smallest_mtu > mtu)) { - sctp_mtu_size_reset(inp, &stcb->asoc, mtu); + if (mtu < net->mtu) { + if ((stcb != NULL) && (stcb->asoc.smallest_mtu > mtu)) { + sctp_mtu_size_reset(inp, &stcb->asoc, mtu); + } + net->mtu = mtu; } - net->mtu = mtu; } } else if (ifp) { if (ND_IFINFO(ifp)->linkmtu && @@ -4968,7 +4972,10 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked struct mbuf * sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, - int param_offset, int *abort_processing, struct sctp_chunkhdr *cp, int *nat_friendly) + int param_offset, int *abort_processing, + struct sctp_chunkhdr *cp, + int *nat_friendly, + int *cookie_found) { /* * Given a mbuf containing an INIT or INIT-ACK with the param_offset @@ -4986,17 +4993,20 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, */ struct sctp_paramhdr *phdr, params; - struct mbuf *mat, *op_err; + struct mbuf *mat, *m_tmp, *op_err, *op_err_last; int at, limit, pad_needed; uint16_t ptype, plen, padded_size; - int err_at; *abort_processing = 0; + if (cookie_found != NULL) { + *cookie_found = 0; + } mat = in_initpkt; - err_at = 0; limit = ntohs(cp->chunk_length) - sizeof(struct sctp_init_chunk); at = param_offset; op_err = NULL; + op_err_last = NULL; + pad_needed = 0; SCTPDBG(SCTP_DEBUG_OUTPUT1, "Check for unrecognized param's\n"); phdr = sctp_get_next_param(mat, at, ¶ms, sizeof(params)); while ((phdr != NULL) && ((size_t)limit >= sizeof(struct sctp_paramhdr))) { @@ -5019,12 +5029,17 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, switch (ptype) { /* Param's with variable size */ case SCTP_HEARTBEAT_INFO: - case SCTP_STATE_COOKIE: case SCTP_UNRECOG_PARAM: case SCTP_ERROR_CAUSE_IND: /* ok skip fwd */ at += padded_size; break; + case SCTP_STATE_COOKIE: + if (cookie_found != NULL) { + *cookie_found = 1; + } + at += padded_size; + break; /* Param's with variable size within a range */ case SCTP_CHUNK_LIST: case SCTP_SUPPORTED_CHUNK_EXT: @@ -5113,55 +5128,44 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, break; case SCTP_HOSTNAME_ADDRESS: { - /* We can NOT handle HOST NAME addresses!! */ + /* Hostname parameters are deprecated. */ + struct sctp_gen_error_cause *cause; int l_len; SCTPDBG(SCTP_DEBUG_OUTPUT1, "Can't handle hostname addresses.. abort processing\n"); *abort_processing = 1; - if (op_err == NULL) { - /* Ok need to try to get a mbuf */ + sctp_m_freem(op_err); + op_err = NULL; + op_err_last = NULL; #ifdef INET6 - l_len = SCTP_MIN_OVERHEAD; + l_len = SCTP_MIN_OVERHEAD; #else - l_len = SCTP_MIN_V4_OVERHEAD; + l_len = SCTP_MIN_V4_OVERHEAD; #endif - l_len += sizeof(struct sctp_chunkhdr); - l_len += sizeof(struct sctp_gen_error_cause); - op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA); - if (op_err) { - SCTP_BUF_LEN(op_err) = 0; - /* - * Pre-reserve space for IP, - * SCTP, and chunk header. - */ + l_len += sizeof(struct sctp_chunkhdr); + l_len += sizeof(struct sctp_gen_error_cause); + op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA); + if (op_err != NULL) { + /* + * Pre-reserve space for IP, SCTP, + * and chunk header. + */ #ifdef INET6 - SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr)); + SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr)); #else - SCTP_BUF_RESV_UF(op_err, sizeof(struct ip)); -#endif - SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); - SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); - } - } - if (op_err) { - /* If we have space */ - struct sctp_gen_error_cause cause; - - if (err_at % 4) { - uint32_t cpthis = 0; - - pad_needed = 4 - (err_at % 4); - m_copyback(op_err, err_at, pad_needed, (caddr_t)&cpthis); - err_at += pad_needed; - } - cause.code = htons(SCTP_CAUSE_UNRESOLVABLE_ADDR); - cause.length = htons((uint16_t)(sizeof(struct sctp_gen_error_cause) + plen)); - m_copyback(op_err, err_at, sizeof(struct sctp_gen_error_cause), (caddr_t)&cause); - err_at += sizeof(struct sctp_gen_error_cause); + SCTP_BUF_RESV_UF(op_err, sizeof(struct ip)); +#endif + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_gen_error_cause); + cause = mtod(op_err, struct sctp_gen_error_cause *); + cause->code = htons(SCTP_CAUSE_UNRESOLVABLE_ADDR); + cause->length = htons((uint16_t)(sizeof(struct sctp_gen_error_cause) + plen)); SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(mat, at, plen, M_NOWAIT); if (SCTP_BUF_NEXT(op_err) == NULL) { sctp_m_freem(op_err); - return (NULL); + op_err = NULL; + op_err_last = NULL; } } return (op_err); @@ -5197,37 +5201,55 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, #endif SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); + op_err_last = op_err; } } - if (op_err) { + if (op_err != NULL) { /* If we have space */ - struct sctp_paramhdr s; - - if (err_at % 4) { - uint32_t cpthis = 0; + struct sctp_paramhdr *param; - pad_needed = 4 - (err_at % 4); - m_copyback(op_err, err_at, pad_needed, (caddr_t)&cpthis); - err_at += pad_needed; + if (pad_needed > 0) { + op_err_last = sctp_add_pad_tombuf(op_err_last, pad_needed); } - s.param_type = htons(SCTP_UNRECOG_PARAM); - s.param_length = htons((uint16_t)sizeof(struct sctp_paramhdr) + plen); - m_copyback(op_err, err_at, sizeof(struct sctp_paramhdr), (caddr_t)&s); - err_at += sizeof(struct sctp_paramhdr); - SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(mat, at, plen, M_NOWAIT); - if (SCTP_BUF_NEXT(op_err) == NULL) { + if (op_err_last == NULL) { + sctp_m_freem(op_err); + op_err = NULL; + op_err_last = NULL; + goto more_processing; + } + if (M_TRAILINGSPACE(op_err_last) < (int)sizeof(struct sctp_paramhdr)) { + m_tmp = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_NOWAIT, 1, MT_DATA); + if (m_tmp == NULL) { + sctp_m_freem(op_err); + op_err = NULL; + op_err_last = NULL; + goto more_processing; + } + SCTP_BUF_LEN(m_tmp) = 0; + SCTP_BUF_NEXT(m_tmp) = NULL; + SCTP_BUF_NEXT(op_err_last) = m_tmp; + op_err_last = m_tmp; + } + param = (struct sctp_paramhdr *)(mtod(op_err_last, caddr_t)+SCTP_BUF_LEN(op_err_last)); + param->param_type = htons(SCTP_UNRECOG_PARAM); + param->param_length = htons((uint16_t)sizeof(struct sctp_paramhdr) + plen); + SCTP_BUF_LEN(op_err_last) += sizeof(struct sctp_paramhdr); + SCTP_BUF_NEXT(op_err_last) = SCTP_M_COPYM(mat, at, plen, M_NOWAIT); + if (SCTP_BUF_NEXT(op_err_last) == NULL) { sctp_m_freem(op_err); - /* - * we are out of memory but - * we still need to have a - * look at what to do (the - * system is in trouble - * though). - */ op_err = NULL; + op_err_last = NULL; goto more_processing; + } else { + while (SCTP_BUF_NEXT(op_err_last) != NULL) { + op_err_last = SCTP_BUF_NEXT(op_err_last); + } + } + if (plen % 4 != 0) { + pad_needed = 4 - (plen % 4); + } else { + pad_needed = 0; } - err_at += plen; } } more_processing: @@ -5248,7 +5270,11 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, invalid_size: SCTPDBG(SCTP_DEBUG_OUTPUT1, "abort flag set\n"); *abort_processing = 1; - if ((op_err == NULL) && phdr) { + sctp_m_freem(op_err); + op_err = NULL; + op_err_last = NULL; + if (phdr != NULL) { + struct sctp_paramhdr *param; int l_len; #ifdef INET6 l_len = SCTP_MIN_OVERHEAD; @@ -5267,25 +5293,15 @@ invalid_size: #endif SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); + SCTP_BUF_LEN(op_err) = 2 * sizeof(struct sctp_paramhdr); + param = mtod(op_err, struct sctp_paramhdr *); + param->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + param->param_length = htons(2 * sizeof(struct sctp_paramhdr)); + param++; + param->param_type = htons(ptype); + param->param_length = htons(plen); } } - if ((op_err) && phdr) { - struct sctp_paramhdr s; - - if (err_at % 4) { - uint32_t cpthis = 0; - - pad_needed = 4 - (err_at % 4); - m_copyback(op_err, err_at, pad_needed, (caddr_t)&cpthis); - err_at += pad_needed; - } - s.param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION); - s.param_length = htons(sizeof(s) + sizeof(struct sctp_paramhdr)); - m_copyback(op_err, err_at, sizeof(s), (caddr_t)&s); - err_at += sizeof(s); - /* Only copy back the p-hdr that caused the issue */ - m_copyback(op_err, err_at, sizeof(struct sctp_paramhdr), (caddr_t)phdr); - } return (op_err); } @@ -5565,7 +5581,9 @@ sctp_send_initiate_ack(struct sctp_inpcb *inp, struct sctp_tcb *stcb, abort_flag = 0; op_err = sctp_arethere_unrecognized_parameters(init_pkt, (offset + sizeof(struct sctp_init_chunk)), - &abort_flag, (struct sctp_chunkhdr *)init_chk, &nat_friendly); + &abort_flag, + (struct sctp_chunkhdr *)init_chk, + &nat_friendly, NULL); if (abort_flag) { do_a_abort: if (op_err == NULL) { @@ -5584,8 +5602,7 @@ do_a_abort: m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (m == NULL) { /* No memory, INIT timer will re-attempt. */ - if (op_err) - sctp_m_freem(op_err); + sctp_m_freem(op_err); return; } chunk_len = (uint16_t)sizeof(struct sctp_init_ack_chunk); @@ -5774,8 +5791,11 @@ do_a_abort: net->ro._s_addr = sctp_source_address_selection(inp, stcb, (sctp_route_t *)&net->ro, net, 0, vrf_id); - if (net->ro._s_addr == NULL) + if (net->ro._s_addr == NULL) { + sctp_m_freem(op_err); + sctp_m_freem(m); return; + } net->src_addr_selected = 1; @@ -5804,8 +5824,11 @@ do_a_abort: net->ro._s_addr = sctp_source_address_selection(inp, stcb, (sctp_route_t *)&net->ro, net, 0, vrf_id); - if (net->ro._s_addr == NULL) + if (net->ro._s_addr == NULL) { + sctp_m_freem(op_err); + sctp_m_freem(m); return; + } net->src_addr_selected = 1; } @@ -5876,6 +5899,7 @@ do_a_abort: so = inp->sctp_socket; if (so == NULL) { /* memory problem */ + sctp_m_freem(op_err); sctp_m_freem(m); return; } else { @@ -6802,15 +6826,19 @@ sctp_sendall_completes(void *ptr, uint32_t val SCTP_UNUSED) */ /* now free everything */ + if (ca->inp) { + /* Lets clear the flag to allow others to run. */ + ca->inp->sctp_flags &= ~SCTP_PCB_FLAGS_SND_ITERATOR_UP; + } sctp_m_freem(ca->m); SCTP_FREE(ca, SCTP_M_COPYAL); } static struct mbuf * -sctp_copy_out_all(struct uio *uio, int len) +sctp_copy_out_all(struct uio *uio, ssize_t len) { struct mbuf *ret, *at; - int left, willcpy, cancpy, error; + ssize_t left, willcpy, cancpy, error; ret = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_WAITOK, 1, MT_DATA); if (ret == NULL) { @@ -6825,17 +6853,17 @@ sctp_copy_out_all(struct uio *uio, int len) at = ret; while (left > 0) { /* Align data to the end */ - error = uiomove(mtod(at, caddr_t), willcpy, uio); + error = uiomove(mtod(at, caddr_t), (int)willcpy, uio); if (error) { err_out_now: sctp_m_freem(at); return (NULL); } - SCTP_BUF_LEN(at) = willcpy; + SCTP_BUF_LEN(at) = (int)willcpy; SCTP_BUF_NEXT_PKT(at) = SCTP_BUF_NEXT(at) = 0; left -= willcpy; if (left > 0) { - SCTP_BUF_NEXT(at) = sctp_get_mbuf_for_msg(left, 0, M_WAITOK, 1, MT_DATA); + SCTP_BUF_NEXT(at) = sctp_get_mbuf_for_msg((unsigned int)left, 0, M_WAITOK, 1, MT_DATA); if (SCTP_BUF_NEXT(at) == NULL) { goto err_out_now; } @@ -6855,6 +6883,14 @@ sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, int ret; struct sctp_copy_all *ca; + if (inp->sctp_flags & SCTP_PCB_FLAGS_SND_ITERATOR_UP) { + /* There is another. */ + return (EBUSY); + } + if (uio->uio_resid > SCTP_MAX_SENDALL_LIMIT) { + /* You must be less than the max! */ + return (EMSGSIZE); + } SCTP_MALLOC(ca, struct sctp_copy_all *, sizeof(struct sctp_copy_all), SCTP_M_COPYAL); if (ca == NULL) { @@ -6875,7 +6911,7 @@ sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, ca->sndrcv.sinfo_flags &= ~SCTP_SENDALL; /* get length and mbuf chain */ if (uio) { - ca->sndlen = (int)uio->uio_resid; + ca->sndlen = uio->uio_resid; ca->m = sctp_copy_out_all(uio, ca->sndlen); if (ca->m == NULL) { SCTP_FREE(ca, SCTP_M_COPYAL); @@ -6891,6 +6927,7 @@ sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, ca->sndlen += SCTP_BUF_LEN(mat); } } + inp->sctp_flags |= SCTP_PCB_FLAGS_SND_ITERATOR_UP; ret = sctp_initiate_iterator(NULL, sctp_sendall_iterator, NULL, SCTP_PCB_ANY_FLAGS, SCTP_PCB_ANY_FEATURES, SCTP_ASOC_ANY_STATE, @@ -8979,7 +9016,7 @@ sctp_queue_op_err(struct sctp_tcb *stcb, struct mbuf *op_err) int sctp_send_cookie_echo(struct mbuf *m, - int offset, + int offset, int limit, struct sctp_tcb *stcb, struct sctp_nets *net) { @@ -9005,18 +9042,30 @@ sctp_send_cookie_echo(struct mbuf *m, } ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); + if (plen < sizeof(struct sctp_paramhdr)) { + return (-6); + } if (ptype == SCTP_STATE_COOKIE) { int pad; /* found the cookie */ - if ((pad = (plen % 4))) { - plen += 4 - pad; + if (at + plen > limit) { + return (-7); } cookie = SCTP_M_COPYM(m, at, plen, M_NOWAIT); if (cookie == NULL) { /* No memory */ return (-2); } + if ((pad = (plen % 4)) > 0) { + pad = 4 - pad; + } + if (pad > 0) { + cookie = sctp_pad_lastmbuf(cookie, pad, NULL); + if (cookie == NULL) { + return (-8); + } + } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(cookie, SCTP_MBUF_ICOPY); @@ -9042,7 +9091,7 @@ sctp_send_cookie_echo(struct mbuf *m, chk->rec.chunk_id.id = SCTP_COOKIE_ECHO; chk->rec.chunk_id.can_take_data = 0; chk->flags = CHUNK_FLAGS_FRAGMENT_OK; - chk->send_size = plen; + chk->send_size = SCTP_SIZE32(plen); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; @@ -9068,7 +9117,6 @@ sctp_send_heartbeat_ack(struct sctp_tcb *stcb, struct sctp_chunkhdr *chdr; struct sctp_tmit_chunk *chk; - if (net == NULL) /* must have a net pointer */ return; @@ -9086,13 +9134,8 @@ sctp_send_heartbeat_ack(struct sctp_tcb *stcb, chdr = mtod(outchain, struct sctp_chunkhdr *); chdr->chunk_type = SCTP_HEARTBEAT_ACK; chdr->chunk_flags = 0; - if (chk_length % 4) { - /* need pad */ - uint32_t cpthis = 0; - int padlen; - - padlen = 4 - (chk_length % 4); - m_copyback(outchain, chk_length, padlen, (caddr_t)&cpthis); + if (chk_length % 4 != 0) { + sctp_pad_lastmbuf(outchain, 4 - (chk_length % 4), NULL); } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { @@ -12372,7 +12415,7 @@ sctp_copy_it_in(struct sctp_tcb *stcb, struct sctp_sndrcvinfo *srcv, struct uio *uio, struct sctp_nets *net, - int max_send_len, + ssize_t max_send_len, int user_marks_eor, int *error) { @@ -12518,7 +12561,7 @@ sctp_lower_sosend(struct socket *so, struct thread *p ) { - unsigned int sndlen = 0, max_len; + ssize_t sndlen = 0, max_len, local_add_more; int error, len; struct mbuf *top = NULL; int queue_only = 0, queue_only_for_init = 0; @@ -12540,7 +12583,7 @@ sctp_lower_sosend(struct socket *so, int got_all_of_the_send = 0; int hold_tcblock = 0; int non_blocking = 0; - uint32_t local_add_more, local_soresv = 0; + ssize_t local_soresv = 0; uint16_t port; uint16_t sinfo_flags; sctp_assoc_t sinfo_assoc_id; @@ -12570,12 +12613,12 @@ sctp_lower_sosend(struct socket *so, SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } - sndlen = (unsigned int)uio->uio_resid; + sndlen = uio->uio_resid; } else { top = SCTP_HEADER_TO_CHAIN(i_pak); sndlen = SCTP_HEADER_LEN(i_pak); } - SCTPDBG(SCTP_DEBUG_OUTPUT1, "Send called addr:%p send length %d\n", + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Send called addr:%p send length %zu\n", (void *)addr, sndlen); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && @@ -12636,6 +12679,12 @@ sctp_lower_sosend(struct socket *so, sinfo_flags = inp->def_send.sinfo_flags; sinfo_assoc_id = inp->def_send.sinfo_assoc_id; } + if (flags & MSG_EOR) { + sinfo_flags |= SCTP_EOR; + } + if (flags & MSG_EOF) { + sinfo_flags |= SCTP_EOF; + } if (sinfo_flags & SCTP_SENDALL) { /* its a sendall */ error = sctp_sendall(inp, uio, top, srcv); @@ -12753,7 +12802,8 @@ sctp_lower_sosend(struct socket *so, stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, inp->sctp_ep.pre_open_stream_count, inp->sctp_ep.port, - p); + p, + SCTP_INITIALIZE_AUTH_PARAMS); if (stcb == NULL) { /* Error is setup for us in the call */ goto out_unlocked; @@ -12782,9 +12832,6 @@ sctp_lower_sosend(struct socket *so, SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); - /* initialize authentication params for the assoc */ - sctp_initialize_auth_params(inp, stcb); - if (control) { if (sctp_process_cmsgs_for_init(stcb, control, &error)) { sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, @@ -12805,9 +12852,17 @@ sctp_lower_sosend(struct socket *so, } } else asoc = &stcb->asoc; - if (srcv == NULL) + if (srcv == NULL) { srcv = (struct sctp_sndrcvinfo *)&asoc->def_send; - if (srcv->sinfo_flags & SCTP_ADDR_OVER) { + sinfo_flags = srcv->sinfo_flags; + if (flags & MSG_EOR) { + sinfo_flags |= SCTP_EOR; + } + if (flags & MSG_EOF) { + sinfo_flags |= SCTP_EOF; + } + } + if (sinfo_flags & SCTP_ADDR_OVER) { if (addr) net = sctp_findnet(stcb, addr); else @@ -12831,20 +12886,20 @@ sctp_lower_sosend(struct socket *so, free_cnt_applied = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NO_FRAGMENT)) { - if (sndlen > asoc->smallest_mtu) { + if (sndlen > (ssize_t)asoc->smallest_mtu) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE); error = EMSGSIZE; goto out_unlocked; } } if (SCTP_SO_IS_NBIO(so) - || (flags & MSG_NBIO) + || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0 ) { non_blocking = 1; } /* would we block? */ if (non_blocking) { - uint32_t amount; + ssize_t amount; if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); @@ -12859,13 +12914,13 @@ sctp_lower_sosend(struct socket *so, if ((SCTP_SB_LIMIT_SND(so) < (amount + inqueue_bytes + stcb->asoc.sb_send_resv)) || (stcb->asoc.chunks_on_out_queue >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EWOULDBLOCK); - if (sndlen > SCTP_SB_LIMIT_SND(so)) + if (sndlen > (ssize_t)SCTP_SB_LIMIT_SND(so)) error = EMSGSIZE; else error = EWOULDBLOCK; goto out_unlocked; } - stcb->asoc.sb_send_resv += sndlen; + stcb->asoc.sb_send_resv += (uint32_t)sndlen; SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } else { @@ -12914,7 +12969,7 @@ sctp_lower_sosend(struct socket *so, (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) || (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) { - if (srcv->sinfo_flags & SCTP_ABORT) { + if (sinfo_flags & SCTP_ABORT) { ; } else { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); @@ -12929,9 +12984,9 @@ sctp_lower_sosend(struct socket *so, } #endif /* __rtems__ */ /* Are we aborting? */ - if (srcv->sinfo_flags & SCTP_ABORT) { + if (sinfo_flags & SCTP_ABORT) { struct mbuf *mm; - int tot_demand, tot_out = 0, max_out; + ssize_t tot_demand, tot_out = 0, max_out; SCTP_STAT_INCR(sctps_sends_with_abort); if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || @@ -12965,7 +13020,7 @@ sctp_lower_sosend(struct socket *so, error = EMSGSIZE; goto out; } - mm = sctp_get_mbuf_for_msg(tot_demand, 0, M_WAITOK, 1, MT_DATA); + mm = sctp_get_mbuf_for_msg((unsigned int)tot_demand, 0, M_WAITOK, 1, MT_DATA); } if (mm == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM); @@ -12985,7 +13040,7 @@ sctp_lower_sosend(struct socket *so, ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); ph->param_length = htons((uint16_t)(sizeof(struct sctp_paramhdr) + tot_out)); ph++; - SCTP_BUF_LEN(mm) = tot_out + sizeof(struct sctp_paramhdr); + SCTP_BUF_LEN(mm) = (int)(tot_out + sizeof(struct sctp_paramhdr)); if (top == NULL) { error = uiomove((caddr_t)ph, (int)tot_out, uio); if (error) { @@ -13026,12 +13081,7 @@ sctp_lower_sosend(struct socket *so, /* Calculate the maximum we can send */ inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb)); if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) { - if (non_blocking) { - /* we already checked for non-blocking above. */ - max_len = sndlen; - } else { - max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; - } + max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; } else { max_len = 0; } @@ -13048,7 +13098,7 @@ sctp_lower_sosend(struct socket *so, /* Unless E_EOR mode is on, we must make a send FIT in one call. */ if ((user_marks_eor == 0) && - (sndlen > SCTP_SB_LIMIT_SND(stcb->sctp_socket))) { + (sndlen > (ssize_t)SCTP_SB_LIMIT_SND(stcb->sctp_socket))) { /* It will NEVER fit */ SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE); error = EMSGSIZE; @@ -13065,7 +13115,7 @@ sctp_lower_sosend(struct socket *so, } if (user_marks_eor) { - local_add_more = min(SCTP_SB_LIMIT_SND(so), SCTP_BASE_SYSCTL(sctp_add_more_threshold)); + local_add_more = (ssize_t)min(SCTP_SB_LIMIT_SND(so), SCTP_BASE_SYSCTL(sctp_add_more_threshold)); } else { /*- * For non-eeor the whole message must fit in @@ -13078,7 +13128,7 @@ sctp_lower_sosend(struct socket *so, goto skip_preblock; } if (((max_len <= local_add_more) && - (SCTP_SB_LIMIT_SND(so) >= local_add_more)) || + ((ssize_t)SCTP_SB_LIMIT_SND(so) >= local_add_more)) || (max_len == 0) || ((stcb->asoc.chunks_on_out_queue + stcb->asoc.stream_queue_cnt) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) { /* No room right now ! */ @@ -13086,7 +13136,7 @@ sctp_lower_sosend(struct socket *so, inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb)); while ((SCTP_SB_LIMIT_SND(so) < (inqueue_bytes + local_add_more)) || ((stcb->asoc.stream_queue_cnt + stcb->asoc.chunks_on_out_queue) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) { - SCTPDBG(SCTP_DEBUG_OUTPUT1, "pre_block limit:%u <(inq:%d + %d) || (%d+%d > %d)\n", + SCTPDBG(SCTP_DEBUG_OUTPUT1, "pre_block limit:%u <(inq:%d + %zd) || (%d+%d > %d)\n", (unsigned int)SCTP_SB_LIMIT_SND(so), inqueue_bytes, local_add_more, @@ -13138,7 +13188,7 @@ skip_preblock: * case NOTE: uio will be null when top/mbuf is passed */ if (sndlen == 0) { - if (srcv->sinfo_flags & SCTP_EOF) { + if (sinfo_flags & SCTP_EOF) { got_all_of_the_send = 1; goto dataless_eof; } else { @@ -13187,7 +13237,7 @@ skip_preblock: } sctp_snd_sb_alloc(stcb, sp->length); atomic_add_int(&asoc->stream_queue_cnt, 1); - if (srcv->sinfo_flags & SCTP_UNORDERED) { + if (sinfo_flags & SCTP_UNORDERED) { SCTP_STAT_INCR(sctps_sends_with_unord); } TAILQ_INSERT_TAIL(&strm->outqueue, sp, next); @@ -13219,16 +13269,16 @@ skip_preblock: else max_len = 0; - if ((max_len > SCTP_BASE_SYSCTL(sctp_add_more_threshold)) || + if ((max_len > (ssize_t)SCTP_BASE_SYSCTL(sctp_add_more_threshold)) || (max_len && (SCTP_SB_LIMIT_SND(so) < SCTP_BASE_SYSCTL(sctp_add_more_threshold))) || - (uio->uio_resid && (uio->uio_resid <= (int)max_len))) { + (uio->uio_resid && (uio->uio_resid <= max_len))) { sndout = 0; new_tail = NULL; if (hold_tcblock) { SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } - mm = sctp_copy_resume(uio, max_len, user_marks_eor, &error, &sndout, &new_tail); + mm = sctp_copy_resume(uio, (int)max_len, user_marks_eor, &error, &sndout, &new_tail); if ((mm == NULL) || error) { if (mm) { sctp_m_freem(mm); @@ -13262,15 +13312,15 @@ skip_preblock: sctp_snd_sb_alloc(stcb, sndout); atomic_add_int(&sp->length, sndout); len += sndout; - if (srcv->sinfo_flags & SCTP_SACK_IMMEDIATELY) { + if (sinfo_flags & SCTP_SACK_IMMEDIATELY) { sp->sinfo_flags |= SCTP_SACK_IMMEDIATELY; } /* Did we reach EOR? */ if ((uio->uio_resid == 0) && ((user_marks_eor == 0) || - (srcv->sinfo_flags & SCTP_EOF) || - (user_marks_eor && (srcv->sinfo_flags & SCTP_EOR)))) { + (sinfo_flags & SCTP_EOF) || + (user_marks_eor && (sinfo_flags & SCTP_EOR)))) { sp->msg_is_complete = 1; } else { sp->msg_is_complete = 0; @@ -13291,7 +13341,7 @@ skip_preblock: SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } - sctp_prune_prsctp(stcb, asoc, srcv, sndlen); + sctp_prune_prsctp(stcb, asoc, srcv, (int)sndlen); inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb)); if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; @@ -13388,10 +13438,10 @@ skip_preblock: stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); } - if (hold_tcblock == 1) { - SCTP_TCB_UNLOCK(stcb); - hold_tcblock = 0; - } + } + if (hold_tcblock == 1) { + SCTP_TCB_UNLOCK(stcb); + hold_tcblock = 0; } SOCKBUF_LOCK(&so->so_snd); /*- @@ -13413,7 +13463,7 @@ skip_preblock: min(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTP_SB_LIMIT_SND(so)))) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_log_block(SCTP_BLOCK_LOG_INTO_BLK, - asoc, (size_t)uio->uio_resid); + asoc, uio->uio_resid); } be.error = 0; stcb->block_entry = &be; @@ -13472,7 +13522,7 @@ skip_preblock: /* We send in a 0, since we do NOT have any locks */ error = sctp_msg_append(stcb, net, top, srcv, 0); top = NULL; - if (srcv->sinfo_flags & SCTP_EOF) { + if (sinfo_flags & SCTP_EOF) { /* * This should only happen for Panda for the mbuf * send case, which does NOT yet support EEOR mode. @@ -13487,7 +13537,7 @@ skip_preblock: } dataless_eof: /* EOF thing ? */ - if ((srcv->sinfo_flags & SCTP_EOF) && + if ((sinfo_flags & SCTP_EOF) && (got_all_of_the_send == 1)) { SCTP_STAT_INCR(sctps_sends_with_eof); error = 0; diff --git a/freebsd/sys/netinet/sctp_output.h b/freebsd/sys/netinet/sctp_output.h index 1b3d22d9..6d78cf90 100644 --- a/freebsd/sys/netinet/sctp_output.h +++ b/freebsd/sys/netinet/sctp_output.h @@ -92,11 +92,11 @@ sctp_send_initiate_ack(struct sctp_inpcb *, struct sctp_tcb *, struct mbuf * sctp_arethere_unrecognized_parameters(struct mbuf *, int, int *, - struct sctp_chunkhdr *, int *); + struct sctp_chunkhdr *, int *, int *); void sctp_queue_op_err(struct sctp_tcb *, struct mbuf *); int -sctp_send_cookie_echo(struct mbuf *, int, struct sctp_tcb *, +sctp_send_cookie_echo(struct mbuf *, int, int, struct sctp_tcb *, struct sctp_nets *); void sctp_send_cookie_ack(struct sctp_tcb *); diff --git a/freebsd/sys/netinet/sctp_pcb.c b/freebsd/sys/netinet/sctp_pcb.c index 782e5f1d..45342dc3 100644 --- a/freebsd/sys/netinet/sctp_pcb.c +++ b/freebsd/sys/netinet/sctp_pcb.c @@ -2847,7 +2847,7 @@ sctp_inpcb_bind(struct socket *so, struct sockaddr *addr, struct sockaddr_in *sin; /* IPV6_V6ONLY socket? */ - if (SCTP_IPV6_V6ONLY(ip_inp)) { + if (SCTP_IPV6_V6ONLY(inp)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); return (EINVAL); } @@ -3648,10 +3648,7 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) #ifdef INET6 if (ip_pcb->inp_vflag & INP_IPV6) { - struct in6pcb *in6p; - - in6p = (struct in6pcb *)inp; - ip6_freepcbopts(in6p->in6p_outputopts); + ip6_freepcbopts(ip_pcb->in6p_outputopts); } #endif /* INET6 */ ip_pcb->inp_vflag = 0; @@ -4161,11 +4158,9 @@ sctp_aloc_a_assoc_id(struct sctp_inpcb *inp, struct sctp_tcb *stcb) struct sctpasochead *head; struct sctp_tcb *lstcb; - SCTP_INP_WLOCK(inp); try_again: if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { /* TSNH */ - SCTP_INP_WUNLOCK(inp); return (0); } /* @@ -4184,8 +4179,7 @@ try_again: head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)]; LIST_INSERT_HEAD(head, stcb, sctp_tcbasocidhash); stcb->asoc.in_asocid_hash = 1; - SCTP_INP_WUNLOCK(inp); - return id; + return (id); } /* @@ -4197,8 +4191,8 @@ struct sctp_tcb * sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, int *error, uint32_t override_tag, uint32_t vrf_id, uint16_t o_streams, uint16_t port, - struct thread *p -) + struct thread *p, + int initialize_auth_params) { /* note the p argument is only valid in unbound sockets */ @@ -4348,7 +4342,6 @@ sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, memset(stcb, 0, sizeof(*stcb)); asoc = &stcb->asoc; - asoc->assoc_id = sctp_aloc_a_assoc_id(inp, stcb); SCTP_TCB_LOCK_INIT(stcb); SCTP_TCB_SEND_LOCK_INIT(stcb); stcb->rport = rport; @@ -4359,7 +4352,6 @@ sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, /* failed */ SCTP_TCB_LOCK_DESTROY(stcb); SCTP_TCB_SEND_LOCK_DESTROY(stcb); - LIST_REMOVE(stcb, sctp_tcbasocidhash); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_DECR_ASOC_COUNT(); *error = err; @@ -4372,7 +4364,6 @@ sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, /* inpcb freed while alloc going on */ SCTP_TCB_LOCK_DESTROY(stcb); SCTP_TCB_SEND_LOCK_DESTROY(stcb); - LIST_REMOVE(stcb, sctp_tcbasocidhash); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); @@ -4383,6 +4374,7 @@ sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, } SCTP_TCB_LOCK(stcb); + asoc->assoc_id = sctp_aloc_a_assoc_id(inp, stcb); /* now that my_vtag is set, add it to the hash */ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* put it in the bucket in the vtag hash of assoc's for the system */ @@ -4430,6 +4422,9 @@ sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, inp->sctp_hashmark)]; LIST_INSERT_HEAD(head, stcb, sctp_tcbhash); } + if (initialize_auth_params == SCTP_INITIALIZE_AUTH_PARAMS) { + sctp_initialize_auth_params(inp, stcb); + } SCTP_INP_WUNLOCK(inp); SCTPDBG(SCTP_DEBUG_PCB1, "Association %p now allocated\n", (void *)stcb); return (stcb); @@ -4918,12 +4913,11 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre inp->sctp_flags |= SCTP_PCB_FLAGS_WAS_CONNECTED; if (so) { SOCKBUF_LOCK(&so->so_rcv); - if (so->so_rcv.sb_cc == 0) { - so->so_state &= ~(SS_ISCONNECTING | - SS_ISDISCONNECTING | - SS_ISCONFIRMING | - SS_ISCONNECTED); - } + so->so_state &= ~(SS_ISCONNECTING | + SS_ISDISCONNECTING | + SS_ISCONFIRMING | + SS_ISCONNECTED); + so->so_state |= SS_ISDISCONNECTED; socantrcvmore_locked(so); socantsendmore(so); sctp_sowwakeup(inp, so); @@ -4991,6 +4985,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre * in case. */ /* anything on the wheel needs to be removed */ + SCTP_TCB_SEND_LOCK(stcb); for (i = 0; i < asoc->streamoutcnt; i++) { struct sctp_stream_out *outs; @@ -4999,7 +4994,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) { atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&outs->outqueue, sp, next); - stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 0); + stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 1); sctp_free_spbufspace(stcb, asoc, sp); if (sp->data) { if (so) { @@ -5021,6 +5016,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre sctp_free_a_strmoq(stcb, sp, SCTP_SO_LOCKED); } } + SCTP_TCB_SEND_UNLOCK(stcb); /* sa_ignore FREED_MEMORY */ TAILQ_FOREACH_SAFE(strrst, &asoc->resetHead, next_resp, nstrrst) { TAILQ_REMOVE(&asoc->resetHead, strrst, next_resp); @@ -5779,7 +5775,7 @@ sctp_startup_mcore_threads(void) #endif void -sctp_pcb_init() +sctp_pcb_init(void) { /* * SCTP initialization for the PCB structures should be called by diff --git a/freebsd/sys/netinet/sctp_pcb.h b/freebsd/sys/netinet/sctp_pcb.h index 5b41ae8a..cbe51c7d 100644 --- a/freebsd/sys/netinet/sctp_pcb.h +++ b/freebsd/sys/netinet/sctp_pcb.h @@ -362,7 +362,7 @@ struct sctp_inpcb { */ union { struct inpcb inp; - char align[(sizeof(struct in6pcb) + SCTP_ALIGNM1) & + char align[(sizeof(struct inpcb) + SCTP_ALIGNM1) & ~SCTP_ALIGNM1]; } ip_inp; @@ -578,9 +578,13 @@ int sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id); void sctp_inpcb_free(struct sctp_inpcb *, int, int); +#define SCTP_DONT_INITIALIZE_AUTH_PARAMS 0 +#define SCTP_INITIALIZE_AUTH_PARAMS 1 + struct sctp_tcb * sctp_aloc_assoc(struct sctp_inpcb *, struct sockaddr *, - int *, uint32_t, uint32_t, uint16_t, uint16_t, struct thread *); + int *, uint32_t, uint32_t, uint16_t, uint16_t, struct thread *, + int); int sctp_free_assoc(struct sctp_inpcb *, struct sctp_tcb *, int, int); diff --git a/freebsd/sys/netinet/sctp_structs.h b/freebsd/sys/netinet/sctp_structs.h index c4eafc26..451ae72b 100644 --- a/freebsd/sys/netinet/sctp_structs.h +++ b/freebsd/sys/netinet/sctp_structs.h @@ -166,7 +166,7 @@ struct sctp_copy_all { struct sctp_inpcb *inp; /* ep */ struct mbuf *m; struct sctp_sndrcvinfo sndrcv; - int sndlen; + ssize_t sndlen; int cnt_sent; int cnt_failed; }; diff --git a/freebsd/sys/netinet/sctp_usrreq.c b/freebsd/sys/netinet/sctp_usrreq.c index b519971c..d8fbabc4 100644 --- a/freebsd/sys/netinet/sctp_usrreq.c +++ b/freebsd/sys/netinet/sctp_usrreq.c @@ -632,7 +632,6 @@ connected_type: /* now what about control */ if (control) { if (inp->control) { - SCTP_PRINTF("huh? control set?\n"); sctp_m_freem(inp->control); inp->control = NULL; } @@ -968,9 +967,9 @@ sctp_shutdown(struct socket *so) abort_anyway: op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6; + SCTP_INP_RUNLOCK(inp); sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_LOCKED); - SCTP_INP_RUNLOCK(inp); return (0); } } @@ -1124,22 +1123,25 @@ sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp, } #ifdef INET6 if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { + if (actual + sizeof(struct sockaddr_in6) > limit) { + return (actual); + } in6_sin_2_v4mapsin6(sin, (struct sockaddr_in6 *)sas); ((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport; sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(struct sockaddr_in6)); actual += sizeof(struct sockaddr_in6); } else { #endif - memcpy(sas, sin, sizeof(*sin)); + if (actual + sizeof(struct sockaddr_in) > limit) { + return (actual); + } + memcpy(sas, sin, sizeof(struct sockaddr_in)); ((struct sockaddr_in *)sas)->sin_port = inp->sctp_lport; - sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(*sin)); - actual += sizeof(*sin); + sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(struct sockaddr_in)); + actual += sizeof(struct sockaddr_in); #ifdef INET6 } #endif - if (actual >= limit) { - return (actual); - } } else { continue; } @@ -1184,13 +1186,13 @@ sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp, (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { continue; } - memcpy(sas, sin6, sizeof(*sin6)); - ((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport; - sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(*sin6)); - actual += sizeof(*sin6); - if (actual >= limit) { + if (actual + sizeof(struct sockaddr_in6) > limit) { return (actual); } + memcpy(sas, sin6, sizeof(struct sockaddr_in6)); + ((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport; + sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(struct sockaddr_in6)); + actual += sizeof(struct sockaddr_in6); } else { continue; } @@ -1204,6 +1206,7 @@ sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp, } } else { struct sctp_laddr *laddr; + size_t sa_len; LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (stcb) { @@ -1211,6 +1214,10 @@ sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp, continue; } } + sa_len = laddr->ifa->address.sa.sa_len; + if (actual + sa_len > limit) { + return (actual); + } if (sctp_fill_user_address(sas, &laddr->ifa->address.sa)) continue; switch (laddr->ifa->address.sa.sa_family) { @@ -1228,12 +1235,8 @@ sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp, /* TSNH */ break; } - sas = (struct sockaddr_storage *)((caddr_t)sas + - laddr->ifa->address.sa.sa_len); - actual += laddr->ifa->address.sa.sa_len; - if (actual >= limit) { - return (actual); - } + sas = (struct sockaddr_storage *)((caddr_t)sas + sa_len); + actual += sa_len; } } return (actual); @@ -1351,13 +1354,12 @@ static int sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, size_t optsize, void *p, int delay) { - int error = 0; + int error; int creat_lock_on = 0; struct sctp_tcb *stcb = NULL; struct sockaddr *sa; unsigned int num_v6 = 0, num_v4 = 0, *totaddrp, totaddr; uint32_t vrf_id; - int bad_addresses = 0; sctp_assoc_t *a_id; SCTPDBG(SCTP_DEBUG_PCB1, "Connectx called\n"); @@ -1396,17 +1398,12 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, totaddrp = (unsigned int *)optval; totaddr = *totaddrp; sa = (struct sockaddr *)(totaddrp + 1); - stcb = sctp_connectx_helper_find(inp, sa, &totaddr, &num_v4, &num_v6, &error, (unsigned int)(optsize - sizeof(int)), &bad_addresses); - if ((stcb != NULL) || bad_addresses) { + error = sctp_connectx_helper_find(inp, sa, totaddr, &num_v4, &num_v6, (unsigned int)(optsize - sizeof(int))); + if (error != 0) { /* Already have or am bring up an association */ SCTP_ASOC_CREATE_UNLOCK(inp); creat_lock_on = 0; - if (stcb) - SCTP_TCB_UNLOCK(stcb); - if (bad_addresses == 0) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); - error = EALREADY; - } + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); goto out_now; } #ifdef INET6 @@ -1417,10 +1414,7 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && (num_v4 > 0)) { - struct in6pcb *inp6; - - inp6 = (struct in6pcb *)inp; - if (SCTP_IPV6_V6ONLY(inp6)) { + if (SCTP_IPV6_V6ONLY(inp)) { /* * if IPV6_V6ONLY flag, ignore connections destined * to a v4 addr or v4-mapped addr @@ -1448,8 +1442,8 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, stcb = sctp_aloc_assoc(inp, sa, &error, 0, vrf_id, inp->sctp_ep.pre_open_stream_count, inp->sctp_ep.port, - (struct thread *)p - ); + (struct thread *)p, + SCTP_INITIALIZE_AUTH_PARAMS); if (stcb == NULL) { /* Gak! no memory */ goto out_now; @@ -1480,16 +1474,11 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, sctp_connectx_helper_add(stcb, sa, (totaddr - 1), &error); /* Fill in the return id */ if (error) { - (void)sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, - SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7); goto out_now; } a_id = (sctp_assoc_t *)optval; *a_id = sctp_get_associd(stcb); - /* initialize authentication parameters for the assoc */ - sctp_initialize_auth_params(inp, stcb); - if (delay) { /* doing delayed connection */ stcb->asoc.delayed_connection = 1; @@ -2238,8 +2227,8 @@ flags_out: SCTP_FIND_STCB(inp, stcb, saddr->sget_assoc_id); if (stcb) { - left = (*optsize) - sizeof(struct sctp_getaddresses); - *optsize = sizeof(struct sctp_getaddresses); + left = (*optsize) - sizeof(sctp_assoc_t); + *optsize = sizeof(sctp_assoc_t); sas = (struct sockaddr_storage *)&saddr->addr[0]; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { @@ -2313,7 +2302,7 @@ flags_out: if (stcb) { SCTP_TCB_UNLOCK(stcb); } - *optsize = sizeof(struct sockaddr_storage) + actual; + *optsize = sizeof(sctp_assoc_t) + actual; break; } case SCTP_PEER_ADDR_PARAMS: @@ -2642,42 +2631,47 @@ flags_out: sstat->sstat_instrms = stcb->asoc.streamincnt; sstat->sstat_outstrms = stcb->asoc.streamoutcnt; sstat->sstat_fragmentation_point = sctp_get_frag_point(stcb, &stcb->asoc); - memcpy(&sstat->sstat_primary.spinfo_address, - &stcb->asoc.primary_destination->ro._l_addr, - ((struct sockaddr *)(&stcb->asoc.primary_destination->ro._l_addr))->sa_len); net = stcb->asoc.primary_destination; - ((struct sockaddr_in *)&sstat->sstat_primary.spinfo_address)->sin_port = stcb->rport; - /* - * Again the user can get info from sctp_constants.h - * for what the state of the network is. - */ - if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { - /* It's unconfirmed */ - sstat->sstat_primary.spinfo_state = SCTP_UNCONFIRMED; - } else if (net->dest_state & SCTP_ADDR_REACHABLE) { - /* It's active */ - sstat->sstat_primary.spinfo_state = SCTP_ACTIVE; - } else { - /* It's inactive */ - sstat->sstat_primary.spinfo_state = SCTP_INACTIVE; - } - sstat->sstat_primary.spinfo_cwnd = net->cwnd; - sstat->sstat_primary.spinfo_srtt = net->lastsa >> SCTP_RTT_SHIFT; - sstat->sstat_primary.spinfo_rto = net->RTO; - sstat->sstat_primary.spinfo_mtu = net->mtu; - switch (stcb->asoc.primary_destination->ro._l_addr.sa.sa_family) { + if (net != NULL) { + memcpy(&sstat->sstat_primary.spinfo_address, + &stcb->asoc.primary_destination->ro._l_addr, + ((struct sockaddr *)(&stcb->asoc.primary_destination->ro._l_addr))->sa_len); + ((struct sockaddr_in *)&sstat->sstat_primary.spinfo_address)->sin_port = stcb->rport; + /* + * Again the user can get info from + * sctp_constants.h for what the state of + * the network is. + */ + if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { + /* It's unconfirmed */ + sstat->sstat_primary.spinfo_state = SCTP_UNCONFIRMED; + } else if (net->dest_state & SCTP_ADDR_REACHABLE) { + /* It's active */ + sstat->sstat_primary.spinfo_state = SCTP_ACTIVE; + } else { + /* It's inactive */ + sstat->sstat_primary.spinfo_state = SCTP_INACTIVE; + } + sstat->sstat_primary.spinfo_cwnd = net->cwnd; + sstat->sstat_primary.spinfo_srtt = net->lastsa >> SCTP_RTT_SHIFT; + sstat->sstat_primary.spinfo_rto = net->RTO; + sstat->sstat_primary.spinfo_mtu = net->mtu; + switch (stcb->asoc.primary_destination->ro._l_addr.sa.sa_family) { #if defined(INET) - case AF_INET: - sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_V4_OVERHEAD; - break; + case AF_INET: + sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_V4_OVERHEAD; + break; #endif #if defined(INET6) - case AF_INET6: - sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_OVERHEAD; - break; + case AF_INET6: + sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_OVERHEAD; + break; #endif - default: - break; + default: + break; + } + } else { + memset(&sstat->sstat_primary, 0, sizeof(struct sctp_paddrinfo)); } sstat->sstat_primary.spinfo_assoc_id = sctp_get_associd(stcb); SCTP_TCB_UNLOCK(stcb); @@ -3744,13 +3738,11 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, uint32_t vrf_id; if (optval == NULL) { - SCTP_PRINTF("optval is NULL\n"); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { - SCTP_PRINTF("inp is NULL?\n"); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } @@ -4065,10 +4057,12 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { + SCTP_TCB_SEND_LOCK(stcb); stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 1, 1); stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value]; stcb->asoc.stream_scheduling_module = av->assoc_value; stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); + SCTP_TCB_SEND_UNLOCK(stcb); SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || @@ -4084,10 +4078,12 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); + SCTP_TCB_SEND_LOCK(stcb); stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 1, 1); stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value]; stcb->asoc.stream_scheduling_module = av->assoc_value; stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); + SCTP_TCB_SEND_UNLOCK(stcb); SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); @@ -4624,6 +4620,12 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_TCB_UNLOCK(stcb); break; } + if (SCTP_GET_STATE(stcb) != SCTP_STATE_OPEN) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + break; + } if (sizeof(struct sctp_reset_streams) + strrst->srs_number_streams * sizeof(uint16_t) > optsize) { error = EINVAL; @@ -4656,13 +4658,13 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } for (i = 0; i < strrst->srs_number_streams; i++) { if ((send_in) && - (strrst->srs_stream_list[i] > stcb->asoc.streamincnt)) { + (strrst->srs_stream_list[i] >= stcb->asoc.streamincnt)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } if ((send_out) && - (strrst->srs_stream_list[i] > stcb->asoc.streamoutcnt)) { + (strrst->srs_stream_list[i] >= stcb->asoc.streamoutcnt)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; @@ -4738,6 +4740,12 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_TCB_UNLOCK(stcb); break; } + if (SCTP_GET_STATE(stcb) != SCTP_STATE_OPEN) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + break; + } if (stcb->asoc.stream_reset_outstanding) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); error = EALREADY; @@ -4808,6 +4816,12 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_TCB_UNLOCK(stcb); break; } + if (SCTP_GET_STATE(stcb) != SCTP_STATE_OPEN) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + break; + } if (stcb->asoc.stream_reset_outstanding) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); error = EALREADY; @@ -5314,10 +5328,11 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, net->dest_state &= ~SCTP_ADDR_NOHB; } if (paddrp->spp_flags & SPP_HB_DEMAND) { - /* on demand HB */ - sctp_send_hb(stcb, net, SCTP_SO_LOCKED); - sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SOCKOPT, SCTP_SO_LOCKED); - sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net); + if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { + sctp_send_hb(stcb, net, SCTP_SO_LOCKED); + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SOCKOPT, SCTP_SO_LOCKED); + sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net); + } } if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) && (paddrp->spp_pathmtu >= SCTP_SMALLEST_PMTU)) { if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { @@ -6117,6 +6132,10 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_INP_RUNLOCK(inp); } } + } else { + if (stcb) { + SCTP_TCB_UNLOCK(stcb); + } } break; } @@ -6211,6 +6230,9 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_FIND_STCB(inp, stcb, info->pr_assoc_id); if (info->pr_policy > SCTP_PR_SCTP_MAX) { + if (stcb) { + SCTP_TCB_UNLOCK(stcb); + } SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; @@ -6330,6 +6352,9 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } } if (thlds->spt_pathcpthld != 0xffff) { + if (stcb != NULL) { + SCTP_TCB_UNLOCK(stcb); + } error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); break; @@ -6830,6 +6855,10 @@ sctp_ctloutput(struct socket *so, struct sockopt *sopt) return (error); } optsize = sopt->sopt_valsize; + if (optsize > SCTP_SOCKET_OPTION_LIMIT) { + SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS); + return (ENOBUFS); + } if (optsize) { SCTP_MALLOC(optval, void *, optsize, SCTP_M_SOCKOPT); if (optval == NULL) { @@ -6886,14 +6915,14 @@ sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p) #ifdef INET6 case AF_INET6: { - struct sockaddr_in6 *sin6p; + struct sockaddr_in6 *sin6; if (addr->sa_len != sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } - sin6p = (struct sockaddr_in6 *)addr; - if (p != NULL && (error = prison_remote_ip6(p->td_ucred, &sin6p->sin6_addr)) != 0) { + sin6 = (struct sockaddr_in6 *)addr; + if (p != NULL && (error = prison_remote_ip6(p->td_ucred, &sin6->sin6_addr)) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); return (error); } @@ -6903,14 +6932,14 @@ sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p) #ifdef INET case AF_INET: { - struct sockaddr_in *sinp; + struct sockaddr_in *sin; if (addr->sa_len != sizeof(struct sockaddr_in)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } - sinp = (struct sockaddr_in *)addr; - if (p != NULL && (error = prison_remote_ip4(p->td_ucred, &sinp->sin_addr)) != 0) { + sin = (struct sockaddr_in *)addr; + if (p != NULL && (error = prison_remote_ip4(p->td_ucred, &sin->sin_addr)) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); return (error); } @@ -6992,7 +7021,8 @@ sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p) /* We are GOOD to go */ stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, inp->sctp_ep.pre_open_stream_count, - inp->sctp_ep.port, p); + inp->sctp_ep.port, p, + SCTP_INITIALIZE_AUTH_PARAMS); if (stcb == NULL) { /* Gak! no memory */ goto out_now; @@ -7005,9 +7035,6 @@ sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p) SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); - /* initialize authentication parameters for the assoc */ - sctp_initialize_auth_params(inp, stcb); - sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); SCTP_TCB_UNLOCK(stcb); out_now: @@ -7201,28 +7228,56 @@ sctp_accept(struct socket *so, struct sockaddr **addr) SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (ECONNRESET); } - SCTP_INP_RLOCK(inp); + SCTP_INP_WLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) { - SCTP_INP_RUNLOCK(inp); + SCTP_INP_WUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); return (EOPNOTSUPP); } if (so->so_state & SS_ISDISCONNECTED) { - SCTP_INP_RUNLOCK(inp); + SCTP_INP_WUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ECONNABORTED); return (ECONNABORTED); } stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { - SCTP_INP_RUNLOCK(inp); + SCTP_INP_WUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (ECONNRESET); } SCTP_TCB_LOCK(stcb); - SCTP_INP_RUNLOCK(inp); store = stcb->asoc.primary_destination->ro._l_addr; SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); - SCTP_TCB_UNLOCK(stcb); + /* Wake any delayed sleep action */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) { + inp->sctp_flags &= ~SCTP_PCB_FLAGS_DONT_WAKE; + if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEOUTPUT) { + inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT; + SOCKBUF_LOCK(&inp->sctp_socket->so_snd); + if (sowriteable(inp->sctp_socket)) { + sowwakeup_locked(inp->sctp_socket); + } else { + SOCKBUF_UNLOCK(&inp->sctp_socket->so_snd); + } + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEINPUT) { + inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT; + SOCKBUF_LOCK(&inp->sctp_socket->so_rcv); + if (soreadable(inp->sctp_socket)) { + sctp_defered_wakeup_cnt++; + sorwakeup_locked(inp->sctp_socket); + } else { + SOCKBUF_UNLOCK(&inp->sctp_socket->so_rcv); + } + } + } + SCTP_INP_WUNLOCK(inp); + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_19); + } else { + SCTP_TCB_UNLOCK(stcb); + } switch (store.sa.sa_family) { #ifdef INET case AF_INET: @@ -7264,40 +7319,6 @@ sctp_accept(struct socket *so, struct sockaddr **addr) /* TSNH */ break; } - /* Wake any delayed sleep action */ - if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) { - SCTP_INP_WLOCK(inp); - inp->sctp_flags &= ~SCTP_PCB_FLAGS_DONT_WAKE; - if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEOUTPUT) { - inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT; - SCTP_INP_WUNLOCK(inp); - SOCKBUF_LOCK(&inp->sctp_socket->so_snd); - if (sowriteable(inp->sctp_socket)) { - sowwakeup_locked(inp->sctp_socket); - } else { - SOCKBUF_UNLOCK(&inp->sctp_socket->so_snd); - } - SCTP_INP_WLOCK(inp); - } - if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEINPUT) { - inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT; - SCTP_INP_WUNLOCK(inp); - SOCKBUF_LOCK(&inp->sctp_socket->so_rcv); - if (soreadable(inp->sctp_socket)) { - sctp_defered_wakeup_cnt++; - sorwakeup_locked(inp->sctp_socket); - } else { - SOCKBUF_UNLOCK(&inp->sctp_socket->so_rcv); - } - SCTP_INP_WLOCK(inp); - } - SCTP_INP_WUNLOCK(inp); - } - if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { - SCTP_TCB_LOCK(stcb); - sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, - SCTP_FROM_SCTP_USRREQ + SCTP_LOC_19); - } return (0); } diff --git a/freebsd/sys/netinet/sctputil.c b/freebsd/sys/netinet/sctputil.c index ddf136ef..e75f158e 100644 --- a/freebsd/sys/netinet/sctputil.c +++ b/freebsd/sys/netinet/sctputil.c @@ -548,7 +548,7 @@ sctp_wakeup_log(struct sctp_tcb *stcb, uint32_t wake_cnt, int from) } void -sctp_log_block(uint8_t from, struct sctp_association *asoc, size_t sendlen) +sctp_log_block(uint8_t from, struct sctp_association *asoc, ssize_t sendlen) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; @@ -2471,25 +2471,24 @@ sctp_mtu_size_reset(struct sctp_inpcb *inp, /* - * given an association and starting time of the current RTT period return - * RTO in number of msecs net should point to the current network + * Given an association and starting time of the current RTT period, update + * RTO in number of msecs. net should point to the current network. + * Return 1, if an RTO update was performed, return 0 if no update was + * performed due to invalid starting point. */ -uint32_t +int sctp_calculate_rto(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_nets *net, struct timeval *old, int rtt_from_sack) { - /*- - * given an association and the starting time of the current RTT - * period (in value1/value2) return RTO in number of msecs. - */ + struct timeval now; + uint64_t rtt_us; /* RTT in us */ int32_t rtt; /* RTT in ms */ uint32_t new_rto; int first_measure = 0; - struct timeval now; /************************/ /* 1. calculate new RTT */ @@ -2500,10 +2499,19 @@ sctp_calculate_rto(struct sctp_tcb *stcb, } else { (void)SCTP_GETTIME_TIMEVAL(&now); } + if ((old->tv_sec > now.tv_sec) || + ((old->tv_sec == now.tv_sec) && (old->tv_sec > now.tv_sec))) { + /* The starting point is in the future. */ + return (0); + } timevalsub(&now, old); + rtt_us = (uint64_t)1000000 * (uint64_t)now.tv_sec + (uint64_t)now.tv_usec; + if (rtt_us > SCTP_RTO_UPPER_BOUND * 1000) { + /* The RTT is larger than a sane value. */ + return (0); + } /* store the current RTT in us */ - net->rtt = (uint64_t)1000000 * (uint64_t)now.tv_sec + - (uint64_t)now.tv_usec; + net->rtt = rtt_us; /* compute rtt in ms */ rtt = (int32_t)(net->rtt / 1000); if ((asoc->cc_functions.sctp_rtt_calculated) && (rtt_from_sack == SCTP_RTT_FROM_DATA)) { @@ -2535,7 +2543,7 @@ sctp_calculate_rto(struct sctp_tcb *stcb, * Paper "Congestion Avoidance and Control", Annex A. * * (net->lastsa >> SCTP_RTT_SHIFT) is the srtt - * (net->lastsa >> SCTP_RTT_VAR_SHIFT) is the rttvar + * (net->lastsv >> SCTP_RTT_VAR_SHIFT) is the rttvar */ if (net->RTO_measured) { rtt -= (net->lastsa >> SCTP_RTT_SHIFT); @@ -2576,8 +2584,8 @@ sctp_calculate_rto(struct sctp_tcb *stcb, if (new_rto > stcb->asoc.maxrto) { new_rto = stcb->asoc.maxrto; } - /* we are now returning the RTO */ - return (new_rto); + net->RTO = new_rto; + return (1); } /* @@ -3948,7 +3956,7 @@ sctp_report_all_outbound(struct sctp_tcb *stcb, uint16_t error, int holds_lock, TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) { atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&outs->outqueue, sp, next); - stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, holds_lock); + stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 1); sctp_free_spbufspace(stcb, asoc, sp); if (sp->data) { sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb, @@ -3997,7 +4005,7 @@ sctp_abort_notification(struct sctp_tcb *stcb, uint8_t from_peer, uint16_t error return; } /* Tell them we lost the asoc */ - sctp_report_all_outbound(stcb, error, 1, so_locked); + sctp_report_all_outbound(stcb, error, 0, so_locked); if (from_peer) { sctp_ulp_notify(SCTP_NOTIFY_ASSOC_REM_ABORTED, stcb, error, abort, so_locked); } else { @@ -4569,12 +4577,14 @@ sctp_add_to_readq(struct sctp_inpcb *inp, if (inp_read_lock_held == 0) SCTP_INP_READ_LOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ) { - sctp_free_remote_addr(control->whoFrom); - if (control->data) { - sctp_m_freem(control->data); - control->data = NULL; + if (!control->on_strm_q) { + sctp_free_remote_addr(control->whoFrom); + if (control->data) { + sctp_m_freem(control->data); + control->data = NULL; + } + sctp_free_a_readq(stcb, control); } - sctp_free_a_readq(stcb, control); if (inp_read_lock_held == 0) SCTP_INP_READ_UNLOCK(inp); return; @@ -4619,8 +4629,10 @@ sctp_add_to_readq(struct sctp_inpcb *inp, control->tail_mbuf = prev; } else { /* Everything got collapsed out?? */ - sctp_free_remote_addr(control->whoFrom); - sctp_free_a_readq(stcb, control); + if (!control->on_strm_q) { + sctp_free_remote_addr(control->whoFrom); + sctp_free_a_readq(stcb, control); + } if (inp_read_lock_held == 0) SCTP_INP_READ_UNLOCK(inp); return; @@ -5221,8 +5233,9 @@ sctp_sorecvmsg(struct socket *so, * */ struct sctp_inpcb *inp = NULL; - int my_len = 0; - int cp_len = 0, error = 0; + ssize_t my_len = 0; + ssize_t cp_len = 0; + int error = 0; struct sctp_queued_to_read *control = NULL, *ctl = NULL, *nxt = NULL; struct mbuf *m = NULL; struct sctp_tcb *stcb = NULL; @@ -5231,7 +5244,7 @@ sctp_sorecvmsg(struct socket *so, int out_flags = 0, in_flags = 0; int block_allowed = 1; uint32_t freed_so_far = 0; - uint32_t copied_so_far = 0; + ssize_t copied_so_far = 0; int in_eeor_mode = 0; int no_rcv_needed = 0; uint32_t rwnd_req = 0; @@ -5575,7 +5588,7 @@ found_one: * will go to the sctp_user_rcvd() that will not * lock until it KNOWs it MUST send a WUP-SACK. */ - freed_so_far = stcb->freed_by_sorcv_sincelast; + freed_so_far = (uint32_t)stcb->freed_by_sorcv_sincelast; stcb->freed_by_sorcv_sincelast = 0; } } @@ -5730,8 +5743,8 @@ get_more_data: m = control->data; while (m) { /* Move out all we can */ - cp_len = (int)uio->uio_resid; - my_len = (int)SCTP_BUF_LEN(m); + cp_len = uio->uio_resid; + my_len = SCTP_BUF_LEN(m); if (cp_len > my_len) { /* not enough in this buf */ cp_len = my_len; @@ -5741,7 +5754,7 @@ get_more_data: hold_rlock = 0; } if (cp_len > 0) - error = uiomove(mtod(m, char *), cp_len, uio); + error = uiomove(mtod(m, char *), (int)cp_len, uio); /* re-read */ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { goto release; @@ -5786,7 +5799,7 @@ get_more_data: control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); } copied_so_far += cp_len; - freed_so_far += cp_len; + freed_so_far += (uint32_t)cp_len; freed_so_far += MSIZE; atomic_subtract_int(&control->length, cp_len); control->data = sctp_m_free(m); @@ -5826,9 +5839,9 @@ get_more_data: } if ((in_flags & MSG_PEEK) == 0) { SCTP_BUF_RESV_UF(m, cp_len); - SCTP_BUF_LEN(m) -= cp_len; + SCTP_BUF_LEN(m) -= (int)cp_len; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { - sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, cp_len); + sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, (int)cp_len); } atomic_subtract_int(&so->so_rcv.sb_cc, cp_len); if ((control->do_not_ref_stcb == 0) && @@ -5836,7 +5849,7 @@ get_more_data: atomic_subtract_int(&stcb->asoc.sb_cc, cp_len); } copied_so_far += cp_len; - freed_so_far += cp_len; + freed_so_far += (uint32_t)cp_len; freed_so_far += MSIZE; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, @@ -5927,7 +5940,7 @@ get_more_data: } if ((uio->uio_resid == 0) || ((in_eeor_mode) && - (copied_so_far >= (uint32_t)max(so->so_rcv.sb_lowat, 1)))) { + (copied_so_far >= max(so->so_rcv.sb_lowat, 1)))) { goto release; } /* @@ -6064,7 +6077,7 @@ wait_some_more: control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m)); } sctp_sbfree(control, stcb, &so->so_rcv, m); - freed_so_far += SCTP_BUF_LEN(m); + freed_so_far += (uint32_t)SCTP_BUF_LEN(m); freed_so_far += MSIZE; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(&so->so_rcv, @@ -6392,30 +6405,33 @@ out_now: return (added); } -struct sctp_tcb * +int sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr, - unsigned int *totaddr, - unsigned int *num_v4, unsigned int *num_v6, int *error, - unsigned int limit, int *bad_addr) + unsigned int totaddr, + unsigned int *num_v4, unsigned int *num_v6, + unsigned int limit) { struct sockaddr *sa; - struct sctp_tcb *stcb = NULL; + struct sctp_tcb *stcb; unsigned int incr, at, i; at = 0; sa = addr; - *error = *num_v6 = *num_v4 = 0; + *num_v6 = *num_v4 = 0; /* account and validate addresses */ - for (i = 0; i < *totaddr; i++) { + if (totaddr == 0) { + return (EINVAL); + } + for (i = 0; i < totaddr; i++) { + if (at + sizeof(struct sockaddr) > limit) { + return (EINVAL); + } switch (sa->sa_family) { #ifdef INET case AF_INET: incr = (unsigned int)sizeof(struct sockaddr_in); if (sa->sa_len != incr) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); - *error = EINVAL; - *bad_addr = 1; - return (NULL); + return (EINVAL); } (*num_v4) += 1; break; @@ -6428,46 +6444,34 @@ sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr, sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { /* Must be non-mapped for connectx */ - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); - *error = EINVAL; - *bad_addr = 1; - return (NULL); + return (EINVAL); } incr = (unsigned int)sizeof(struct sockaddr_in6); if (sa->sa_len != incr) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); - *error = EINVAL; - *bad_addr = 1; - return (NULL); + return (EINVAL); } (*num_v6) += 1; break; } #endif default: - *totaddr = i; - incr = 0; - /* we are done */ - break; + return (EINVAL); } - if (i == *totaddr) { - break; + if ((at + incr) > limit) { + return (EINVAL); } SCTP_INP_INCR_REF(inp); stcb = sctp_findassociation_ep_addr(&inp, sa, NULL, NULL, NULL); if (stcb != NULL) { - /* Already have or am bring up an association */ - return (stcb); + SCTP_TCB_UNLOCK(stcb); + return (EALREADY); } else { SCTP_INP_DECR_REF(inp); } - if ((at + incr) > limit) { - *totaddr = i; - break; - } + at += incr; sa = (struct sockaddr *)((caddr_t)sa + incr); } - return ((struct sctp_tcb *)NULL); + return (0); } /* diff --git a/freebsd/sys/netinet/sctputil.h b/freebsd/sys/netinet/sctputil.h index c12fb210..c67c021f 100644 --- a/freebsd/sys/netinet/sctputil.h +++ b/freebsd/sys/netinet/sctputil.h @@ -133,7 +133,7 @@ uint32_t sctp_get_next_mtu(uint32_t); void sctp_timeout_handler(void *); -uint32_t +int sctp_calculate_rto(struct sctp_tcb *, struct sctp_association *, struct sctp_nets *, struct timeval *, int); @@ -211,10 +211,9 @@ int sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr, int totaddr, int *error); -struct sctp_tcb * -sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr, - unsigned int *totaddr, unsigned int *num_v4, unsigned int *num_v6, - int *error, unsigned int limit, int *bad_addr); +int +sctp_connectx_helper_find(struct sctp_inpcb *, struct sockaddr *, + unsigned int, unsigned int *, unsigned int *, unsigned int); int sctp_is_there_an_abort_here(struct mbuf *, int, uint32_t *); #ifdef INET6 @@ -367,7 +366,7 @@ void sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc void sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from); void sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *, int, int, uint8_t); -void sctp_log_block(uint8_t, struct sctp_association *, size_t); +void sctp_log_block(uint8_t, struct sctp_association *, ssize_t); void sctp_log_rwnd(uint8_t, uint32_t, uint32_t, uint32_t); void sctp_log_rwnd_set(uint8_t, uint32_t, uint32_t, uint32_t, uint32_t); int sctp_fill_stat_log(void *, size_t *); diff --git a/freebsd/sys/netinet/tcp_hpts.h b/freebsd/sys/netinet/tcp_hpts.h index 04c86769..293daa2c 100644 --- a/freebsd/sys/netinet/tcp_hpts.h +++ b/freebsd/sys/netinet/tcp_hpts.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2016-2018 Netflix Inc. + * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -45,112 +45,80 @@ TAILQ_HEAD(hptsh, inpcb); /* Number of useconds in a hpts tick */ #define HPTS_TICKS_PER_USEC 10 -#define HPTS_MS_TO_SLOTS(x) (x * 100) +#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1) #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) #define HPTS_USEC_IN_SEC 1000000 #define HPTS_MSEC_IN_SEC 1000 #define HPTS_USEC_IN_MSEC 1000 -#define DEFAULT_HPTS_LOG 3072 - -/* - * Log flags consist of - * 7f 7f 1 1 bits - * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE - * - * So for example cpu 10, number 10 would with - * input active would show up as: - * p_flags = 0001010 0001010 1 0 - * <or> - * p_flags = 0x142a - */ -#define HPTS_HPTS_ACTIVE 0x01 -#define HPTS_INPUT_ACTIVE 0x02 - -#define HPTSLOG_IMMEDIATE 1 -#define HPTSLOG_INSERT_NORMAL 2 -#define HPTSLOG_INSERT_SLEEPER 3 -#define HPTSLOG_SLEEP_AFTER 4 -#define HPTSLOG_SLEEP_BEFORE 5 -#define HPTSLOG_INSERTED 6 -#define HPTSLOG_WAKEUP_HPTS 7 -#define HPTSLOG_SETTORUN 8 -#define HPTSLOG_HPTSI 9 -#define HPTSLOG_TOLONG 10 -#define HPTSLOG_AWAKENS 11 -#define HPTSLOG_TIMESOUT 12 -#define HPTSLOG_SLEEPSET 13 -#define HPTSLOG_WAKEUP_INPUT 14 -#define HPTSLOG_RESCHEDULE 15 -#define HPTSLOG_AWAKE 16 -#define HPTSLOG_INP_DONE 17 - -struct hpts_log { - struct inpcb *inp; - int32_t event; - uint32_t cts; - int32_t line; - uint32_t ticknow; - uint32_t t_paceslot; - uint32_t t_hptsreq; - uint32_t p_curtick; - uint32_t p_prevtick; - uint32_t slot_req; - uint32_t p_on_queue_cnt; - uint32_t p_nxt_slot; - uint32_t p_cur_slot; - uint32_t p_hpts_sleep_time; - uint16_t p_flags; - uint8_t p_onhpts; - uint8_t p_oninput; - uint8_t is_notempty; -}; struct hpts_diag { - uint32_t p_hpts_active; - uint32_t p_nxt_slot; - uint32_t p_cur_slot; - uint32_t slot_req; - uint32_t inp_hptsslot; - uint32_t slot_now; - uint32_t have_slept; - uint32_t hpts_sleep_time; - uint32_t yet_to_sleep; - uint32_t need_new_to; - int32_t co_ret; - uint8_t p_on_min_sleep; + uint32_t p_hpts_active; /* bbr->flex7 x */ + uint32_t p_nxt_slot; /* bbr->flex1 x */ + uint32_t p_cur_slot; /* bbr->flex2 x */ + uint32_t p_prev_slot; /* bbr->delivered */ + uint32_t p_runningtick; /* bbr->inflight */ + uint32_t slot_req; /* bbr->flex3 x */ + uint32_t inp_hptsslot; /* bbr->flex4 x */ + uint32_t slot_remaining; /* bbr->flex5 x */ + uint32_t have_slept; /* bbr->epoch x */ + uint32_t hpts_sleep_time; /* bbr->applimited x */ + uint32_t yet_to_sleep; /* bbr->lt_epoch x */ + uint32_t need_new_to; /* bbr->flex6 x */ + uint32_t wheel_tick; /* bbr->bw_inuse x */ + uint32_t maxticks; /* bbr->delRate x */ + uint32_t wheel_cts; /* bbr->rttProp x */ + int32_t co_ret; /* bbr->pkts_out x */ + uint32_t p_curtick; /* upper bbr->cur_del_rate */ + uint32_t p_lasttick; /* lower bbr->cur_del_rate */ + uint8_t p_on_min_sleep; /* bbr->flex8 x */ }; +/* Magic flags to tell whats cooking on the pacing wheel */ +#define PACE_TMR_DELACK 0x01 /* Delayed ack timer running */ +#define PACE_TMR_RACK 0x02 /* RACK timer running */ +#define PACE_TMR_TLP 0x04 /* TLP timer running */ +#define PACE_TMR_RXT 0x08 /* Retransmit timer running */ +#define PACE_TMR_PERSIT 0x10 /* Persists timer running */ +#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */ +#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */ +#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) + #ifdef _KERNEL /* Each hpts has its own p_mtx which is used for locking */ struct tcp_hpts_entry { /* Cache line 0x00 */ struct mtx p_mtx; /* Mutex for hpts */ - uint32_t p_hpts_active; /* Flag that says hpts is awake */ - uint32_t p_curtick; /* Current tick in 10 us the hpts is at */ - uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */ + uint16_t p_hpts_active; /* Flag that says hpts is awake */ + uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */ + uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ + uint32_t p_curtick; /* Tick in 10 us the hpts is going to */ + uint32_t p_runningtick; /* Current tick we are at if we are running */ + uint32_t p_prev_slot; /* Previous slot we were on */ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ uint32_t p_nxt_slot; /* The next slot outside the current range of * slots that the hpts is running on. */ int32_t p_on_queue_cnt; /* Count on queue in this hpts */ - uint32_t enobuf_cnt; - uint16_t p_log_at; + uint32_t p_lasttick; /* Last tick before the current one */ uint8_t p_direct_wake :1, /* boolean */ - p_log_wrapped :1, /* boolean */ - p_on_min_sleep:1; /* boolean */ - uint8_t p_fill; + p_on_min_sleep:1, /* boolean */ + p_avail:6; + uint8_t p_fill[3]; /* Fill to 32 bits */ /* Cache line 0x40 */ void *p_inp; struct hptsh p_input; /* For the tcp-input runner */ /* Hptsi wheel */ struct hptsh *p_hptss; - struct hpts_log *p_log; - uint32_t p_logsize; int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */ uint32_t hit_no_enobuf; uint32_t p_dyn_adjust; uint32_t p_hpts_sleep_time; /* Current sleep interval having a max * of 255ms */ + uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ + uint32_t saved_lasttick; /* for logging */ + uint32_t saved_curtick; /* for logging */ + uint32_t saved_curslot; /* for logging */ + uint32_t saved_prev_slot; /* for logging */ uint32_t p_delayed_by; /* How much were we delayed by */ /* Cache line 0x80 */ struct sysctl_ctx_list hpts_ctx; @@ -236,13 +204,9 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line); #define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__); -void -tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos); int -__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line); -#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__) +__tcp_queue_to_input(struct inpcb *inp, int32_t line); +#define tcp_queue_to_input(a) __tcp_queue_to_input(a, __LINE__) uint16_t tcp_hpts_delayedby(struct inpcb *inp); diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c index d00504dc..4bf12298 100644 --- a/freebsd/sys/netinet/tcp_input.c +++ b/freebsd/sys/netinet/tcp_input.c @@ -214,11 +214,6 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); -VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, - &VNET_NAME(tcp_autorcvbuf_inc), 0, - "Incrementor step size of automatic receive buffer"); - VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, @@ -373,31 +368,14 @@ cc_conn_init(struct tcpcb *tp) /* * Set the initial slow-start flight size. * - * RFC5681 Section 3.1 specifies the default conservative values. - * RFC3390 specifies slightly more aggressive values. - * RFC6928 increases it to ten segments. - * Support for user specified value for initial flight size. - * * If a SYN or SYN/ACK was lost and retransmitted, we have to * reduce the initial CWND to one segment as congestion is likely * requiring us to be cautious. */ if (tp->snd_cwnd == 1) tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */ - else if (V_tcp_initcwnd_segments) - tp->snd_cwnd = min(V_tcp_initcwnd_segments * maxseg, - max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); - else if (V_tcp_do_rfc3390) - tp->snd_cwnd = min(4 * maxseg, max(2 * maxseg, 4380)); - else { - /* Per RFC5681 Section 3.1 */ - if (maxseg > 2190) - tp->snd_cwnd = 2 * maxseg; - else if (maxseg > 1095) - tp->snd_cwnd = 3 * maxseg; - else - tp->snd_cwnd = 4 * maxseg; - } + else + tp->snd_cwnd = tcp_compute_initwnd(maxseg); if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); @@ -578,6 +556,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) int optlen = 0; #ifdef INET int len; + uint8_t ipttl; #endif int tlen = 0, off; int drop_hdrlen; @@ -700,6 +679,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) * Checksum extended TCP header and data. */ len = off0 + tlen; + ipttl = ip->ip_ttl; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); @@ -708,6 +688,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) /* Reset TOS bits */ ip->ip_tos = iptos; /* Re-initialization for later version check */ + ip->ip_ttl = ipttl; ip->ip_v = IPVERSION; ip->ip_hl = off0 >> 2; } @@ -1468,13 +1449,16 @@ drop: * The criteria to step up the receive buffer one notch are: * 1. Application has not set receive buffer size with * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. - * 2. the number of bytes received during the time it takes - * one timestamp to be reflected back to us (the RTT); - * 3. received bytes per RTT is within seven eighth of the - * current socket buffer size; - * 4. receive buffer size has not hit maximal automatic size; + * 2. the number of bytes received during 1/2 of an sRTT + * is at least 3/8 of the current socket buffer size. + * 3. receive buffer size has not hit maximal automatic size; + * + * If all of the criteria are met we increaset the socket buffer + * by a 1/2 (bounded by the max). This allows us to keep ahead + * of slow-start but also makes it so our peer never gets limited + * by our rwnd which we then open up causing a burst. * - * This algorithm does one step per RTT at most and only if + * This algorithm does two steps per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. * Shrinking the buffer during idle times is not necessary as * it doesn't consume any memory when idle. @@ -1491,11 +1475,10 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) && tp->t_srtt != 0 && tp->rfbuf_ts != 0 && TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) > - (tp->t_srtt >> TCP_RTT_SHIFT)) { - if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) && + ((tp->t_srtt >> TCP_RTT_SHIFT)/2)) { + if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 2)/ 4 * 3) && so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { - newsize = min(so->so_rcv.sb_hiwat + - V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); + newsize = min((so->so_rcv.sb_hiwat + (so->so_rcv.sb_hiwat/2)), V_tcp_autorcvbuf_max); } TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize); @@ -1505,7 +1488,6 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, } else { tp->rfbuf_cnt += tlen; /* add up */ } - return (newsize); } @@ -2029,7 +2011,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, else tp->t_flags |= TF_ACKNOW; - if ((thflags & TH_ECE) && V_tcp_do_ecn) { + if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && + V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } @@ -2279,6 +2262,18 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } + /* + * DSACK - add SACK block for dropped range + */ + if (tp->t_flags & TF_SACK_PERMIT) { + tcp_update_sack_list(tp, th->th_seq, + th->th_seq + todrop); + /* + * ACK now, as the next in-sequence segment + * will clear the DSACK block again + */ + tp->t_flags |= TF_ACKNOW; + } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; @@ -2403,8 +2398,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; - tp->snd_wnd = tiwin; } + tp->snd_wnd = tiwin; /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED @@ -3007,6 +3002,8 @@ dodata: /* XXX */ if ((tlen || (thflags & TH_FIN) || tfo_syn) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; + tcp_seq save_rnxt = tp->rcv_nxt; + int save_tlen = tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue @@ -3046,11 +3043,41 @@ dodata: /* XXX */ * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ - thflags = tcp_reass(tp, th, &save_start, &tlen, m); + tcp_seq temp = save_start; + thflags = tcp_reass(tp, th, &temp, &tlen, m); tp->t_flags |= TF_ACKNOW; } - if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) - tcp_update_sack_list(tp, save_start, save_start + tlen); + if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { + if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { + /* + * DSACK actually handled in the fastpath + * above. + */ + tcp_update_sack_list(tp, save_start, + save_start + save_tlen); + } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { + if ((tp->rcv_numsacks >= 1) && + (tp->sackblks[0].end == save_start)) { + /* + * Partial overlap, recorded at todrop + * above. + */ + tcp_update_sack_list(tp, + tp->sackblks[0].start, + tp->sackblks[0].end); + } else { + tcp_update_dsack_list(tp, save_start, + save_start + save_tlen); + } + } else if (tlen >= save_tlen) { + /* Update of sackblks. */ + tcp_update_dsack_list(tp, save_start, + save_start + save_tlen); + } else if (tlen > 0) { + tcp_update_dsack_list(tp, save_start, + save_start + tlen); + } + } #if 0 /* * Note the amount of data that peer has sent into @@ -3820,3 +3847,30 @@ tcp_compute_pipe(struct tcpcb *tp) tp->sackhint.sack_bytes_rexmit - tp->sackhint.sacked_bytes); } + +uint32_t +tcp_compute_initwnd(uint32_t maxseg) +{ + /* + * Calculate the Initial Window, also used as Restart Window + * + * RFC5681 Section 3.1 specifies the default conservative values. + * RFC3390 specifies slightly more aggressive values. + * RFC6928 increases it to ten segments. + * Support for user specified value for initial flight size. + */ + if (V_tcp_initcwnd_segments) + return min(V_tcp_initcwnd_segments * maxseg, + max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); + else if (V_tcp_do_rfc3390) + return min(4 * maxseg, max(2 * maxseg, 4380)); + else { + /* Per RFC5681 Section 3.1 */ + if (maxseg > 2190) + return (2 * maxseg); + else if (maxseg > 1095) + return (3 * maxseg); + else + return (4 * maxseg); + } +} diff --git a/freebsd/sys/netinet/tcp_log_buf.h b/freebsd/sys/netinet/tcp_log_buf.h index e569395a..e0575a43 100644 --- a/freebsd/sys/netinet/tcp_log_buf.h +++ b/freebsd/sys/netinet/tcp_log_buf.h @@ -1,8 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2016-2018 - * Netflix Inc. All rights reserved. + * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -176,7 +175,7 @@ enum tcp_log_events { TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ TCP_LOG_PRR, /* Doing PRR 6 */ TCP_LOG_REORDER,/* Detected reorder 7 */ - TCP_LOG_PACER, /* Pacer sending a packet 8 */ + TCP_LOG_HPTS, /* Hpts sending a packet 8 */ BBR_LOG_BBRUPD, /* We updated BBR info 9 */ BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */ BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */ @@ -195,31 +194,38 @@ enum tcp_log_events { BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ TCP_LOG_FLOWEND, /* End of a flow 25 */ BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ - BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */ - BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */ + BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */ + BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */ BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */ TCP_LOG_USERSEND, /* User level sends data 31 */ - UNUSED_32, /* Unused 32 */ - UNUSED_33, /* Unused 33 */ + BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */ + BBR_LOG_STATE_TARGET, /* Log of target at state 33 */ BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */ BBR_LOG_TO_PROCESS, /* A to was processed 35 */ BBR_LOG_BBRTSO, /* TSO update 36 */ - BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */ + BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */ BBR_LOG_LOWGAIN, /* Low gain accounting 38 */ BBR_LOG_PROGRESS, /* Progress timer event 39 */ TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */ BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */ BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */ - BBR_LOG_PACING_CALC, /* calc the pacing time 43 */ + BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */ BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */ BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */ BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ - BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */ + BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */ TCP_LOG_REASS, /* Reassembly buffer logging 50 */ - TCP_LOG_END /* End (keep at end) 51 */ + TCP_HDWR_TLS, /* TCP Hardware TLS logs 51 */ + BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */ + BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */ + TCP_LOG_CONNEND, /* End of connection 54 */ + TCP_LOG_LRO, /* LRO entry 55 */ + TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */ + TCP_SAD_DETECTION, /* Sack Attack Detection 57 */ + TCP_LOG_END /* End (keep at end) 58 */ }; enum tcp_log_states { @@ -276,8 +282,8 @@ struct tcp_log_dev_log_queue { #ifdef _KERNEL -#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000 -#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000 +#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000 +#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000 /* * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always diff --git a/freebsd/sys/netinet/tcp_lro.c b/freebsd/sys/netinet/tcp_lro.c index 7242eb5c..1bc90c9f 100644 --- a/freebsd/sys/netinet/tcp_lro.c +++ b/freebsd/sys/netinet/tcp_lro.c @@ -46,6 +46,8 @@ __FBSDID("$FreeBSD$"); #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sockbuf.h> #include <sys/sysctl.h> #include <net/if.h> @@ -58,11 +60,14 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip6.h> #include <netinet/ip.h> #include <netinet/ip_var.h> +#include <netinet/in_pcb.h> +#include <netinet6/in6_pcb.h> #include <netinet/tcp.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_lro.h> #include <netinet/tcp_var.h> - +#include <netinet/tcp_hpts.h> +#include <netinet/tcp_log_buf.h> #include <netinet6/ip6_var.h> #include <machine/in_cksum.h> @@ -81,10 +86,46 @@ static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP LRO"); +static long tcplro_stacks_wanting_mbufq = 0; +counter_u64_t tcp_inp_lro_direct_queue; +counter_u64_t tcp_inp_lro_wokeup_queue; +counter_u64_t tcp_inp_lro_compressed; +counter_u64_t tcp_inp_lro_single_push; +counter_u64_t tcp_inp_lro_locks_taken; +counter_u64_t tcp_inp_lro_sack_wake; + static unsigned tcp_lro_entries = TCP_LRO_ENTRIES; +static int32_t hold_lock_over_compress = 0; +SYSCTL_INT(_net_inet_tcp_lro, OID_AUTO, hold_lock, CTLFLAG_RW, + &hold_lock_over_compress, 0, + "Do we hold the lock over the compress of mbufs?"); SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, "default number of LRO entries"); +SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD, + &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport"); +SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD, + &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts"); +SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD, + &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport"); +SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, single, CTLFLAG_RD, + &tcp_inp_lro_single_push, "Number of lro's sent with single segment"); +SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD, + &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken"); +SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, sackwakeups, CTLFLAG_RD, + &tcp_inp_lro_sack_wake, "Number of wakeups caused by sack/fin"); + +void +tcp_lro_reg_mbufq(void) +{ + atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1); +} + +void +tcp_lro_dereg_mbufq(void) +{ + atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1); +} static __inline void tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, @@ -164,6 +205,36 @@ tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, return (0); } +static struct tcphdr * +tcp_lro_get_th(struct lro_entry *le, struct mbuf *m) +{ + struct ether_header *eh; + struct tcphdr *th = NULL; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ +#endif +#ifdef INET + struct ip *ip4 = NULL; /* Keep compiler happy. */ +#endif + + eh = mtod(m, struct ether_header *); + switch (le->eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + ip6 = (struct ip6_hdr *)(eh + 1); + th = (struct tcphdr *)(ip6 + 1); + break; +#endif +#ifdef INET + case ETHERTYPE_IP: + ip4 = (struct ip *)(eh + 1); + th = (struct tcphdr *)(ip4 + 1); + break; +#endif + } + return (th); +} + void tcp_lro_free(struct lro_ctrl *lc) { @@ -194,7 +265,6 @@ tcp_lro_free(struct lro_ctrl *lc) lc->lro_mbuf_data = NULL; } -#ifdef TCP_LRO_UPDATE_CSUM static uint16_t tcp_lro_csum_th(struct tcphdr *th) { @@ -277,7 +347,6 @@ tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, return (c & 0xffff); } -#endif static void tcp_lro_rx_done(struct lro_ctrl *lc) @@ -299,7 +368,7 @@ tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) if (LIST_EMPTY(&lc->lro_active)) return; - getmicrotime(&tv); + getmicrouptime(&tv); timevalsub(&tv, timeout); LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { if (timevalcmp(&tv, &le->mtime, >=)) { @@ -309,11 +378,113 @@ tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) } } -void -tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) +#ifdef INET6 +static int +tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, + struct tcphdr **th) +{ + + /* XXX-BZ we should check the flow-label. */ + + /* XXX-BZ We do not yet support ext. hdrs. */ + if (ip6->ip6_nxt != IPPROTO_TCP) + return (TCP_LRO_NOT_SUPPORTED); + + /* Find the TCP header. */ + *th = (struct tcphdr *)(ip6 + 1); + + return (0); +} +#endif + +#ifdef INET +static int +tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, + struct tcphdr **th) { + int csum_flags; + uint16_t csum; + + if (ip4->ip_p != IPPROTO_TCP) + return (TCP_LRO_NOT_SUPPORTED); + + /* Ensure there are no options. */ + if ((ip4->ip_hl << 2) != sizeof (*ip4)) + return (TCP_LRO_CANNOT); + + /* .. and the packet is not fragmented. */ + if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) + return (TCP_LRO_CANNOT); - if (le->append_cnt > 0) { + /* Legacy IP has a header checksum that needs to be correct. */ + csum_flags = m->m_pkthdr.csum_flags; + if (csum_flags & CSUM_IP_CHECKED) { + if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { + lc->lro_bad_csum++; + return (TCP_LRO_CANNOT); + } + } else { + csum = in_cksum_hdr(ip4); + if (__predict_false((csum) != 0)) { + lc->lro_bad_csum++; + return (TCP_LRO_CANNOT); + } + } + /* Find the TCP header (we assured there are no IP options). */ + *th = (struct tcphdr *)(ip4 + 1); + return (0); +} +#endif + +static void +tcp_lro_log(struct tcpcb *tp, struct lro_ctrl *lc, + struct lro_entry *le, struct mbuf *m, int frm, int32_t tcp_data_len, + uint32_t th_seq , uint32_t th_ack, uint16_t th_win) +{ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + uint32_t cts; + + cts = tcp_get_usecs(&tv); + memset(&log, 0, sizeof(union tcp_log_stackspecific)); + log.u_bbr.flex8 = frm; + log.u_bbr.flex1 = tcp_data_len; + if (m) + log.u_bbr.flex2 = m->m_pkthdr.len; + else + log.u_bbr.flex2 = 0; + log.u_bbr.flex3 = le->append_cnt; + log.u_bbr.flex4 = le->p_len; + log.u_bbr.flex5 = le->m_head->m_pkthdr.len; + log.u_bbr.delRate = le->m_head->m_flags; + log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp; + log.u_bbr.flex6 = lc->lro_length_lim; + log.u_bbr.flex7 = lc->lro_ackcnt_lim; + log.u_bbr.inflight = th_seq; + log.u_bbr.timeStamp = cts; + log.u_bbr.epoch = le->next_seq; + log.u_bbr.delivered = th_ack; + log.u_bbr.lt_epoch = le->ack_seq; + log.u_bbr.pacing_gain = th_win; + log.u_bbr.cwnd_gain = le->window; + log.u_bbr.cur_del_rate = (uint64_t)m; + log.u_bbr.bw_inuse = (uint64_t)le->m_head; + log.u_bbr.pkts_out = le->mbuf_cnt; /* Total mbufs added */ + log.u_bbr.applimited = le->ulp_csum; + log.u_bbr.lost = le->mbuf_appended; + TCP_LOG_EVENTP(tp, NULL, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, + TCP_LOG_LRO, 0, + 0, &log, false, &tv); + } +} + +static void +tcp_flush_out_le(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, int locked) +{ + if (le->append_cnt > 1) { struct tcphdr *th; uint16_t p_len; @@ -337,13 +508,10 @@ tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) case ETHERTYPE_IP: { struct ip *ip4; -#ifdef TCP_LRO_UPDATE_CSUM uint32_t cl; uint16_t c; -#endif ip4 = le->le_ip4; -#ifdef TCP_LRO_UPDATE_CSUM /* Fix IP header checksum for new length. */ c = ~ip4->ip_sum; cl = c; @@ -353,9 +521,6 @@ tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) cl = (cl >> 16) + (cl & 0xffff); c = cl; ip4->ip_sum = ~c; -#else - ip4->ip_sum = TCP_LRO_INVALID_CSUM; -#endif ip4->ip_len = p_len; th = (struct tcphdr *)(ip4 + 1); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | @@ -381,7 +546,6 @@ tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) ts_ptr[1] = htonl(le->tsval); ts_ptr[2] = le->tsecr; } -#ifdef TCP_LRO_UPDATE_CSUM /* Update the TCP header checksum. */ le->ulp_csum += p_len; le->ulp_csum += tcp_lro_csum_th(th); @@ -390,14 +554,431 @@ tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) (le->ulp_csum & 0xffff); th->th_sum = (le->ulp_csum & 0xffff); th->th_sum = ~th->th_sum; -#else - th->th_sum = TCP_LRO_INVALID_CSUM; + if (tp && locked) { + tcp_lro_log(tp, lc, le, NULL, 7, 0, 0, 0, 0); + } + } + /* + * Break any chain, this is not set to NULL on the singleton + * case m_nextpkt points to m_head. Other case set them + * m_nextpkt to NULL in push_and_replace. + */ + le->m_head->m_nextpkt = NULL; + le->m_head->m_pkthdr.lro_nsegs = le->append_cnt; + if (tp && locked) { + tcp_lro_log(tp, lc, le, le->m_head, 8, 0, 0, 0, 0); + } + (*lc->ifp->if_input)(lc->ifp, le->m_head); + lc->lro_queued += le->append_cnt; +} + +static void +tcp_set_le_to_m(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) +{ + struct ether_header *eh; + void *l3hdr = NULL; /* Keep compiler happy. */ + struct tcphdr *th; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ +#endif +#ifdef INET + struct ip *ip4 = NULL; /* Keep compiler happy. */ +#endif + uint32_t *ts_ptr; + int error, l, ts_failed = 0; + uint16_t tcp_data_len; + uint16_t csum; + + error = -1; + eh = mtod(m, struct ether_header *); + /* + * We must reset the other pointers since the mbuf + * we were pointing too is about to go away. + */ + switch (le->eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); + error = tcp_lro_rx_ipv6(lc, m, ip6, &th); + le->le_ip6 = ip6; + le->source_ip6 = ip6->ip6_src; + le->dest_ip6 = ip6->ip6_dst; + le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); + break; +#endif +#ifdef INET + case ETHERTYPE_IP: + l3hdr = ip4 = (struct ip *)(eh + 1); + error = tcp_lro_rx_ipv4(lc, m, ip4, &th); + le->le_ip4 = ip4; + le->source_ip4 = ip4->ip_src.s_addr; + le->dest_ip4 = ip4->ip_dst.s_addr; + le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; + break; #endif } + KASSERT(error == 0, ("%s: le=%p tcp_lro_rx_xxx failed\n", + __func__, le)); + ts_ptr = (uint32_t *)(th + 1); + l = (th->th_off << 2); + l -= sizeof(*th); + if (l != 0 && + (__predict_false(l != TCPOLEN_TSTAMP_APPA) || + (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { + /* We have failed to find a timestamp some other option? */ + ts_failed = 1; + } + if ((l != 0) && (ts_failed == 0)) { + le->timestamp = 1; + le->tsval = ntohl(*(ts_ptr + 1)); + le->tsecr = *(ts_ptr + 2); + } else + le->timestamp = 0; + le->source_port = th->th_sport; + le->dest_port = th->th_dport; + /* Pull out the csum */ + tcp_data_len = m->m_pkthdr.lro_len; + le->next_seq = ntohl(th->th_seq) + tcp_data_len; + le->ack_seq = th->th_ack; + le->window = th->th_win; + csum = th->th_sum; + /* Setup the data pointers */ + le->m_head = m; + le->m_tail = m_last(m); + le->append_cnt = 0; + le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, + ~csum); + le->append_cnt++; + th->th_sum = csum; /* Restore checksum on first packet. */ +} - le->m_head->m_pkthdr.lro_nsegs = le->append_cnt + 1; - (*lc->ifp->if_input)(lc->ifp, le->m_head); - lc->lro_queued += le->append_cnt + 1; +static void +tcp_push_and_replace(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m, int locked) +{ + /* + * Push up the stack the current le and replace + * it with m. + */ + struct mbuf *msave; + + /* Grab off the next and save it */ + msave = le->m_head->m_nextpkt; + le->m_head->m_nextpkt = NULL; + /* Now push out the old le entry */ + tcp_flush_out_le(tp, lc, le, locked); + /* + * Now to replace the data properly in the le + * we have to reset the tcp header and + * other fields. + */ + tcp_set_le_to_m(lc, le, m); + /* Restore the next list */ + m->m_nextpkt = msave; +} + +static void +tcp_lro_condense(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, int locked) +{ + /* + * Walk through the mbuf chain we + * have on tap and compress/condense + * as required. + */ + uint32_t *ts_ptr; + struct mbuf *m; + struct tcphdr *th; + uint16_t tcp_data_len, csum_upd; + int l; + + /* + * First we must check the lead (m_head) + * we must make sure that it is *not* + * something that should be sent up + * right away (sack etc). + */ +again: + + m = le->m_head->m_nextpkt; + if (m == NULL) { + /* Just the one left */ + return; + } + th = tcp_lro_get_th(le, le->m_head); + KASSERT(th != NULL, + ("le:%p m:%p th comes back NULL?", le, le->m_head)); + l = (th->th_off << 2); + l -= sizeof(*th); + ts_ptr = (uint32_t *)(th + 1); + if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || + (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { + /* + * Its not the timestamp. We can't + * use this guy as the head. + */ + le->m_head->m_nextpkt = m->m_nextpkt; + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { + /* + * Make sure that previously seen segements/ACKs are delivered + * before this segment, e.g. FIN. + */ + le->m_head->m_nextpkt = m->m_nextpkt; + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + while((m = le->m_head->m_nextpkt) != NULL) { + /* + * condense m into le, first + * pull m out of the list. + */ + le->m_head->m_nextpkt = m->m_nextpkt; + m->m_nextpkt = NULL; + /* Setup my data */ + tcp_data_len = m->m_pkthdr.lro_len; + th = tcp_lro_get_th(le, m); + KASSERT(th != NULL, + ("le:%p m:%p th comes back NULL?", le, m)); + ts_ptr = (uint32_t *)(th + 1); + l = (th->th_off << 2); + l -= sizeof(*th); + if (tp && locked) { + tcp_lro_log(tp, lc, le, m, 1, 0, 0, 0, 0); + } + if (le->append_cnt >= lc->lro_ackcnt_lim) { + if (tp && locked) { + tcp_lro_log(tp, lc, le, m, 2, 0, 0, 0, 0); + } + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { + /* Flush now if appending will result in overflow. */ + if (tp && locked) { + tcp_lro_log(tp, lc, le, m, 3, tcp_data_len, 0, 0, 0); + } + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || + (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { + /* + * Maybe a sack in the new one? We need to + * start all over after flushing the + * current le. We will go up to the beginning + * and flush it (calling the replace again possibly + * or just returning). + */ + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + if (l != 0) { + uint32_t tsval = ntohl(*(ts_ptr + 1)); + /* Make sure timestamp values are increasing. */ + if (TSTMP_GT(le->tsval, tsval)) { + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + le->tsval = tsval; + le->tsecr = *(ts_ptr + 2); + } + /* Try to append the new segment. */ + if (__predict_false(ntohl(th->th_seq) != le->next_seq || + (tcp_data_len == 0 && + le->ack_seq == th->th_ack && + le->window == th->th_win))) { + /* Out of order packet or duplicate ACK. */ + if (tp && locked) { + tcp_lro_log(tp, lc, le, m, 4, tcp_data_len, + ntohl(th->th_seq), + th->th_ack, + th->th_win); + } + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) { + le->next_seq += tcp_data_len; + le->ack_seq = th->th_ack; + le->window = th->th_win; + } else if (th->th_ack == le->ack_seq) { + le->window = WIN_MAX(le->window, th->th_win); + } + csum_upd = m->m_pkthdr.lro_csum; + le->ulp_csum += csum_upd; + if (tcp_data_len == 0) { + le->append_cnt++; + le->mbuf_cnt--; + if (tp && locked) { + tcp_lro_log(tp, lc, le, m, 5, tcp_data_len, + ntohl(th->th_seq), + th->th_ack, + th->th_win); + } + m_freem(m); + continue; + } + le->append_cnt++; + le->mbuf_appended++; + le->p_len += tcp_data_len; + /* + * Adjust the mbuf so that m_data points to the first byte of + * the ULP payload. Adjust the mbuf to avoid complications and + * append new segment to existing mbuf chain. + */ + m_adj(m, m->m_pkthdr.len - tcp_data_len); + if (tp && locked) { + tcp_lro_log(tp, lc, le, m, 6, tcp_data_len, + ntohl(th->th_seq), + th->th_ack, + th->th_win); + } + m_demote_pkthdr(m); + le->m_tail->m_next = m; + le->m_tail = m_last(m); + } +} + +#ifdef TCPHPTS +static void +tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le) +{ + if (tp->t_in_pkt == NULL) { + /* Nothing yet there */ + tp->t_in_pkt = le->m_head; + tp->t_tail_pkt = le->m_last_mbuf; + } else { + /* Already some there */ + tp->t_tail_pkt->m_nextpkt = le->m_head; + tp->t_tail_pkt = le->m_last_mbuf; + } + le->m_head = NULL; + le->m_last_mbuf = NULL; +} +#endif + +void +tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) +{ + struct tcpcb *tp = NULL; + int locked = 0; +#ifdef TCPHPTS + struct inpcb *inp = NULL; + int need_wakeup = 0, can_queue = 0; + struct epoch_tracker et; + + /* Now lets lookup the inp first */ + CURVNET_SET(lc->ifp->if_vnet); + if (tcplro_stacks_wanting_mbufq == 0) + goto skip_lookup; + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + switch (le->eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + inp = in6_pcblookup(&V_tcbinfo, &le->source_ip6, + le->source_port, &le->dest_ip6,le->dest_port, + INPLOOKUP_WLOCKPCB, + lc->ifp); + break; +#endif +#ifdef INET + case ETHERTYPE_IP: + inp = in_pcblookup(&V_tcbinfo, le->le_ip4->ip_src, + le->source_port, le->le_ip4->ip_dst, le->dest_port, + INPLOOKUP_WLOCKPCB, + lc->ifp); + break; +#endif + } + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + if (inp && ((inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || + (inp->inp_flags2 & INP_FREED))) { + /* We don't want this guy */ + INP_WUNLOCK(inp); + inp = NULL; + } + if (inp && (inp->inp_flags2 & INP_SUPPORTS_MBUFQ)) { + /* The transport supports mbuf queuing */ + can_queue = 1; + if (le->need_wakeup || + ((inp->inp_in_input == 0) && + ((inp->inp_flags2 & INP_MBUF_QUEUE_READY) == 0))) { + /* + * Either the transport is off on a keep-alive + * (it has the queue_ready flag clear and its + * not already been woken) or the entry has + * some urgent thing (FIN or possibly SACK blocks). + * This means we need to wake the transport up by + * putting it on the input pacer. + */ + need_wakeup = 1; + if ((inp->inp_flags2 & INP_DONT_SACK_QUEUE) && + (le->need_wakeup != 1)) { + /* + * Prohibited from a sack wakeup. + */ + need_wakeup = 0; + } + } + /* Do we need to be awoken due to lots of data or acks? */ + if ((le->tcp_tot_p_len >= lc->lro_length_lim) || + (le->mbuf_cnt >= lc->lro_ackcnt_lim)) + need_wakeup = 1; + } + if (inp) { + tp = intotcpcb(inp); + locked = 1; + } else + tp = NULL; + if (can_queue) { + counter_u64_add(tcp_inp_lro_direct_queue, 1); + tcp_lro_log(tp, lc, le, NULL, 22, need_wakeup, + inp->inp_flags2, inp->inp_in_input, le->need_wakeup); + tcp_queue_pkts(tp, le); + if (need_wakeup) { + /* + * We must get the guy to wakeup via + * hpts. + */ + counter_u64_add(tcp_inp_lro_wokeup_queue, 1); + if (le->need_wakeup) + counter_u64_add(tcp_inp_lro_sack_wake, 1); + tcp_queue_to_input(inp); + } + } + if (inp && (hold_lock_over_compress == 0)) { + /* Unlock it */ + locked = 0; + tp = NULL; + counter_u64_add(tcp_inp_lro_locks_taken, 1); + INP_WUNLOCK(inp); + } + if (can_queue == 0) { +skip_lookup: +#endif /* TCPHPTS */ + /* Old fashioned lro method */ + if (le->m_head != le->m_last_mbuf) { + counter_u64_add(tcp_inp_lro_compressed, 1); + tcp_lro_condense(tp, lc, le, locked); + } else + counter_u64_add(tcp_inp_lro_single_push, 1); + tcp_flush_out_le(tp, lc, le, locked); +#ifdef TCPHPTS + } + if (inp && locked) { + counter_u64_add(tcp_inp_lro_locks_taken, 1); + INP_WUNLOCK(inp); + } + CURVNET_RESTORE(); +#endif lc->lro_flushed++; bzero(le, sizeof(*le)); LIST_INSERT_HEAD(&lc->lro_free, le, next); @@ -539,65 +1120,12 @@ done: lc->lro_mbuf_count = 0; } -#ifdef INET6 -static int -tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, - struct tcphdr **th) -{ - - /* XXX-BZ we should check the flow-label. */ - - /* XXX-BZ We do not yet support ext. hdrs. */ - if (ip6->ip6_nxt != IPPROTO_TCP) - return (TCP_LRO_NOT_SUPPORTED); - - /* Find the TCP header. */ - *th = (struct tcphdr *)(ip6 + 1); - - return (0); -} -#endif - -#ifdef INET -static int -tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, - struct tcphdr **th) +static void +lro_set_mtime(struct timeval *tv, struct timespec *ts) { - int csum_flags; - uint16_t csum; - - if (ip4->ip_p != IPPROTO_TCP) - return (TCP_LRO_NOT_SUPPORTED); - - /* Ensure there are no options. */ - if ((ip4->ip_hl << 2) != sizeof (*ip4)) - return (TCP_LRO_CANNOT); - - /* .. and the packet is not fragmented. */ - if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) - return (TCP_LRO_CANNOT); - - /* Legacy IP has a header checksum that needs to be correct. */ - csum_flags = m->m_pkthdr.csum_flags; - if (csum_flags & CSUM_IP_CHECKED) { - if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { - lc->lro_bad_csum++; - return (TCP_LRO_CANNOT); - } - } else { - csum = in_cksum_hdr(ip4); - if (__predict_false((csum) != 0)) { - lc->lro_bad_csum++; - return (TCP_LRO_CANNOT); - } - } - - /* Find the TCP header (we assured there are no IP options). */ - *th = (struct tcphdr *)(ip4 + 1); - - return (0); + tv->tv_sec = ts->tv_sec; + tv->tv_usec = ts->tv_nsec / 1000; } -#endif static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) @@ -615,12 +1143,17 @@ tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) uint32_t *ts_ptr; tcp_seq seq; int error, ip_len, l; - uint16_t eh_type, tcp_data_len; + uint16_t eh_type, tcp_data_len, need_flush; struct lro_head *bucket; - int force_flush = 0; + struct timespec arrv; /* We expect a contiguous header [eh, ip, tcp]. */ - + if ((m->m_flags & (M_TSTMP_LRO|M_TSTMP)) == 0) { + /* If no hardware or arrival stamp on the packet add arrival */ + nanouptime(&arrv); + m->m_pkthdr.rcv_tstmp = (arrv.tv_sec * 1000000000) + arrv.tv_nsec; + m->m_flags |= M_TSTMP_LRO; + } eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); switch (eh_type) { @@ -679,49 +1212,35 @@ tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) m_adj(m, -l); } - /* * Check TCP header constraints. */ - /* Ensure no bits set besides ACK or PSH. */ - if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { - if (th->th_flags & TH_SYN) - return (TCP_LRO_CANNOT); - /* - * Make sure that previously seen segements/ACKs are delivered - * before this segement, e.g. FIN. - */ - force_flush = 1; - } - - /* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */ - /* XXX-BZ Ideally we'd flush on PUSH? */ - - /* - * Check for timestamps. - * Since the only option we handle are timestamps, we only have to - * handle the simple case of aligned timestamps. - */ + if (th->th_flags & TH_SYN) + return (TCP_LRO_CANNOT); + if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) + need_flush = 1; + else + need_flush = 0; l = (th->th_off << 2); + ts_ptr = (uint32_t *)(th + 1); tcp_data_len -= l; l -= sizeof(*th); - ts_ptr = (uint32_t *)(th + 1); if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || - (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| - TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { - /* - * Make sure that previously seen segements/ACKs are delivered - * before this segement. + (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { + /* + * We have an option besides Timestamps, maybe + * it is a sack (most likely) which means we + * will probably need to wake up a sleeper (if + * the guy does queueing). */ - force_flush = 1; + need_flush = 2; } /* If the driver did not pass in the checksum, set it now. */ if (csum == 0x0000) csum = th->th_sum; - seq = ntohl(th->th_seq); - if (!use_hash) { bucket = &lc->lro_hash[0]; } else if (M_HASHTYPE_ISHASH(m)) { @@ -738,13 +1257,13 @@ tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) #ifdef INET6 case ETHERTYPE_IPV6: hash = ip6->ip6_src.s6_addr32[0] + - ip6->ip6_dst.s6_addr32[0]; + ip6->ip6_dst.s6_addr32[0]; hash += ip6->ip6_src.s6_addr32[1] + - ip6->ip6_dst.s6_addr32[1]; + ip6->ip6_dst.s6_addr32[1]; hash += ip6->ip6_src.s6_addr32[2] + - ip6->ip6_dst.s6_addr32[2]; + ip6->ip6_dst.s6_addr32[2]; hash += ip6->ip6_src.s6_addr32[3] + - ip6->ip6_dst.s6_addr32[3]; + ip6->ip6_dst.s6_addr32[3]; break; #endif default: @@ -766,9 +1285,9 @@ tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) #ifdef INET6 case ETHERTYPE_IPV6: if (bcmp(&le->source_ip6, &ip6->ip6_src, - sizeof(struct in6_addr)) != 0 || + sizeof(struct in6_addr)) != 0 || bcmp(&le->dest_ip6, &ip6->ip6_dst, - sizeof(struct in6_addr)) != 0) + sizeof(struct in6_addr)) != 0) continue; break; #endif @@ -780,108 +1299,34 @@ tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) break; #endif } - - if (force_flush) { - /* Timestamps mismatch; this is a FIN, etc */ - tcp_lro_active_remove(le); - tcp_lro_flush(lc, le); - return (TCP_LRO_CANNOT); - } - - /* Flush now if appending will result in overflow. */ - if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { - tcp_lro_active_remove(le); - tcp_lro_flush(lc, le); - break; - } - - /* Try to append the new segment. */ - if (__predict_false(seq != le->next_seq || - (tcp_data_len == 0 && - le->ack_seq == th->th_ack && - le->window == th->th_win))) { - /* Out of order packet or duplicate ACK. */ - tcp_lro_active_remove(le); - tcp_lro_flush(lc, le); - return (TCP_LRO_CANNOT); - } - - if (l != 0) { - uint32_t tsval = ntohl(*(ts_ptr + 1)); - /* Make sure timestamp values are increasing. */ - /* XXX-BZ flip and use TSTMP_GEQ macro for this? */ - if (__predict_false(le->tsval > tsval || - *(ts_ptr + 2) == 0)) - return (TCP_LRO_CANNOT); - le->tsval = tsval; - le->tsecr = *(ts_ptr + 2); - } - if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) { - le->next_seq += tcp_data_len; - le->ack_seq = th->th_ack; - le->window = th->th_win; - le->append_cnt++; - } else if (th->th_ack == le->ack_seq) { - le->window = WIN_MAX(le->window, th->th_win); - le->append_cnt++; + if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq)) || + (th->th_ack == le->ack_seq)) { + m->m_pkthdr.lro_len = tcp_data_len; } else { /* no data and old ack */ - le->append_cnt++; - m_freem(m); - return (0); - } -#ifdef TCP_LRO_UPDATE_CSUM - le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th, - tcp_data_len, ~csum); -#endif - - if (tcp_data_len == 0) { m_freem(m); - /* - * Flush this LRO entry, if this ACK should not - * be further delayed. - */ - if (le->append_cnt >= lc->lro_ackcnt_lim) { - tcp_lro_active_remove(le); - tcp_lro_flush(lc, le); - } return (0); } - - le->p_len += tcp_data_len; - - /* - * Adjust the mbuf so that m_data points to the first byte of - * the ULP payload. Adjust the mbuf to avoid complications and - * append new segment to existing mbuf chain. - */ - m_adj(m, m->m_pkthdr.len - tcp_data_len); - m_demote_pkthdr(m); - - le->m_tail->m_next = m; - le->m_tail = m_last(m); - - /* - * If a possible next full length packet would cause an - * overflow, pro-actively flush now. - */ - if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) { - tcp_lro_active_remove(le); - tcp_lro_flush(lc, le); - } else - getmicrotime(&le->mtime); - + if (need_flush) + le->need_wakeup = need_flush; + /* Save of the data only csum */ + m->m_pkthdr.rcvif = lc->ifp; + m->m_pkthdr.lro_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, + tcp_data_len, ~csum); + th->th_sum = csum; /* Restore checksum */ + /* Save off the tail I am appending too (prev) */ + le->m_prev_last = le->m_last_mbuf; + /* Mark me in the last spot */ + le->m_last_mbuf->m_nextpkt = m; + /* Now set the tail to me */ + le->m_last_mbuf = m; + le->mbuf_cnt++; + m->m_nextpkt = NULL; + /* Add to the total size of data */ + le->tcp_tot_p_len += tcp_data_len; + lro_set_mtime(&le->mtime, &arrv); return (0); } - - if (force_flush) { - /* - * Nothing to flush, but this segment can not be further - * aggregated/delayed. - */ - return (TCP_LRO_CANNOT); - } - /* Try to find an empty slot. */ if (LIST_EMPTY(&lc->lro_free)) return (TCP_LRO_NO_ENTRIES); @@ -890,7 +1335,7 @@ tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) le = LIST_FIRST(&lc->lro_free); LIST_REMOVE(le, next); tcp_lro_active_insert(lc, bucket, le); - getmicrotime(&le->mtime); + lro_set_mtime(&le->mtime, &arrv); /* Start filling in details. */ switch (eh_type) { @@ -912,10 +1357,9 @@ tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; break; #endif - } + } le->source_port = th->th_sport; le->dest_port = th->th_dport; - le->next_seq = seq + tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; @@ -924,26 +1368,31 @@ tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) le->tsval = ntohl(*(ts_ptr + 1)); le->tsecr = *(ts_ptr + 2); } - -#ifdef TCP_LRO_UPDATE_CSUM - /* - * Do not touch the csum of the first packet. However save the - * "adjusted" checksum of just the source and destination addresses, - * the next header and the TCP payload. The length and TCP header - * parts may change, so we remove those from the saved checksum and - * re-add with final values on tcp_lro_flush() if needed. - */ KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n", - __func__, le, le->ulp_csum)); + __func__, le, le->ulp_csum)); + le->append_cnt = 0; le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, - ~csum); - th->th_sum = csum; /* Restore checksum on first packet. */ -#endif - + ~csum); + le->append_cnt++; + th->th_sum = csum; /* Restore checksum */ le->m_head = m; + m->m_pkthdr.rcvif = lc->ifp; + le->mbuf_cnt = 1; + if (need_flush) + le->need_wakeup = need_flush; + else + le->need_wakeup = 0; le->m_tail = m_last(m); - + le->m_last_mbuf = m; + m->m_nextpkt = NULL; + le->m_prev_last = NULL; + /* + * We keep the total size here for cross checking when we may need + * to flush/wakeup in the MBUF_QUEUE case. + */ + le->tcp_tot_p_len = tcp_data_len; + m->m_pkthdr.lro_len = tcp_data_len; return (0); } @@ -957,6 +1406,8 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) void tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) { + struct timespec arrv; + /* sanity checks */ if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || lc->lro_mbuf_max == 0)) { @@ -973,7 +1424,15 @@ tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) (*lc->ifp->if_input) (lc->ifp, mb); return; } - + /* Arrival Stamp the packet */ + + if ((mb->m_flags & M_TSTMP) == 0) { + /* If no hardware or arrival stamp on the packet add arrival */ + nanouptime(&arrv); + mb->m_pkthdr.rcv_tstmp = ((arrv.tv_sec * 1000000000) + + arrv.tv_nsec); + mb->m_flags |= M_TSTMP_LRO; + } /* create sequence number */ lc->lro_mbuf_data[lc->lro_mbuf_count].seq = (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | diff --git a/freebsd/sys/netinet/tcp_lro.h b/freebsd/sys/netinet/tcp_lro.h index 855f4ee4..1c6d2dd5 100644 --- a/freebsd/sys/netinet/tcp_lro.h +++ b/freebsd/sys/netinet/tcp_lro.h @@ -45,6 +45,8 @@ struct lro_entry { LIST_ENTRY(lro_entry) hash_next; struct mbuf *m_head; struct mbuf *m_tail; + struct mbuf *m_last_mbuf; + struct mbuf *m_prev_last; union { struct ip *ip4; struct ip6_hdr *ip6; @@ -67,10 +69,22 @@ struct lro_entry { uint32_t ack_seq; /* tcp_seq */ uint32_t tsval; uint32_t tsecr; + uint32_t tcp_tot_p_len; /* TCP payload length of chain */ uint16_t window; uint16_t timestamp; /* flag, not a TCP hdr field. */ + uint16_t need_wakeup; + uint16_t mbuf_cnt; /* Count of mbufs collected see note */ + uint16_t mbuf_appended; struct timeval mtime; }; +/* + * Note: The mbuf_cnt field tracks our number of mbufs added to the m_next + * list. Each mbuf counted can have data and of course it will + * have an ack as well (by defintion any inbound tcp segment will + * have an ack value. We use this count to tell us how many ACK's + * are present for our ack-count threshold. If we exceed that or + * the data threshold we will wake up the endpoint. + */ LIST_HEAD(lro_head, lro_entry); #define le_ip4 leip.ip4 @@ -115,6 +129,8 @@ void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *); void tcp_lro_flush_all(struct lro_ctrl *); int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t); void tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *); +void tcp_lro_reg_mbufq(void); +void tcp_lro_dereg_mbufq(void); #define TCP_LRO_NO_ENTRIES -2 #define TCP_LRO_CANNOT -1 diff --git a/freebsd/sys/netinet/tcp_offload.c b/freebsd/sys/netinet/tcp_offload.c index f3ab3b50..89b21859 100644 --- a/freebsd/sys/netinet/tcp_offload.c +++ b/freebsd/sys/netinet/tcp_offload.c @@ -35,7 +35,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> -#include <sys/types.h> +#include <sys/eventhandler.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> diff --git a/freebsd/sys/netinet/tcp_output.c b/freebsd/sys/netinet/tcp_output.c index b641f04c..ff09fa31 100644 --- a/freebsd/sys/netinet/tcp_output.c +++ b/freebsd/sys/netinet/tcp_output.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_ipsec.h> +#include <rtems/bsd/local/opt_kern_tls.h> #include <rtems/bsd/local/opt_tcpdebug.h> #include <sys/param.h> @@ -48,6 +49,9 @@ __FBSDID("$FreeBSD$"); #include <sys/hhook.h> #endif #include <sys/kernel.h> +#ifdef KERN_TLS +#include <sys/ktls.h> +#endif #include <sys/lock.h> #include <sys/mbuf.h> #include <sys/mutex.h> @@ -193,7 +197,7 @@ tcp_output(struct tcpcb *tp) uint32_t recwin, sendwin; int off, flags, error = 0; /* Keep compiler happy */ u_int if_hw_tsomaxsegcount = 0; - u_int if_hw_tsomaxsegsize; + u_int if_hw_tsomaxsegsize = 0; struct mbuf *m; struct ip *ip = NULL; #ifdef TCPDEBUG @@ -221,6 +225,11 @@ tcp_output(struct tcpcb *tp) isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif +#ifdef KERN_TLS + const bool hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; +#else + const bool hw_tls = false; +#endif INP_WLOCK_ASSERT(tp->t_inpcb); @@ -658,7 +667,8 @@ after_sack_rexmit: if (adv >= (int32_t)(2 * tp->t_maxseg) && (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || recwin <= (so->so_rcv.sb_hiwat / 8) || - so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) + so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg || + adv >= TCP_MAXWIN << tp->rcv_scale)) goto send; if (2 * adv >= (int32_t)so->so_rcv.sb_hiwat) goto send; @@ -1001,7 +1011,7 @@ send: * to the offset in the socket buffer chain. */ mb = sbsndptr_noadv(&so->so_snd, off, &moff); - if (len <= MHLEN - hdrlen - max_linkhdr) { + if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { m_copydata(mb, moff, len, mtod(m, caddr_t) + hdrlen); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) @@ -1014,7 +1024,7 @@ send: msb = &so->so_snd; m->m_next = tcp_m_copym(mb, moff, &len, if_hw_tsomaxsegcount, - if_hw_tsomaxsegsize, msb); + if_hw_tsomaxsegsize, msb, hw_tls); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy @@ -1285,15 +1295,9 @@ send: m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } -#if defined(IPSEC) || defined(IPSEC_SUPPORT) - KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), - ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u", - __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); -#else - KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), - ("%s: mbuf chain shorter than expected: %d + %u + %u != %u", - __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); -#endif + KASSERT(len + hdrlen == m_length(m, NULL), + ("%s: mbuf chain shorter than expected: %d + %u != %u", + __func__, len, hdrlen, m_length(m, NULL))); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ @@ -1515,7 +1519,13 @@ timer: if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + xlen; } - + if ((error == 0) && + (TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->t_flags & TF_SACK_PERMIT) && + tp->rcv_numsacks > 0)) { + /* Clean up any DSACK's sent */ + tcp_clean_dsack_blocks(tp); + } if (error) { /* Record the error. */ TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, @@ -1817,8 +1827,12 @@ tcp_addoptions(struct tcpopt *to, u_char *optp) */ struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, - int32_t seglimit, int32_t segsize, struct sockbuf *sb) + int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls) { +#ifdef KERN_TLS + struct ktls_session *tls, *ntls; + struct mbuf *start; +#endif struct mbuf *n, **np; struct mbuf *top; int32_t off = off0; @@ -1850,6 +1864,13 @@ tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, np = ⊤ top = NULL; pkthdrlen = NULL; +#ifdef KERN_TLS + if (m->m_flags & M_NOMAP) + tls = m->m_ext.ext_pgs->tls; + else + tls = NULL; + start = m; +#endif while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, @@ -1859,6 +1880,38 @@ tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, *pkthdrlen = len_cp; break; } +#ifdef KERN_TLS + if (hw_tls) { + if (m->m_flags & M_NOMAP) + ntls = m->m_ext.ext_pgs->tls; + else + ntls = NULL; + + /* + * Avoid mixing TLS records with handshake + * data or TLS records from different + * sessions. + */ + if (tls != ntls) { + MPASS(m != start); + *plen = len_cp; + if (pkthdrlen != NULL) + *pkthdrlen = len_cp; + break; + } + + /* + * Don't end a send in the middle of a TLS + * record if it spans multiple TLS records. + */ + if (tls != NULL && (m != start) && len < m->m_len) { + *plen = len_cp; + if (pkthdrlen != NULL) + *pkthdrlen = len_cp; + break; + } + } +#endif mlen = min(len, m->m_len - off); if (seglimit) { /* diff --git a/freebsd/sys/netinet/tcp_reass.c b/freebsd/sys/netinet/tcp_reass.c index fea3b716..9d4bf3be 100644 --- a/freebsd/sys/netinet/tcp_reass.c +++ b/freebsd/sys/netinet/tcp_reass.c @@ -40,6 +40,10 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_tcpdebug.h> +/* For debugging we want counters and BB logging */ +/* #define TCP_REASS_COUNTERS 1 */ +/* #define TCP_REASS_LOGGING 1 */ + #include <sys/param.h> #include <sys/kernel.h> #include <sys/eventhandler.h> @@ -74,8 +78,10 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> +#ifdef TCP_REASS_LOGGING #include <netinet/tcp_log_buf.h> #include <netinet/tcp_hpts.h> +#endif #include <netinet6/tcp6_var.h> #include <netinet/tcpip.h> #ifdef TCPDEBUG @@ -94,10 +100,6 @@ __FBSDID("$FreeBSD$"); #define TCP_R_LOG_DUMP 10 #define TCP_R_LOG_TRIM 11 -/* For debugging we want counters and BB logging */ -/* #define TCP_REASS_COUNTERS 1 */ -/* #define TCP_REASS_LOGGING 1 */ - static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); @@ -542,6 +544,10 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, tcp_seq *seq_start, * and should be rewritten (see NetBSD for optimizations). */ + KASSERT(th == NULL || (seq_start != NULL && tlenp != NULL), + ("tcp_reass called with illegal parameter combination " + "(tp=%p, th=%p, seq_start=%p, tlenp=%p, m=%p)", + tp, th, seq_start, tlenp, m)); /* * Call with th==NULL after become established to * force pre-ESTABLISHED data up to user socket. @@ -581,7 +587,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, tcp_seq *seq_start, */ lenofoh = tcp_reass_overhead_of_chain(m, &mlast); sb = &tp->t_inpcb->inp_socket->so_rcv; - if ((sb->sb_mbcnt + tp->t_segqmbuflen + lenofoh) > sb->sb_mbmax) { + if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && + (sb->sb_mbcnt + tp->t_segqmbuflen + lenofoh) > sb->sb_mbmax) { /* No room */ TCPSTAT_INC(tcps_rcvreassfull); #ifdef TCP_REASS_COUNTERS @@ -590,6 +597,11 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, tcp_seq *seq_start, #ifdef TCP_REASS_LOGGING tcp_log_reassm(tp, NULL, NULL, th->th_seq, lenofoh, TCP_R_LOG_LIMIT_REACHED, 0); #endif + if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: mbuf count limit reached, " + "segment dropped\n", s, __func__); + free(s, M_TCPLOG); + } m_freem(m); *tlenp = 0; #ifdef TCP_REASS_LOGGING @@ -938,6 +950,20 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, tcp_seq *seq_start, * is understood. */ new_entry: + if (th->th_seq == tp->rcv_nxt && TCPS_HAVEESTABLISHED(tp->t_state)) { + tp->rcv_nxt += *tlenp; + flags = th->th_flags & TH_FIN; + TCPSTAT_INC(tcps_rcvoopack); + TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); + SOCKBUF_LOCK(&so->so_rcv); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + m_freem(m); + } else { + sbappendstream_locked(&so->so_rcv, m, 0); + } + sorwakeup_locked(so); + return (flags); + } if (tcp_new_limits) { if ((tp->t_segqlen > tcp_reass_queue_guard) && (*tlenp < MSIZE)) { @@ -962,9 +988,7 @@ new_entry: return (0); } } else { - - if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && - tp->t_segqlen >= min((so->so_rcv.sb_hiwat / tp->t_maxseg) + 1, + if (tp->t_segqlen >= min((so->so_rcv.sb_hiwat / tp->t_maxseg) + 1, tcp_reass_maxqueuelen)) { TCPSTAT_INC(tcps_rcvreassfull); *tlenp = 0; @@ -1044,12 +1068,20 @@ present: } else { #ifdef TCP_REASS_LOGGING tcp_reass_log_new_in(tp, q->tqe_start, q->tqe_len, q->tqe_m, TCP_R_LOG_READ, q); - tcp_log_reassm(tp, q, NULL, th->th_seq, *tlenp, TCP_R_LOG_READ, 1); + if (th != NULL) { + tcp_log_reassm(tp, q, NULL, th->th_seq, *tlenp, TCP_R_LOG_READ, 1); + } else { + tcp_log_reassm(tp, q, NULL, 0, 0, TCP_R_LOG_READ, 1); + } #endif sbappendstream_locked(&so->so_rcv, q->tqe_m, 0); } #ifdef TCP_REASS_LOGGING - tcp_log_reassm(tp, q, NULL, th->th_seq, *tlenp, TCP_R_LOG_READ, 2); + if (th != NULL) { + tcp_log_reassm(tp, q, NULL, th->th_seq, *tlenp, TCP_R_LOG_READ, 2); + } else { + tcp_log_reassm(tp, q, NULL, 0, 0, TCP_R_LOG_READ, 2); + } #endif KASSERT(tp->t_segqmbuflen >= q->tqe_mbuf_cnt, ("tp:%p seg queue goes negative", tp)); @@ -1065,7 +1097,11 @@ present: tp, &tp->t_segq, tp->t_segqmbuflen); #else #ifdef TCP_REASS_LOGGING - tcp_log_reassm(tp, NULL, NULL, th->th_seq, *tlenp, TCP_R_LOG_ZERO, 0); + if (th != NULL) { + tcp_log_reassm(tp, NULL, NULL, th->th_seq, *tlenp, TCP_R_LOG_ZERO, 0); + } else { + tcp_log_reassm(tp, NULL, NULL, 0, 0, TCP_R_LOG_ZERO, 0); + } #endif tp->t_segqmbuflen = 0; #endif diff --git a/freebsd/sys/netinet/tcp_sack.c b/freebsd/sys/netinet/tcp_sack.c index 91c032c8..6d6198dd 100644 --- a/freebsd/sys/netinet/tcp_sack.c +++ b/freebsd/sys/netinet/tcp_sack.c @@ -151,6 +151,108 @@ SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcp_sack_globalholes), 0, "Global number of TCP SACK holes currently allocated"); + +/* + * This function will find overlaps with the currently stored sackblocks + * and add any overlap as a dsack block upfront + */ +void +tcp_update_dsack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) +{ + struct sackblk head_blk,mid_blk,saved_blks[MAX_SACK_BLKS]; + int i, j, n, identical; + tcp_seq start, end; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end")); + + if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) { + log(LOG_DEBUG, "\nDSACK update: %d..%d, rcv_nxt: %u\n", + rcv_start, rcv_end, tp->rcv_nxt); + } + + if (SEQ_LT(rcv_end, tp->rcv_nxt) || + ((rcv_end == tp->rcv_nxt) && + (tp->rcv_numsacks > 0 ) && + (tp->sackblks[0].end == tp->rcv_nxt))) { + saved_blks[0].start = rcv_start; + saved_blks[0].end = rcv_end; + } else { + saved_blks[0].start = saved_blks[0].end = 0; + } + + head_blk.start = head_blk.end = 0; + mid_blk.start = rcv_start; + mid_blk.end = rcv_end; + identical = 0; + + for (i = 0; i < tp->rcv_numsacks; i++) { + start = tp->sackblks[i].start; + end = tp->sackblks[i].end; + if (SEQ_LT(rcv_end, start)) { + /* pkt left to sack blk */ + continue; + } + if (SEQ_GT(rcv_start, end)) { + /* pkt right to sack blk */ + continue; + } + if (SEQ_GT(tp->rcv_nxt, end)) { + if ((SEQ_MAX(rcv_start, start) != SEQ_MIN(rcv_end, end)) && + (SEQ_GT(head_blk.start, SEQ_MAX(rcv_start, start)) || + (head_blk.start == head_blk.end))) { + head_blk.start = SEQ_MAX(rcv_start, start); + head_blk.end = SEQ_MIN(rcv_end, end); + } + continue; + } + if (((head_blk.start == head_blk.end) || + SEQ_LT(start, head_blk.start)) && + (SEQ_GT(end, rcv_start) && + SEQ_LEQ(start, rcv_end))) { + head_blk.start = start; + head_blk.end = end; + } + mid_blk.start = SEQ_MIN(mid_blk.start, start); + mid_blk.end = SEQ_MAX(mid_blk.end, end); + if ((mid_blk.start == start) && + (mid_blk.end == end)) + identical = 1; + } + if (SEQ_LT(head_blk.start, head_blk.end)) { + /* store overlapping range */ + saved_blks[0].start = SEQ_MAX(rcv_start, head_blk.start); + saved_blks[0].end = SEQ_MIN(rcv_end, head_blk.end); + } + n = 1; + /* + * Second, if not ACKed, store the SACK block that + * overlaps with the DSACK block unless it is identical + */ + if ((SEQ_LT(tp->rcv_nxt, mid_blk.end) && + !((mid_blk.start == saved_blks[0].start) && + (mid_blk.end == saved_blks[0].end))) || + identical == 1) { + saved_blks[n].start = mid_blk.start; + saved_blks[n++].end = mid_blk.end; + } + for (j = 0; (j < tp->rcv_numsacks) && (n < MAX_SACK_BLKS); j++) { + if (((SEQ_LT(tp->sackblks[j].end, mid_blk.start) || + SEQ_GT(tp->sackblks[j].start, mid_blk.end)) && + (SEQ_GT(tp->sackblks[j].start, tp->rcv_nxt)))) + saved_blks[n++] = tp->sackblks[j]; + } + j = 0; + for (i = 0; i < n; i++) { + /* we can end up with a stale inital entry */ + if (SEQ_LT(saved_blks[i].start, saved_blks[i].end)) { + tp->sackblks[j++] = saved_blks[i]; + } + } + tp->rcv_numsacks = j; +} + /* * This function is called upon receipt of new valid data (while not in * header prediction mode), and it updates the ordered list of sacks. @@ -170,11 +272,18 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) INP_WLOCK_ASSERT(tp->t_inpcb); /* Check arguments. */ - KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end")); - - /* SACK block for the received segment. */ - head_blk.start = rcv_start; - head_blk.end = rcv_end; + KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("rcv_start <= rcv_end")); + + if ((rcv_start == rcv_end) && + (tp->rcv_numsacks >= 1) && + (rcv_end == tp->sackblks[0].end)) { + /* retaining DSACK block below rcv_nxt (todrop) */ + head_blk = tp->sackblks[0]; + } else { + /* SACK block for the received segment. */ + head_blk.start = rcv_start; + head_blk.end = rcv_end; + } /* * Merge updated SACK blocks into head_blk, and save unchanged SACK @@ -195,12 +304,54 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) * Merge this SACK block into head_blk. This SACK * block itself will be discarded. */ - if (SEQ_GT(head_blk.start, start)) + /* + * |-| + * |---| merge + * + * |-| + * |---| merge + * + * |-----| + * |-| DSACK smaller + * + * |-| + * |-----| DSACK smaller + */ + if (head_blk.start == end) head_blk.start = start; - if (SEQ_LT(head_blk.end, end)) + else if (head_blk.end == start) head_blk.end = end; + else { + if (SEQ_LT(head_blk.start, start)) { + tcp_seq temp = start; + start = head_blk.start; + head_blk.start = temp; + } + if (SEQ_GT(head_blk.end, end)) { + tcp_seq temp = end; + end = head_blk.end; + head_blk.end = temp; + } + if ((head_blk.start != start) || + (head_blk.end != end)) { + if ((num_saved >= 1) && + SEQ_GEQ(saved_blks[num_saved-1].start, start) && + SEQ_LEQ(saved_blks[num_saved-1].end, end)) + num_saved--; + saved_blks[num_saved].start = start; + saved_blks[num_saved].end = end; + num_saved++; + } + } } else { /* + * This block supercedes the prior block + */ + if ((num_saved >= 1) && + SEQ_GEQ(saved_blks[num_saved-1].start, start) && + SEQ_LEQ(saved_blks[num_saved-1].end, end)) + num_saved--; + /* * Save this SACK block. */ saved_blks[num_saved].start = start; @@ -213,7 +364,7 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) * Update SACK list in tp->sackblks[]. */ num_head = 0; - if (SEQ_GT(head_blk.start, tp->rcv_nxt)) { + if (SEQ_LT(rcv_start, rcv_end)) { /* * The received data segment is an out-of-order segment. Put * head_blk at the top of SACK list. @@ -227,6 +378,10 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) if (num_saved >= MAX_SACK_BLKS) num_saved--; } + if ((rcv_start == rcv_end) && + (rcv_start == tp->sackblks[0].end)) { + num_head = 1; + } if (num_saved > 0) { /* * Copy the saved SACK blocks back. @@ -239,6 +394,45 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) tp->rcv_numsacks = num_head + num_saved; } +void +tcp_clean_dsack_blocks(struct tcpcb *tp) +{ + struct sackblk saved_blks[MAX_SACK_BLKS]; + int num_saved, i; + + INP_WLOCK_ASSERT(tp->t_inpcb); + /* + * Clean up any DSACK blocks that + * are in our queue of sack blocks. + * + */ + num_saved = 0; + for (i = 0; i < tp->rcv_numsacks; i++) { + tcp_seq start = tp->sackblks[i].start; + tcp_seq end = tp->sackblks[i].end; + if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { + /* + * Discard this D-SACK block. + */ + continue; + } + /* + * Save this SACK block. + */ + saved_blks[num_saved].start = start; + saved_blks[num_saved].end = end; + num_saved++; + } + if (num_saved > 0) { + /* + * Copy the saved SACK blocks back. + */ + bcopy(saved_blks, &tp->sackblks[0], + sizeof(struct sackblk) * num_saved); + } + tp->rcv_numsacks = num_saved; +} + /* * Delete all receiver-side SACK information. */ diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c index 52d2ca2a..223f33f7 100644 --- a/freebsd/sys/netinet/tcp_subr.c +++ b/freebsd/sys/netinet/tcp_subr.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_ipsec.h> +#include <rtems/bsd/local/opt_kern_tls.h> #include <rtems/bsd/local/opt_tcpdebug.h> #include <sys/param.h> @@ -56,6 +57,9 @@ __FBSDID("$FreeBSD$"); #ifdef TCP_HHOOK #include <sys/khelp.h> #endif +#ifdef KERN_TLS +#include <sys/ktls.h> +#endif #include <sys/sysctl.h> #include <sys/jail.h> #include <sys/malloc.h> @@ -201,6 +205,11 @@ SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); +VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(tcp_ts_offset_per_conn), 0, + "Initialize TCP timestamps per connection instead of per host pair"); + static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); @@ -263,21 +272,10 @@ static struct tcp_function_block tcp_def_funcblk = { .tfb_tcp_fb_fini = tcp_default_fb_fini, }; -int t_functions_inited = 0; static int tcp_fb_cnt = 0; struct tcp_funchead t_functions; static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; -static void -init_tcp_functions(void) -{ - if (t_functions_inited == 0) { - TAILQ_INIT(&t_functions); - rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0); - t_functions_inited = 1; - } -} - static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { @@ -565,13 +563,10 @@ sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS) bzero(&tfi, sizeof(tfi)); tfi.tfi_refcnt = f->tf_fb->tfb_refcnt; tfi.tfi_id = f->tf_fb->tfb_id; - (void)strncpy(tfi.tfi_alias, f->tf_name, - TCP_FUNCTION_NAME_LEN_MAX); - tfi.tfi_alias[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; - (void)strncpy(tfi.tfi_name, - f->tf_fb->tfb_tcp_block_name, - TCP_FUNCTION_NAME_LEN_MAX); - tfi.tfi_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; + (void)strlcpy(tfi.tfi_alias, f->tf_name, + sizeof(tfi.tfi_alias)); + (void)strlcpy(tfi.tfi_name, + f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name)); error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); /* * Don't stop on error, as that is the @@ -787,10 +782,9 @@ register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, KASSERT(names != NULL && *num_names > 0, ("%s: Called with 0-length name list", __func__)); KASSERT(names != NULL, ("%s: Called with NULL name list", __func__)); + KASSERT(rw_initialized(&tcp_function_lock), + ("%s: called too early", __func__)); - if (t_functions_inited == 0) { - init_tcp_functions(); - } if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || @@ -819,8 +813,12 @@ register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, } } + if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { + *num_names = 0; + return (EINVAL); + } + refcount_init(&blk->tfb_refcnt, 0); - blk->tfb_flags = 0; blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); for (i = 0; i < *num_names; i++) { n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); @@ -830,9 +828,8 @@ register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, } n->tf_fb = blk; - (void)strncpy(fs.function_set_name, names[i], - TCP_FUNCTION_NAME_LEN_MAX); - fs.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; + (void)strlcpy(fs.function_set_name, names[i], + sizeof(fs.function_set_name)); rw_wlock(&tcp_function_lock); if (find_tcp_functions_locked(&fs) != NULL) { /* Duplicate name space not allowed */ @@ -841,8 +838,7 @@ register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, error = EALREADY; goto cleanup; } - (void)strncpy(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX); - n->tf_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; + (void)strlcpy(n->tf_name, names[i], sizeof(n->tf_name)); TAILQ_INSERT_TAIL(&t_functions, n, tf_next); tcp_fb_cnt++; rw_wunlock(&tcp_function_lock); @@ -929,8 +925,8 @@ deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force) { struct tcp_function *f; - - if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { + + if (blk == &tcp_def_funcblk) { /* You can't un-register the default */ return (EPERM); } @@ -1088,6 +1084,9 @@ tcp_init(void) tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; + tcp_rexmit_initial = TCPTV_RTOBASE; + if (tcp_rexmit_initial < 1) + tcp_rexmit_initial = 1; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; @@ -1096,8 +1095,10 @@ tcp_init(void) tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; tcp_tcbhashsize = hashsize; + /* Setup the tcp function block list */ - init_tcp_functions(); + TAILQ_INIT(&t_functions); + rw_init(&tcp_function_lock, "tcp_func_lock"); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); #ifdef TCP_BLACKBOX /* Initialize the TCP logging data. */ @@ -1130,6 +1131,13 @@ tcp_init(void) SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); + + tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK); + tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK); + tcp_inp_lro_compressed = counter_u64_alloc(M_WAITOK); + tcp_inp_lro_single_push = counter_u64_alloc(M_WAITOK); + tcp_inp_lro_locks_taken = counter_u64_alloc(M_WAITOK); + tcp_inp_lro_sack_wake = counter_u64_alloc(M_WAITOK); #ifdef TCPPCAP tcp_pcap_init(); #endif @@ -1666,9 +1674,9 @@ tcp_newtcpcb(struct inpcb *inp) * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; - tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; + tp->t_rttvar = ((tcp_rexmit_initial - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; - tp->t_rxtcur = TCPTV_RTOBASE; + tp->t_rxtcur = tcp_rexmit_initial; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; @@ -2648,7 +2656,17 @@ tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len) uint32_t tcp_new_ts_offset(struct in_conninfo *inc) { - return (tcp_keyed_hash(inc, V_ts_offset_secret, + struct in_conninfo inc_store, *local_inc; + + if (!V_tcp_ts_offset_per_conn) { + memcpy(&inc_store, inc, sizeof(struct in_conninfo)); + inc_store.inc_lport = 0; + inc_store.inc_fport = 0; + local_inc = &inc_store; + } else { + local_inc = inc; + } + return (tcp_keyed_hash(local_inc, V_ts_offset_secret, sizeof(V_ts_offset_secret))); } @@ -3075,6 +3093,120 @@ SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); +#ifdef KERN_TLS +static int +sysctl_switch_tls(SYSCTL_HANDLER_ARGS) +{ + /* addrs[0] is a foreign socket, addrs[1] is a local one. */ + struct sockaddr_storage addrs[2]; + struct inpcb *inp; + struct sockaddr_in *fin, *lin; + struct epoch_tracker et; +#ifdef INET6 + struct sockaddr_in6 *fin6, *lin6; +#endif + int error; + + inp = NULL; + fin = lin = NULL; +#ifdef INET6 + fin6 = lin6 = NULL; +#endif + error = 0; + + if (req->oldptr != NULL || req->oldlen != 0) + return (EINVAL); + if (req->newptr == NULL) + return (EPERM); + if (req->newlen < sizeof(addrs)) + return (ENOMEM); + error = SYSCTL_IN(req, &addrs, sizeof(addrs)); + if (error) + return (error); + + switch (addrs[0].ss_family) { +#ifdef INET6 + case AF_INET6: + fin6 = (struct sockaddr_in6 *)&addrs[0]; + lin6 = (struct sockaddr_in6 *)&addrs[1]; + if (fin6->sin6_len != sizeof(struct sockaddr_in6) || + lin6->sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { + if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) + return (EINVAL); + in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); + in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); + fin = (struct sockaddr_in *)&addrs[0]; + lin = (struct sockaddr_in *)&addrs[1]; + break; + } + error = sa6_embedscope(fin6, V_ip6_use_defzone); + if (error) + return (error); + error = sa6_embedscope(lin6, V_ip6_use_defzone); + if (error) + return (error); + break; +#endif +#ifdef INET + case AF_INET: + fin = (struct sockaddr_in *)&addrs[0]; + lin = (struct sockaddr_in *)&addrs[1]; + if (fin->sin_len != sizeof(struct sockaddr_in) || + lin->sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + break; +#endif + default: + return (EINVAL); + } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + switch (addrs[0].ss_family) { +#ifdef INET6 + case AF_INET6: + inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, + fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, + INPLOOKUP_WLOCKPCB, NULL); + break; +#endif +#ifdef INET + case AF_INET: + inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, + lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); + break; +#endif + } + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + if (inp != NULL) { + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0 || + inp->inp_socket == NULL) { + error = ECONNRESET; + INP_WUNLOCK(inp); + } else { + struct socket *so; + + so = inp->inp_socket; + soref(so); + error = ktls_set_tx_mode(so, + arg2 == 0 ? TCP_TLS_MODE_SW : TCP_TLS_MODE_IFNET); + INP_WUNLOCK(inp); + SOCK_LOCK(so); + sorele(so); + } + } else + error = ESRCH; + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_sw_tls, + CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL, + 0, sysctl_switch_tls, "", "Switch TCP connection to SW TLS"); +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_ifnet_tls, + CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL, + 1, sysctl_switch_tls, "", "Switch TCP connection to ifnet TLS"); +#endif + /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to diff --git a/freebsd/sys/netinet/tcp_syncache.c b/freebsd/sys/netinet/tcp_syncache.c index bfbf9f42..468aaab7 100644 --- a/freebsd/sys/netinet/tcp_syncache.c +++ b/freebsd/sys/netinet/tcp_syncache.c @@ -156,7 +156,12 @@ static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, /* * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. - * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds, + * 3 retransmits corresponds to a timeout with default values of + * tcp_rexmit_initial * ( 1 + + * tcp_backoff[1] + + * tcp_backoff[2] + + * tcp_backoff[3]) + 3 * tcp_rexmit_slop, + * 1000 ms * (1 + 2 + 4 + 8) + 3 * 200 ms = 15600 ms, * the odds are that the user has given up attempting to connect by then. */ #define SYNCACHE_MAXREXMTS 3 @@ -421,9 +426,10 @@ syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) int rexmt; if (sc->sc_rxmits == 0) - rexmt = TCPTV_RTOBASE; + rexmt = tcp_rexmit_initial; else - TCPT_RANGESET(rexmt, TCPTV_RTOBASE * tcp_syn_backoff[sc->sc_rxmits], + TCPT_RANGESET(rexmt, + tcp_rexmit_initial * tcp_backoff[sc->sc_rxmits], tcp_rexmit_min, TCPTV_REXMTMAX); sc->sc_rxttime = ticks + rexmt; sc->sc_rxmits++; @@ -773,6 +779,9 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) if (m != NULL && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); +#ifdef NUMA + inp->inp_numa_domain = m->m_pkthdr.numa_domain; +#endif } /* @@ -1143,6 +1152,28 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, } } #endif /* TCP_SIGNATURE */ + + /* + * RFC 7323 PAWS: If we have a timestamp on this segment and + * it's less than ts_recent, drop it. + * XXXMT: RFC 7323 also requires to send an ACK. + * In tcp_input.c this is only done for TCP segments + * with user data, so be consistent here and just drop + * the segment. + */ + if (sc->sc_flags & SCF_TIMESTAMP && to->to_flags & TOF_TS && + TSTMP_LT(to->to_tsval, sc->sc_tsreflect)) { + SCH_UNLOCK(sch); + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, + "%s; %s: SEG.TSval %u < TS.Recent %u, " + "segment dropped\n", s, __func__, + to->to_tsval, sc->sc_tsreflect); + free(s, M_TCPLOG); + } + return (-1); /* Do not send RST */ + } + /* * Pull out the entry to unlock the bucket row. * @@ -1522,7 +1553,6 @@ skip_alloc: sc->sc_todctx = todctx; #endif sc->sc_irs = th->th_seq; - sc->sc_iss = arc4random(); sc->sc_flags = 0; sc->sc_flowlabel = 0; @@ -1596,6 +1626,8 @@ skip_alloc: if (V_tcp_syncookies) sc->sc_iss = syncookie_generate(sch, sc); + else + sc->sc_iss = arc4random(); #ifdef INET6 if (autoflowlabel) { if (V_tcp_syncookies) diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c index c50af2bb..cf6ceff5 100644 --- a/freebsd/sys/netinet/tcp_timer.c +++ b/freebsd/sys/netinet/tcp_timer.c @@ -112,6 +112,11 @@ int tcp_msl; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); +int tcp_rexmit_initial; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, CTLTYPE_INT|CTLFLAG_RW, + &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", + "Initial Retransmission Timeout"); + int tcp_rexmit_min; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", @@ -235,9 +240,6 @@ tcp_slowtimo(void) VNET_LIST_RUNLOCK_NOSLEEP(); } -int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = - { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; - int tcp_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; @@ -673,7 +675,7 @@ tcp_timer_rexmt(void * xtp) TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) - rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; + rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, diff --git a/freebsd/sys/netinet/tcp_timer.h b/freebsd/sys/netinet/tcp_timer.h index a2ab6ca5..3e985bdf 100644 --- a/freebsd/sys/netinet/tcp_timer.h +++ b/freebsd/sys/netinet/tcp_timer.h @@ -77,7 +77,7 @@ #define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ #define TCPTV_SRTTBASE 0 /* base roundtrip time; if 0, no idea yet */ -#define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */ +#define TCPTV_RTOBASE ( 1*hz) /* assumed RTO if no info */ #define TCPTV_PERSMIN ( 5*hz) /* minimum persist interval */ #define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ @@ -194,12 +194,12 @@ extern int tcp_keepintvl; /* time between keepalive probes */ extern int tcp_keepcnt; /* number of keepalives */ extern int tcp_delacktime; /* time before sending a delayed ACK */ extern int tcp_maxpersistidle; +extern int tcp_rexmit_initial; extern int tcp_rexmit_min; extern int tcp_rexmit_slop; extern int tcp_msl; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; -extern int tcp_syn_backoff[]; extern int tcp_totbackoff; extern int tcp_rexmit_drop_options; diff --git a/freebsd/sys/netinet/tcp_timewait.c b/freebsd/sys/netinet/tcp_timewait.c index 8a28283f..6965d391 100644 --- a/freebsd/sys/netinet/tcp_timewait.c +++ b/freebsd/sys/netinet/tcp_timewait.c @@ -304,7 +304,7 @@ tcp_twstart(struct tcpcb *tp) if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (tp->rcv_adv - tp->rcv_nxt)) recwin = (tp->rcv_adv - tp->rcv_nxt); - tw->last_win = htons((u_short)(recwin >> tp->rcv_scale)); + tw->last_win = (u_short)(recwin >> tp->rcv_scale); /* * Set t_recent if timestamps are used on the connection. diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c index 27ab745b..9f670278 100644 --- a/freebsd/sys/netinet/tcp_usrreq.c +++ b/freebsd/sys/netinet/tcp_usrreq.c @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_ipsec.h> +#include <rtems/bsd/local/opt_kern_tls.h> #include <rtems/bsd/local/opt_tcpdebug.h> #include <sys/param.h> @@ -54,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include <sys/malloc.h> #include <sys/refcount.h> #include <sys/kernel.h> +#include <sys/ktls.h> #include <sys/sysctl.h> #include <sys/mbuf.h> #ifdef INET6 @@ -346,17 +348,17 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; - struct sockaddr_in6 *sin6p; + struct sockaddr_in6 *sin6; - sin6p = (struct sockaddr_in6 *)nam; - if (nam->sa_len != sizeof (*sin6p)) + sin6 = (struct sockaddr_in6 *)nam; + if (nam->sa_len != sizeof (*sin6)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ - if (sin6p->sin6_family == AF_INET6 && - IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) + if (sin6->sin6_family == AF_INET6 && + IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EAFNOSUPPORT); TCPDEBUG0; @@ -374,12 +376,12 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) inp->inp_vflag |= INP_IPV6; #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { - if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) inp->inp_vflag |= INP_IPV4; - else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + else if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; - in6_sin6_2_sin(&sin, sin6p); + in6_sin6_2_sin(&sin, sin6); if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { error = EAFNOSUPPORT; INP_HASH_WUNLOCK(&V_tcbinfo); @@ -568,18 +570,18 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; - struct sockaddr_in6 *sin6p; + struct sockaddr_in6 *sin6; TCPDEBUG0; - sin6p = (struct sockaddr_in6 *)nam; - if (nam->sa_len != sizeof (*sin6p)) + sin6 = (struct sockaddr_in6 *)nam; + if (nam->sa_len != sizeof (*sin6)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ - if (sin6p->sin6_family == AF_INET6 - && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) + if (sin6->sin6_family == AF_INET6 + && IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EAFNOSUPPORT); inp = sotoinpcb(so); @@ -601,7 +603,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) * therefore probably require the hash lock, which isn't held here. * Is this a significant problem? */ - if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { @@ -613,7 +615,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; } - in6_sin6_2_sin(&sin, sin6p); + in6_sin6_2_sin(&sin, sin6); if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { error = EAFNOSUPPORT; goto out; @@ -643,7 +645,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; inp->inp_inc.inc_flags |= INC_ISIPV6; - if ((error = prison_remote_ip6(td->td_ucred, &sin6p->sin6_addr)) != 0) + if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr)) != 0) goto out; if ((error = tcp6_connect(tp, nam, td)) != 0) goto out; @@ -974,22 +976,22 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, #ifdef INET6 case AF_INET6: { - struct sockaddr_in6 *sin6p; + struct sockaddr_in6 *sin6; - sin6p = (struct sockaddr_in6 *)nam; - if (sin6p->sin6_len != sizeof(struct sockaddr_in6)) { + sin6 = (struct sockaddr_in6 *)nam; + if (sin6->sin6_len != sizeof(*sin6)) { if (m) m_freem(m); error = EINVAL; goto out; } - if (IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { + if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { if (m) m_freem(m); error = EAFNOSUPPORT; goto out; } - if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; @@ -1005,7 +1007,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, } inp->inp_vflag &= ~INP_IPV6; sinp = &sin; - in6_sin6_2_sin(sinp, sin6p); + in6_sin6_2_sin(sinp, sin6); if (IN_MULTICAST( ntohl(sinp->sin_addr.s_addr))) { error = EAFNOSUPPORT; @@ -1036,7 +1038,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, inp->inp_vflag &= ~INP_IPV4; inp->inp_inc.inc_flags |= INC_ISIPV6; if ((error = prison_remote_ip6(td->td_ucred, - &sin6p->sin6_addr))) { + &sin6->sin6_addr))) { if (m) m_freem(m); goto out; @@ -1192,8 +1194,7 @@ tcp_usr_ready(struct socket *so, struct mbuf *m, int count) INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); - for (int i = 0; i < count; i++) - m = m_free(m); + mb_free_notready(m, count); return (ECONNRESET); } tp = intotcpcb(inp); @@ -1580,11 +1581,9 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); - INP_WLOCK(inp); if (sopt->sopt_level != IPPROTO_TCP) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { - INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); /* * In case of the IPV6_USE_MIN_MTU socket option, @@ -1629,12 +1628,12 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) #endif #ifdef INET { - INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); } #endif return (error); } + INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); @@ -1760,6 +1759,9 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp int error, opt, optval; u_int ui; struct tcp_info ti; +#ifdef KERN_TLS + struct tls_enable tls; +#endif struct cc_algo *algo; char *pbuf, buf[TCP_LOG_ID_LEN]; size_t len; @@ -1922,6 +1924,29 @@ unlock_and_done: INP_WUNLOCK(inp); break; +#ifdef KERN_TLS + case TCP_TXTLS_ENABLE: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &tls, sizeof(tls), + sizeof(tls)); + if (error) + break; + error = ktls_enable_tx(so, &tls); + break; + case TCP_TXTLS_MODE: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); + if (error) + return (error); + if (ui != TCP_TLS_MODE_SW && ui != TCP_TLS_MODE_IFNET) + return (EINVAL); + + INP_WLOCK_RECHECK(inp); + error = ktls_set_tx_mode(so, ui); + INP_WUNLOCK(inp); + break; +#endif + case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: @@ -2203,6 +2228,13 @@ unlock_and_done: error = EINVAL; break; #endif +#ifdef KERN_TLS + case TCP_TXTLS_MODE: + optval = ktls_get_tx_mode(so); + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof(optval)); + break; +#endif default: INP_WUNLOCK(inp); error = ENOPROTOOPT; diff --git a/freebsd/sys/netinet/tcp_var.h b/freebsd/sys/netinet/tcp_var.h index 2fbe07ad..48136abe 100644 --- a/freebsd/sys/netinet/tcp_var.h +++ b/freebsd/sys/netinet/tcp_var.h @@ -102,7 +102,8 @@ struct tcpcb { t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ - bits_spare : 4; + t_fin_is_rst: 1, /* Are fin's treated as resets */ + bits_spare : 3; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; @@ -271,6 +272,11 @@ struct tcp_function_block { void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); + int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int); + int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, + int, int, uint8_t, + int, struct timeval *); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, @@ -754,7 +760,6 @@ extern int tcp_log_in_vain; VNET_DECLARE(int, drop_synfin); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, tcp_abc_l_var); -VNET_DECLARE(int, tcp_autorcvbuf_inc); VNET_DECLARE(int, tcp_autorcvbuf_max); VNET_DECLARE(int, tcp_autosndbuf_inc); VNET_DECLARE(int, tcp_autosndbuf_max); @@ -789,7 +794,6 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) -#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) @@ -798,6 +802,7 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) +#define V_tcp_ts_offset_per_conn VNET(tcp_ts_offset_per_conn) #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) @@ -882,6 +887,13 @@ struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *fs); int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); +extern counter_u64_t tcp_inp_lro_direct_queue; +extern counter_u64_t tcp_inp_lro_wokeup_queue; +extern counter_u64_t tcp_inp_lro_compressed; +extern counter_u64_t tcp_inp_lro_single_push; +extern counter_u64_t tcp_inp_lro_locks_taken; +extern counter_u64_t tcp_inp_lro_sack_wake; + uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); u_int tcp_maxseg(const struct tcpcb *); @@ -934,7 +946,9 @@ uint32_t tcp_new_ts_offset(struct in_conninfo *); tcp_seq tcp_new_isn(struct in_conninfo *); int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); +void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); +void tcp_clean_dsack_blocks(struct tcpcb *tp); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); @@ -942,10 +956,11 @@ void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); +uint32_t tcp_compute_initwnd(uint32_t); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, - int32_t seglimit, int32_t segsize, struct sockbuf *sb); + int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls); static inline void diff --git a/freebsd/sys/netinet/toecore.h b/freebsd/sys/netinet/toecore.h index f2374d70..f8bda614 100644 --- a/freebsd/sys/netinet/toecore.h +++ b/freebsd/sys/netinet/toecore.h @@ -35,6 +35,8 @@ #error "no user-serviceable parts inside" #endif +#include <sys/_eventhandler.h> + struct tcpopt; struct tcphdr; struct in_conninfo; @@ -108,7 +110,6 @@ struct toedev { struct tcp_info *); }; -#include <sys/eventhandler.h> typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *); typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *); EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn); diff --git a/freebsd/sys/netinet/udp_usrreq.c b/freebsd/sys/netinet/udp_usrreq.c index 33b89c21..f89660d6 100644 --- a/freebsd/sys/netinet/udp_usrreq.c +++ b/freebsd/sys/netinet/udp_usrreq.c @@ -1162,9 +1162,23 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, src.sin_family = 0; sin = (struct sockaddr_in *)addr; +retry: if (sin == NULL || (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { INP_WLOCK(inp); + /* + * In case we lost a race and another thread bound addr/port + * on the inp we cannot keep the wlock (which still would be + * fine) as further down, based on these values we make + * decisions for the pcbinfo lock. If the locks are not in + * synch the assertions on unlock will fire, hence we go for + * one retry loop. + */ + if (sin != NULL && (inp->inp_laddr.s_addr != INADDR_ANY || + inp->inp_lport != 0)) { + INP_WUNLOCK(inp); + goto retry; + } unlock_inp = UH_WLOCKED; } else { INP_RLOCK(inp); @@ -1264,36 +1278,44 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, } /* - * Depending on whether or not the application has bound or connected - * the socket, we may have to do varying levels of work. The optimal - * case is for a connected UDP socket, as a global lock isn't - * required at all. - * - * In order to decide which we need, we require stability of the - * inpcb binding, which we ensure by acquiring a read lock on the - * inpcb. This doesn't strictly follow the lock order, so we play - * the trylock and retry game; note that we may end up with more - * conservative locks than required the second time around, so later - * assertions have to accept that. Further analysis of the number of - * misses under contention is required. - * - * XXXRW: Check that hash locking update here is correct. + * In the old days, depending on whether or not the application had + * bound or connected the socket, we had to do varying levels of work. + * The optimal case was for a connected UDP socket, as a global lock + * wasn't required at all. + * In order to decide which we need, we required stability of the + * inpcb binding, which we ensured by acquiring a read lock on the + * inpcb. This didn't strictly follow the lock order, so we played + * the trylock and retry game. + * With the re-introduction of the route-cache in some cases, we started + * to acquire an early inp wlock and a possible race during re-lock + * went away. With the introduction of epoch(9) some read locking + * became epoch(9) and the lock-order issues also went away. + * Due to route-cache we may now hold more conservative locks than + * otherwise required and have split up the 2nd case in case 2 and 3 + * in order to keep the udpinfo lock level in sync with the inp one + * for the IP_SENDSRCADDR case below. */ pr = inp->inp_socket->so_proto->pr_protocol; pcbinfo = udp_get_inpcbinfo(pr); - sin = (struct sockaddr_in *)addr; if (sin != NULL && (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { INP_HASH_WLOCK(pcbinfo); unlock_udbinfo = UH_WLOCKED; - } else if ((sin != NULL && ( - (sin->sin_addr.s_addr == INADDR_ANY) || - (sin->sin_addr.s_addr == INADDR_BROADCAST) || - (inp->inp_laddr.s_addr == INADDR_ANY) || - (inp->inp_lport == 0))) || - (src.sin_family == AF_INET)) { + } else if (sin != NULL && + (sin->sin_addr.s_addr == INADDR_ANY || + sin->sin_addr.s_addr == INADDR_BROADCAST || + inp->inp_laddr.s_addr == INADDR_ANY || + inp->inp_lport == 0)) { INP_HASH_RLOCK_ET(pcbinfo, et); unlock_udbinfo = UH_RLOCKED; + } else if (src.sin_family == AF_INET) { + if (unlock_inp == UH_WLOCKED) { + INP_HASH_WLOCK(pcbinfo); + unlock_udbinfo = UH_WLOCKED; + } else { + INP_HASH_RLOCK_ET(pcbinfo, et); + unlock_udbinfo = UH_RLOCKED; + } } else unlock_udbinfo = UH_UNLOCKED; @@ -1503,8 +1525,9 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, if (flowtype != M_HASHTYPE_NONE) { m->m_pkthdr.flowid = flowid; M_HASHTYPE_SET(m, flowtype); + } #ifdef RSS - } else { + else { uint32_t hash_val, hash_type; /* * Calculate an appropriate RSS hash for UDP and @@ -1527,10 +1550,8 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, m->m_pkthdr.flowid = hash_val; M_HASHTYPE_SET(m, hash_type); } -#endif } -#ifdef RSS /* * Don't override with the inp cached flowid value. * @@ -1565,12 +1586,22 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, release: if (unlock_udbinfo == UH_WLOCKED) { KASSERT(unlock_inp == UH_WLOCKED, - ("%s: excl udbinfo lock, shared inp lock", __func__)); + ("%s: excl udbinfo lock %#03x, shared inp lock %#03x, " + "sin %p daddr %#010x inp %p laddr %#010x lport %#06x " + "src fam %#04x", + __func__, unlock_udbinfo, unlock_inp, sin, + (sin != NULL) ? sin->sin_addr.s_addr : 0xfefefefe, inp, + inp->inp_laddr.s_addr, inp->inp_lport, src.sin_family)); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); } else if (unlock_udbinfo == UH_RLOCKED) { KASSERT(unlock_inp == UH_RLOCKED, - ("%s: shared udbinfo lock, excl inp lock", __func__)); + ("%s: shared udbinfo lock %#03x, excl inp lock %#03x, " + "sin %p daddr %#010x inp %p laddr %#010x lport %#06x " + "src fam %#04x", + __func__, unlock_udbinfo, unlock_inp, sin, + (sin != NULL) ? sin->sin_addr.s_addr : 0xfefefefe, inp, + inp->inp_laddr.s_addr, inp->inp_lport, src.sin_family)); INP_HASH_RUNLOCK_ET(pcbinfo, et); INP_RUNLOCK(inp); } else if (unlock_inp == UH_WLOCKED) diff --git a/freebsd/sys/netinet6/frag6.c b/freebsd/sys/netinet6/frag6.c index 0b0c7b91..6f16c712 100644 --- a/freebsd/sys/netinet6/frag6.c +++ b/freebsd/sys/netinet6/frag6.c @@ -40,20 +40,17 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/domain.h> +#include <sys/eventhandler.h> #include <sys/hash.h> +#include <sys/kernel.h> #include <sys/malloc.h> #include <sys/mbuf.h> -#include <sys/domain.h> -#include <sys/eventhandler.h> #include <sys/protosw.h> #include <sys/socket.h> -#include <sys/errno.h> -#include <sys/time.h> -#include <sys/kernel.h> +#include <sys/sysctl.h> #include <sys/syslog.h> -#include <machine/atomic.h> - #include <net/if.h> #include <net/if_var.h> #include <net/netisr.h> @@ -65,14 +62,14 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet/icmp6.h> -#include <netinet/in_systm.h> /* for ECN definitions */ -#include <netinet/ip.h> /* for ECN definitions */ +#include <netinet/in_systm.h> /* For ECN definitions. */ +#include <netinet/ip.h> /* For ECN definitions. */ +#ifdef MAC #include <security/mac/mac_framework.h> +#endif -/* - * Reassembly headers are stored in hash buckets. - */ +/* Reassembly headers are stored in hash buckets. */ #define IP6REASS_NHASH_LOG2 10 #define IP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2) #define IP6REASS_HMASK (IP6REASS_NHASH - 1) @@ -91,22 +88,47 @@ struct ip6qbucket { int count; }; -VNET_DEFINE_STATIC(volatile u_int, frag6_nfragpackets); -volatile u_int frag6_nfrags = 0; -VNET_DEFINE_STATIC(struct ip6qbucket, ip6q[IP6REASS_NHASH]); -VNET_DEFINE_STATIC(uint32_t, ip6q_hashseed); +struct ip6asfrag { + struct ip6asfrag *ip6af_down; + struct ip6asfrag *ip6af_up; + struct mbuf *ip6af_m; + int ip6af_offset; /* offset in ip6af_m to next header */ + int ip6af_frglen; /* fragmentable part length */ + int ip6af_off; /* fragment offset */ + u_int16_t ip6af_mff; /* more fragment bit in frag off */ +}; +#define IP6_REASS_MBUF(ip6af) (*(struct mbuf **)&((ip6af)->ip6af_m)) + +static MALLOC_DEFINE(M_FRAG6, "frag6", "IPv6 fragment reassembly header"); + +/* System wide (global) maximum and count of packets in reassembly queues. */ +static int ip6_maxfrags; +static volatile u_int frag6_nfrags = 0; + +/* Maximum and current packets in per-VNET reassembly queue. */ +VNET_DEFINE_STATIC(int, ip6_maxfragpackets); +VNET_DEFINE_STATIC(volatile u_int, frag6_nfragpackets); +#define V_ip6_maxfragpackets VNET(ip6_maxfragpackets) #define V_frag6_nfragpackets VNET(frag6_nfragpackets) -#define V_ip6q VNET(ip6q) -#define V_ip6q_hashseed VNET(ip6q_hashseed) -#define IP6Q_LOCK(i) mtx_lock(&V_ip6q[(i)].lock) -#define IP6Q_TRYLOCK(i) mtx_trylock(&V_ip6q[(i)].lock) -#define IP6Q_LOCK_ASSERT(i) mtx_assert(&V_ip6q[(i)].lock, MA_OWNED) -#define IP6Q_UNLOCK(i) mtx_unlock(&V_ip6q[(i)].lock) -#define IP6Q_HEAD(i) (&V_ip6q[(i)].ip6q) +/* Maximum per-VNET reassembly queues per bucket and fragments per packet. */ +VNET_DEFINE_STATIC(int, ip6_maxfragbucketsize); +VNET_DEFINE_STATIC(int, ip6_maxfragsperpacket); +#define V_ip6_maxfragbucketsize VNET(ip6_maxfragbucketsize) +#define V_ip6_maxfragsperpacket VNET(ip6_maxfragsperpacket) -static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); +/* Per-VNET reassembly queue buckets. */ +VNET_DEFINE_STATIC(struct ip6qbucket, ip6qb[IP6REASS_NHASH]); +VNET_DEFINE_STATIC(uint32_t, ip6qb_hashseed); +#define V_ip6qb VNET(ip6qb) +#define V_ip6qb_hashseed VNET(ip6qb_hashseed) + +#define IP6QB_LOCK(_b) mtx_lock(&V_ip6qb[(_b)].lock) +#define IP6QB_TRYLOCK(_b) mtx_trylock(&V_ip6qb[(_b)].lock) +#define IP6QB_LOCK_ASSERT(_b) mtx_assert(&V_ip6qb[(_b)].lock, MA_OWNED) +#define IP6QB_UNLOCK(_b) mtx_unlock(&V_ip6qb[(_b)].lock) +#define IP6QB_HEAD(_b) (&V_ip6qb[(_b)].ip6q) /* * By default, limit the number of IP6 fragments across all reassembly @@ -124,11 +146,14 @@ static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); #define IP6_MAXFRAGS (nmbclusters / 32) #define IP6_MAXFRAGPACKETS (imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50)) + /* - * Initialise reassembly queue and fragment identifier. + * Sysctls and helper function. */ -void -frag6_set_bucketsize() +SYSCTL_DECL(_net_inet6_ip6); + +static void +frag6_set_bucketsize(void) { int i; @@ -136,68 +161,140 @@ frag6_set_bucketsize() V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1); } -static void -frag6_change(void *tag) +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, + CTLFLAG_RW, &ip6_maxfrags, 0, + "Maximum allowed number of outstanding IPv6 packet fragments. " + "A value of 0 means no fragmented packets will be accepted, while a " + "a value of -1 means no limit"); + +static int +sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS) { - VNET_ITERATOR_DECL(vnet_iter); + int error, val; - ip6_maxfrags = IP6_MAXFRAGS; - VNET_LIST_RLOCK_NOSLEEP(); - VNET_FOREACH(vnet_iter) { - CURVNET_SET(vnet_iter); - V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; - frag6_set_bucketsize(); - CURVNET_RESTORE(); - } - VNET_LIST_RUNLOCK_NOSLEEP(); + val = V_ip6_maxfragpackets; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || !req->newptr) + return (error); + V_ip6_maxfragpackets = val; + frag6_set_bucketsize(); + return (0); } +SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, NULL, 0, + sysctl_ip6_maxfragpackets, "I", + "Default maximum number of outstanding fragmented IPv6 packets. " + "A value of 0 means no fragmented packets will be accepted, while a " + "a value of -1 means no limit"); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0, + "Maximum allowed number of fragments per packet"); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0, + "Maximum number of reassembly queues per hash bucket"); -void -frag6_init(void) + +/* + * Remove the IPv6 fragmentation header from the mbuf. + */ +int +ip6_deletefraghdr(struct mbuf *m, int offset, int wait) { - struct ip6q *q6; - int i; + struct ip6_hdr *ip6; + struct mbuf *t; - V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; - frag6_set_bucketsize(); - for (i = 0; i < IP6REASS_NHASH; i++) { - q6 = IP6Q_HEAD(i); - q6->ip6q_next = q6->ip6q_prev = q6; - mtx_init(&V_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF); - V_ip6q[i].count = 0; + /* Delete frag6 header. */ + if (m->m_len >= offset + sizeof(struct ip6_frag)) { + + /* This is the only possible case with !PULLDOWN_TEST. */ + ip6 = mtod(m, struct ip6_hdr *); + bcopy(ip6, (char *)ip6 + sizeof(struct ip6_frag), + offset); + m->m_data += sizeof(struct ip6_frag); + m->m_len -= sizeof(struct ip6_frag); + } else { + + /* This comes with no copy if the boundary is on cluster. */ + if ((t = m_split(m, offset, wait)) == NULL) + return (ENOMEM); + m_adj(t, sizeof(struct ip6_frag)); + m_cat(m, t); } - V_ip6q_hashseed = arc4random(); - V_ip6_maxfragsperpacket = 64; - if (!IS_DEFAULT_VNET(curvnet)) - return; - ip6_maxfrags = IP6_MAXFRAGS; - EVENTHANDLER_REGISTER(nmbclusters_change, - frag6_change, NULL, EVENTHANDLER_PRI_ANY); + m->m_flags |= M_FRAGMENTED; + return (0); +} + +/* + * Free a fragment reassembly header and all associated datagrams. + */ +static void +frag6_freef(struct ip6q *q6, uint32_t bucket) +{ + struct ip6_hdr *ip6; + struct ip6asfrag *af6, *down6; + struct mbuf *m; + + IP6QB_LOCK_ASSERT(bucket); + + for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; + af6 = down6) { + + m = IP6_REASS_MBUF(af6); + down6 = af6->ip6af_down; + frag6_deq(af6, bucket); + + /* + * Return ICMP time exceeded error for the 1st fragment. + * Just free other fragments. + */ + if (af6->ip6af_off == 0) { + + /* Adjust pointer. */ + ip6 = mtod(m, struct ip6_hdr *); + + /* Restore source and destination addresses. */ + ip6->ip6_src = q6->ip6q_src; + ip6->ip6_dst = q6->ip6q_dst; + + icmp6_error(m, ICMP6_TIME_EXCEEDED, + ICMP6_TIME_EXCEED_REASSEMBLY, 0); + } else + m_freem(m); + + free(af6, M_FRAG6); + } + frag6_remque(q6, bucket); + atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); +#ifdef MAC + mac_ip6q_destroy(q6); +#endif + free(q6, M_FRAG6); + atomic_subtract_int(&V_frag6_nfragpackets, 1); } /* - * In RFC2460, fragment and reassembly rule do not agree with each other, - * in terms of next header field handling in fragment header. + * Like in RFC2460, in RFC8200, fragment and reassembly rules do not agree with + * each other, in terms of next header field handling in fragment header. * While the sender will use the same value for all of the fragmented packets, - * receiver is suggested not to check the consistency. + * receiver is suggested not to check for consistency. * - * fragment rule (p20): - * (2) A Fragment header containing: - * The Next Header value that identifies the first header of - * the Fragmentable Part of the original packet. + * Fragment rules (p18,p19): + * (2) A Fragment header containing: + * The Next Header value that identifies the first header + * after the Per-Fragment headers of the original packet. * -> next header field is same for all fragments * - * reassembly rule (p21): - * The Next Header field of the last header of the Unfragmentable - * Part is obtained from the Next Header field of the first + * Reassembly rule (p20): + * The Next Header field of the last header of the Per-Fragment + * headers is obtained from the Next Header field of the first * fragment's Fragment header. * -> should grab it from the first fragment only * * The following note also contradicts with fragment rule - no one is going to * send different fragment with different next header field. * - * additional note (p22): + * Additional note (p22) [not an error]: * The Next Header values in the Fragment headers of different * fragments of the same original packet may differ. Only the value * from the Offset zero fragment packet is used for reassembly. @@ -206,33 +303,32 @@ frag6_init(void) * There is no explicit reason given in the RFC. Historical reason maybe? */ /* - * Fragment input + * Fragment input. */ int frag6_input(struct mbuf **mp, int *offp, int proto) { - struct mbuf *m = *mp, *t; + struct ifnet *dstifp; + struct in6_ifaddr *ia6; struct ip6_hdr *ip6; struct ip6_frag *ip6f; struct ip6q *head, *q6; - struct ip6asfrag *af6, *ip6af, *af6dwn; - struct in6_ifaddr *ia; - int offset = *offp, nxt, i, next; - int first_frag = 0; - int fragoff, frgpartlen; /* must be larger than u_int16_t */ + struct ip6asfrag *af6, *af6dwn, *ip6af; + struct mbuf *m, *t; uint32_t hashkey[(sizeof(struct in6_addr) * 2 + sizeof(ip6f->ip6f_ident)) / sizeof(uint32_t)]; - uint32_t hash, *hashkeyp; - struct ifnet *dstifp; - u_int8_t ecn, ecn0; + uint32_t bucket, *hashkeyp; + int fragoff, frgpartlen; /* Must be larger than uint16_t. */ + int nxt, offset, plen; + uint8_t ecn, ecn0; + bool only_frag; #ifdef RSS - struct m_tag *mtag; struct ip6_direct_ctx *ip6dc; + struct m_tag *mtag; #endif -#if 0 - char ip6buf[INET6_ADDRSTRLEN]; -#endif + m = *mp; + offset = *offp; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST @@ -245,22 +341,23 @@ frag6_input(struct mbuf **mp, int *offp, int proto) #endif dstifp = NULL; - /* find the destination interface of the packet. */ - ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); - if (ia != NULL) { - dstifp = ia->ia_ifp; - ifa_free(&ia->ia_ifa); + /* Find the destination interface of the packet. */ + ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); + if (ia6 != NULL) { + dstifp = ia6->ia_ifp; + ifa_free(&ia6->ia_ifa); } - /* jumbo payload can't contain a fragment header */ + + /* Jumbo payload cannot contain a fragment header. */ if (ip6->ip6_plen == 0) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset); in6_ifstat_inc(dstifp, ifs6_reass_fail); - return IPPROTO_DONE; + return (IPPROTO_DONE); } /* - * check whether fragment packet's fragment length is - * multiple of 8 octets. + * Check whether fragment packet's fragment length is a + * multiple of 8 octets (unless it is the last one). * sizeof(struct ip6_frag) == 8 * sizeof(struct ip6_hdr) = 40 */ @@ -269,22 +366,23 @@ frag6_input(struct mbuf **mp, int *offp, int proto) icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offsetof(struct ip6_hdr, ip6_plen)); in6_ifstat_inc(dstifp, ifs6_reass_fail); - return IPPROTO_DONE; + return (IPPROTO_DONE); } IP6STAT_INC(ip6s_fragments); in6_ifstat_inc(dstifp, ifs6_reass_reqd); - /* offset now points to data portion */ + /* Offset now points to data portion. */ offset += sizeof(struct ip6_frag); /* - * RFC 6946: Handle "atomic" fragments (offset and m bit set to 0) - * upfront, unrelated to any reassembly. Just skip the fragment header. + * Handle "atomic" fragments (offset and m bit set to 0) upfront, + * unrelated to any reassembly. Still need to remove the frag hdr. + * See RFC 6946 and section 4.5 of RFC 8200. */ if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) { - /* XXX-BZ we want dedicated counters for this. */ - IP6STAT_INC(ip6s_reassembled); + IP6STAT_INC(ip6s_atomicfrags); + /* XXX-BZ handle correctly. */ in6_ifstat_inc(dstifp, ifs6_reass_ok); *offp = offset; m->m_flags |= M_FRAGMENTED; @@ -298,22 +396,23 @@ frag6_input(struct mbuf **mp, int *offp, int proto) offsetof(struct ip6_hdr, ip6_plen)); in6_ifstat_inc(dstifp, ifs6_reass_fail); IP6STAT_INC(ip6s_fragdropped); - return IPPROTO_DONE; + return (IPPROTO_DONE); } + /* Generate a hash value for fragment bucket selection. */ hashkeyp = hashkey; memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr)); hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr)); hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); *hashkeyp = ip6f->ip6f_ident; - hash = jenkins_hash32(hashkey, nitems(hashkey), V_ip6q_hashseed); - hash &= IP6REASS_HMASK; - head = IP6Q_HEAD(hash); - IP6Q_LOCK(hash); + bucket = jenkins_hash32(hashkey, nitems(hashkey), V_ip6qb_hashseed); + bucket &= IP6REASS_HMASK; + head = IP6QB_HEAD(bucket); + IP6QB_LOCK(bucket); /* - * Enforce upper bound on number of fragments. + * Enforce upper bound on number of fragments for the entire system. * If maxfrag is 0, never accept fragments. * If maxfrag is -1, accept all fragments without limitation. */ @@ -332,11 +431,11 @@ frag6_input(struct mbuf **mp, int *offp, int proto) ) break; + only_frag = false; if (q6 == head) { - /* - * the first fragment to arrive, create a reassembly queue. - */ - first_frag = 1; + + /* A first fragment to arrive creates a reassembly queue. */ + only_frag = true; /* * Enforce upper bound on number of fragmented packets @@ -347,26 +446,27 @@ frag6_input(struct mbuf **mp, int *offp, int proto) */ if (V_ip6_maxfragpackets < 0) ; - else if (V_ip6q[hash].count >= V_ip6_maxfragbucketsize || + else if (V_ip6qb[bucket].count >= V_ip6_maxfragbucketsize || atomic_load_int(&V_frag6_nfragpackets) >= (u_int)V_ip6_maxfragpackets) goto dropfrag; atomic_add_int(&V_frag6_nfragpackets, 1); - q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE, - M_NOWAIT); + + /* Allocate IPv6 fragement packet queue entry. */ + q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FRAG6, + M_NOWAIT | M_ZERO); if (q6 == NULL) goto dropfrag; - bzero(q6, sizeof(*q6)); #ifdef MAC if (mac_ip6q_init(q6, M_NOWAIT) != 0) { - free(q6, M_FTABLE); + free(q6, M_FRAG6); goto dropfrag; } mac_ip6q_create(m, q6); #endif - frag6_insque_head(q6, head, hash); + frag6_insque_head(q6, head, bucket); - /* ip6q_nxt will be filled afterwards, from 1st fragment */ + /* ip6q_nxt will be filled afterwards, from 1st fragment. */ q6->ip6q_down = q6->ip6q_up = (struct ip6asfrag *)q6; #ifdef notyet q6->ip6q_nxtp = (u_char *)nxtp; @@ -383,7 +483,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto) } /* - * If it's the 1st fragment, record the length of the + * If it is the 1st fragment, record the length of the * unfragmentable part and the next header of the fragment header. */ fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK); @@ -404,18 +504,18 @@ frag6_input(struct mbuf **mp, int *offp, int proto) icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); - IP6Q_UNLOCK(hash); + IP6QB_UNLOCK(bucket); return (IPPROTO_DONE); } } else if (fragoff + frgpartlen > IPV6_MAXPACKET) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); - IP6Q_UNLOCK(hash); + IP6QB_UNLOCK(bucket); return (IPPROTO_DONE); } /* - * If it's the first fragment, do the above check for each + * If it is the first fragment, do the above check for each * fragment already stored in the reassembly queue. */ if (fragoff == 0) { @@ -425,15 +525,18 @@ frag6_input(struct mbuf **mp, int *offp, int proto) if (q6->ip6q_unfrglen + af6->ip6af_off + af6->ip6af_frglen > IPV6_MAXPACKET) { - struct mbuf *merr = IP6_REASS_MBUF(af6); struct ip6_hdr *ip6err; - int erroff = af6->ip6af_offset; + struct mbuf *merr; + int erroff; + + merr = IP6_REASS_MBUF(af6); + erroff = af6->ip6af_offset; - /* dequeue the fragment. */ - frag6_deq(af6, hash); - free(af6, M_FTABLE); + /* Dequeue the fragment. */ + frag6_deq(af6, bucket); + free(af6, M_FRAG6); - /* adjust pointer. */ + /* Adjust pointer. */ ip6err = mtod(merr, struct ip6_hdr *); /* @@ -451,174 +554,113 @@ frag6_input(struct mbuf **mp, int *offp, int proto) } } - ip6af = (struct ip6asfrag *)malloc(sizeof(struct ip6asfrag), M_FTABLE, - M_NOWAIT); + /* Allocate an IPv6 fragement queue entry for this fragmented part. */ + ip6af = (struct ip6asfrag *)malloc(sizeof(struct ip6asfrag), M_FRAG6, + M_NOWAIT | M_ZERO); if (ip6af == NULL) goto dropfrag; - bzero(ip6af, sizeof(*ip6af)); ip6af->ip6af_mff = ip6f->ip6f_offlg & IP6F_MORE_FRAG; ip6af->ip6af_off = fragoff; ip6af->ip6af_frglen = frgpartlen; ip6af->ip6af_offset = offset; IP6_REASS_MBUF(ip6af) = m; - if (first_frag) { + if (only_frag) { af6 = (struct ip6asfrag *)q6; goto insert; } + /* Do duplicate, condition, and boundry checks. */ /* * Handle ECN by comparing this segment with the first one; * if CE is set, do not lose CE. - * drop if CE and not-ECT are mixed for the same packet. + * Drop if CE and not-ECT are mixed for the same packet. */ ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; ecn0 = q6->ip6q_ecn; if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT) { - free(ip6af, M_FTABLE); + free(ip6af, M_FRAG6); goto dropfrag; } if (ecn0 != IPTOS_ECN_CE) q6->ip6q_ecn = IPTOS_ECN_CE; } if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) { - free(ip6af, M_FTABLE); + free(ip6af, M_FRAG6); goto dropfrag; } - /* - * Find a segment which begins after this one does. - */ + /* Find a fragmented part which begins after this one does. */ for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; af6 = af6->ip6af_down) if (af6->ip6af_off > ip6af->ip6af_off) break; -#if 0 - /* - * If there is a preceding segment, it may provide some of - * our data already. If so, drop the data from the incoming - * segment. If it provides all of our data, drop us. - */ - if (af6->ip6af_up != (struct ip6asfrag *)q6) { - i = af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen - - ip6af->ip6af_off; - if (i > 0) { - if (i >= ip6af->ip6af_frglen) - goto dropfrag; - m_adj(IP6_REASS_MBUF(ip6af), i); - ip6af->ip6af_off += i; - ip6af->ip6af_frglen -= i; - } - } - - /* - * While we overlap succeeding segments trim them or, - * if they are completely covered, dequeue them. - */ - while (af6 != (struct ip6asfrag *)q6 && - ip6af->ip6af_off + ip6af->ip6af_frglen > af6->ip6af_off) { - i = (ip6af->ip6af_off + ip6af->ip6af_frglen) - af6->ip6af_off; - if (i < af6->ip6af_frglen) { - af6->ip6af_frglen -= i; - af6->ip6af_off += i; - m_adj(IP6_REASS_MBUF(af6), i); - break; - } - af6 = af6->ip6af_down; - m_freem(IP6_REASS_MBUF(af6->ip6af_up)); - frag6_deq(af6->ip6af_up, hash); - } -#else /* * If the incoming framgent overlaps some existing fragments in - * the reassembly queue, drop it, since it is dangerous to override - * existing fragments from a security point of view. - * We don't know which fragment is the bad guy - here we trust - * fragment that came in earlier, with no real reason. - * - * Note: due to changes after disabling this part, mbuf passed to - * m_adj() below now does not meet the requirement. + * the reassembly queue, drop both the new fragment and the + * entire reassembly queue. However, if the new fragment + * is an exact duplicate of an existing fragment, only silently + * drop the existing fragment and leave the fragmentation queue + * unchanged, as allowed by the RFC. (RFC 8200, 4.5) */ if (af6->ip6af_up != (struct ip6asfrag *)q6) { - i = af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen - - ip6af->ip6af_off; - if (i > 0) { -#if 0 /* suppress the noisy log */ - log(LOG_ERR, "%d bytes of a fragment from %s " - "overlaps the previous fragment\n", - i, ip6_sprintf(ip6buf, &q6->ip6q_src)); -#endif - free(ip6af, M_FTABLE); + if (af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen - + ip6af->ip6af_off > 0) { + free(ip6af, M_FRAG6); goto dropfrag; } } if (af6 != (struct ip6asfrag *)q6) { - i = (ip6af->ip6af_off + ip6af->ip6af_frglen) - af6->ip6af_off; - if (i > 0) { -#if 0 /* suppress the noisy log */ - log(LOG_ERR, "%d bytes of a fragment from %s " - "overlaps the succeeding fragment", - i, ip6_sprintf(ip6buf, &q6->ip6q_src)); -#endif - free(ip6af, M_FTABLE); + if (ip6af->ip6af_off + ip6af->ip6af_frglen - + af6->ip6af_off > 0) { + free(ip6af, M_FRAG6); goto dropfrag; } } -#endif insert: #ifdef MAC - if (!first_frag) + if (!only_frag) mac_ip6q_update(m, q6); #endif /* - * Stick new segment in its place; - * check for complete reassembly. - * If not complete, check fragment limit. - * Move to front of packet queue, as we are - * the most recently active fragmented packet. + * Stick new segment in its place; check for complete reassembly. + * If not complete, check fragment limit. Move to front of packet + * queue, as we are the most recently active fragmented packet. */ - frag6_enq(ip6af, af6->ip6af_up, hash); + frag6_enq(ip6af, af6->ip6af_up, bucket); atomic_add_int(&frag6_nfrags, 1); q6->ip6q_nfrag++; -#if 0 /* xxx */ - if (q6 != head->ip6q_next) { - frag6_remque(q6, hash); - frag6_insque_head(q6, head, hash); - } -#endif - next = 0; + plen = 0; for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; af6 = af6->ip6af_down) { - if (af6->ip6af_off != next) { + if (af6->ip6af_off != plen) { if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { - IP6STAT_INC(ip6s_fragdropped); - frag6_freef(q6, hash); + IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag); + frag6_freef(q6, bucket); } - IP6Q_UNLOCK(hash); - return IPPROTO_DONE; + IP6QB_UNLOCK(bucket); + return (IPPROTO_DONE); } - next += af6->ip6af_frglen; + plen += af6->ip6af_frglen; } if (af6->ip6af_up->ip6af_mff) { if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { - IP6STAT_INC(ip6s_fragdropped); - frag6_freef(q6, hash); + IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag); + frag6_freef(q6, bucket); } - IP6Q_UNLOCK(hash); - return IPPROTO_DONE; + IP6QB_UNLOCK(bucket); + return (IPPROTO_DONE); } - /* - * Reassembly is complete; concatenate fragments. - */ + /* Reassembly is complete; concatenate fragments. */ ip6af = q6->ip6q_down; t = m = IP6_REASS_MBUF(ip6af); af6 = ip6af->ip6af_down; - frag6_deq(ip6af, hash); + frag6_deq(ip6af, bucket); while (af6 != (struct ip6asfrag *)q6) { m->m_pkthdr.csum_flags &= IP6_REASS_MBUF(af6)->m_pkthdr.csum_flags; @@ -626,13 +668,13 @@ insert: IP6_REASS_MBUF(af6)->m_pkthdr.csum_data; af6dwn = af6->ip6af_down; - frag6_deq(af6, hash); + frag6_deq(af6, bucket); while (t->m_next) t = t->m_next; m_adj(IP6_REASS_MBUF(af6), af6->ip6af_offset); m_demote_pkthdr(IP6_REASS_MBUF(af6)); m_cat(t, IP6_REASS_MBUF(af6)); - free(af6, M_FTABLE); + free(af6, M_FRAG6); af6 = af6dwn; } @@ -640,47 +682,43 @@ insert: m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); - /* adjust offset to point where the original next header starts */ + /* Adjust offset to point where the original next header starts. */ offset = ip6af->ip6af_offset - sizeof(struct ip6_frag); - free(ip6af, M_FTABLE); + free(ip6af, M_FRAG6); ip6 = mtod(m, struct ip6_hdr *); - ip6->ip6_plen = htons((u_short)next + offset - sizeof(struct ip6_hdr)); + ip6->ip6_plen = htons((u_short)plen + offset - sizeof(struct ip6_hdr)); if (q6->ip6q_ecn == IPTOS_ECN_CE) ip6->ip6_flow |= htonl(IPTOS_ECN_CE << 20); nxt = q6->ip6q_nxt; -#ifdef notyet - *q6->ip6q_nxtp = (u_char)(nxt & 0xff); -#endif if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) { - frag6_remque(q6, hash); + frag6_remque(q6, bucket); atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_destroy(q6); #endif - free(q6, M_FTABLE); + free(q6, M_FRAG6); atomic_subtract_int(&V_frag6_nfragpackets, 1); goto dropfrag; } - /* - * Store NXT to the original. - */ + /* Set nxt(-hdr field value) to the original value. */ m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t), (caddr_t)&nxt); - frag6_remque(q6, hash); + frag6_remque(q6, bucket); atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_reassemble(q6, m); mac_ip6q_destroy(q6); #endif - free(q6, M_FTABLE); + free(q6, M_FRAG6); atomic_subtract_int(&V_frag6_nfragpackets, 1); if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */ - int plen = 0; + + plen = 0; for (t = m; t; t = t->m_next) plen += t->m_len; m->m_pkthdr.len = plen; @@ -699,173 +737,64 @@ insert: m_tag_prepend(m, mtag); #endif - IP6Q_UNLOCK(hash); + IP6QB_UNLOCK(bucket); IP6STAT_INC(ip6s_reassembled); in6_ifstat_inc(dstifp, ifs6_reass_ok); #ifdef RSS - /* - * Queue/dispatch for reprocessing. - */ + /* Queue/dispatch for reprocessing. */ netisr_dispatch(NETISR_IPV6_DIRECT, m); - return IPPROTO_DONE; + return (IPPROTO_DONE); #endif - /* - * Tell launch routine the next header - */ - + /* Tell launch routine the next header. */ *mp = m; *offp = offset; - return nxt; + return (nxt); - dropfrag: - IP6Q_UNLOCK(hash); +dropfrag: + IP6QB_UNLOCK(bucket); in6_ifstat_inc(dstifp, ifs6_reass_fail); IP6STAT_INC(ip6s_fragdropped); m_freem(m); - return IPPROTO_DONE; -} - -/* - * Free a fragment reassembly header and all - * associated datagrams. - */ -static void -frag6_freef(struct ip6q *q6, uint32_t bucket) -{ - struct ip6asfrag *af6, *down6; - - IP6Q_LOCK_ASSERT(bucket); - - for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; - af6 = down6) { - struct mbuf *m = IP6_REASS_MBUF(af6); - - down6 = af6->ip6af_down; - frag6_deq(af6, bucket); - - /* - * Return ICMP time exceeded error for the 1st fragment. - * Just free other fragments. - */ - if (af6->ip6af_off == 0) { - struct ip6_hdr *ip6; - - /* adjust pointer */ - ip6 = mtod(m, struct ip6_hdr *); - - /* restore source and destination addresses */ - ip6->ip6_src = q6->ip6q_src; - ip6->ip6_dst = q6->ip6q_dst; - - icmp6_error(m, ICMP6_TIME_EXCEEDED, - ICMP6_TIME_EXCEED_REASSEMBLY, 0); - } else - m_freem(m); - free(af6, M_FTABLE); - } - frag6_remque(q6, bucket); - atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); -#ifdef MAC - mac_ip6q_destroy(q6); -#endif - free(q6, M_FTABLE); - atomic_subtract_int(&V_frag6_nfragpackets, 1); -} - -/* - * Put an ip fragment on a reassembly chain. - * Like insque, but pointers in middle of structure. - */ -static void -frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6, - uint32_t bucket __unused) -{ - - IP6Q_LOCK_ASSERT(bucket); - - af6->ip6af_up = up6; - af6->ip6af_down = up6->ip6af_down; - up6->ip6af_down->ip6af_up = af6; - up6->ip6af_down = af6; -} - -/* - * To frag6_enq as remque is to insque. - */ -static void -frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unused) -{ - - IP6Q_LOCK_ASSERT(bucket); - - af6->ip6af_up->ip6af_down = af6->ip6af_down; - af6->ip6af_down->ip6af_up = af6->ip6af_up; -} - -static void -frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket) -{ - - IP6Q_LOCK_ASSERT(bucket); - KASSERT(IP6Q_HEAD(bucket) == old, - ("%s: attempt to insert at head of wrong bucket" - " (bucket=%u, old=%p)", __func__, bucket, old)); - - new->ip6q_prev = old; - new->ip6q_next = old->ip6q_next; - old->ip6q_next->ip6q_prev= new; - old->ip6q_next = new; - V_ip6q[bucket].count++; -} - -static void -frag6_remque(struct ip6q *p6, uint32_t bucket) -{ - - IP6Q_LOCK_ASSERT(bucket); - - p6->ip6q_prev->ip6q_next = p6->ip6q_next; - p6->ip6q_next->ip6q_prev = p6->ip6q_prev; - V_ip6q[bucket].count--; + return (IPPROTO_DONE); } /* * IPv6 reassembling timer processing; - * if a timer expires on a reassembly - * queue, discard it. + * if a timer expires on a reassembly queue, discard it. */ void frag6_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); struct ip6q *head, *q6; - int i; + uint32_t bucket; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - for (i = 0; i < IP6REASS_NHASH; i++) { - IP6Q_LOCK(i); - head = IP6Q_HEAD(i); + for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { + IP6QB_LOCK(bucket); + head = IP6QB_HEAD(bucket); q6 = head->ip6q_next; if (q6 == NULL) { /* * XXXJTL: This should never happen. This * should turn into an assertion. */ - IP6Q_UNLOCK(i); + IP6QB_UNLOCK(bucket); continue; } while (q6 != head) { --q6->ip6q_ttl; q6 = q6->ip6q_next; if (q6->ip6q_prev->ip6q_ttl == 0) { - IP6STAT_INC(ip6s_fragtimeout); + IP6STAT_ADD(ip6s_fragtimeout, + q6->ip6q_prev->ip6q_nfrag); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(q6->ip6q_prev, i); + frag6_freef(q6->ip6q_prev, bucket); } } /* @@ -874,36 +803,38 @@ frag6_slowtimo(void) * enough to get down to the new limit. * Note that we drain all reassembly queues if * maxfragpackets is 0 (fragmentation is disabled), - * and don't enforce a limit when maxfragpackets + * and do not enforce a limit when maxfragpackets * is negative. */ while ((V_ip6_maxfragpackets == 0 || (V_ip6_maxfragpackets > 0 && - V_ip6q[i].count > V_ip6_maxfragbucketsize)) && + V_ip6qb[bucket].count > V_ip6_maxfragbucketsize)) && head->ip6q_prev != head) { - IP6STAT_INC(ip6s_fragoverflow); + IP6STAT_ADD(ip6s_fragoverflow, + q6->ip6q_prev->ip6q_nfrag); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(head->ip6q_prev, i); + frag6_freef(head->ip6q_prev, bucket); } - IP6Q_UNLOCK(i); + IP6QB_UNLOCK(bucket); } /* * If we are still over the maximum number of fragmented * packets, drain off enough to get down to the new limit. */ - i = 0; + bucket = 0; while (V_ip6_maxfragpackets >= 0 && atomic_load_int(&V_frag6_nfragpackets) > (u_int)V_ip6_maxfragpackets) { - IP6Q_LOCK(i); - head = IP6Q_HEAD(i); + IP6QB_LOCK(bucket); + head = IP6QB_HEAD(bucket); if (head->ip6q_prev != head) { - IP6STAT_INC(ip6s_fragoverflow); + IP6STAT_ADD(ip6s_fragoverflow, + q6->ip6q_prev->ip6q_nfrag); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(head->ip6q_prev, i); + frag6_freef(head->ip6q_prev, bucket); } - IP6Q_UNLOCK(i); - i = (i + 1) % IP6REASS_NHASH; + IP6QB_UNLOCK(bucket); + bucket = (bucket + 1) % IP6REASS_NHASH; } CURVNET_RESTORE(); } @@ -911,6 +842,52 @@ frag6_slowtimo(void) } /* + * Eventhandler to adjust limits in case nmbclusters change. + */ +static void +frag6_change(void *tag) +{ + VNET_ITERATOR_DECL(vnet_iter); + + ip6_maxfrags = IP6_MAXFRAGS; + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; + frag6_set_bucketsize(); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +} + +/* + * Initialise reassembly queue and fragment identifier. + */ +void +frag6_init(void) +{ + struct ip6q *q6; + uint32_t bucket; + + V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; + frag6_set_bucketsize(); + for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { + q6 = IP6QB_HEAD(bucket); + q6->ip6q_next = q6->ip6q_prev = q6; + mtx_init(&V_ip6qb[bucket].lock, "ip6qlock", NULL, MTX_DEF); + V_ip6qb[bucket].count = 0; + } + V_ip6qb_hashseed = arc4random(); + V_ip6_maxfragsperpacket = 64; + if (!IS_DEFAULT_VNET(curvnet)) + return; + + ip6_maxfrags = IP6_MAXFRAGS; + EVENTHANDLER_REGISTER(nmbclusters_change, + frag6_change, NULL, EVENTHANDLER_PRI_ANY); +} + +/* * Drain off all datagram fragments. */ void @@ -918,48 +895,80 @@ frag6_drain(void) { VNET_ITERATOR_DECL(vnet_iter); struct ip6q *head; - int i; + uint32_t bucket; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - for (i = 0; i < IP6REASS_NHASH; i++) { - if (IP6Q_TRYLOCK(i) == 0) + for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { + if (IP6QB_TRYLOCK(bucket) == 0) continue; - head = IP6Q_HEAD(i); + head = IP6QB_HEAD(bucket); while (head->ip6q_next != head) { IP6STAT_INC(ip6s_fragdropped); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(head->ip6q_next, i); + frag6_freef(head->ip6q_next, bucket); } - IP6Q_UNLOCK(i); + IP6QB_UNLOCK(bucket); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } -int -ip6_deletefraghdr(struct mbuf *m, int offset, int wait) +/* + * Put an ip fragment on a reassembly chain. + * Like insque, but pointers in middle of structure. + */ +static void +frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6, + uint32_t bucket __unused) { - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); - struct mbuf *t; - /* Delete frag6 header. */ - if (m->m_len >= offset + sizeof(struct ip6_frag)) { - /* This is the only possible case with !PULLDOWN_TEST. */ - bcopy(ip6, (char *)ip6 + sizeof(struct ip6_frag), - offset); - m->m_data += sizeof(struct ip6_frag); - m->m_len -= sizeof(struct ip6_frag); - } else { - /* This comes with no copy if the boundary is on cluster. */ - if ((t = m_split(m, offset, wait)) == NULL) - return (ENOMEM); - m_adj(t, sizeof(struct ip6_frag)); - m_cat(m, t); - } + IP6QB_LOCK_ASSERT(bucket); - m->m_flags |= M_FRAGMENTED; - return (0); + af6->ip6af_up = up6; + af6->ip6af_down = up6->ip6af_down; + up6->ip6af_down->ip6af_up = af6; + up6->ip6af_down = af6; +} + +/* + * To frag6_enq as remque is to insque. + */ +static void +frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unused) +{ + + IP6QB_LOCK_ASSERT(bucket); + + af6->ip6af_up->ip6af_down = af6->ip6af_down; + af6->ip6af_down->ip6af_up = af6->ip6af_up; +} + +static void +frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket) +{ + + IP6QB_LOCK_ASSERT(bucket); + KASSERT(IP6QB_HEAD(bucket) == old, + ("%s: attempt to insert at head of wrong bucket" + " (bucket=%u, old=%p)", __func__, bucket, old)); + + new->ip6q_prev = old; + new->ip6q_next = old->ip6q_next; + old->ip6q_next->ip6q_prev= new; + old->ip6q_next = new; + V_ip6qb[bucket].count++; +} + +static void +frag6_remque(struct ip6q *p6, uint32_t bucket) +{ + + IP6QB_LOCK_ASSERT(bucket); + + p6->ip6q_prev->ip6q_next = p6->ip6q_next; + p6->ip6q_next->ip6q_prev = p6->ip6q_prev; + V_ip6qb[bucket].count--; } diff --git a/freebsd/sys/netinet6/icmp6.c b/freebsd/sys/netinet6/icmp6.c index 6dd25e98..4a35eb8d 100644 --- a/freebsd/sys/netinet6/icmp6.c +++ b/freebsd/sys/netinet6/icmp6.c @@ -142,7 +142,7 @@ static int icmp6_rip6_input(struct mbuf **, int); static int icmp6_ratelimit(const struct in6_addr *, const int, const int); static const char *icmp6_redirect_diag(struct in6_addr *, struct in6_addr *, struct in6_addr *); -static struct mbuf *ni6_input(struct mbuf *, int); +static struct mbuf *ni6_input(struct mbuf *, int, struct prison *); static struct mbuf *ni6_nametodns(const char *, int, int); static int ni6_dnsmatch(const char *, int, const char *, int); static int ni6_addrs(struct icmp6_nodeinfo *, struct mbuf *, @@ -629,6 +629,7 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; + struct prison *pr; if (!V_icmp6_nodeinfo) break; @@ -640,6 +641,18 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) else goto badlen; +#ifndef __rtems__ + pr = NULL; + sx_slock(&allprison_lock); + TAILQ_FOREACH(pr, &allprison, pr_list) + if (pr->pr_vnet == ifp->if_vnet) + break; + sx_sunlock(&allprison_lock); + if (pr == NULL) + pr = curthread->td_ucred->cr_prison; +#else /* __rtems__ */ + pr = &prison0; +#endif /* __rtems__ */ if (mode == FQDN) { #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), @@ -647,11 +660,10 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) #endif n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n) - n = ni6_input(n, off); + n = ni6_input(n, off, pr); /* XXX meaningless if n == NULL */ noff = sizeof(struct ip6_hdr); } else { - struct prison *pr; u_char *p; int maxhlen, hlen; @@ -685,17 +697,6 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) n = NULL; break; } - maxhlen = M_TRAILINGSPACE(n) - - (sizeof(*nip6) + sizeof(*nicmp6) + 4); -#ifndef __rtems__ - pr = curthread->td_ucred->cr_prison; -#else /* __rtems__ */ - pr = &prison0; -#endif /* __rtems__ */ - mtx_lock(&pr->pr_mtx); - hlen = strlen(pr->pr_hostname); - if (maxhlen > hlen) - maxhlen = hlen; /* * Copy IPv6 and ICMPv6 only. */ @@ -705,6 +706,13 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); + + maxhlen = M_TRAILINGSPACE(n) - + (sizeof(*nip6) + sizeof(*nicmp6) + 4); + mtx_lock(&pr->pr_mtx); + hlen = strlen(pr->pr_hostname); + if (maxhlen > hlen) + maxhlen = hlen; /* meaningless TTL */ bcopy(pr->pr_hostname, p + 4, maxhlen); mtx_unlock(&pr->pr_mtx); @@ -1173,11 +1181,10 @@ icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) * with hostname changes by sethostname(3) */ static struct mbuf * -ni6_input(struct mbuf *m, int off) +ni6_input(struct mbuf *m, int off, struct prison *pr) { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; - struct prison *pr; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); @@ -1329,11 +1336,6 @@ ni6_input(struct mbuf *m, int off) * wildcard match, if gethostname(3) side has * truncated hostname. */ -#ifndef __rtems__ - pr = curthread->td_ucred->cr_prison; -#else /* __rtems__ */ - pr = &prison0; -#endif /* __rtems__ */ mtx_lock(&pr->pr_mtx); n = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), 0); @@ -1458,11 +1460,6 @@ ni6_input(struct mbuf *m, int off) /* * XXX do we really have FQDN in hostname? */ -#ifndef __rtems__ - pr = curthread->td_ucred->cr_prison; -#else /* __rtems__ */ - pr = &prison0; -#endif /* __rtems__ */ mtx_lock(&pr->pr_mtx); n->m_next = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), oldfqdn); @@ -1669,6 +1666,7 @@ static int ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, struct in6_addr *subj) { + struct epoch_tracker et; struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; @@ -1690,10 +1688,9 @@ ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, } } - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { addrsofif = 0; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; @@ -1744,16 +1741,15 @@ ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, } addrsofif++; /* count the address */ } - IF_ADDR_RUNLOCK(ifp); if (iffound) { *ifpp = ifp; - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (addrsofif); } addrs += addrsofif; } - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (addrs); } @@ -1762,6 +1758,7 @@ static int ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, struct ifnet *ifp0, int resid) { + struct epoch_tracker et; struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; @@ -1774,12 +1771,11 @@ ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return (0); /* needless to copy */ - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); ifp = ifp0 ? ifp0 : CK_STAILQ_FIRST(&V_ifnet); again: for (; ifp; ifp = CK_STAILQ_NEXT(ifp, if_link)) { - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; @@ -1834,13 +1830,12 @@ ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { - IF_ADDR_RUNLOCK(ifp); /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (copied); } @@ -1881,7 +1876,6 @@ ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } - IF_ADDR_RUNLOCK(ifp); if (ifp0) /* we need search only on the specified IF */ break; } @@ -1893,7 +1887,7 @@ ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, goto again; } - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (copied); } @@ -1906,7 +1900,7 @@ icmp6_rip6_input(struct mbuf **mp, int off) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); - struct inpcb *in6p; + struct inpcb *inp; struct inpcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; @@ -1938,25 +1932,25 @@ icmp6_rip6_input(struct mbuf **mp, int off) } INP_INFO_RLOCK_ET(&V_ripcbinfo, et); - CK_LIST_FOREACH(in6p, &V_ripcb, inp_list) { - if ((in6p->inp_vflag & INP_IPV6) == 0) + CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { + if ((inp->inp_vflag & INP_IPV6) == 0) continue; - if (in6p->inp_ip_p != IPPROTO_ICMPV6) + if (inp->inp_ip_p != IPPROTO_ICMPV6) continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && - !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && - !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) continue; - INP_RLOCK(in6p); - if (__predict_false(in6p->inp_flags2 & INP_FREED)) { - INP_RUNLOCK(in6p); + INP_RLOCK(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + INP_RUNLOCK(inp); continue; } if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, - in6p->in6p_icmp6filt)) { - INP_RUNLOCK(in6p); + inp->in6p_icmp6filt)) { + INP_RUNLOCK(inp); continue; } if (last != NULL) { @@ -2017,7 +2011,7 @@ icmp6_rip6_input(struct mbuf **mp, int off) } INP_RUNLOCK(last); } - last = in6p; + last = inp; } INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); if (last != NULL) { @@ -2575,13 +2569,14 @@ icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) { /* target lladdr option */ + struct epoch_tracker et; int len; struct nd_opt_hdr *nd_opt; char *lladdr; - IF_AFDATA_RLOCK(ifp); + NET_EPOCH_ENTER(et); ln = nd6_lookup(router_ll6, 0, ifp); - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (ln == NULL) goto nolladdropt; diff --git a/freebsd/sys/netinet6/in6.c b/freebsd/sys/netinet6/in6.c index ef59203e..f3306bc3 100644 --- a/freebsd/sys/netinet6/in6.c +++ b/freebsd/sys/netinet6/in6.c @@ -1392,13 +1392,15 @@ in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia, * if this is its first address, */ if (hostIsNew != 0) { - IF_ADDR_RLOCK(ifp); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifacount++; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); } if (ifacount <= 1 && ifp->if_ioctl) { @@ -1476,9 +1478,10 @@ done: struct in6_ifaddr * in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { + struct epoch_tracker et; struct ifaddr *ifa; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; @@ -1490,7 +1493,7 @@ in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) break; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return ((struct in6_ifaddr *)ifa); } @@ -1527,9 +1530,10 @@ in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid) struct in6_ifaddr * in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr) { + struct epoch_tracker et; struct ifaddr *ifa; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; @@ -1538,7 +1542,7 @@ in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr) break; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return ((struct in6_ifaddr *)ifa); } @@ -1549,12 +1553,13 @@ in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr) struct in6_ifaddr * in6ifa_llaonifp(struct ifnet *ifp) { + struct epoch_tracker et; struct sockaddr_in6 *sin6; struct ifaddr *ifa; if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) return (NULL); - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; @@ -1564,7 +1569,7 @@ in6ifa_llaonifp(struct ifnet *ifp) IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr)) break; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return ((struct in6_ifaddr *)ifa); } @@ -1701,6 +1706,7 @@ int in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr) { struct in6_addr in6; + struct epoch_tracker et; struct ifaddr *ifa; struct in6_ifaddr *ia6; @@ -1709,17 +1715,17 @@ in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr) return (0); in6_setscope(&in6, ifp, NULL); - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = (struct in6_ifaddr *)ifa; if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (1); } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (0); } @@ -1823,6 +1829,7 @@ in6_prefixlen2mask(struct in6_addr *maskp, int len) struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { + struct epoch_tracker et; int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = NULL; @@ -1836,7 +1843,7 @@ in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; @@ -1870,7 +1877,7 @@ in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) } if (besta) { ifa_ref(&besta->ia_ifa); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (besta); } @@ -1891,23 +1898,23 @@ in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) if (ifa != NULL) ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (struct in6_ifaddr *)ifa; } /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) { ifa_ref((struct ifaddr *)dep[0]); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return dep[0]; } if (dep[1]) { ifa_ref((struct ifaddr *)dep[1]); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return dep[1]; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return NULL; } @@ -1917,10 +1924,11 @@ in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) void in6_if_up(struct ifnet *ifp) { + struct epoch_tracker et; struct ifaddr *ifa; struct in6_ifaddr *ia; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; @@ -1936,7 +1944,7 @@ in6_if_up(struct ifnet *ifp) arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz)); } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); /* * special cases, like 6to4, are handled in in6_ifattach @@ -1947,26 +1955,14 @@ in6_if_up(struct ifnet *ifp) int in6if_do_dad(struct ifnet *ifp) { + if ((ifp->if_flags & IFF_LOOPBACK) != 0) return (0); - - if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) || - (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD)) + if ((ifp->if_flags & IFF_MULTICAST) == 0) + return (0); + if ((ND_IFINFO(ifp)->flags & + (ND6_IFF_IFDISABLED | ND6_IFF_NO_DAD)) != 0) return (0); - - /* - * Our DAD routine requires the interface up and running. - * However, some interfaces can be up before the RUNNING - * status. Additionally, users may try to assign addresses - * before the interface becomes up (or running). - * This function returns EAGAIN in that case. - * The caller should mark "tentative" on the address instead of - * performing DAD immediately. - */ - if (!((ifp->if_flags & IFF_UP) && - (ifp->if_drv_flags & IFF_DRV_RUNNING))) - return (EAGAIN); - return (1); } @@ -1977,10 +1973,11 @@ in6if_do_dad(struct ifnet *ifp) void in6_setmaxmtu(void) { + struct epoch_tracker et; unsigned long maxmtu = 0; struct ifnet *ifp; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* this function can be called during ifnet initialization */ if (!ifp->if_afdata[AF_INET6]) @@ -1989,7 +1986,7 @@ in6_setmaxmtu(void) IN6_LINKMTU(ifp) > maxmtu) maxmtu = IN6_LINKMTU(ifp); } - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); if (maxmtu) /* update only when maxmtu is positive */ V_in6_maxmtu = maxmtu; } @@ -2167,18 +2164,19 @@ in6_lltable_rtcheck(struct ifnet *ifp, fibnum = V_rt_add_addr_allfibs ? RT_DEFAULT_FIB : ifp->if_fib; error = fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6); if (error != 0 || (nh6.nh_flags & NHF_GATEWAY) || nh6.nh_ifp != ifp) { + struct epoch_tracker et; struct ifaddr *ifa; /* * Create an ND6 cache for an IPv6 neighbor * that is not covered by our own prefix. */ - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); ifa = ifaof_ifpforaddr(l3addr, ifp); if (ifa != NULL) { - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return 0; } - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n", ip6_sprintf(ip6buf, &sin6->sin6_addr)); return EINVAL; @@ -2319,16 +2317,13 @@ in6_lltable_lookup(struct lltable *llt, u_int flags, IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); + KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) != + (LLE_UNLOCKED | LLE_EXCLUSIVE), + ("wrong lle request flags: %#x", flags)); lle = in6_lltable_find_dst(llt, &sin6->sin6_addr); - if (lle == NULL) return (NULL); - - KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) != - (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X", - flags)); - if (flags & LLE_UNLOCKED) return (lle); @@ -2336,6 +2331,18 @@ in6_lltable_lookup(struct lltable *llt, u_int flags, LLE_WLOCK(lle); else LLE_RLOCK(lle); + + /* + * If the afdata lock is not held, the LLE may have been unlinked while + * we were blocked on the LLE lock. Check for this case. + */ + if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) { + if (flags & LLE_EXCLUSIVE) + LLE_WUNLOCK(lle); + else + LLE_RUNLOCK(lle); + return (NULL); + } return (lle); } diff --git a/freebsd/sys/netinet6/in6_ifattach.c b/freebsd/sys/netinet6/in6_ifattach.c index 6af4b557..560b4255 100644 --- a/freebsd/sys/netinet6/in6_ifattach.c +++ b/freebsd/sys/netinet6/in6_ifattach.c @@ -246,6 +246,7 @@ generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret) int in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6) { + struct epoch_tracker et; struct ifaddr *ifa; struct sockaddr_dl *sdl; u_int8_t *addr; @@ -254,7 +255,7 @@ in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6) static u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; @@ -266,7 +267,7 @@ in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6) goto found; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return -1; @@ -289,7 +290,7 @@ found: /* look at IEEE802/EUI64 only */ if (addrlen != 8 && addrlen != 6) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return -1; } @@ -299,11 +300,11 @@ found: * card insertion. */ if (bcmp(addr, allzero, addrlen) == 0) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return -1; } if (bcmp(addr, allone, addrlen) == 0) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return -1; } @@ -330,17 +331,25 @@ found: * identifier source (can be renumbered). * we don't do this. */ - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return -1; + case IFT_INFINIBAND: + if (addrlen != 20) { + NET_EPOCH_EXIT(et); + return -1; + } + bcopy(addr + 12, &in6->s6_addr[8], 8); + break; + default: - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return -1; } /* sanity check: g bit must not indicate "group" */ if (EUI64_GROUP(in6)) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return -1; } @@ -353,11 +362,11 @@ found: */ if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 && bcmp(&in6->s6_addr[9], allzero, 7) == 0) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return -1; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return 0; } @@ -372,6 +381,7 @@ static int get_ifid(struct ifnet *ifp0, struct ifnet *altifp, struct in6_addr *in6) { + struct epoch_tracker et; struct ifnet *ifp; /* first, try to get it from the interface itself */ @@ -389,7 +399,7 @@ get_ifid(struct ifnet *ifp0, struct ifnet *altifp, } /* next, try to get it from some other hardware interface */ - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp == ifp0) continue; @@ -404,11 +414,11 @@ get_ifid(struct ifnet *ifp0, struct ifnet *altifp, nd6log((LOG_DEBUG, "%s: borrow interface identifier from %s\n", if_name(ifp0), if_name(ifp))); - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); goto success; } } - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); /* last resort: get from random number source */ if (get_rand_ifid(ifp, in6) == 0) { @@ -700,6 +710,7 @@ in6_ifattach(struct ifnet *ifp, struct ifnet *altifp) * it is rather harmful to have one. */ ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL; + ND_IFINFO(ifp)->flags |= ND6_IFF_NO_DAD; break; default: break; @@ -773,9 +784,11 @@ _in6_ifdetach(struct ifnet *ifp, int purgeulp) in6_purgeaddr(ifa); } if (purgeulp) { + IN6_MULTI_LOCK(); in6_pcbpurgeif0(&V_udbinfo, ifp); in6_pcbpurgeif0(&V_ulitecbinfo, ifp); in6_pcbpurgeif0(&V_ripcbinfo, ifp); + IN6_MULTI_UNLOCK(); } /* leave from all multicast groups joined */ in6_purgemaddrs(ifp); @@ -862,36 +875,22 @@ in6_tmpaddrtimer(void *arg) static void in6_purgemaddrs(struct ifnet *ifp) { - struct in6_multi_head purgeinms; - struct in6_multi *inm; - struct ifmultiaddr *ifma, *next; + struct in6_multi_head inmh; - SLIST_INIT(&purgeinms); + SLIST_INIT(&inmh); IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); - IF_ADDR_WLOCK(ifp); - /* - * Extract list of in6_multi associated with the detaching ifp - * which the PF_INET6 layer is about to release. - */ - restart: - CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { - if (ifma->ifma_addr->sa_family != AF_INET6 || - ifma->ifma_protospec == NULL) - continue; - inm = (struct in6_multi *)ifma->ifma_protospec; - in6m_disconnect(inm); - in6m_rele_locked(&purgeinms, inm); - if (__predict_false(ifma6_restart)) { - ifma6_restart = false; - goto restart; - } - } - IF_ADDR_WUNLOCK(ifp); - mld_ifdetach(ifp); + mld_ifdetach(ifp, &inmh); IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); - in6m_release_list_deferred(&purgeinms); + in6m_release_list_deferred(&inmh); + + /* + * Make sure all multicast deletions invoking if_ioctl() are + * completed before returning. Else we risk accessing a freed + * ifnet structure pointer. + */ + in6m_release_wait(); } void diff --git a/freebsd/sys/netinet6/in6_mcast.c b/freebsd/sys/netinet6/in6_mcast.c index 3824645d..44d20612 100644 --- a/freebsd/sys/netinet6/in6_mcast.c +++ b/freebsd/sys/netinet6/in6_mcast.c @@ -104,7 +104,8 @@ RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); /* * Locking: - * - Lock order is: Giant, INP_WLOCK, IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK. + * - Lock order is: Giant, IN6_MULTI_LOCK, INP_WLOCK, + * IN6_MULTI_LIST_LOCK, MLD_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK. @@ -136,12 +137,11 @@ static int im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *); static void im6f_purge(struct in6_mfilter *); static void im6f_rollback(struct in6_mfilter *); static void im6f_reap(struct in6_mfilter *); -static int im6o_grow(struct ip6_moptions *); -static size_t im6o_match_group(const struct ip6_moptions *, +static struct in6_mfilter * + im6o_match_group(const struct ip6_moptions *, const struct ifnet *, const struct sockaddr *); static struct in6_msource * - im6o_match_source(const struct ip6_moptions *, const size_t, - const struct sockaddr *); + im6o_match_source(struct in6_mfilter *, const struct sockaddr *); static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback); static int in6_getmulti(struct ifnet *, const struct in6_addr *, @@ -192,7 +192,6 @@ static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters, "Per-interface stack-wide source filters"); -int ifma6_restart = 0; #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. @@ -231,55 +230,25 @@ im6f_init(struct in6_mfilter *imf, const int st0, const int st1) imf->im6f_st[1] = st1; } -/* - * Resize the ip6_moptions vector to the next power-of-two minus 1. - * May be called with locks held; do not sleep. - */ -static int -im6o_grow(struct ip6_moptions *imo) +struct in6_mfilter * +ip6_mfilter_alloc(const int mflags, const int st0, const int st1) { - struct in6_multi **nmships; - struct in6_multi **omships; - struct in6_mfilter *nmfilters; - struct in6_mfilter *omfilters; - size_t idx; - size_t newmax; - size_t oldmax; - - nmships = NULL; - nmfilters = NULL; - omships = imo->im6o_membership; - omfilters = imo->im6o_mfilters; - oldmax = imo->im6o_max_memberships; - newmax = ((oldmax + 1) * 2) - 1; - - if (newmax <= IPV6_MAX_MEMBERSHIPS) { - nmships = (struct in6_multi **)realloc(omships, - sizeof(struct in6_multi *) * newmax, M_IP6MOPTS, M_NOWAIT); - nmfilters = (struct in6_mfilter *)realloc(omfilters, - sizeof(struct in6_mfilter) * newmax, M_IN6MFILTER, - M_NOWAIT); - if (nmships != NULL && nmfilters != NULL) { - /* Initialize newly allocated source filter heads. */ - for (idx = oldmax; idx < newmax; idx++) { - im6f_init(&nmfilters[idx], MCAST_UNDEFINED, - MCAST_EXCLUDE); - } - imo->im6o_max_memberships = newmax; - imo->im6o_membership = nmships; - imo->im6o_mfilters = nmfilters; - } - } + struct in6_mfilter *imf; - if (nmships == NULL || nmfilters == NULL) { - if (nmships != NULL) - free(nmships, M_IP6MOPTS); - if (nmfilters != NULL) - free(nmfilters, M_IN6MFILTER); - return (ETOOMANYREFS); - } + imf = malloc(sizeof(*imf), M_IN6MFILTER, mflags); - return (0); + if (imf != NULL) + im6f_init(imf, st0, st1); + + return (imf); +} + +void +ip6_mfilter_free(struct in6_mfilter *imf) +{ + + im6f_purge(imf); + free(imf, M_IN6MFILTER); } /* @@ -287,36 +256,27 @@ im6o_grow(struct ip6_moptions *imo) * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ -static size_t +static struct in6_mfilter * im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in6 *gsin6; - struct in6_multi **pinm; - int idx; - int nmships; + struct in6_mfilter *imf; + struct in6_multi *inm; - gsin6 = (const struct sockaddr_in6 *)group; + gsin6 = (const struct sockaddr_in6 *)group; - /* The im6o_membership array may be lazy allocated. */ - if (imo->im6o_membership == NULL || imo->im6o_num_memberships == 0) - return (-1); - - nmships = imo->im6o_num_memberships; - pinm = &imo->im6o_membership[0]; - for (idx = 0; idx < nmships; idx++, pinm++) { - if (*pinm == NULL) + IP6_MFILTER_FOREACH(imf, &imo->im6o_head) { + inm = imf->im6f_in6m; + if (inm == NULL) continue; - if ((ifp == NULL || ((*pinm)->in6m_ifp == ifp)) && - IN6_ARE_ADDR_EQUAL(&(*pinm)->in6m_addr, + if ((ifp == NULL || (inm->in6m_ifp == ifp)) && + IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &gsin6->sin6_addr)) { break; } } - if (idx >= nmships) - idx = -1; - - return (idx); + return (imf); } /* @@ -331,22 +291,13 @@ im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp, * it exists, which may not be the desired behaviour. */ static struct in6_msource * -im6o_match_source(const struct ip6_moptions *imo, const size_t gidx, - const struct sockaddr *src) +im6o_match_source(struct in6_mfilter *imf, const struct sockaddr *src) { struct ip6_msource find; - struct in6_mfilter *imf; struct ip6_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__)); - KASSERT(gidx != -1 && gidx < imo->im6o_num_memberships, - ("%s: invalid index %d\n", __func__, (int)gidx)); - - /* The im6o_mfilters array may be lazy allocated. */ - if (imo->im6o_mfilters == NULL) - return (NULL); - imf = &imo->im6o_mfilters[gidx]; psa = (const sockunion_t *)src; find.im6s_addr = psa->sin6.sin6_addr; @@ -366,14 +317,14 @@ int im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { - size_t gidx; + struct in6_mfilter *imf; struct in6_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); - gidx = im6o_match_group(imo, ifp, group); - if (gidx == -1) + imf = im6o_match_group(imo, ifp, group); + if (imf == NULL) return (MCAST_NOTGMEMBER); /* @@ -385,8 +336,8 @@ im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp, * NOTE: We are comparing group state here at MLD t1 (now) * with socket-layer t0 (since last downcall). */ - mode = imo->im6o_mfilters[gidx].im6f_st[1]; - ims = im6o_match_source(imo, gidx, src); + mode = imf->im6f_st[1]; + ims = im6o_match_source(imf, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->im6sl_st[0] != mode)) @@ -407,6 +358,7 @@ static int in6_getmulti(struct ifnet *ifp, const struct in6_addr *group, struct in6_multi **pinm) { + struct epoch_tracker et; struct sockaddr_in6 gsin6; struct ifmultiaddr *ifma; struct in6_multi *inm; @@ -422,7 +374,10 @@ in6_getmulti(struct ifnet *ifp, const struct in6_addr *group, IN6_MULTI_LOCK_ASSERT(); IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); + NET_EPOCH_ENTER(et); inm = in6m_lookup_locked(ifp, group); + NET_EPOCH_EXIT(et); + if (inm != NULL) { /* * If we already joined this group, just bump the @@ -587,7 +542,15 @@ in6m_release_list_deferred(struct in6_multi_head *inmh) } void -in6m_disconnect(struct in6_multi *inm) +in6m_release_wait(void) +{ + + /* Wait for all jobs to complete. */ + gtaskqueue_drain_all(free_gtask.gt_taskqueue); +} + +void +in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm) { struct ifnet *ifp; struct ifaddr *ifa; @@ -595,10 +558,12 @@ in6m_disconnect(struct in6_multi *inm) struct in6_multi_mship *imm, *imm_tmp; struct ifmultiaddr *ifma, *ll_ifma; - ifp = inm->in6m_ifp; + IN6_MULTI_LIST_LOCK_ASSERT(); + ifp = inm->in6m_ifp; if (ifp == NULL) - return; + return; /* already called */ + inm->in6m_ifp = NULL; IF_ADDR_WLOCK_ASSERT(ifp); ifma = inm->in6m_ifma; @@ -617,7 +582,6 @@ in6m_disconnect(struct in6_multi *inm) MPASS(ll_ifma->ifma_llifma == NULL); MPASS(ll_ifma->ifma_ifp == ifp); if (--ll_ifma->ifma_refcount == 0) { - ifma6_restart = true; if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; @@ -635,28 +599,12 @@ in6m_disconnect(struct in6_multi *inm) if (inm == imm->i6mm_maddr) { LIST_REMOVE(imm, i6mm_chain); free(imm, M_IP6MADDR); + in6m_rele_locked(inmh, inm); } } } } -void -in6m_release_deferred(struct in6_multi *inm) -{ - struct in6_multi_head tmp; - - IN6_MULTI_LIST_LOCK_ASSERT(); - KASSERT(inm->in6m_refcount > 0, ("refcount == %d inm: %p", inm->in6m_refcount, inm)); - if (--inm->in6m_refcount == 0) { - MPASS(inm->in6m_ifp == NULL); - SLIST_INIT(&tmp); - inm->in6m_ifma->ifma_protospec = NULL; - MPASS(inm->in6m_ifma->ifma_llifma == NULL); - SLIST_INSERT_HEAD(&tmp, inm, in6m_nrele); - in6m_release_list_deferred(&tmp); - } -} - static void in6m_release_task(void *arg __unused) { @@ -1256,6 +1204,7 @@ in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { + struct in6_multi_head inmh; struct in6_mfilter timf; struct in6_multi *inm; struct ifmultiaddr *ifma; @@ -1264,7 +1213,6 @@ in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr, char ip6tbuf[INET6_ADDRSTRLEN]; #endif -#ifdef INVARIANTS /* * Sanity: Check scope zone ID was set for ifp, if and * only if group is scoped to an interface. @@ -1276,7 +1224,6 @@ in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr, KASSERT(mcaddr->s6_addr16[1] != 0, ("%s: scope zone ID not set", __func__)); } -#endif IN6_MULTI_LOCK_ASSERT(); IN6_MULTI_LIST_UNLOCK_ASSERT(); @@ -1317,22 +1264,26 @@ in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr, } out_in6m_release: + SLIST_INIT(&inmh); if (error) { + struct epoch_tracker et; + CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_protospec == inm) { ifma->ifma_protospec = NULL; break; } } - in6m_disconnect(inm); - in6m_release_deferred(inm); - IF_ADDR_RUNLOCK(ifp); + in6m_disconnect_locked(&inmh, inm); + in6m_rele_locked(&inmh, inm); + NET_EPOCH_EXIT(et); } else { *pinm = inm; } IN6_MULTI_LIST_UNLOCK(); + in6m_release_list_deferred(&inmh); return (error); } @@ -1366,6 +1317,7 @@ in6_leavegroup(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) int in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { + struct in6_multi_head inmh; struct in6_mfilter timf; struct ifnet *ifp; int error; @@ -1415,13 +1367,15 @@ in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); if (ifp) IF_ADDR_WLOCK(ifp); - if (inm->in6m_refcount == 1 && inm->in6m_ifp != NULL) - in6m_disconnect(inm); - in6m_release_deferred(inm); + + SLIST_INIT(&inmh); + if (inm->in6m_refcount == 1) + in6m_disconnect_locked(&inmh, inm); + in6m_rele_locked(&inmh, inm); if (ifp) IF_ADDR_WUNLOCK(ifp); IN6_MULTI_LIST_UNLOCK(); - + in6m_release_list_deferred(&inmh); return (error); } @@ -1447,7 +1401,6 @@ in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; - size_t idx; uint16_t fmode; int error, doblock; #ifdef KTR @@ -1504,16 +1457,12 @@ in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) * Check if we are actually a member of this group. */ imo = in6p_findmoptions(inp); - idx = im6o_match_group(imo, ifp, &gsa->sa); - if (idx == -1 || imo->im6o_mfilters == NULL) { + imf = im6o_match_group(imo, ifp, &gsa->sa); + if (imf == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } - - KASSERT(imo->im6o_mfilters != NULL, - ("%s: im6o_mfilters not allocated", __func__)); - imf = &imo->im6o_mfilters[idx]; - inm = imo->im6o_membership[idx]; + inm = imf->im6f_in6m; /* * Attempting to use the delta-based API on an @@ -1531,7 +1480,7 @@ in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ - ims = im6o_match_source(imo, idx, &ssa->sa); + ims = im6o_match_source(imf, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_MLD, "%s: source %s %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), @@ -1601,9 +1550,6 @@ static struct ip6_moptions * in6p_findmoptions(struct inpcb *inp) { struct ip6_moptions *imo; - struct in6_multi **immp; - struct in6_mfilter *imfp; - size_t idx; INP_WLOCK(inp); if (inp->in6p_moptions != NULL) @@ -1612,27 +1558,14 @@ in6p_findmoptions(struct inpcb *inp) INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK); - immp = malloc(sizeof(*immp) * IPV6_MIN_MEMBERSHIPS, M_IP6MOPTS, - M_WAITOK | M_ZERO); - imfp = malloc(sizeof(struct in6_mfilter) * IPV6_MIN_MEMBERSHIPS, - M_IN6MFILTER, M_WAITOK); imo->im6o_multicast_ifp = NULL; imo->im6o_multicast_hlim = V_ip6_defmcasthlim; imo->im6o_multicast_loop = in6_mcast_loop; - imo->im6o_num_memberships = 0; - imo->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; - imo->im6o_membership = immp; - - /* Initialize per-group source filters. */ - for (idx = 0; idx < IPV6_MIN_MEMBERSHIPS; idx++) - im6f_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); - imo->im6o_mfilters = imfp; + STAILQ_INIT(&imo->im6o_head); INP_WLOCK(inp); if (inp->in6p_moptions != NULL) { - free(imfp, M_IN6MFILTER); - free(immp, M_IP6MOPTS); free(imo, M_IP6MOPTS); return (inp->in6p_moptions); } @@ -1652,33 +1585,26 @@ in6p_findmoptions(struct inpcb *inp) static void inp_gcmoptions(struct ip6_moptions *imo) { - struct in6_mfilter *imf; + struct in6_mfilter *imf; struct in6_multi *inm; struct ifnet *ifp; - size_t idx, nmships; - - nmships = imo->im6o_num_memberships; - for (idx = 0; idx < nmships; ++idx) { - imf = imo->im6o_mfilters ? &imo->im6o_mfilters[idx] : NULL; - if (imf) - im6f_leave(imf); - inm = imo->im6o_membership[idx]; - ifp = inm->in6m_ifp; - if (ifp != NULL) { - CURVNET_SET(ifp->if_vnet); - (void)in6_leavegroup(inm, imf); - CURVNET_RESTORE(); - } else { - (void)in6_leavegroup(inm, imf); - } - if (imf) - im6f_purge(imf); - } - if (imo->im6o_mfilters) - free(imo->im6o_mfilters, M_IN6MFILTER); - free(imo->im6o_membership, M_IP6MOPTS); - free(imo, M_IP6MOPTS); + while ((imf = ip6_mfilter_first(&imo->im6o_head)) != NULL) { + ip6_mfilter_remove(&imo->im6o_head, imf); + + im6f_leave(imf); + if ((inm = imf->im6f_in6m) != NULL) { + if ((ifp = inm->in6m_ifp) != NULL) { + CURVNET_SET(ifp->if_vnet); + (void)in6_leavegroup(inm, imf); + CURVNET_RESTORE(); + } else { + (void)in6_leavegroup(inm, imf); + } + } + ip6_mfilter_free(imf); + } + free(imo, M_IP6MOPTS); } void @@ -1707,7 +1633,7 @@ in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; - size_t idx, nsrcs, ncsrcs; + size_t nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); @@ -1741,12 +1667,11 @@ in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) /* * Lookup group on the socket. */ - idx = im6o_match_group(imo, ifp, &gsa->sa); - if (idx == -1 || imo->im6o_mfilters == NULL) { + imf = im6o_match_group(imo, ifp, &gsa->sa); + if (imf == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } - imf = &imo->im6o_mfilters[idx]; /* * Ignore memberships which are in limbo. @@ -1905,7 +1830,7 @@ ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt) * Returns NULL if no ifp could be found. */ static struct ifnet * -in6p_lookup_mcast_ifp(const struct inpcb *in6p, +in6p_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in6 *gsin6) { struct nhop6_basic nh6; @@ -1913,13 +1838,13 @@ in6p_lookup_mcast_ifp(const struct inpcb *in6p, uint32_t scopeid; uint32_t fibnum; - KASSERT(in6p->inp_vflag & INP_IPV6, + KASSERT(inp->inp_vflag & INP_IPV6, ("%s: not INP_IPV6 inpcb", __func__)); KASSERT(gsin6->sin6_family == AF_INET6, ("%s: not AF_INET6 group", __func__)); in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid); - fibnum = in6p ? in6p->inp_inc.inc_fibnum : RT_DEFAULT_FIB; + fibnum = inp ? inp->inp_inc.inc_fibnum : RT_DEFAULT_FIB; if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0) return (NULL); @@ -1935,6 +1860,7 @@ in6p_lookup_mcast_ifp(const struct inpcb *in6p, static int in6p_join_group(struct inpcb *inp, struct sockopt *sopt) { + struct in6_multi_head inmh; struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; @@ -1942,14 +1868,12 @@ in6p_join_group(struct inpcb *inp, struct sockopt *sopt) struct ip6_moptions *imo; struct in6_multi *inm; struct in6_msource *lims; - size_t idx; int error, is_new; + SLIST_INIT(&inmh); ifp = NULL; - imf = NULL; lims = NULL; error = 0; - is_new = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; @@ -2050,13 +1974,25 @@ in6p_join_group(struct inpcb *inp, struct sockopt *sopt) */ (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); + IN6_MULTI_LOCK(); + + /* + * Find the membership in the membership list. + */ imo = in6p_findmoptions(inp); - idx = im6o_match_group(imo, ifp, &gsa->sa); - if (idx == -1) { + imf = im6o_match_group(imo, ifp, &gsa->sa); + if (imf == NULL) { is_new = 1; + inm = NULL; + + if (ip6_mfilter_count(&imo->im6o_head) >= IPV6_MAX_MEMBERSHIPS) { + error = ENOMEM; + goto out_in6p_locked; + } } else { - inm = imo->im6o_membership[idx]; - imf = &imo->im6o_mfilters[idx]; + is_new = 0; + inm = imf->im6f_in6m; + if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership @@ -2083,7 +2019,7 @@ in6p_join_group(struct inpcb *inp, struct sockopt *sopt) * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ - lims = im6o_match_source(imo, idx, &ssa->sa); + lims = im6o_match_source(imf, &ssa->sa); if (lims != NULL /*&& lims->im6sl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; @@ -2111,27 +2047,6 @@ in6p_join_group(struct inpcb *inp, struct sockopt *sopt) */ INP_WLOCK_ASSERT(inp); - if (is_new) { - if (imo->im6o_num_memberships == imo->im6o_max_memberships) { - error = im6o_grow(imo); - if (error) - goto out_in6p_locked; - } - /* - * Allocate the new slot upfront so we can deal with - * grafting the new source filter in same code path - * as for join-source on existing membership. - */ - idx = imo->im6o_num_memberships; - imo->im6o_membership[idx] = NULL; - imo->im6o_num_memberships++; - KASSERT(imo->im6o_mfilters != NULL, - ("%s: im6f_mfilters vector was not allocated", __func__)); - imf = &imo->im6o_mfilters[idx]; - KASSERT(RB_EMPTY(&imf->im6f_sources), - ("%s: im6f_sources not empty", __func__)); - } - /* * Graft new source into filter list for this inpcb's * membership of the group. The in6_multi may not have @@ -2147,7 +2062,11 @@ in6p_join_group(struct inpcb *inp, struct sockopt *sopt) /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/source", __func__); - im6f_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); + imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE); + if (imf == NULL) { + error = ENOMEM; + goto out_in6p_locked; + } } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); } @@ -2156,77 +2075,88 @@ in6p_join_group(struct inpcb *inp, struct sockopt *sopt) CTR1(KTR_MLD, "%s: merge imf state failed", __func__); error = ENOMEM; - goto out_im6o_free; + goto out_in6p_locked; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/o source", __func__); - im6f_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); + imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE); + if (imf == NULL) { + error = ENOMEM; + goto out_in6p_locked; + } } } /* * Begin state merge transaction at MLD layer. */ - in_pcbref(inp); - INP_WUNLOCK(inp); - IN6_MULTI_LOCK(); - if (is_new) { + in_pcbref(inp); + INP_WUNLOCK(inp); + error = in6_joingroup_locked(ifp, &gsa->sin6.sin6_addr, imf, - &inm, 0); + &imf->im6f_in6m, 0); + + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) { + error = ENXIO; + goto out_in6p_unlocked; + } if (error) { - IN6_MULTI_UNLOCK(); - goto out_im6o_free; + goto out_in6p_locked; } - in6m_acquire(inm); - imo->im6o_membership[idx] = inm; + /* + * NOTE: Refcount from in6_joingroup_locked() + * is protecting membership. + */ } else { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); - if (error) + if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); - else { - CTR1(KTR_MLD, "%s: doing mld downcall", __func__); - error = mld_change_state(inm, 0); - if (error) - CTR1(KTR_MLD, "%s: failed mld downcall", - __func__); + IN6_MULTI_LIST_UNLOCK(); + im6f_rollback(imf); + im6f_reap(imf); + goto out_in6p_locked; } + CTR1(KTR_MLD, "%s: doing mld downcall", __func__); + error = mld_change_state(inm, 0); IN6_MULTI_LIST_UNLOCK(); - } - IN6_MULTI_UNLOCK(); - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - return (ENXIO); - if (error) { - im6f_rollback(imf); - if (is_new) - im6f_purge(imf); - else + if (error) { + CTR1(KTR_MLD, "%s: failed mld downcall", + __func__); + im6f_rollback(imf); im6f_reap(imf); - } else { - im6f_commit(imf); - } - -out_im6o_free: - if (error && is_new) { - inm = imo->im6o_membership[idx]; - if (inm != NULL) { - IN6_MULTI_LIST_LOCK(); - in6m_release_deferred(inm); - IN6_MULTI_LIST_UNLOCK(); + goto out_in6p_locked; } - imo->im6o_membership[idx] = NULL; - --imo->im6o_num_memberships; } + if (is_new) + ip6_mfilter_insert(&imo->im6o_head, imf); + + im6f_commit(imf); + imf = NULL; + out_in6p_locked: INP_WUNLOCK(inp); +out_in6p_unlocked: + IN6_MULTI_UNLOCK(); + + if (is_new && imf) { + if (imf->im6f_in6m != NULL) { + struct in6_multi_head inmh; + + SLIST_INIT(&inmh); + SLIST_INSERT_HEAD(&inmh, imf->im6f_in6m, in6m_defer); + in6m_release_list_deferred(&inmh); + } + ip6_mfilter_free(imf); + } return (error); } @@ -2245,8 +2175,8 @@ in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) struct in6_msource *ims; struct in6_multi *inm; uint32_t ifindex; - size_t idx; - int error, is_final; + int error; + bool is_final; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif @@ -2254,7 +2184,7 @@ in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) ifp = NULL; ifindex = 0; error = 0; - is_final = 1; + is_final = true; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; @@ -2372,20 +2302,21 @@ in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp); KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__)); + IN6_MULTI_LOCK(); + /* - * Find the membership in the membership array. + * Find the membership in the membership list. */ imo = in6p_findmoptions(inp); - idx = im6o_match_group(imo, ifp, &gsa->sa); - if (idx == -1) { + imf = im6o_match_group(imo, ifp, &gsa->sa); + if (imf == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } - inm = imo->im6o_membership[idx]; - imf = &imo->im6o_mfilters[idx]; + inm = imf->im6f_in6m; if (ssa->ss.ss_family != AF_UNSPEC) - is_final = 0; + is_final = false; /* * Begin state merge transaction at socket layer. @@ -2397,13 +2328,14 @@ in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { + ip6_mfilter_remove(&imo->im6o_head, imf); im6f_leave(imf); } else { if (imf->im6f_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_in6p_locked; } - ims = im6o_match_source(imo, idx, &ssa->sa); + ims = im6o_match_source(imf, &ssa->sa); if (ims == NULL) { CTR3(KTR_MLD, "%s: source %p %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), @@ -2423,56 +2355,47 @@ in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) /* * Begin state merge transaction at MLD layer. */ - in_pcbref(inp); - INP_WUNLOCK(inp); - IN6_MULTI_LOCK(); - - if (is_final) { - /* - * Give up the multicast address record to which - * the membership points. - */ - (void)in6_leavegroup_locked(inm, imf); - } else { + if (!is_final) { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); - if (error) + if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); - else { - CTR1(KTR_MLD, "%s: doing mld downcall", __func__); - error = mld_change_state(inm, 0); - if (error) - CTR1(KTR_MLD, "%s: failed mld downcall", - __func__); + IN6_MULTI_LIST_UNLOCK(); + im6f_rollback(imf); + im6f_reap(imf); + goto out_in6p_locked; } + + CTR1(KTR_MLD, "%s: doing mld downcall", __func__); + error = mld_change_state(inm, 0); IN6_MULTI_LIST_UNLOCK(); + if (error) { + CTR1(KTR_MLD, "%s: failed mld downcall", + __func__); + im6f_rollback(imf); + im6f_reap(imf); + goto out_in6p_locked; + } } - IN6_MULTI_UNLOCK(); - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - return (ENXIO); - - if (error) - im6f_rollback(imf); - else - im6f_commit(imf); - + im6f_commit(imf); im6f_reap(imf); - if (is_final) { - /* Remove the gap in the membership array. */ - for (++idx; idx < imo->im6o_num_memberships; ++idx) { - imo->im6o_membership[idx-1] = imo->im6o_membership[idx]; - imo->im6o_mfilters[idx-1] = imo->im6o_mfilters[idx]; - } - imo->im6o_num_memberships--; - } - out_in6p_locked: INP_WUNLOCK(inp); + + if (is_final && imf) { + /* + * Give up the multicast address record to which + * the membership points. + */ + (void)in6_leavegroup_locked(inm, imf); + ip6_mfilter_free(imf); + } + + IN6_MULTI_UNLOCK(); return (error); } @@ -2530,7 +2453,6 @@ in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; - size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), @@ -2567,13 +2489,12 @@ in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) * Check if this socket is a member of this group. */ imo = in6p_findmoptions(inp); - idx = im6o_match_group(imo, ifp, &gsa->sa); - if (idx == -1 || imo->im6o_mfilters == NULL) { + imf = im6o_match_group(imo, ifp, &gsa->sa); + if (imf == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } - inm = imo->im6o_membership[idx]; - imf = &imo->im6o_mfilters[idx]; + inm = imf->im6f_in6m; /* * Begin state merge transaction at socket layer. @@ -2814,6 +2735,7 @@ sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in6_addr mcaddr; struct in6_addr src; + struct epoch_tracker et; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in6_multi *inm; @@ -2868,12 +2790,11 @@ sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS) IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { - if (ifma->ifma_addr->sa_family != AF_INET6 || - ifma->ifma_protospec == NULL) + inm = in6m_ifmultiaddr_get_inm(ifma); + if (inm == NULL) continue; - inm = (struct in6_multi *)ifma->ifma_protospec; if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr)) continue; fmode = inm->in6m_st[1].iss_fmode; @@ -2897,7 +2818,7 @@ sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS) break; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); diff --git a/freebsd/sys/netinet6/in6_pcb.c b/freebsd/sys/netinet6/in6_pcb.c index a6beba43..b66aa8a4 100644 --- a/freebsd/sys/netinet6/in6_pcb.c +++ b/freebsd/sys/netinet6/in6_pcb.c @@ -186,14 +186,15 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + struct epoch_tracker et; struct ifaddr *ifa; sin6->sin6_port = 0; /* yech... */ - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin6)) == NULL && (inp->inp_flags & INP_BINDANY) == 0) { - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } @@ -206,10 +207,10 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) { - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); } if (lport) { struct inpcb *t; @@ -814,19 +815,20 @@ in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, void in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { - struct inpcb *in6p; + struct inpcb *inp; + struct in6_multi *inm; + struct in6_mfilter *imf; struct ip6_moptions *im6o; - int i, gap; INP_INFO_WLOCK(pcbinfo); - CK_LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) { - INP_WLOCK(in6p); - if (__predict_false(in6p->inp_flags2 & INP_FREED)) { - INP_WUNLOCK(in6p); + CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { + INP_WLOCK(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + INP_WUNLOCK(inp); continue; } - im6o = in6p->in6p_moptions; - if ((in6p->inp_vflag & INP_IPV6) && im6o != NULL) { + im6o = inp->in6p_moptions; + if ((inp->inp_vflag & INP_IPV6) && im6o != NULL) { /* * Unselect the outgoing ifp for multicast if it * is being detached. @@ -837,20 +839,20 @@ in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) * Drop multicast group membership if we joined * through the interface being detached. */ - gap = 0; - for (i = 0; i < im6o->im6o_num_memberships; i++) { - if (im6o->im6o_membership[i]->in6m_ifp == - ifp) { - in6_leavegroup(im6o->im6o_membership[i], NULL); - gap++; - } else if (gap != 0) { - im6o->im6o_membership[i - gap] = - im6o->im6o_membership[i]; - } +restart: + IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) { + if ((inm = imf->im6f_in6m) == NULL) + continue; + if (inm->in6m_ifp != ifp) + continue; + ip6_mfilter_remove(&im6o->im6o_head, imf); + IN6_MULTI_LOCK_ASSERT(); + in6_leavegroup_locked(inm, NULL); + ip6_mfilter_free(imf); + goto restart; } - im6o->im6o_num_memberships -= gap; } - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } diff --git a/freebsd/sys/netinet6/in6_pcb.h b/freebsd/sys/netinet6/in6_pcb.h index 2c6bcdc6..56ea6eeb 100644 --- a/freebsd/sys/netinet6/in6_pcb.h +++ b/freebsd/sys/netinet6/in6_pcb.h @@ -113,7 +113,7 @@ int in6_getpeeraddr(struct socket *so, struct sockaddr **nam); int in6_getsockaddr(struct socket *so, struct sockaddr **nam); int in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam); int in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam); -int in6_selecthlim(struct in6pcb *, struct ifnet *); +int in6_selecthlim(struct inpcb *, struct ifnet *); int in6_pcbsetport(struct in6_addr *, struct inpcb *, struct ucred *); void init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m, int); #endif /* _KERNEL */ diff --git a/freebsd/sys/netinet6/in6_proto.c b/freebsd/sys/netinet6/in6_proto.c index cf62e60c..652d70b6 100644 --- a/freebsd/sys/netinet6/in6_proto.c +++ b/freebsd/sys/netinet6/in6_proto.c @@ -386,10 +386,6 @@ VNET_DEFINE(int, ip6_accept_rtadv) = 0; VNET_DEFINE(int, ip6_no_radr) = 0; VNET_DEFINE(int, ip6_norbit_raif) = 0; VNET_DEFINE(int, ip6_rfc6204w3) = 0; -VNET_DEFINE(int, ip6_maxfragpackets); /* initialized in frag6.c:frag6_init() */ -int ip6_maxfrags; /* initialized in frag6.c:frag6_init() */ -VNET_DEFINE(int, ip6_maxfragbucketsize);/* initialized in frag6.c:frag6_init() */ -VNET_DEFINE(int, ip6_maxfragsperpacket); /* initialized in frag6.c:frag6_init() */ VNET_DEFINE(int, ip6_log_interval) = 5; VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we * process? */ @@ -476,20 +472,6 @@ sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS) return (0); } -static int -sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS) -{ - int error, val; - - val = V_ip6_maxfragpackets; - error = sysctl_handle_int(oidp, &val, 0, req); - if (error != 0 || !req->newptr) - return (error); - V_ip6_maxfragpackets = val; - frag6_set_bucketsize(); - return (0); -} - SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0, "Enable forwarding of IPv6 packets between interfaces"); @@ -502,12 +484,6 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim, SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat, ip6stat, "IP6 statistics (struct ip6stat, netinet6/ip6_var.h)"); -SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, - CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, NULL, 0, - sysctl_ip6_maxfragpackets, "I", - "Default maximum number of outstanding fragmented IPv6 packets. " - "A value of 0 means no fragmented packets will be accepted, while a " - "a value of -1 means no limit"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0, "Default value of per-interface flag for accepting ICMPv6 RA messages"); @@ -577,17 +553,6 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr, SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0, "Use the default scope zone when none is specified"); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, - CTLFLAG_RW, &ip6_maxfrags, 0, - "Maximum allowed number of outstanding IPv6 packet fragments. " - "A value of 0 means no fragmented packets will be accepted, while a " - "a value of -1 means no limit"); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0, - "Maximum number of reassembly queues per hash bucket"); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0, - "Maximum allowed number of fragments per packet"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0, "Enable path MTU discovery for multicast packets"); @@ -643,3 +608,10 @@ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861, nd6_onlink_ns_rfc4861, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_onlink_ns_rfc4861), 0, "Accept 'on-link' ICMPv6 NS messages in compliance with RFC 4861"); +#ifdef EXPERIMENTAL +SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, + nd6_ignore_ipv6_only_ra, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(nd6_ignore_ipv6_only_ra), 0, + "Ignore the 'IPv6-Only flag' in RA messages in compliance with " + "draft-ietf-6man-ipv6only-flag"); +#endif diff --git a/freebsd/sys/netinet6/in6_src.c b/freebsd/sys/netinet6/in6_src.c index 1cb71b88..0bd8bba4 100644 --- a/freebsd/sys/netinet6/in6_src.c +++ b/freebsd/sys/netinet6/in6_src.c @@ -726,6 +726,10 @@ selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, if (ron->ro_rt == NULL || (ron->ro_rt->rt_flags & RTF_GATEWAY) != 0) error = EHOSTUNREACH; + else { + rt = ron->ro_rt; + ifp = rt->rt_ifp; + } goto done; } @@ -929,21 +933,21 @@ in6_selectroute_fib(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, * 3. The system default hoplimit. */ int -in6_selecthlim(struct inpcb *in6p, struct ifnet *ifp) +in6_selecthlim(struct inpcb *inp, struct ifnet *ifp) { - if (in6p && in6p->in6p_hops >= 0) - return (in6p->in6p_hops); + if (inp && inp->in6p_hops >= 0) + return (inp->in6p_hops); else if (ifp) return (ND_IFINFO(ifp)->chlim); - else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { + else if (inp && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { struct nhop6_basic nh6; struct in6_addr dst; uint32_t fibnum, scopeid; int hlim; - fibnum = in6p->inp_inc.inc_fibnum; - in6_splitscope(&in6p->in6p_faddr, &dst, &scopeid); + fibnum = inp->inp_inc.inc_fibnum; + in6_splitscope(&inp->in6p_faddr, &dst, &scopeid); if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6)==0){ hlim = ND_IFINFO(nh6.nh_ifp)->chlim; return (hlim); diff --git a/freebsd/sys/netinet6/in6_var.h b/freebsd/sys/netinet6/in6_var.h index 5ed0ae90..3e535310 100644 --- a/freebsd/sys/netinet6/in6_var.h +++ b/freebsd/sys/netinet6/in6_var.h @@ -602,9 +602,61 @@ struct in6_mfilter { struct ip6_msource_tree im6f_sources; /* source list for (S,G) */ u_long im6f_nsrc; /* # of source entries */ uint8_t im6f_st[2]; /* state before/at commit */ + struct in6_multi *im6f_in6m; /* associated multicast address */ + STAILQ_ENTRY(in6_mfilter) im6f_entry; /* list entry */ }; /* + * Helper types and functions for IPv4 multicast filters. + */ +STAILQ_HEAD(ip6_mfilter_head, in6_mfilter); + +struct in6_mfilter *ip6_mfilter_alloc(int mflags, int st0, int st1); +void ip6_mfilter_free(struct in6_mfilter *); + +static inline void +ip6_mfilter_init(struct ip6_mfilter_head *head) +{ + + STAILQ_INIT(head); +} + +static inline struct in6_mfilter * +ip6_mfilter_first(const struct ip6_mfilter_head *head) +{ + + return (STAILQ_FIRST(head)); +} + +static inline void +ip6_mfilter_insert(struct ip6_mfilter_head *head, struct in6_mfilter *imf) +{ + + STAILQ_INSERT_TAIL(head, imf, im6f_entry); +} + +static inline void +ip6_mfilter_remove(struct ip6_mfilter_head *head, struct in6_mfilter *imf) +{ + + STAILQ_REMOVE(head, imf, in6_mfilter, im6f_entry); +} + +#define IP6_MFILTER_FOREACH(imf, head) \ + STAILQ_FOREACH(imf, head, im6f_entry) + +static inline size_t +ip6_mfilter_count(struct ip6_mfilter_head *head) +{ + struct in6_mfilter *imf; + size_t num = 0; + + STAILQ_FOREACH(imf, head, im6f_entry) + num++; + return (num); +} + +/* * Legacy KAME IPv6 multicast membership descriptor. */ struct in6_multi_mship { @@ -645,6 +697,7 @@ struct in6_multi { /* New fields for MLDv2 follow. */ struct mld_ifsoftc *in6m_mli; /* MLD info */ SLIST_ENTRY(in6_multi) in6m_nrele; /* to-be-released by MLD */ + SLIST_ENTRY(in6_multi) in6m_defer; /* deferred MLDv1 */ struct ip6_msource_tree in6m_srcs; /* tree of sources */ u_long in6m_nsrc; /* # of tree entries */ @@ -670,8 +723,8 @@ struct in6_multi { } in6m_st[2]; /* state at t0, t1 */ }; -void in6m_disconnect(struct in6_multi *inm); -extern int ifma6_restart; +void in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm); + /* * Helper function to derive the filter mode on a source entry * from its internal counters. Predicates are: @@ -713,13 +766,25 @@ extern struct sx in6_multi_sx; #define IN6_MULTI_LOCK_ASSERT() sx_assert(&in6_multi_sx, SA_XLOCKED) #define IN6_MULTI_UNLOCK_ASSERT() sx_assert(&in6_multi_sx, SA_XUNLOCKED) +/* + * Get the in6_multi pointer from a ifmultiaddr. + * Returns NULL if ifmultiaddr is no longer valid. + */ +static __inline struct in6_multi * +in6m_ifmultiaddr_get_inm(struct ifmultiaddr *ifma) +{ + + NET_EPOCH_ASSERT(); + + return ((ifma->ifma_addr->sa_family != AF_INET6 || + (ifma->ifma_flags & IFMA_F_ENQUEUED) == 0) ? NULL : + ifma->ifma_protospec); +} /* * Look up an in6_multi record for an IPv6 multicast address * on the interface ifp. * If no record found, return NULL. - * - * SMPng: The IN6_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held. */ static __inline struct in6_multi * in6m_lookup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr) @@ -727,18 +792,14 @@ in6m_lookup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr) struct ifmultiaddr *ifma; struct in6_multi *inm; - inm = NULL; - CK_STAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { - if (ifma->ifma_addr->sa_family == AF_INET6) { - inm = (struct in6_multi *)ifma->ifma_protospec; - if (inm == NULL) - continue; - if (IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, mcaddr)) - break; - inm = NULL; - } + CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + inm = in6m_ifmultiaddr_get_inm(ifma); + if (inm == NULL) + continue; + if (IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, mcaddr)) + return (inm); } - return (inm); + return (NULL); } /* @@ -749,12 +810,13 @@ in6m_lookup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr) static __inline struct in6_multi * in6m_lookup(struct ifnet *ifp, const struct in6_addr *mcaddr) { + struct epoch_tracker et; struct in6_multi *inm; IN6_MULTI_LIST_LOCK(); - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); inm = in6m_lookup_locked(ifp, mcaddr); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); IN6_MULTI_LIST_UNLOCK(); return (inm); @@ -808,8 +870,8 @@ void in6m_clear_recorded(struct in6_multi *); void in6m_commit(struct in6_multi *); void in6m_print(const struct in6_multi *); int in6m_record_source(struct in6_multi *, const struct in6_addr *); -void in6m_release_deferred(struct in6_multi *); void in6m_release_list_deferred(struct in6_multi_head *); +void in6m_release_wait(void); void ip6_freemoptions(struct ip6_moptions *); int ip6_getmoptions(struct inpcb *, struct sockopt *); int ip6_setmoptions(struct inpcb *, struct sockopt *); diff --git a/freebsd/sys/netinet6/ip6_fastfwd.c b/freebsd/sys/netinet6/ip6_fastfwd.c index f63c51bf..0c04200c 100644 --- a/freebsd/sys/netinet6/ip6_fastfwd.c +++ b/freebsd/sys/netinet6/ip6_fastfwd.c @@ -158,10 +158,10 @@ ip6_tryforward(struct mbuf *m) /* * Incoming packet firewall processing. */ - if (!PFIL_HOOKED(&V_inet6_pfil_hook)) + if (!PFIL_HOOKED_IN(V_inet6_pfil_head)) goto passin; - if (pfil_run_hooks(&V_inet6_pfil_hook, &m, rcvif, PFIL_IN, 0, - NULL) != 0 || m == NULL) + if (pfil_run_hooks(V_inet6_pfil_head, &m, rcvif, PFIL_IN, NULL) != + PFIL_PASS) goto dropin; /* * If packet filter sets the M_FASTFWD_OURS flag, this means @@ -197,7 +197,7 @@ passin: in6_ifstat_inc(rcvif, ifs6_in_noroute); goto dropin; } - if (!PFIL_HOOKED(&V_inet6_pfil_hook)) { + if (!PFIL_HOOKED_OUT(V_inet6_pfil_head)) { if (m->m_pkthdr.len > nh.nh_mtu) { in6_ifstat_inc(nh.nh_ifp, ifs6_in_toobig); icmp6_error(m, ICMP6_PACKET_TOO_BIG, 0, nh.nh_mtu); @@ -210,8 +210,8 @@ passin: /* * Outgoing packet firewall processing. */ - if (pfil_run_hooks(&V_inet6_pfil_hook, &m, nh.nh_ifp, PFIL_OUT, - PFIL_FWD, NULL) != 0 || m == NULL) + if (pfil_run_hooks(V_inet6_pfil_head, &m, nh.nh_ifp, PFIL_OUT | + PFIL_FWD, NULL) != PFIL_PASS) goto dropout; /* diff --git a/freebsd/sys/netinet6/ip6_forward.c b/freebsd/sys/netinet6/ip6_forward.c index 80535efe..ca73977f 100644 --- a/freebsd/sys/netinet6/ip6_forward.c +++ b/freebsd/sys/netinet6/ip6_forward.c @@ -322,15 +322,14 @@ again2: in6_clearscope(&ip6->ip6_dst); /* Jump over all PFIL processing if hooks are not active. */ - if (!PFIL_HOOKED(&V_inet6_pfil_hook)) + if (!PFIL_HOOKED_OUT(V_inet6_pfil_head)) goto pass; odst = ip6->ip6_dst; /* Run through list of hooks for forwarded packets. */ - error = pfil_run_hooks(&V_inet6_pfil_hook, &m, rt->rt_ifp, PFIL_OUT, - PFIL_FWD, NULL); - if (error != 0 || m == NULL) - goto freecopy; /* consumed by filter */ + if (pfil_run_hooks(V_inet6_pfil_head, &m, rt->rt_ifp, PFIL_OUT | + PFIL_FWD, NULL) != PFIL_PASS) + goto freecopy; ip6 = mtod(m, struct ip6_hdr *); /* See if destination IP address was changed by packet filter. */ diff --git a/freebsd/sys/netinet6/ip6_id.c b/freebsd/sys/netinet6/ip6_id.c index 0905ab3f..847dc403 100644 --- a/freebsd/sys/netinet6/ip6_id.c +++ b/freebsd/sys/netinet6/ip6_id.c @@ -91,6 +91,7 @@ __FBSDID("$FreeBSD$"); #include <sys/types.h> #include <sys/param.h> #include <sys/kernel.h> +#include <sys/random.h> #include <sys/socket.h> #include <sys/libkern.h> @@ -260,5 +261,15 @@ u_int32_t ip6_randomflowlabel(void) { + /* + * It's ok to emit zero flow labels early, before random is available + * (seeded). RFC 6437: + * + * "A Flow Label of zero is used to indicate packets that have not been + * labeled." + */ + if (__predict_false(!is_random_seeded())) + return (0); + return randomid(&randomtab_20) & 0xfffff; } diff --git a/freebsd/sys/netinet6/ip6_input.c b/freebsd/sys/netinet6/ip6_input.c index 25ab624c..c5c040f0 100644 --- a/freebsd/sys/netinet6/ip6_input.c +++ b/freebsd/sys/netinet6/ip6_input.c @@ -193,7 +193,7 @@ SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRDQMAXLEN, intr_direct_queue_maxlen, #endif -VNET_DEFINE(struct pfil_head, inet6_pfil_hook); +VNET_DEFINE(pfil_head_t, inet6_pfil_head); VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat); VNET_PCPUSTAT_SYSINIT(ip6stat); @@ -216,6 +216,7 @@ static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); void ip6_init(void) { + struct pfil_head_args args; struct protosw *pr; int i; @@ -229,11 +230,11 @@ ip6_init(void) &V_in6_ifaddrhmask); /* Initialize packet filter hooks. */ - V_inet6_pfil_hook.ph_type = PFIL_TYPE_AF; - V_inet6_pfil_hook.ph_af = AF_INET6; - if ((i = pfil_head_register(&V_inet6_pfil_hook)) != 0) - printf("%s: WARNING: unable to register pfil hook, " - "error %d\n", __func__, i); + args.pa_version = PFIL_VERSION; + args.pa_flags = PFIL_IN | PFIL_OUT; + args.pa_type = PFIL_TYPE_IP6; + args.pa_headname = PFIL_INET6_NAME; + V_inet6_pfil_head = pfil_head_register(&args); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET6, &V_ipsec_hhh_in[HHOOK_IPSEC_INET6], @@ -361,9 +362,7 @@ ip6_destroy(void *unused __unused) #endif netisr_unregister_vnet(&ip6_nh); - if ((error = pfil_head_unregister(&V_inet6_pfil_hook)) != 0) - printf("%s: WARNING: unable to unregister pfil hook, " - "error %d\n", __func__, error); + pfil_head_unregister(V_inet6_pfil_head); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " @@ -406,20 +405,22 @@ VNET_SYSUNINIT(inet6, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_destroy, NULL); #endif static int -ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off, +ip6_input_hbh(struct mbuf **mp, uint32_t *plen, uint32_t *rtalert, int *off, int *nxt, int *ours) { + struct mbuf *m; struct ip6_hdr *ip6; struct ip6_hbh *hbh; - if (ip6_hopopts_input(plen, rtalert, &m, off)) { + if (ip6_hopopts_input(plen, rtalert, mp, off)) { #if 0 /*touches NULL pointer*/ - in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); + in6_ifstat_inc((*mp)->m_pkthdr.rcvif, ifs6_in_discard); #endif goto out; /* m have already been freed */ } /* adjust pointer */ + m = *mp; ip6 = mtod(m, struct ip6_hdr *); /* @@ -760,14 +761,12 @@ ip6_input(struct mbuf *m) */ /* Jump over all PFIL processing if hooks are not active. */ - if (!PFIL_HOOKED(&V_inet6_pfil_hook)) + if (!PFIL_HOOKED_IN(V_inet6_pfil_head)) goto passin; odst = ip6->ip6_dst; - if (pfil_run_hooks(&V_inet6_pfil_hook, &m, - m->m_pkthdr.rcvif, PFIL_IN, 0, NULL)) - return; - if (m == NULL) /* consumed by filter */ + if (pfil_run_hooks(V_inet6_pfil_head, &m, m->m_pkthdr.rcvif, PFIL_IN, + NULL) != PFIL_PASS) return; ip6 = mtod(m, struct ip6_hdr *); srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst); @@ -859,7 +858,7 @@ passin: */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { - if (ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours) != 0) + if (ip6_input_hbh(&m, &plen, &rtalert, &off, &nxt, &ours) != 0) return; } else nxt = ip6->ip6_nxt; @@ -1409,12 +1408,12 @@ ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, } void -ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) +ip6_savecontrol(struct inpcb *inp, struct mbuf *m, struct mbuf **mp) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); int v4only = 0; - mp = ip6_savecontrol_v4(in6p, m, mp, &v4only); + mp = ip6_savecontrol_v4(inp, m, mp, &v4only); if (v4only) return; @@ -1425,7 +1424,7 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) * returned to normal user. * See also RFC 2292 section 6 (or RFC 3542 section 8). */ - if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) { + if ((inp->inp_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary @@ -1467,7 +1466,7 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) * Note: this constraint is removed in RFC3542 */ *mp = sbcreatecontrol((caddr_t)hbh, hbhlen, - IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS), + IS2292(inp, IPV6_2292HOPOPTS, IPV6_HOPOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; @@ -1477,7 +1476,7 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) } } - if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { + if ((inp->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* @@ -1538,22 +1537,22 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) switch (nxt) { case IPPROTO_DSTOPTS: - if (!(in6p->inp_flags & IN6P_DSTOPTS)) + if (!(inp->inp_flags & IN6P_DSTOPTS)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, - IS2292(in6p, + IS2292(inp, IPV6_2292DSTOPTS, IPV6_DSTOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: - if (!(in6p->inp_flags & IN6P_RTHDR)) + if (!(inp->inp_flags & IN6P_RTHDR)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, - IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR), + IS2292(inp, IPV6_2292RTHDR, IPV6_RTHDR), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; @@ -1589,7 +1588,7 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) ; } - if (in6p->inp_flags2 & INP_RECVFLOWID) { + if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; @@ -1610,7 +1609,7 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) } #ifdef RSS - if (in6p->inp_flags2 & INP_RECVRSSBUCKETID) { + if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; diff --git a/freebsd/sys/netinet6/ip6_output.c b/freebsd/sys/netinet6/ip6_output.c index 0851bef8..483f17f0 100644 --- a/freebsd/sys/netinet6/ip6_output.c +++ b/freebsd/sys/netinet6/ip6_output.c @@ -69,14 +69,16 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> -#include <rtems/bsd/local/opt_ratelimit.h> #include <rtems/bsd/local/opt_ipsec.h> -#include <rtems/bsd/local/opt_sctp.h> +#include <rtems/bsd/local/opt_kern_tls.h> +#include <rtems/bsd/local/opt_ratelimit.h> #include <rtems/bsd/local/opt_route.h> #include <rtems/bsd/local/opt_rss.h> +#include <rtems/bsd/local/opt_sctp.h> #include <sys/param.h> #include <sys/kernel.h> +#include <sys/ktls.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/errno.h> @@ -230,7 +232,20 @@ ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto, IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } - m->m_flags = m0->m_flags & M_COPYFLAGS; + + /* + * Make sure the complete packet header gets copied + * from the originating mbuf to the newly created + * mbuf. This also ensures that existing firewall + * classification(s), VLAN tags and so on get copied + * to the resulting fragmented packet(s): + */ + if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { + m_free(m); + IP6STAT_INC(ip6s_odropped); + return (ENOBUFS); + } + *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; @@ -255,8 +270,6 @@ ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto, } m_cat(m, m_frgpart); m->m_pkthdr.len = fraglen + hlen + sizeof(*ip6f); - m->m_pkthdr.fibnum = m0->m_pkthdr.fibnum; - m->m_pkthdr.rcvif = NULL; ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; @@ -267,6 +280,83 @@ ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto, return (0); } +static int +ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp, + struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro) +{ +#ifdef KERN_TLS + struct ktls_session *tls = NULL; +#endif + struct m_snd_tag *mst; + int error; + + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); + mst = NULL; + +#ifdef KERN_TLS + /* + * If this is an unencrypted TLS record, save a reference to + * the record. This local reference is used to call + * ktls_output_eagain after the mbuf has been freed (thus + * dropping the mbuf's reference) in if_output. + */ + if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) { + tls = ktls_hold(m->m_next->m_ext.ext_pgs->tls); + mst = tls->snd_tag; + + /* + * If a TLS session doesn't have a valid tag, it must + * have had an earlier ifp mismatch, so drop this + * packet. + */ + if (mst == NULL) { + error = EAGAIN; + goto done; + } + } +#endif +#ifdef RATELIMIT + if (inp != NULL && mst == NULL) { + if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 || + (inp->inp_snd_tag != NULL && + inp->inp_snd_tag->ifp != ifp)) + in_pcboutput_txrtlmt(inp, ifp, m); + + if (inp->inp_snd_tag != NULL) + mst = inp->inp_snd_tag; + } +#endif + if (mst != NULL) { + KASSERT(m->m_pkthdr.rcvif == NULL, + ("trying to add a send tag to a forwarded packet")); + if (mst->ifp != ifp) { + error = EAGAIN; + goto done; + } + + /* stamp send tag on mbuf */ + m->m_pkthdr.snd_tag = m_snd_tag_ref(mst); + m->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } + + error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); + +done: + /* Check for route change invalidating send tags. */ +#ifdef KERN_TLS + if (tls != NULL) { + if (error == EAGAIN) + error = ktls_output_eagain(inp, tls); + ktls_free(tls); + } +#endif +#ifdef RATELIMIT + if (error == EAGAIN) + in_pcboutput_eagain(inp); +#endif + return (error); +} + /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). @@ -324,6 +414,9 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } +#ifdef NUMA + m->m_pkthdr.numa_domain = inp->inp_numa_domain; +#endif } #if defined(IPSEC) || defined(IPSEC_SUPPORT) @@ -573,52 +666,72 @@ again: counter_u64_add(rt->rt_pksent, 1); } - - /* - * The outgoing interface must be in the zone of source and - * destination addresses. - */ - origifp = ifp; - + /* Setup data structures for scope ID checks. */ src0 = ip6->ip6_src; - if (in6_setscope(&src0, origifp, &zone)) - goto badscope; bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = ip6->ip6_src; - if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id) - goto badscope; dst0 = ip6->ip6_dst; - if (in6_setscope(&dst0, origifp, &zone)) - goto badscope; /* re-initialize to be sure */ bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; - if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) { - goto badscope; - } - - /* We should use ia_ifp to support the case of - * sending packets to an address of our own. - */ - if (ia != NULL && ia->ia_ifp) - ifp = ia->ia_ifp; - /* scope check is done. */ - goto routefound; + /* Check for valid scope ID. */ + if (in6_setscope(&src0, ifp, &zone) == 0 && + sa6_recoverscope(&src_sa) == 0 && zone == src_sa.sin6_scope_id && + in6_setscope(&dst0, ifp, &zone) == 0 && + sa6_recoverscope(&dst_sa) == 0 && zone == dst_sa.sin6_scope_id) { + /* + * The outgoing interface is in the zone of the source + * and destination addresses. + * + * Because the loopback interface cannot receive + * packets with a different scope ID than its own, + * there is a trick is to pretend the outgoing packet + * was received by the real network interface, by + * setting "origifp" different from "ifp". This is + * only allowed when "ifp" is a loopback network + * interface. Refer to code in nd6_output_ifp() for + * more details. + */ + origifp = ifp; + + /* + * We should use ia_ifp to support the case of sending + * packets to an address of our own. + */ + if (ia != NULL && ia->ia_ifp) + ifp = ia->ia_ifp; + + } else if ((ifp->if_flags & IFF_LOOPBACK) == 0 || + sa6_recoverscope(&src_sa) != 0 || + sa6_recoverscope(&dst_sa) != 0 || + dst_sa.sin6_scope_id == 0 || + (src_sa.sin6_scope_id != 0 && + src_sa.sin6_scope_id != dst_sa.sin6_scope_id) || + (origifp = ifnet_byindex(dst_sa.sin6_scope_id)) == NULL) { + /* + * If the destination network interface is not a + * loopback interface, or the destination network + * address has no scope ID, or the source address has + * a scope ID set which is different from the + * destination address one, or there is no network + * interface representing this scope ID, the address + * pair is considered invalid. + */ + IP6STAT_INC(ip6s_badscope); + in6_ifstat_inc(ifp, ifs6_out_discard); + if (error == 0) + error = EHOSTUNREACH; /* XXX */ + goto bad; + } - badscope: - IP6STAT_INC(ip6s_badscope); - in6_ifstat_inc(origifp, ifs6_out_discard); - if (error == 0) - error = EHOSTUNREACH; /* XXX */ - goto bad; + /* All scope ID checks are successful. */ - routefound: if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (opt && opt->ip6po_nextroute.ro_rt) { /* @@ -774,16 +887,21 @@ again: } /* Jump over all PFIL processing if hooks are not active. */ - if (!PFIL_HOOKED(&V_inet6_pfil_hook)) + if (!PFIL_HOOKED_OUT(V_inet6_pfil_head)) goto passout; odst = ip6->ip6_dst; /* Run through list of hooks for output packets. */ - error = pfil_run_hooks(&V_inet6_pfil_hook, &m, ifp, PFIL_OUT, 0, inp); - if (error != 0 || m == NULL) + switch (pfil_run_hooks(V_inet6_pfil_head, &m, ifp, PFIL_OUT, inp)) { + case PFIL_PASS: + ip6 = mtod(m, struct ip6_hdr *); + break; + case PFIL_DROPPED: + error = EPERM; + /* FALLTHROUGH */ + case PFIL_CONSUMED: goto done; - /* adjust pointer */ - ip6 = mtod(m, struct ip6_hdr *); + } needfiblookup = 0; /* See if destination IP address was changed by packet filter. */ @@ -881,11 +999,30 @@ passout: */ if (sw_csum & CSUM_DELAY_DATA_IPV6) { sw_csum &= ~CSUM_DELAY_DATA_IPV6; + m = mb_unmapped_to_ext(m); + if (m == NULL) { + error = ENOBUFS; + IP6STAT_INC(ip6s_odropped); + goto bad; + } in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr)); + } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) { + m = mb_unmapped_to_ext(m); + if (m == NULL) { + error = ENOBUFS; + IP6STAT_INC(ip6s_odropped); + goto bad; + } } #ifdef SCTP if (sw_csum & CSUM_SCTP_IPV6) { sw_csum &= ~CSUM_SCTP_IPV6; + m = mb_unmapped_to_ext(m); + if (m == NULL) { + error = ENOBUFS; + IP6STAT_INC(ip6s_odropped); + goto bad; + } sctp_delayed_cksum(m, sizeof(struct ip6_hdr)); } #endif @@ -931,23 +1068,7 @@ passout: m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } -#ifdef RATELIMIT - if (inp != NULL) { - if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) - in_pcboutput_txrtlmt(inp, ifp, m); - /* stamp send tag on mbuf */ - m->m_pkthdr.snd_tag = inp->inp_snd_tag; - } else { - m->m_pkthdr.snd_tag = NULL; - } -#endif - error = nd6_output_ifp(ifp, origifp, m, dst, - (struct route *)ro); -#ifdef RATELIMIT - /* check for route change */ - if (error == EAGAIN) - in_pcboutput_eagain(inp); -#endif + error = ip6_output_send(inp, ifp, origifp, m, dst, ro); goto done; } @@ -989,11 +1110,23 @@ passout: * XXX-BZ handle the hw offloading case. Need flags. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { + m = mb_unmapped_to_ext(m); + if (m == NULL) { + in6_ifstat_inc(ifp, ifs6_out_fragfail); + error = ENOBUFS; + goto bad; + } in6_delayed_cksum(m, plen, hlen); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { + m = mb_unmapped_to_ext(m); + if (m == NULL) { + in6_ifstat_inc(ifp, ifs6_out_fragfail); + error = ENOBUFS; + goto bad; + } sctp_delayed_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } @@ -1046,23 +1179,7 @@ sendorfree: counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } -#ifdef RATELIMIT - if (inp != NULL) { - if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) - in_pcboutput_txrtlmt(inp, ifp, m); - /* stamp send tag on mbuf */ - m->m_pkthdr.snd_tag = inp->inp_snd_tag; - } else { - m->m_pkthdr.snd_tag = NULL; - } -#endif - error = nd6_output_ifp(ifp, origifp, m, dst, - (struct route *)ro); -#ifdef RATELIMIT - /* check for route change */ - if (error == EAGAIN) - in_pcboutput_eagain(inp); -#endif + error = ip6_output_send(inp, ifp, origifp, m, dst, ro); } else m_freem(m); } @@ -1390,7 +1507,7 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) { int optdatalen, uproto; void *optdata; - struct inpcb *in6p = sotoinpcb(so); + struct inpcb *inp = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; @@ -1425,43 +1542,43 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: - INP_WLOCK(in6p); + INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) - in6p->inp_flags2 |= INP_REUSEADDR; + inp->inp_flags2 |= INP_REUSEADDR; else - in6p->inp_flags2 &= ~INP_REUSEADDR; - INP_WUNLOCK(in6p); + inp->inp_flags2 &= ~INP_REUSEADDR; + INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: - INP_WLOCK(in6p); + INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) - in6p->inp_flags2 |= INP_REUSEPORT; + inp->inp_flags2 |= INP_REUSEPORT; else - in6p->inp_flags2 &= ~INP_REUSEPORT; - INP_WUNLOCK(in6p); + inp->inp_flags2 &= ~INP_REUSEPORT; + INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT_LB: - INP_WLOCK(in6p); + INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT_LB) != 0) - in6p->inp_flags2 |= INP_REUSEPORT_LB; + inp->inp_flags2 |= INP_REUSEPORT_LB; else - in6p->inp_flags2 &= ~INP_REUSEPORT_LB; - INP_WUNLOCK(in6p); + inp->inp_flags2 &= ~INP_REUSEPORT_LB; + INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: - INP_WLOCK(in6p); - in6p->inp_inc.inc_fibnum = so->so_fibnum; - INP_WUNLOCK(in6p); + INP_WLOCK(inp); + inp->inp_inc.inc_fibnum = so->so_fibnum; + INP_WUNLOCK(inp); error = 0; break; case SO_MAX_PACING_RATE: #ifdef RATELIMIT - INP_WLOCK(in6p); - in6p->inp_flags2 |= INP_RATE_LIMIT_CHANGED; - INP_WUNLOCK(in6p); + INP_WLOCK(inp); + inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; + INP_WUNLOCK(inp); error = 0; #else error = EOPNOTSUPP; @@ -1495,7 +1612,7 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; - error = ip6_pcbopts(&in6p->in6p_outputopts, + error = ip6_pcbopts(&inp->in6p_outputopts, m, so, sopt); m_freem(m); /* XXX */ break; @@ -1566,57 +1683,57 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) error = EINVAL; else { /* -1 = kernel default */ - in6p->in6p_hops = optval; - if ((in6p->inp_vflag & + inp->in6p_hops = optval; + if ((inp->inp_vflag & INP_IPV4) != 0) - in6p->inp_ip_ttl = optval; + inp->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ - INP_WLOCK(in6p); \ + INP_WLOCK(inp); \ if (optval) \ - in6p->inp_flags |= (bit); \ + inp->inp_flags |= (bit); \ else \ - in6p->inp_flags &= ~(bit); \ - INP_WUNLOCK(in6p); \ + inp->inp_flags &= ~(bit); \ + INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) #define OPTSET2292(bit) \ do { \ - INP_WLOCK(in6p); \ - in6p->inp_flags |= IN6P_RFC2292; \ + INP_WLOCK(inp); \ + inp->inp_flags |= IN6P_RFC2292; \ if (optval) \ - in6p->inp_flags |= (bit); \ + inp->inp_flags |= (bit); \ else \ - in6p->inp_flags &= ~(bit); \ - INP_WUNLOCK(in6p); \ + inp->inp_flags &= ~(bit); \ + INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) -#define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0) +#define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0) #define OPTSET2_N(bit, val) do { \ if (val) \ - in6p->inp_flags2 |= bit; \ + inp->inp_flags2 |= bit; \ else \ - in6p->inp_flags2 &= ~bit; \ + inp->inp_flags2 &= ~bit; \ } while (0) #define OPTSET2(bit, val) do { \ - INP_WLOCK(in6p); \ + INP_WLOCK(inp); \ OPTSET2_N(bit, val); \ - INP_WUNLOCK(in6p); \ + INP_WUNLOCK(inp); \ } while (0) -#define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0) +#define OPTBIT2(bit) (inp->inp_flags2 & (bit) ? 1 : 0) #define OPTSET2292_EXCLUSIVE(bit) \ do { \ - INP_WLOCK(in6p); \ + INP_WLOCK(inp); \ if (OPTBIT(IN6P_RFC2292)) { \ error = EINVAL; \ } else { \ if (optval) \ - in6p->inp_flags |= (bit); \ + inp->inp_flags |= (bit); \ else \ - in6p->inp_flags &= ~(bit); \ + inp->inp_flags &= ~(bit); \ } \ - INP_WUNLOCK(in6p); \ + INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) case IPV6_RECVPKTINFO: @@ -1632,17 +1749,17 @@ do { \ error = EINVAL; break; } - INP_WLOCK(in6p); - if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - INP_WUNLOCK(in6p); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); return (ECONNRESET); } - optp = &in6p->in6p_outputopts; + optp = &inp->in6p_outputopts; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); break; } @@ -1693,16 +1810,16 @@ do { \ * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ - if (in6p->inp_lport || - !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { + if (inp->inp_lport || + !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) - in6p->inp_vflag &= ~INP_IPV4; + inp->inp_vflag &= ~INP_IPV4; else - in6p->inp_vflag |= INP_IPV4; + inp->inp_vflag |= INP_IPV4; break; case IPV6_RECVTCLASS: /* cannot mix with RFC2292 XXX */ @@ -1726,10 +1843,10 @@ do { \ case IPV6_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { - INP_WLOCK(in6p); - in6p->inp_rss_listen_bucket = optval; + INP_WLOCK(inp); + inp->inp_rss_listen_bucket = optval; OPTSET2_N(INP_RSS_BUCKET_SET, 1); - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); } else { error = EINVAL; } @@ -1752,17 +1869,17 @@ do { \ break; { struct ip6_pktopts **optp; - INP_WLOCK(in6p); - if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - INP_WUNLOCK(in6p); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); return (ECONNRESET); } - optp = &in6p->in6p_outputopts; + optp = &inp->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); break; } @@ -1844,16 +1961,16 @@ do { \ break; optlen = sopt->sopt_valsize; optbuf = optbuf_storage; - INP_WLOCK(in6p); - if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - INP_WUNLOCK(in6p); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); return (ECONNRESET); } - optp = &in6p->in6p_outputopts; + optp = &inp->in6p_outputopts; error = ip6_pcbopt(optname, optbuf, optlen, optp, (td != NULL) ? td->td_ucred : NULL, uproto); - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); break; } #undef OPTSET @@ -1870,7 +1987,7 @@ do { \ case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: - error = ip6_setmoptions(in6p, sopt); + error = ip6_setmoptions(inp, sopt); break; case IPV6_PORTRANGE: @@ -1879,34 +1996,34 @@ do { \ if (error) break; - INP_WLOCK(in6p); + INP_WLOCK(inp); switch (optval) { case IPV6_PORTRANGE_DEFAULT: - in6p->inp_flags &= ~(INP_LOWPORT); - in6p->inp_flags &= ~(INP_HIGHPORT); + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: - in6p->inp_flags &= ~(INP_LOWPORT); - in6p->inp_flags |= INP_HIGHPORT; + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: - in6p->inp_flags &= ~(INP_HIGHPORT); - in6p->inp_flags |= INP_LOWPORT; + inp->inp_flags &= ~(INP_HIGHPORT); + inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IPV6_IPSEC_POLICY: if (IPSEC_ENABLED(ipv6)) { - error = IPSEC_PCBCTL(ipv6, in6p, sopt); + error = IPSEC_PCBCTL(ipv6, inp, sopt); break; } /* FALLTHROUGH */ @@ -1974,7 +2091,7 @@ do { \ break; case IPV6_UNICAST_HOPS: - optval = in6p->in6p_hops; + optval = inp->in6p_hops; break; case IPV6_RECVPKTINFO: @@ -2000,7 +2117,7 @@ do { \ case IPV6_PORTRANGE: { int flags; - flags = in6p->inp_flags; + flags = inp->inp_flags; if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & INP_LOWPORT) @@ -2026,11 +2143,11 @@ do { \ break; case IPV6_FLOWID: - optval = in6p->inp_flowid; + optval = inp->inp_flowid; break; case IPV6_FLOWTYPE: - optval = in6p->inp_flowtype; + optval = inp->inp_flowtype; break; case IPV6_RECVFLOWID: @@ -2039,8 +2156,8 @@ do { \ #ifdef RSS case IPV6_RSSBUCKETID: retval = - rss_hash2bucket(in6p->inp_flowid, - in6p->inp_flowtype, + rss_hash2bucket(inp->inp_flowid, + inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; @@ -2076,12 +2193,12 @@ do { \ * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. - * Copy faddr out of in6p to avoid holding lock + * Copy faddr out of inp to avoid holding lock * on inp during route lookup. */ - INP_RLOCK(in6p); - bcopy(&in6p->in6p_faddr, &addr, sizeof(addr)); - INP_RUNLOCK(in6p); + INP_RLOCK(inp); + bcopy(&inp->in6p_faddr, &addr, sizeof(addr)); + INP_RUNLOCK(inp); error = ip6_getpmtu_ctl(so->so_fibnum, &addr, &pmtu); if (error) @@ -2133,20 +2250,20 @@ do { \ case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: - error = ip6_getpcbopt(in6p, optname, sopt); + error = ip6_getpcbopt(inp, optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_MSFILTER: - error = ip6_getmoptions(in6p, sopt); + error = ip6_getmoptions(inp, sopt); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IPV6_IPSEC_POLICY: if (IPSEC_ENABLED(ipv6)) { - error = IPSEC_PCBCTL(ipv6, in6p, sopt); + error = IPSEC_PCBCTL(ipv6, inp, sopt); break; } /* FALLTHROUGH */ @@ -2166,7 +2283,7 @@ ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0, optval, optlen; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); - struct inpcb *in6p = sotoinpcb(so); + struct inpcb *inp = sotoinpcb(so); int level, op, optname; level = sopt->sopt_level; @@ -2198,22 +2315,25 @@ ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) sizeof(optval)); if (error) break; - if ((optval % 2) != 0) { - /* the API assumes even offset values */ + if (optval < -1 || (optval % 2) != 0) { + /* + * The API assumes non-negative even offset + * values or -1 as a special value. + */ error = EINVAL; } else if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else - in6p->in6p_cksum = optval; + inp->in6p_cksum = optval; break; case SOPT_GET: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else - optval = in6p->in6p_cksum; + optval = inp->in6p_cksum; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; @@ -2312,16 +2432,16 @@ ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, #define GET_PKTOPT_VAR(field, lenexpr) do { \ if (pktopt && pktopt->field) { \ - INP_RUNLOCK(in6p); \ + INP_RUNLOCK(inp); \ optdata = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK); \ malloc_optdata = true; \ - INP_RLOCK(in6p); \ - if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ - INP_RUNLOCK(in6p); \ + INP_RLOCK(inp); \ + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ + INP_RUNLOCK(inp); \ free(optdata, M_TEMP); \ return (ECONNRESET); \ } \ - pktopt = in6p->in6p_outputopts; \ + pktopt = inp->in6p_outputopts; \ if (pktopt && pktopt->field) { \ optdatalen = min(lenexpr, sopt->sopt_valsize); \ bcopy(&pktopt->field, optdata, optdatalen); \ @@ -2340,7 +2460,7 @@ ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, pktopt->field->sa_len) static int -ip6_getpcbopt(struct inpcb *in6p, int optname, struct sockopt *sopt) +ip6_getpcbopt(struct inpcb *inp, int optname, struct sockopt *sopt) { void *optdata = NULL; bool malloc_optdata = false; @@ -2352,8 +2472,8 @@ ip6_getpcbopt(struct inpcb *in6p, int optname, struct sockopt *sopt) int defpreftemp = IP6PO_TEMPADDR_SYSTEM; struct ip6_pktopts *pktopt; - INP_RLOCK(in6p); - pktopt = in6p->in6p_outputopts; + INP_RLOCK(inp); + pktopt = inp->in6p_outputopts; switch (optname) { case IPV6_PKTINFO: @@ -2413,10 +2533,10 @@ ip6_getpcbopt(struct inpcb *in6p, int optname, struct sockopt *sopt) #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif - INP_RUNLOCK(in6p); + INP_RUNLOCK(inp); return (ENOPROTOOPT); } - INP_RUNLOCK(in6p); + INP_RUNLOCK(inp); error = sooptcopyout(sopt, optdata, optdatalen); if (malloc_optdata) @@ -3098,23 +3218,23 @@ ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) * Compute IPv6 extension header length. */ int -ip6_optlen(struct inpcb *in6p) +ip6_optlen(struct inpcb *inp) { int len; - if (!in6p->in6p_outputopts) + if (!inp->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) - len += elen(in6p->in6p_outputopts->ip6po_hbh); - if (in6p->in6p_outputopts->ip6po_rthdr) + len += elen(inp->in6p_outputopts->ip6po_hbh); + if (inp->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ - len += elen(in6p->in6p_outputopts->ip6po_dest1); - len += elen(in6p->in6p_outputopts->ip6po_rthdr); - len += elen(in6p->in6p_outputopts->ip6po_dest2); + len += elen(inp->in6p_outputopts->ip6po_dest1); + len += elen(inp->in6p_outputopts->ip6po_rthdr); + len += elen(inp->in6p_outputopts->ip6po_dest2); return len; #undef elen } diff --git a/freebsd/sys/netinet6/ip6_var.h b/freebsd/sys/netinet6/ip6_var.h index f235572d..be748b31 100644 --- a/freebsd/sys/netinet6/ip6_var.h +++ b/freebsd/sys/netinet6/ip6_var.h @@ -68,6 +68,7 @@ #include <sys/epoch.h> +struct ip6asfrag; /* * IP6 reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. @@ -83,25 +84,10 @@ struct ip6q { struct ip6q *ip6q_next; struct ip6q *ip6q_prev; int ip6q_unfrglen; /* len of unfragmentable part */ -#ifdef notyet - u_char *ip6q_nxtp; -#endif int ip6q_nfrag; /* # of fragments */ struct label *ip6q_label; }; -struct ip6asfrag { - struct ip6asfrag *ip6af_down; - struct ip6asfrag *ip6af_up; - struct mbuf *ip6af_m; - int ip6af_offset; /* offset in ip6af_m to next header */ - int ip6af_frglen; /* fragmentable part length */ - int ip6af_off; /* fragment offset */ - u_int16_t ip6af_mff; /* more fragment bit in frag off */ -}; - -#define IP6_REASS_MBUF(ip6af) (*(struct mbuf **)&((ip6af)->ip6af_m)) - /* * IP6 reinjecting structure. */ @@ -110,6 +96,7 @@ struct ip6_direct_ctx { uint32_t ip6dc_off; /* offset to next header */ }; +#if defined(_NETINET6_IN6_VAR_H_) && defined(_KERNEL) /* * Structure attached to inpcb.in6p_moptions and * passed to ip6_output when IPv6 multicast options are in use. @@ -119,13 +106,11 @@ struct ip6_moptions { struct ifnet *im6o_multicast_ifp; /* ifp for outgoing multicasts */ u_char im6o_multicast_hlim; /* hoplimit for outgoing multicasts */ u_char im6o_multicast_loop; /* 1 >= hear sends if a member */ - u_short im6o_num_memberships; /* no. memberships this socket */ - u_short im6o_max_memberships; /* max memberships this socket */ - struct in6_multi **im6o_membership; /* group memberships */ - struct in6_mfilter *im6o_mfilters; /* source filters */ - struct epoch_context imo6_epoch_ctx; + struct ip6_mfilter_head im6o_head; /* group membership list */ }; - +#else +struct ip6_moptions; +#endif /* * Control options for outgoing packets */ @@ -208,6 +193,7 @@ struct ip6stat { uint64_t ip6s_localout; /* total ip packets generated here */ uint64_t ip6s_odropped; /* lost packets due to nobufs, etc. */ uint64_t ip6s_reassembled; /* total packets reassembled ok */ + uint64_t ip6s_atomicfrags; /* atomic fragments */ uint64_t ip6s_fragmented; /* datagrams successfully fragmented */ uint64_t ip6s_ofragments; /* output fragments created */ uint64_t ip6s_cantfrag; /* don't fragment flag was set, etc. */ @@ -299,12 +285,6 @@ VNET_DECLARE(int, ip6_v6only); VNET_DECLARE(struct socket *, ip6_mrouter); /* multicast routing daemon */ VNET_DECLARE(int, ip6_sendredirects); /* send IP redirects when forwarding? */ -VNET_DECLARE(int, ip6_maxfragpackets); /* Maximum packets in reassembly - * queue */ -extern int ip6_maxfrags; /* Maximum fragments in reassembly - * queue */ -VNET_DECLARE(int, ip6_maxfragbucketsize); /* Maximum reassembly queues per bucket */ -VNET_DECLARE(int, ip6_maxfragsperpacket); /* Maximum fragments per packet */ VNET_DECLARE(int, ip6_accept_rtadv); /* Acts as a host not a router */ VNET_DECLARE(int, ip6_no_radr); /* No defroute from RA */ VNET_DECLARE(int, ip6_norbit_raif); /* Disable R-bit in NA on RA @@ -318,9 +298,6 @@ VNET_DECLARE(int, ip6_hdrnestlimit); /* upper limit of # of extension VNET_DECLARE(int, ip6_dad_count); /* DupAddrDetectionTransmits */ #define V_ip6_mrouter VNET(ip6_mrouter) #define V_ip6_sendredirects VNET(ip6_sendredirects) -#define V_ip6_maxfragpackets VNET(ip6_maxfragpackets) -#define V_ip6_maxfragbucketsize VNET(ip6_maxfragbucketsize) -#define V_ip6_maxfragsperpacket VNET(ip6_maxfragsperpacket) #define V_ip6_accept_rtadv VNET(ip6_accept_rtadv) #define V_ip6_no_radr VNET(ip6_no_radr) #define V_ip6_norbit_raif VNET(ip6_norbit_raif) @@ -346,13 +323,20 @@ VNET_DECLARE(int, ip6_use_defzone); /* Whether to use the default scope * zone when unspecified */ #define V_ip6_use_defzone VNET(ip6_use_defzone) -VNET_DECLARE (struct pfil_head, inet6_pfil_hook); /* packet filter hooks */ -#define V_inet6_pfil_hook VNET(inet6_pfil_hook) +VNET_DECLARE(struct pfil_head *, inet6_pfil_head); +#define V_inet6_pfil_head VNET(inet6_pfil_head) +#define PFIL_INET6_NAME "inet6" + #ifdef IPSTEALTH VNET_DECLARE(int, ip6stealth); #define V_ip6stealth VNET(ip6stealth) #endif +#ifdef EXPERIMENTAL +VNET_DECLARE(int, nd6_ignore_ipv6_only_ra); +#define V_nd6_ignore_ipv6_only_ra VNET(nd6_ignore_ipv6_only_ra) +#endif + extern struct pr_usrreqs rip6_usrreqs; struct sockopt; @@ -407,7 +391,6 @@ int ip6_fragment(struct ifnet *, struct mbuf *, int, u_char, int, int route6_input(struct mbuf **, int *, int); -void frag6_set_bucketsize(void); void frag6_init(void); int frag6_input(struct mbuf **, int *, int); void frag6_slowtimo(void); diff --git a/freebsd/sys/netinet6/mld6.c b/freebsd/sys/netinet6/mld6.c index b00f03ef..cc946f67 100644 --- a/freebsd/sys/netinet6/mld6.c +++ b/freebsd/sys/netinet6/mld6.c @@ -112,7 +112,7 @@ static void mli_delete_locked(const struct ifnet *); static void mld_dispatch_packet(struct mbuf *); static void mld_dispatch_queue(struct mbufq *, int); static void mld_final_leave(struct in6_multi *, struct mld_ifsoftc *); -static void mld_fasttimo_vnet(void); +static void mld_fasttimo_vnet(struct in6_multi_head *inmh); static int mld_handle_state_change(struct in6_multi *, struct mld_ifsoftc *); static int mld_initial_join(struct in6_multi *, struct mld_ifsoftc *, @@ -141,14 +141,15 @@ static int mld_v2_enqueue_group_record(struct mbufq *, struct in6_multi *, const int, const int, const int, const int); static int mld_v2_input_query(struct ifnet *, const struct ip6_hdr *, - struct mbuf *, const int, const int); + struct mbuf *, struct mldv2_query *, const int, const int); static int mld_v2_merge_state_changes(struct in6_multi *, struct mbufq *); static void mld_v2_process_group_timers(struct in6_multi_head *, struct mbufq *, struct mbufq *, struct in6_multi *, const int); static int mld_v2_process_group_query(struct in6_multi *, - struct mld_ifsoftc *mli, int, struct mbuf *, const int); + struct mld_ifsoftc *mli, int, struct mbuf *, + struct mldv2_query *, const int); static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS); @@ -245,6 +246,10 @@ static int mld_v1enable = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN, &mld_v1enable, 0, "Enable fallback to MLDv1"); +static int mld_v2enable = 1; +SYSCTL_INT(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_RWTUN, + &mld_v2enable, 0, "Enable MLDv2"); + static int mld_use_allow = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN, &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves"); @@ -535,45 +540,48 @@ out: * XXX This routine is also bitten by unlocked ifma_protospec access. */ void -mld_ifdetach(struct ifnet *ifp) +mld_ifdetach(struct ifnet *ifp, struct in6_multi_head *inmh) { + struct epoch_tracker et; struct mld_ifsoftc *mli; - struct ifmultiaddr *ifma, *next; + struct ifmultiaddr *ifma; struct in6_multi *inm; - struct in6_multi_head inmh; CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); - SLIST_INIT(&inmh); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK(); mli = MLD_IFINFO(ifp); - if (mli->mli_version == MLD_VERSION_2) { - IF_ADDR_WLOCK(ifp); - restart: - CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { - if (ifma->ifma_addr->sa_family != AF_INET6 || - ifma->ifma_protospec == NULL) - continue; - inm = (struct in6_multi *)ifma->ifma_protospec; - if (inm->in6m_state == MLD_LEAVING_MEMBER) { - in6m_disconnect(inm); - in6m_rele_locked(&inmh, inm); - ifma->ifma_protospec = NULL; - } + IF_ADDR_WLOCK(ifp); + /* + * Extract list of in6_multi associated with the detaching ifp + * which the PF_INET6 layer is about to release. + */ + NET_EPOCH_ENTER(et); + CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + inm = in6m_ifmultiaddr_get_inm(ifma); + if (inm == NULL) + continue; + in6m_disconnect_locked(inmh, inm); + + if (mli->mli_version == MLD_VERSION_2) { in6m_clear_recorded(inm); - if (__predict_false(ifma6_restart)) { - ifma6_restart = false; - goto restart; + + /* + * We need to release the final reference held + * for issuing the INCLUDE {}. + */ + if (inm->in6m_state == MLD_LEAVING_MEMBER) { + inm->in6m_state = MLD_NOT_MEMBER; + in6m_rele_locked(inmh, inm); } } - IF_ADDR_WUNLOCK(ifp); } - + NET_EPOCH_EXIT(et); + IF_ADDR_WUNLOCK(ifp); MLD_UNLOCK(); - in6m_release_list_deferred(&inmh); } /* @@ -630,6 +638,7 @@ static int mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, /*const*/ struct mld_hdr *mld) { + struct epoch_tracker et; struct ifmultiaddr *ifma; struct mld_ifsoftc *mli; struct in6_multi *inm; @@ -697,7 +706,7 @@ mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, if (timer == 0) timer = 1; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); if (is_general_query) { /* * For each reporting group joined on this @@ -706,10 +715,9 @@ mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)", ifp, if_name(ifp)); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { - if (ifma->ifma_addr->sa_family != AF_INET6 || - ifma->ifma_protospec == NULL) + inm = in6m_ifmultiaddr_get_inm(ifma); + if (inm == NULL) continue; - inm = (struct in6_multi *)ifma->ifma_protospec; mld_v1_update_group(inm, timer); } } else { @@ -729,7 +737,7 @@ mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, in6_clearscope(&mld->mld_addr); } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); @@ -799,16 +807,16 @@ mld_v1_update_group(struct in6_multi *inm, const int timer) * Process a received MLDv2 general, group-specific or * group-and-source-specific query. * - * Assumes that the query header has been pulled up to sizeof(mldv2_query). + * Assumes that mld points to a struct mldv2_query which is stored in + * contiguous memory. * * Return 0 if successful, otherwise an appropriate error code is returned. */ static int mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, - struct mbuf *m, const int off, const int icmp6len) + struct mbuf *m, struct mldv2_query *mld, const int off, const int icmp6len) { struct mld_ifsoftc *mli; - struct mldv2_query *mld; struct in6_multi *inm; uint32_t maxdelay, nsrc, qqi; int is_general_query; @@ -818,7 +826,12 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, char ip6tbuf[INET6_ADDRSTRLEN]; #endif - is_general_query = 0; + if (!mld_v2enable) { + CTR3(KTR_MLD, "ignore v2 query src %s on ifp %p(%s)", + ip6_sprintf(ip6tbuf, &ip6->ip6_src), + ifp, if_name(ifp)); + return (0); + } /* * RFC3810 Section 6.2: MLD queries must originate from @@ -831,9 +844,9 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, return (0); } - CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp)); + is_general_query = 0; - mld = (struct mldv2_query *)(mtod(m, uint8_t *) + off); + CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp)); maxdelay = ntohs(mld->mld_maxdelay); /* in 1/10ths of a second */ if (maxdelay >= 32768) { @@ -926,6 +939,8 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, V_interface_timers_running6 = 1; } } else { + struct epoch_tracker et; + /* * MLDv2 Group-specific or Group-and-source-specific Query. * @@ -934,10 +949,10 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, * Queries for groups we are not a member of on this * link are simply ignored. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm == NULL) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); goto out_locked; } if (nsrc > 0) { @@ -945,7 +960,7 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, &V_mld_gsrdelay)) { CTR1(KTR_MLD, "%s: GS query throttled.", __func__); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); goto out_locked; } } @@ -959,11 +974,11 @@ mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, * group-specific or group-and-source query. */ if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) - mld_v2_process_group_query(inm, mli, timer, m, off); + mld_v2_process_group_query(inm, mli, timer, m, mld, off); /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); } out_locked: @@ -980,9 +995,8 @@ out_locked: */ static int mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifsoftc *mli, - int timer, struct mbuf *m0, const int off) + int timer, struct mbuf *m0, struct mldv2_query *mld, const int off) { - struct mldv2_query *mld; int retval; uint16_t nsrc; @@ -990,7 +1004,6 @@ mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifsoftc *mli, MLD_LOCK_ASSERT(); retval = 0; - mld = (struct mldv2_query *)(mtod(m0, uint8_t *) + off); switch (inm->in6m_state) { case MLD_NOT_MEMBER: @@ -1010,6 +1023,15 @@ mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifsoftc *mli, nsrc = ntohs(mld->mld_numsrc); + /* Length should be checked by calling function. */ + KASSERT((m0->m_flags & M_PKTHDR) == 0 || + m0->m_pkthdr.len >= off + sizeof(struct mldv2_query) + + nsrc * sizeof(struct in6_addr), + ("mldv2 packet is too short: (%d bytes < %zd bytes, m=%p)", + m0->m_pkthdr.len, off + sizeof(struct mldv2_query) + + nsrc * sizeof(struct in6_addr), m0)); + + /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded @@ -1051,28 +1073,20 @@ mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifsoftc *mli, * report for those sources. */ if (inm->in6m_nsrc > 0) { - struct mbuf *m; - uint8_t *sp; + struct in6_addr srcaddr; int i, nrecorded; int soff; - m = m0; soff = off + sizeof(struct mldv2_query); nrecorded = 0; for (i = 0; i < nsrc; i++) { - sp = mtod(m, uint8_t *) + soff; - retval = in6m_record_source(inm, - (const struct in6_addr *)sp); + m_copydata(m0, soff, sizeof(struct in6_addr), + (caddr_t)&srcaddr); + retval = in6m_record_source(inm, &srcaddr); if (retval < 0) break; nrecorded += retval; soff += sizeof(struct in6_addr); - if (soff >= m->m_len) { - soff = soff - m->m_len; - m = m->m_next; - if (m == NULL) - break; - } } if (nrecorded > 0) { CTR1(KTR_MLD, @@ -1098,6 +1112,7 @@ mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6, /*const*/ struct mld_hdr *mld) { struct in6_addr src, dst; + struct epoch_tracker et; struct in6_ifaddr *ia; struct in6_multi *inm; #ifdef KTR @@ -1173,7 +1188,7 @@ mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6, IN6_MULTI_LIST_LOCK(); MLD_LOCK(); - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); /* * MLDv1 report suppression. @@ -1221,7 +1236,7 @@ mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6, } out_locked: - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); @@ -1281,8 +1296,8 @@ mld_input(struct mbuf *m, int off, int icmp6len) if (mld_v1_input_query(ifp, ip6, mld) != 0) return (0); } else if (icmp6len >= sizeof(struct mldv2_query)) { - if (mld_v2_input_query(ifp, ip6, m, off, - icmp6len) != 0) + if (mld_v2_input_query(ifp, ip6, m, + (struct mldv2_query *)mld, off, icmp6len) != 0) return (0); } break; @@ -1311,15 +1326,19 @@ mld_input(struct mbuf *m, int off, int icmp6len) void mld_fasttimo(void) { + struct in6_multi_head inmh; VNET_ITERATOR_DECL(vnet_iter); + SLIST_INIT(&inmh); + VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - mld_fasttimo_vnet(); + mld_fasttimo_vnet(&inmh); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); + in6m_release_list_deferred(&inmh); } /* @@ -1328,15 +1347,15 @@ mld_fasttimo(void) * VIMAGE: Assume caller has set up our curvnet. */ static void -mld_fasttimo_vnet(void) +mld_fasttimo_vnet(struct in6_multi_head *inmh) { + struct epoch_tracker et; struct mbufq scq; /* State-change packets */ struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct mld_ifsoftc *mli; - struct ifmultiaddr *ifma, *next; - struct in6_multi *inm, *tinm; - struct in6_multi_head inmh; + struct ifmultiaddr *ifma; + struct in6_multi *inm; int uri_fasthz; uri_fasthz = 0; @@ -1351,7 +1370,6 @@ mld_fasttimo_vnet(void) !V_state_change_timers_running6) return; - SLIST_INIT(&inmh); IN6_MULTI_LIST_LOCK(); MLD_LOCK(); @@ -1397,25 +1415,20 @@ mld_fasttimo_vnet(void) } IF_ADDR_WLOCK(ifp); - restart: - CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { - if (ifma->ifma_addr->sa_family != AF_INET6 || - ifma->ifma_protospec == NULL) + NET_EPOCH_ENTER(et); + CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + inm = in6m_ifmultiaddr_get_inm(ifma); + if (inm == NULL) continue; - inm = (struct in6_multi *)ifma->ifma_protospec; switch (mli->mli_version) { case MLD_VERSION_1: - mld_v1_process_group_timer(&inmh, inm); + mld_v1_process_group_timer(inmh, inm); break; case MLD_VERSION_2: - mld_v2_process_group_timers(&inmh, &qrq, + mld_v2_process_group_timers(inmh, &qrq, &scq, inm, uri_fasthz); break; } - if (__predict_false(ifma6_restart)) { - ifma6_restart = false; - goto restart; - } } IF_ADDR_WUNLOCK(ifp); @@ -1429,9 +1442,8 @@ mld_fasttimo_vnet(void) * IF_ADDR_LOCK internally as well as * ip6_output() to transmit a packet. */ - SLIST_FOREACH_SAFE(inm, &inmh, in6m_nrele, tinm) { - SLIST_REMOVE_HEAD(&inmh, - in6m_nrele); + while ((inm = SLIST_FIRST(inmh)) != NULL) { + SLIST_REMOVE_HEAD(inmh, in6m_defer); (void)mld_v1_transmit_report(inm, MLD_LISTENER_REPORT); } @@ -1439,14 +1451,9 @@ mld_fasttimo_vnet(void) case MLD_VERSION_2: mld_dispatch_queue(&qrq, 0); mld_dispatch_queue(&scq, 0); - - /* - * Free the in_multi reference(s) for - * this lifecycle. - */ - in6m_release_list_deferred(&inmh); break; } + NET_EPOCH_EXIT(et); } out_locked: @@ -1486,8 +1493,7 @@ mld_v1_process_group_timer(struct in6_multi_head *inmh, struct in6_multi *inm) case MLD_REPORTING_MEMBER: if (report_timer_expired) { inm->in6m_state = MLD_IDLE_MEMBER; - in6m_disconnect(inm); - in6m_rele_locked(inmh, inm); + SLIST_INSERT_HEAD(inmh, inm, in6m_defer); } break; case MLD_G_QUERY_PENDING_MEMBER: @@ -1611,7 +1617,7 @@ mld_v2_process_group_timers(struct in6_multi_head *inmh, if (inm->in6m_state == MLD_LEAVING_MEMBER && inm->in6m_scrv == 0) { inm->in6m_state = MLD_NOT_MEMBER; - in6m_disconnect(inm); + in6m_disconnect_locked(inmh, inm); in6m_rele_locked(inmh, inm); } } @@ -1656,10 +1662,11 @@ mld_set_version(struct mld_ifsoftc *mli, const int version) static void mld_v2_cancel_link_timers(struct mld_ifsoftc *mli) { - struct ifmultiaddr *ifma, *next; + struct epoch_tracker et; + struct in6_multi_head inmh; + struct ifmultiaddr *ifma; struct ifnet *ifp; struct in6_multi *inm; - struct in6_multi_head inmh; CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__, mli->mli_ifp, if_name(mli->mli_ifp)); @@ -1682,12 +1689,11 @@ mld_v2_cancel_link_timers(struct mld_ifsoftc *mli) ifp = mli->mli_ifp; IF_ADDR_WLOCK(ifp); - restart: - CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { - if (ifma->ifma_addr->sa_family != AF_INET6 || - ifma->ifma_protospec == NULL) + NET_EPOCH_ENTER(et); + CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + inm = in6m_ifmultiaddr_get_inm(ifma); + if (inm == NULL) continue; - inm = (struct in6_multi *)ifma->ifma_protospec; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: @@ -1702,9 +1708,9 @@ mld_v2_cancel_link_timers(struct mld_ifsoftc *mli) * version, we need to release the final * reference held for issuing the INCLUDE {}. */ - in6m_disconnect(inm); + if (inm->in6m_refcount == 1) + in6m_disconnect_locked(&inmh, inm); in6m_rele_locked(&inmh, inm); - ifma->ifma_protospec = NULL; /* FALLTHROUGH */ case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: @@ -1720,11 +1726,8 @@ mld_v2_cancel_link_timers(struct mld_ifsoftc *mli) mbufq_drain(&inm->in6m_scq); break; } - if (__predict_false(ifma6_restart)) { - ifma6_restart = false; - goto restart; - } } + NET_EPOCH_EXIT(et); IF_ADDR_WUNLOCK(ifp); in6m_release_list_deferred(&inmh); } @@ -1897,6 +1900,14 @@ mld_change_state(struct in6_multi *inm, const int delay) error = 0; /* + * Check if the in6_multi has already been disconnected. + */ + if (inm->in6m_ifp == NULL) { + CTR1(KTR_MLD, "%s: inm is disconnected", __func__); + return (0); + } + + /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ @@ -2006,6 +2017,7 @@ mld_initial_join(struct in6_multi *inm, struct mld_ifsoftc *mli, if (mli->mli_version == MLD_VERSION_2 && inm->in6m_state == MLD_LEAVING_MEMBER) { inm->in6m_refcount--; + MPASS(inm->in6m_refcount > 0); } inm->in6m_state = MLD_REPORTING_MEMBER; @@ -2985,6 +2997,7 @@ mld_v2_merge_state_changes(struct in6_multi *inm, struct mbufq *scq) static void mld_v2_dispatch_general_query(struct mld_ifsoftc *mli) { + struct epoch_tracker et; struct ifmultiaddr *ifma; struct ifnet *ifp; struct in6_multi *inm; @@ -3007,13 +3020,11 @@ mld_v2_dispatch_general_query(struct mld_ifsoftc *mli) ifp = mli->mli_ifp; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { - if (ifma->ifma_addr->sa_family != AF_INET6 || - ifma->ifma_protospec == NULL) + inm = in6m_ifmultiaddr_get_inm(ifma); + if (inm == NULL) continue; - - inm = (struct in6_multi *)ifma->ifma_protospec; KASSERT(ifp == inm->in6m_ifp, ("%s: inconsistent ifp", __func__)); @@ -3038,7 +3049,7 @@ mld_v2_dispatch_general_query(struct mld_ifsoftc *mli) break; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); send: mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST); diff --git a/freebsd/sys/netinet6/mld6_var.h b/freebsd/sys/netinet6/mld6_var.h index 166c2055..8dc2ffa4 100644 --- a/freebsd/sys/netinet6/mld6_var.h +++ b/freebsd/sys/netinet6/mld6_var.h @@ -160,12 +160,13 @@ struct mld_ifsoftc { #define MLD_IFINFO(ifp) \ (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->mld_ifinfo) +struct in6_multi_head; int mld_change_state(struct in6_multi *, const int); struct mld_ifsoftc * mld_domifattach(struct ifnet *); void mld_domifdetach(struct ifnet *); void mld_fasttimo(void); -void mld_ifdetach(struct ifnet *); +void mld_ifdetach(struct ifnet *, struct in6_multi_head *); int mld_input(struct mbuf *, int, int); void mld_slowtimo(void); diff --git a/freebsd/sys/netinet6/nd6.c b/freebsd/sys/netinet6/nd6.c index f065815c..201b4d40 100644 --- a/freebsd/sys/netinet6/nd6.c +++ b/freebsd/sys/netinet6/nd6.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/eventhandler.h> #include <sys/callout.h> #include <sys/lock.h> #include <sys/malloc.h> @@ -115,7 +116,7 @@ VNET_DEFINE(int, nd6_debug) = 1; VNET_DEFINE(int, nd6_debug) = 0; #endif -static eventhandler_tag lle_event_eh, iflladdr_event_eh; +static eventhandler_tag lle_event_eh, iflladdr_event_eh, ifnet_link_event_eh; VNET_DEFINE(struct nd_drhead, nd_defrouter); VNET_DEFINE(struct nd_prhead, nd_prefix); @@ -235,6 +236,8 @@ nd6_init(void) NULL, EVENTHANDLER_PRI_ANY); iflladdr_event_eh = EVENTHANDLER_REGISTER(iflladdr_event, nd6_iflladdr, NULL, EVENTHANDLER_PRI_ANY); + ifnet_link_event_eh = EVENTHANDLER_REGISTER(ifnet_link_event, + nd6_ifnet_link_event, NULL, EVENTHANDLER_PRI_ANY); } } @@ -246,6 +249,7 @@ nd6_destroy() callout_drain(&V_nd6_slowtimo_ch); callout_drain(&V_nd6_timer_ch); if (IS_DEFAULT_VNET(curvnet)) { + EVENTHANDLER_DEREGISTER(ifnet_link_event, ifnet_link_event_eh); EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_event_eh); } @@ -300,9 +304,10 @@ nd6_ifattach(struct ifnet *ifp) void nd6_ifdetach(struct ifnet *ifp, struct nd_ifinfo *nd) { + struct epoch_tracker et; struct ifaddr *ifa, *next; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; @@ -310,7 +315,7 @@ nd6_ifdetach(struct ifnet *ifp, struct nd_ifinfo *nd) /* stop DAD processing */ nd6_dad_stop(ifa); } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); free(nd, M_IP6NDP); } @@ -898,6 +903,7 @@ nd6_timer(void *arg) struct nd_prhead prl; struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; + struct ifnet *ifp; struct in6_ifaddr *ia6, *nia6; uint64_t genid; @@ -994,14 +1000,15 @@ nd6_timer(void *arg) * Check status of the interface. If it is down, * mark the address as tentative for future DAD. */ - if ((ia6->ia_ifp->if_flags & IFF_UP) == 0 || - (ia6->ia_ifp->if_drv_flags & IFF_DRV_RUNNING) - == 0 || - (ND_IFINFO(ia6->ia_ifp)->flags & - ND6_IFF_IFDISABLED) != 0) { + ifp = ia6->ia_ifp; + if ((ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0 && + ((ifp->if_flags & IFF_UP) == 0 || + (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0)){ ia6->ia6_flags &= ~IN6_IFF_DUPLICATED; ia6->ia6_flags |= IN6_IFF_TENTATIVE; } + /* * A new RA might have made a deprecated address * preferred. @@ -1064,12 +1071,13 @@ restart: static int regen_tmpaddr(struct in6_ifaddr *ia6) { + struct epoch_tracker et; struct ifaddr *ifa; struct ifnet *ifp; struct in6_ifaddr *public_ifa6 = NULL; ifp = ia6->ia_ifa.ifa_ifp; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in6_ifaddr *it6; @@ -1110,7 +1118,7 @@ regen_tmpaddr(struct in6_ifaddr *ia6) } if (public_ifa6 != NULL) ifa_ref(&public_ifa6->ia_ifa); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (public_ifa6 != NULL) { int e; @@ -1345,17 +1353,19 @@ restart: * a p2p interface, the address should be a neighbor. */ if (ifp->if_flags & IFF_POINTOPOINT) { - IF_ADDR_RLOCK(ifp); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sin6_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return 1; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); } /* @@ -1379,6 +1389,7 @@ restart: int nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { + struct epoch_tracker et; struct llentry *lle; int rc = 0; @@ -1390,12 +1401,12 @@ nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) * Even if the address matches none of our addresses, it might be * in the neighbor cache. */ - IF_AFDATA_RLOCK(ifp); + NET_EPOCH_ENTER(et); if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) { LLE_RUNLOCK(lle); rc = 1; } - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (rc); } @@ -1624,6 +1635,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct in6_ndifreq *ndif = (struct in6_ndifreq *)data; + struct epoch_tracker et; int error = 0; if (ifp->if_afdata[AF_INET6] == NULL) @@ -1688,7 +1700,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) * do not clear ND6_IFF_IFDISABLED. * See RFC 4862, Section 5.4.5. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; @@ -1697,7 +1709,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (ifa != NULL) { /* LLA is duplicated. */ @@ -1718,7 +1730,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; if (V_ip6_dad_count > 0 && (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) { - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != @@ -1727,7 +1739,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) ia = (struct in6_ifaddr *)ifa; ia->ia6_flags |= IN6_IFF_TENTATIVE; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); } } @@ -1746,7 +1758,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) * address is assigned, and IFF_UP, try to * assign one. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != @@ -1756,7 +1768,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (ifa != NULL) /* No LLA is configured. */ in6_ifattach(ifp, NULL); @@ -1833,9 +1845,9 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0) return (error); - IF_AFDATA_RLOCK(ifp); + NET_EPOCH_ENTER(et); ln = nd6_lookup(&nb_addr, 0, ifp); - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (ln == NULL) { error = EINVAL; @@ -1960,6 +1972,7 @@ nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, int flags; uint16_t router = 0; struct sockaddr_in6 sin6; + struct epoch_tracker et; struct mbuf *chain = NULL; u_char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; @@ -1984,9 +1997,9 @@ nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, * description on it in NS section (RFC 2461 7.2.3). */ flags = lladdr ? LLE_EXCLUSIVE : 0; - IF_AFDATA_RLOCK(ifp); + NET_EPOCH_ENTER(et); ln = nd6_lookup(from, flags, ifp); - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); is_newentry = 0; if (ln == NULL) { flags |= LLE_EXCLUSIVE; @@ -2128,13 +2141,14 @@ nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, static void nd6_slowtimo(void *arg) { + struct epoch_tracker et; CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *nd6if; struct ifnet *ifp; callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; @@ -2151,7 +2165,7 @@ nd6_slowtimo(void *arg) nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } } - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } @@ -2244,6 +2258,7 @@ nd6_resolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *sa_dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { + struct epoch_tracker et; struct llentry *ln = NULL; const struct sockaddr_in6 *dst6; @@ -2272,7 +2287,7 @@ nd6_resolve(struct ifnet *ifp, int is_gw, struct mbuf *m, } } - IF_AFDATA_RLOCK(ifp); + NET_EPOCH_ENTER(et); ln = nd6_lookup(&dst6->sin6_addr, plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, ifp); if (ln != NULL && (ln->r_flags & RLLE_VALID) != 0) { @@ -2292,11 +2307,11 @@ nd6_resolve(struct ifnet *ifp, int is_gw, struct mbuf *m, *plle = ln; LLE_WUNLOCK(ln); } - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (0); } else if (plle && ln) LLE_WUNLOCK(ln); - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (nd6_resolve_slow(ifp, 0, m, dst6, desten, pflags, plle)); } @@ -2330,9 +2345,11 @@ nd6_resolve_slow(struct ifnet *ifp, int flags, struct mbuf *m, * or an anycast address(i.e. not a multicast). */ if (lle == NULL) { - IF_AFDATA_RLOCK(ifp); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); lle = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp)) { /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), diff --git a/freebsd/sys/netinet6/nd6.h b/freebsd/sys/netinet6/nd6.h index 7544d23c..ffc88cb5 100644 --- a/freebsd/sys/netinet6/nd6.h +++ b/freebsd/sys/netinet6/nd6.h @@ -91,7 +91,10 @@ struct nd_ifinfo { #define ND6_IFF_NO_PREFER_IFACE 0x80 /* XXX: not related to ND. */ #define ND6_IFF_NO_DAD 0x100 #ifdef EXPERIMENTAL +/* XXX: not related to ND. */ #define ND6_IFF_IPV6_ONLY 0x200 /* draft-ietf-6man-ipv6only-flag */ +#define ND6_IFF_IPV6_ONLY_MANUAL 0x400 +#define ND6_IFF_IPV6_ONLY_MASK (ND6_IFF_IPV6_ONLY|ND6_IFF_IPV6_ONLY_MANUAL) #endif #ifdef _KERNEL @@ -473,6 +476,7 @@ void nd6_dad_stop(struct ifaddr *); /* nd6_rtr.c */ void nd6_rs_input(struct mbuf *, int, int); void nd6_ra_input(struct mbuf *, int, int); +void nd6_ifnet_link_event(void *, struct ifnet *, int); void defrouter_reset(void); void defrouter_select_fib(int fibnum); void defrouter_select(void); diff --git a/freebsd/sys/netinet6/nd6_nbr.c b/freebsd/sys/netinet6/nd6_nbr.c index 49810020..136fbecc 100644 --- a/freebsd/sys/netinet6/nd6_nbr.c +++ b/freebsd/sys/netinet6/nd6_nbr.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/eventhandler.h> #include <sys/malloc.h> #include <sys/libkern.h> #include <sys/lock.h> @@ -615,6 +616,7 @@ nd6_ns_output(struct ifnet *ifp, const struct in6_addr *saddr6, void nd6_na_input(struct mbuf *m, int off, int icmp6len) { + struct epoch_tracker et; struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_advert *nd_na; @@ -742,9 +744,9 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) * If no neighbor cache entry is found, NA SHOULD silently be * discarded. */ - IF_AFDATA_RLOCK(ifp); + NET_EPOCH_ENTER(et); ln = nd6_lookup(&taddr6, LLE_EXCLUSIVE, ifp); - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (ln == NULL) { goto freeit; } diff --git a/freebsd/sys/netinet6/nd6_rtr.c b/freebsd/sys/netinet6/nd6_rtr.c index 59868383..0ba1e416 100644 --- a/freebsd/sys/netinet6/nd6_rtr.c +++ b/freebsd/sys/netinet6/nd6_rtr.c @@ -108,6 +108,10 @@ VNET_DEFINE(u_int32_t, ip6_temp_valid_lifetime) = DEF_TEMP_VALID_LIFETIME; VNET_DEFINE(int, ip6_temp_regen_advance) = TEMPADDR_REGEN_ADVANCE; +#ifdef EXPERIMENTAL +VNET_DEFINE(int, nd6_ignore_ipv6_only_ra) = 1; +#endif + /* RTPREF_MEDIUM has to be 0! */ #define RTPREF_HIGH 1 #define RTPREF_MEDIUM 0 @@ -210,7 +214,7 @@ nd6_rs_input(struct mbuf *m, int off, int icmp6len) /* * An initial update routine for draft-ietf-6man-ipv6only-flag. * We need to iterate over all default routers for the given - * interface to see whether they are all advertising the "6" + * interface to see whether they are all advertising the "S" * (IPv6-Only) flag. If they do set, otherwise unset, the * interface flag we later use to filter on. */ @@ -218,7 +222,15 @@ static void defrtr_ipv6_only_ifp(struct ifnet *ifp) { struct nd_defrouter *dr; - bool ipv6_only; + bool ipv6_only, ipv6_only_old; +#ifdef INET + struct epoch_tracker et; + struct ifaddr *ifa; + bool has_ipv4_addr; +#endif + + if (V_nd6_ignore_ipv6_only_ra != 0) + return; ipv6_only = true; ND6_RLOCK(); @@ -229,13 +241,78 @@ defrtr_ipv6_only_ifp(struct ifnet *ifp) ND6_RUNLOCK(); IF_AFDATA_WLOCK(ifp); + ipv6_only_old = ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY; + IF_AFDATA_WUNLOCK(ifp); + + /* If nothing changed, we have an early exit. */ + if (ipv6_only == ipv6_only_old) + return; + +#ifdef INET + /* + * Should we want to set the IPV6-ONLY flag, check if the + * interface has a non-0/0 and non-link-local IPv4 address + * configured on it. If it has we will assume working + * IPv4 operations and will clear the interface flag. + */ + has_ipv4_addr = false; + if (ipv6_only) { + NET_EPOCH_ENTER(et); + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + if (in_canforward( + satosin(ifa->ifa_addr)->sin_addr)) { + has_ipv4_addr = true; + break; + } + } + NET_EPOCH_EXIT(et); + } + if (ipv6_only && has_ipv4_addr) { + log(LOG_NOTICE, "%s rcvd RA w/ IPv6-Only flag set but has IPv4 " + "configured, ignoring IPv6-Only flag.\n", ifp->if_xname); + ipv6_only = false; + } +#endif + + IF_AFDATA_WLOCK(ifp); if (ipv6_only) ND_IFINFO(ifp)->flags |= ND6_IFF_IPV6_ONLY; else ND_IFINFO(ifp)->flags &= ~ND6_IFF_IPV6_ONLY; IF_AFDATA_WUNLOCK(ifp); + +#ifdef notyet + /* Send notification of flag change. */ +#endif +} + +static void +defrtr_ipv6_only_ipf_down(struct ifnet *ifp) +{ + + IF_AFDATA_WLOCK(ifp); + ND_IFINFO(ifp)->flags &= ~ND6_IFF_IPV6_ONLY; + IF_AFDATA_WUNLOCK(ifp); } +#endif /* EXPERIMENTAL */ + +void +nd6_ifnet_link_event(void *arg __unused, struct ifnet *ifp, int linkstate) +{ + + /* + * XXX-BZ we might want to trigger re-evaluation of our default router + * availability. E.g., on link down the default router might be + * unreachable but a different interface might still have connectivity. + */ + +#ifdef EXPERIMENTAL + if (linkstate == LINK_STATE_DOWN) + defrtr_ipv6_only_ipf_down(ifp); #endif +} /* * Receive Router Advertisement Message. @@ -513,11 +590,13 @@ nd6_rtmsg(int cmd, struct rtentry *rt) info.rti_info[RTAX_NETMASK] = rt_mask(rt); ifp = rt->rt_ifp; if (ifp != NULL) { - IF_ADDR_RLOCK(ifp); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); info.rti_info[RTAX_IFP] = ifa->ifa_addr; ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; } else ifa = NULL; @@ -791,6 +870,7 @@ defrouter_del(struct nd_defrouter *dr) void defrouter_select_fib(int fibnum) { + struct epoch_tracker et; struct nd_defrouter *dr, *selected_dr, *installed_dr; struct llentry *ln = NULL; @@ -817,14 +897,14 @@ defrouter_select_fib(int fibnum) */ selected_dr = installed_dr = NULL; TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { - IF_AFDATA_RLOCK(dr->ifp); + NET_EPOCH_ENTER(et); if (selected_dr == NULL && dr->ifp->if_fib == fibnum && (ln = nd6_lookup(&dr->rtaddr, 0, dr->ifp)) && ND6_IS_LLINFO_PROBREACH(ln)) { selected_dr = dr; defrouter_ref(selected_dr); } - IF_AFDATA_RUNLOCK(dr->ifp); + NET_EPOCH_EXIT(et); if (ln != NULL) { LLE_RUNLOCK(ln); ln = NULL; @@ -868,7 +948,7 @@ defrouter_select_fib(int fibnum) } } } else if (installed_dr != NULL) { - IF_AFDATA_RLOCK(installed_dr->ifp); + NET_EPOCH_ENTER(et); if ((ln = nd6_lookup(&installed_dr->rtaddr, 0, installed_dr->ifp)) && ND6_IS_LLINFO_PROBREACH(ln) && @@ -877,7 +957,7 @@ defrouter_select_fib(int fibnum) defrouter_rele(selected_dr); selected_dr = installed_dr; } - IF_AFDATA_RUNLOCK(installed_dr->ifp); + NET_EPOCH_EXIT(et); if (ln != NULL) LLE_RUNLOCK(ln); } @@ -1273,6 +1353,7 @@ prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr, int auth; struct in6_addrlifetime lt6_tmp; char ip6buf[INET6_ADDRSTRLEN]; + struct epoch_tracker et; auth = 0; if (m) { @@ -1386,7 +1467,7 @@ prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr, * consider autoconfigured addresses while RFC2462 simply said * "address". */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in6_ifaddr *ifa6; u_int32_t remaininglifetime; @@ -1509,7 +1590,7 @@ prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr, ifa6->ia6_lifetime = lt6_tmp; ifa6->ia6_updatetime = time_uptime; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (ia6_match == NULL && new->ndpr_vltime) { int ifidlen; @@ -1598,6 +1679,7 @@ end: static struct nd_pfxrouter * find_pfxlist_reachable_router(struct nd_prefix *pr) { + struct epoch_tracker et; struct nd_pfxrouter *pfxrtr; struct llentry *ln; int canreach; @@ -1605,9 +1687,9 @@ find_pfxlist_reachable_router(struct nd_prefix *pr) ND6_LOCK_ASSERT(); LIST_FOREACH(pfxrtr, &pr->ndpr_advrtrs, pfr_entry) { - IF_AFDATA_RLOCK(pfxrtr->router->ifp); + NET_EPOCH_ENTER(et); ln = nd6_lookup(&pfxrtr->router->rtaddr, 0, pfxrtr->router->ifp); - IF_AFDATA_RUNLOCK(pfxrtr->router->ifp); + NET_EPOCH_EXIT(et); if (ln == NULL) continue; canreach = ND6_IS_LLINFO_PROBREACH(ln); @@ -1814,8 +1896,7 @@ restart: static int nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa) { - static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; - struct rib_head *rnh; + struct sockaddr_dl sdl; struct rtentry *rt; struct sockaddr_in6 mask6; u_long rtflags; @@ -1830,6 +1911,12 @@ nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa) mask6.sin6_addr = pr->ndpr_mask; rtflags = (ifa->ifa_flags & ~IFA_RTSELF) | RTF_UP; + bzero(&sdl, sizeof(struct sockaddr_dl)); + sdl.sdl_len = sizeof(struct sockaddr_dl); + sdl.sdl_family = AF_LINK; + sdl.sdl_type = ifa->ifa_ifp->if_type; + sdl.sdl_index = ifa->ifa_ifp->if_index; + if(V_rt_add_addr_allfibs) { fibnum = 0; maxfib = rt_numfibs; @@ -1842,26 +1929,13 @@ nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa) rt = NULL; error = in6_rtrequest(RTM_ADD, - (struct sockaddr *)&pr->ndpr_prefix, ifa->ifa_addr, + (struct sockaddr *)&pr->ndpr_prefix, (struct sockaddr *)&sdl, (struct sockaddr *)&mask6, rtflags, &rt, fibnum); if (error == 0) { KASSERT(rt != NULL, ("%s: in6_rtrequest return no " "error(%d) but rt is NULL, pr=%p, ifa=%p", __func__, error, pr, ifa)); - - rnh = rt_tables_get_rnh(rt->rt_fibnum, AF_INET6); - /* XXX what if rhn == NULL? */ - RIB_WLOCK(rnh); RT_LOCK(rt); - if (rt_setgate(rt, rt_key(rt), - (struct sockaddr *)&null_sdl) == 0) { - struct sockaddr_dl *dl; - - dl = (struct sockaddr_dl *)rt->rt_gateway; - dl->sdl_type = rt->rt_ifp->if_type; - dl->sdl_index = rt->rt_ifp->if_index; - } - RIB_WUNLOCK(rnh); nd6_rtmsg(RTM_ADD, rt); RT_UNLOCK(rt); pr->ndpr_stateflags |= NDPRF_ONLINK; @@ -1946,15 +2020,17 @@ nd6_prefix_onlink(struct nd_prefix *pr) ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY | IN6_IFF_ANYCAST); if (ifa == NULL) { + struct epoch_tracker et; + /* XXX: freebsd does not have ifa_ifwithaf */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET6) { ifa_ref(ifa); break; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); /* should we care about ia6_flags? */ } if (ifa == NULL) { diff --git a/freebsd/sys/netinet6/raw_ip6.c b/freebsd/sys/netinet6/raw_ip6.c index 73d0832a..b4aa9664 100644 --- a/freebsd/sys/netinet6/raw_ip6.c +++ b/freebsd/sys/netinet6/raw_ip6.c @@ -163,7 +163,7 @@ rip6_input(struct mbuf **mp, int *offp, int proto) struct ifnet *ifp; struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); - struct inpcb *in6p; + struct inpcb *inp; struct inpcb *last = NULL; struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; @@ -176,18 +176,18 @@ rip6_input(struct mbuf **mp, int *offp, int proto) ifp = m->m_pkthdr.rcvif; INP_INFO_RLOCK_ET(&V_ripcbinfo, et); - CK_LIST_FOREACH(in6p, &V_ripcb, inp_list) { + CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { /* XXX inp locking */ - if ((in6p->inp_vflag & INP_IPV6) == 0) + if ((inp->inp_vflag & INP_IPV6) == 0) continue; - if (in6p->inp_ip_p && - in6p->inp_ip_p != proto) + if (inp->inp_ip_p && + inp->inp_ip_p != proto) continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && - !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && - !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) continue; if (last != NULL) { struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT); @@ -225,25 +225,32 @@ rip6_input(struct mbuf **mp, int *offp, int proto) INP_RUNLOCK(last); last = NULL; } - INP_RLOCK(in6p); - if (__predict_false(in6p->inp_flags2 & INP_FREED)) + INP_RLOCK(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) goto skip_2; - if (jailed_without_vnet(in6p->inp_cred)) { + if (jailed_without_vnet(inp->inp_cred)) { /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && - prison_check_ip6(in6p->inp_cred, + prison_check_ip6(inp->inp_cred, &ip6->ip6_dst) != 0) goto skip_2; } - if (in6p->in6p_cksum != -1) { + if (inp->in6p_cksum != -1) { RIP6STAT_INC(rip6s_isum); - if (in6_cksum(m, proto, *offp, + if (m->m_pkthdr.len - (*offp + inp->in6p_cksum) < 2 || + in6_cksum(m, proto, *offp, m->m_pkthdr.len - *offp)) { RIP6STAT_INC(rip6s_badsum); + /* + * Drop the received message, don't send an + * ICMP6 message. Set proto to IPPROTO_NONE + * to achieve that. + */ + proto = IPPROTO_NONE; goto skip_2; } } @@ -253,7 +260,7 @@ rip6_input(struct mbuf **mp, int *offp, int proto) * should receive it, as multicast filtering is now * the responsibility of the transport layer. */ - if (in6p->in6p_moptions && + if (inp->in6p_moptions && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If the incoming datagram is for MLD, allow it @@ -283,7 +290,7 @@ rip6_input(struct mbuf **mp, int *offp, int proto) mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ip6->ip6_dst; - blocked = im6o_mc_filter(in6p->in6p_moptions, + blocked = im6o_mc_filter(inp->in6p_moptions, ifp, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa); @@ -293,10 +300,10 @@ rip6_input(struct mbuf **mp, int *offp, int proto) goto skip_2; } } - last = in6p; + last = inp; continue; skip_2: - INP_RUNLOCK(in6p); + INP_RUNLOCK(inp); } INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); #if defined(IPSEC) || defined(IPSEC_SUPPORT) @@ -389,7 +396,7 @@ rip6_output(struct mbuf *m, struct socket *so, ...) struct m_tag *mtag; struct sockaddr_in6 *dstsock; struct ip6_hdr *ip6; - struct inpcb *in6p; + struct inpcb *inp; u_int plen = m->m_pkthdr.len; int error = 0; struct ip6_pktopts opt, *optp; @@ -406,18 +413,18 @@ rip6_output(struct mbuf *m, struct socket *so, ...) control = va_arg(ap, struct mbuf *); va_end(ap); - in6p = sotoinpcb(so); - INP_WLOCK(in6p); + inp = sotoinpcb(so); + INP_WLOCK(inp); if (control != NULL) { if ((error = ip6_setpktopts(control, &opt, - in6p->in6p_outputopts, so->so_cred, + inp->in6p_outputopts, so->so_cred, so->so_proto->pr_protocol)) != 0) { goto bad; } optp = &opt; } else - optp = in6p->in6p_outputopts; + optp = inp->in6p_outputopts; /* * Check and convert scope zone ID into internal form. @@ -460,12 +467,12 @@ rip6_output(struct mbuf *m, struct socket *so, ...) /* * Source address selection. */ - error = in6_selectsrc_socket(dstsock, optp, in6p, so->so_cred, + error = in6_selectsrc_socket(dstsock, optp, inp, so->so_cred, scope_ambiguous, &in6a, &hlim); if (error) goto bad; - error = prison_check_ip6(in6p->inp_cred, &in6a); + error = prison_check_ip6(inp->inp_cred, &in6a); if (error != 0) goto bad; ip6->ip6_src = in6a; @@ -476,18 +483,18 @@ rip6_output(struct mbuf *m, struct socket *so, ...) * Fill in the rest of the IPv6 header fields. */ ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | - (in6p->inp_flow & IPV6_FLOWINFO_MASK); + (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); /* * ip6_plen will be filled in ip6_output, so not fill it here. */ - ip6->ip6_nxt = in6p->inp_ip_p; + ip6->ip6_nxt = inp->inp_ip_p; ip6->ip6_hlim = hlim; if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 || - in6p->in6p_cksum != -1) { + inp->in6p_cksum != -1) { struct mbuf *n; int off; u_int16_t *p; @@ -496,8 +503,8 @@ rip6_output(struct mbuf *m, struct socket *so, ...) if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) off = offsetof(struct icmp6_hdr, icmp6_cksum); else - off = in6p->in6p_cksum; - if (plen < off + 1) { + off = inp->in6p_cksum; + if (plen < off + 2) { error = EINVAL; goto bad; } @@ -532,7 +539,7 @@ rip6_output(struct mbuf *m, struct socket *so, ...) } } - error = ip6_output(m, optp, NULL, 0, in6p->in6p_moptions, &oifp, in6p); + error = ip6_output(m, optp, NULL, 0, inp->in6p_moptions, &oifp, inp); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) icmp6_ifoutstat_inc(oifp, type, code); @@ -551,7 +558,7 @@ rip6_output(struct mbuf *m, struct socket *so, ...) ip6_clearpktopts(&opt, -1); m_freem(control); } - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); return (error); } @@ -729,6 +736,7 @@ rip6_disconnect(struct socket *so) static int rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { + struct epoch_tracker et; struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct ifaddr *ifa = NULL; @@ -746,20 +754,20 @@ rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == NULL) { - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); inp->in6p_laddr = addr->sin6_addr; diff --git a/freebsd/sys/netinet6/scope6.c b/freebsd/sys/netinet6/scope6.c index 64b866dd..d556f3a4 100644 --- a/freebsd/sys/netinet6/scope6.c +++ b/freebsd/sys/netinet6/scope6.c @@ -211,19 +211,20 @@ scope6_set(struct ifnet *ifp, struct scope6_id *idlist) static int scope6_get(struct ifnet *ifp, struct scope6_id *idlist) { + struct epoch_tracker et; struct scope6_id *sid; /* We only need to lock the interface's afdata for SID() to work. */ - IF_AFDATA_RLOCK(ifp); + NET_EPOCH_ENTER(et); sid = SID(ifp); if (sid == NULL) { /* paranoid? */ - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (EINVAL); } *idlist = *sid; - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (0); } @@ -420,10 +421,12 @@ in6_setscope(struct in6_addr *in6, struct ifnet *ifp, u_int32_t *ret_id) zoneid = ifp->if_index; in6->s6_addr16[1] = htons(zoneid & 0xffff); /* XXX */ } else if (scope != IPV6_ADDR_SCOPE_GLOBAL) { - IF_AFDATA_RLOCK(ifp); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); sid = SID(ifp); zoneid = sid->s6id_list[scope]; - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); } } diff --git a/freebsd/sys/netinet6/sctp6_usrreq.c b/freebsd/sys/netinet6/sctp6_usrreq.c index 6a3391ee..dd320c32 100644 --- a/freebsd/sys/netinet6/sctp6_usrreq.c +++ b/freebsd/sys/netinet6/sctp6_usrreq.c @@ -522,7 +522,6 @@ sctp_must_try_again: static int sctp6_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNUSED) { - struct in6pcb *inp6; int error; struct sctp_inpcb *inp; uint32_t vrf_id = SCTP_DEFAULT_VRFID; @@ -544,18 +543,17 @@ sctp6_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNU inp = (struct sctp_inpcb *)so->so_pcb; SCTP_INP_WLOCK(inp); inp->sctp_flags |= SCTP_PCB_FLAGS_BOUND_V6; /* I'm v6! */ - inp6 = (struct in6pcb *)inp; - inp6->inp_vflag |= INP_IPV6; - inp6->in6p_hops = -1; /* use kernel default */ - inp6->in6p_cksum = -1; /* just to be sure */ + inp->ip_inp.inp.inp_vflag |= INP_IPV6; + inp->ip_inp.inp.in6p_hops = -1; /* use kernel default */ + inp->ip_inp.inp.in6p_cksum = -1; /* just to be sure */ #ifdef INET /* * XXX: ugly!! IPv4 TTL initialization is necessary for an IPv6 * socket as well, because the socket may be bound to an IPv6 * wildcard address, which may match an IPv4-mapped IPv6 address. */ - inp6->inp_ip_ttl = MODULE_GLOBAL(ip_defttl); + inp->ip_inp.inp.inp_ip_ttl = MODULE_GLOBAL(ip_defttl); #endif SCTP_INP_WUNLOCK(inp); return (0); @@ -565,7 +563,6 @@ static int sctp6_bind(struct socket *so, struct sockaddr *addr, struct thread *p) { struct sctp_inpcb *inp; - struct in6pcb *inp6; int error; inp = (struct sctp_inpcb *)so->so_pcb; @@ -597,16 +594,15 @@ sctp6_bind(struct socket *so, struct sockaddr *addr, struct thread *p) return (EINVAL); } } - inp6 = (struct in6pcb *)inp; - inp6->inp_vflag &= ~INP_IPV4; - inp6->inp_vflag |= INP_IPV6; - if ((addr != NULL) && (SCTP_IPV6_V6ONLY(inp6) == 0)) { + inp->ip_inp.inp.inp_vflag &= ~INP_IPV4; + inp->ip_inp.inp.inp_vflag |= INP_IPV6; + if ((addr != NULL) && (SCTP_IPV6_V6ONLY(inp) == 0)) { switch (addr->sa_family) { #ifdef INET case AF_INET: /* binding v4 addr to v6 socket, so reset flags */ - inp6->inp_vflag |= INP_IPV4; - inp6->inp_vflag &= ~INP_IPV6; + inp->ip_inp.inp.inp_vflag |= INP_IPV4; + inp->ip_inp.inp.inp_vflag &= ~INP_IPV6; break; #endif #ifdef INET6 @@ -617,15 +613,15 @@ sctp6_bind(struct socket *so, struct sockaddr *addr, struct thread *p) sin6_p = (struct sockaddr_in6 *)addr; if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr)) { - inp6->inp_vflag |= INP_IPV4; + inp->ip_inp.inp.inp_vflag |= INP_IPV4; } #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6_p); - inp6->inp_vflag |= INP_IPV4; - inp6->inp_vflag &= ~INP_IPV6; + inp->ip_inp.inp.inp_vflag |= INP_IPV4; + inp->ip_inp.inp.inp_vflag &= ~INP_IPV6; error = sctp_inpcb_bind(so, (struct sockaddr *)&sin, NULL, p); return (error); } @@ -687,7 +683,6 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *p) { struct sctp_inpcb *inp; - struct in6pcb *inp6; #ifdef INET struct sockaddr_in6 *sin6; @@ -704,7 +699,6 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } - inp6 = (struct in6pcb *)inp; /* * For the TCP model we may get a NULL addr, if we are a connected * socket thats ok. @@ -724,7 +718,7 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, } #ifdef INET sin6 = (struct sockaddr_in6 *)addr; - if (SCTP_IPV6_V6ONLY(inp6)) { + if (SCTP_IPV6_V6ONLY(inp)) { /* * if IPV6_V6ONLY flag, we discard datagrams destined to a * v4 addr or v4-mapped addr @@ -793,14 +787,10 @@ sctp6_connect(struct socket *so, struct sockaddr *addr, struct thread *p) struct sctp_inpcb *inp; struct sctp_tcb *stcb; #ifdef INET - struct in6pcb *inp6; struct sockaddr_in6 *sin6; union sctp_sockstore store; #endif -#ifdef INET - inp6 = (struct in6pcb *)so->so_pcb; -#endif inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ECONNRESET); @@ -858,7 +848,7 @@ sctp6_connect(struct socket *so, struct sockaddr *addr, struct thread *p) } #ifdef INET sin6 = (struct sockaddr_in6 *)addr; - if (SCTP_IPV6_V6ONLY(inp6)) { + if (SCTP_IPV6_V6ONLY(inp)) { /* * if IPV6_V6ONLY flag, ignore connections destined to a v4 * addr or v4-mapped addr @@ -912,7 +902,8 @@ sctp6_connect(struct socket *so, struct sockaddr *addr, struct thread *p) /* We are GOOD to go */ stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, inp->sctp_ep.pre_open_stream_count, - inp->sctp_ep.port, p); + inp->sctp_ep.port, p, + SCTP_INITIALIZE_AUTH_PARAMS); SCTP_ASOC_CREATE_UNLOCK(inp); if (stcb == NULL) { /* Gak! no memory */ @@ -925,10 +916,6 @@ sctp6_connect(struct socket *so, struct sockaddr *addr, struct thread *p) } SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); - - /* initialize authentication parameters for the assoc */ - sctp_initialize_auth_params(inp, stcb); - sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); SCTP_TCB_UNLOCK(stcb); return (error); @@ -1103,10 +1090,10 @@ sctp6_peeraddr(struct socket *so, struct sockaddr **addr) static int sctp6_in6getaddr(struct socket *so, struct sockaddr **nam) { - struct in6pcb *inp6 = sotoin6pcb(so); + struct inpcb *inp = sotoinpcb(so); int error; - if (inp6 == NULL) { + if (inp == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } @@ -1139,10 +1126,10 @@ sctp6_in6getaddr(struct socket *so, struct sockaddr **nam) static int sctp6_getpeeraddr(struct socket *so, struct sockaddr **nam) { - struct in6pcb *inp6 = sotoin6pcb(so); + struct inpcb *inp = sotoinpcb(so); int error; - if (inp6 == NULL) { + if (inp == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } diff --git a/freebsd/sys/netinet6/udp6_usrreq.c b/freebsd/sys/netinet6/udp6_usrreq.c index e0fcd06d..270b4880 100644 --- a/freebsd/sys/netinet6/udp6_usrreq.c +++ b/freebsd/sys/netinet6/udp6_usrreq.c @@ -744,9 +744,24 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m, * - when we are not bound to an address and source port (it is * in6_pcbsetport() which will require the write lock). */ +retry: if (sin6 == NULL || (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0)) { INP_WLOCK(inp); + /* + * In case we lost a race and another thread bound addr/port + * on the inp we cannot keep the wlock (which still would be + * fine) as further down, based on these values we make + * decisions for the pcbinfo lock. If the locks are not in + * synch the assertions on unlock will fire, hence we go for + * one retry loop. + */ + if (sin6 != NULL && + (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) || + inp->inp_lport != 0)) { + INP_WUNLOCK(inp); + goto retry; + } unlock_inp = UH_WLOCKED; } else { INP_RLOCK(inp); diff --git a/freebsd/sys/netipsec/ipsec.c b/freebsd/sys/netipsec/ipsec.c index 116557ed..f5c3967c 100644 --- a/freebsd/sys/netipsec/ipsec.c +++ b/freebsd/sys/netipsec/ipsec.c @@ -218,6 +218,11 @@ SYSCTL_INT(_net_inet_ipsec, OID_AUTO, filtertunnel, SYSCTL_VNET_PCPUSTAT(_net_inet_ipsec, OID_AUTO, ipsecstats, struct ipsecstat, ipsec4stat, "IPsec IPv4 statistics."); +struct timeval ipsec_warn_interval = { .tv_sec = 1, .tv_usec = 0 }; +SYSCTL_TIMEVAL_SEC(_net_inet_ipsec, OID_AUTO, crypto_warn_interval, CTLFLAG_RW, + &ipsec_warn_interval, + "Delay in seconds between warnings of deprecated IPsec crypto algorithms."); + #ifdef REGRESSION /* * When set to 1, IPsec will send packets with the same sequence number. @@ -1320,6 +1325,8 @@ ok: __func__, replay->overflow, ipsec_sa2str(sav, buf, sizeof(buf)))); } + + replay->count++; return (0); } diff --git a/freebsd/sys/netipsec/ipsec.h b/freebsd/sys/netipsec/ipsec.h index eed2d077..b9b6eca2 100644 --- a/freebsd/sys/netipsec/ipsec.h +++ b/freebsd/sys/netipsec/ipsec.h @@ -287,6 +287,8 @@ VNET_DECLARE(int, crypto_support); VNET_DECLARE(int, async_crypto); VNET_DECLARE(int, natt_cksum_policy); +extern struct timeval ipsec_warn_interval; + #define IPSECSTAT_INC(name) \ VNET_PCPUSTAT_ADD(struct ipsecstat, ipsec4stat, name, 1) #define V_ip4_esp_trans_deflev VNET(ip4_esp_trans_deflev) diff --git a/freebsd/sys/netipsec/key.c b/freebsd/sys/netipsec/key.c index 4b79f881..062fbf28 100644 --- a/freebsd/sys/netipsec/key.c +++ b/freebsd/sys/netipsec/key.c @@ -286,7 +286,7 @@ key_addrprotohash(const union sockaddr_union *src, #endif default: hval = 0; - ipseclog((LOG_DEBUG, "%s: unknown address family %d", + ipseclog((LOG_DEBUG, "%s: unknown address family %d\n", __func__, dst->sa.sa_family)); } return (hval); @@ -2041,8 +2041,8 @@ key_spdadd(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp) key_freesp(&newsp); } else { key_freesp(&newsp); - ipseclog((LOG_DEBUG, "%s: a SP entry exists already.", - __func__)); + ipseclog((LOG_DEBUG, + "%s: a SP entry exists already.\n", __func__)); return (key_senderror(so, m, EEXIST)); } } @@ -4762,34 +4762,10 @@ key_random() { u_long value; - key_randomfill(&value, sizeof(value)); + arc4random_buf(&value, sizeof(value)); return value; } -void -key_randomfill(void *p, size_t l) -{ - size_t n; - u_long v; - static int warn = 1; - - n = 0; - n = (size_t)read_random(p, (u_int)l); - /* last resort */ - while (n < l) { - v = random(); - bcopy(&v, (u_int8_t *)p + n, - l - n < sizeof(v) ? l - n : sizeof(v)); - n += sizeof(v); - - if (warn) { - printf("WARNING: pseudo-random number generator " - "used for IPsec processing\n"); - warn = 0; - } - } -} - /* * map SADB_SATYPE_* to IPPROTO_*. * if satype == SADB_SATYPE then satype is mapped to ~0. @@ -5435,7 +5411,7 @@ key_update(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp) } /* saidx should match with SA. */ if (key_cmpsaidx(&sav->sah->saidx, &saidx, CMP_MODE_REQID) == 0) { - ipseclog((LOG_DEBUG, "%s: saidx mismatched for SPI %u", + ipseclog((LOG_DEBUG, "%s: saidx mismatched for SPI %u\n", __func__, ntohl(sav->spi))); key_freesav(&sav); return key_senderror(so, m, ESRCH); @@ -6911,14 +6887,14 @@ key_acqdone(const struct secasindex *saidx, uint32_t seq) if (acq != NULL) { if (key_cmpsaidx(&acq->saidx, saidx, CMP_EXACTLY) == 0) { ipseclog((LOG_DEBUG, - "%s: Mismatched saidx for ACQ %u", __func__, seq)); + "%s: Mismatched saidx for ACQ %u\n", __func__, seq)); acq = NULL; } else { acq->created = 0; } } else { ipseclog((LOG_DEBUG, - "%s: ACQ %u is not found.", __func__, seq)); + "%s: ACQ %u is not found.\n", __func__, seq)); } ACQ_UNLOCK(); if (acq == NULL) @@ -7190,7 +7166,7 @@ key_register(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp) return key_senderror(so, m, ENOBUFS); MGETHDR(n, M_NOWAIT, MT_DATA); - if (len > MHLEN) { + if (n != NULL && len > MHLEN) { if (!(MCLGET(n, M_NOWAIT))) { m_freem(n); n = NULL; diff --git a/freebsd/sys/netipsec/key.h b/freebsd/sys/netipsec/key.h index 7d7ae69f..2ee7c208 100644 --- a/freebsd/sys/netipsec/key.h +++ b/freebsd/sys/netipsec/key.h @@ -78,7 +78,6 @@ void key_unregister_ifnet(struct secpolicy **, u_int); void key_delete_xform(const struct xformsw *); extern u_long key_random(void); -extern void key_randomfill(void *, size_t); extern void key_freereg(struct socket *); extern int key_parse(struct mbuf *, struct socket *); extern void key_init(void); diff --git a/freebsd/sys/netipsec/xform_ah.c b/freebsd/sys/netipsec/xform_ah.c index 84ba6c16..618fbd9b 100644 --- a/freebsd/sys/netipsec/xform_ah.c +++ b/freebsd/sys/netipsec/xform_ah.c @@ -110,6 +110,7 @@ SYSCTL_VNET_PCPUSTAT(_net_inet_ah, IPSECCTL_STATS, stats, struct ahstat, #endif static unsigned char ipseczeroes[256]; /* larger than an ip6 extension hdr */ +static struct timeval md5warn, ripewarn, kpdkmd5warn, kpdksha1warn; static int ah_input_cb(struct cryptop*); static int ah_output_cb(struct cryptop*); @@ -186,6 +187,26 @@ ah_init0(struct secasvar *sav, struct xformsw *xsp, struct cryptoini *cria) __func__, sav->alg_auth)); return EINVAL; } + + switch (sav->alg_auth) { + case SADB_AALG_MD5HMAC: + if (ratecheck(&md5warn, &ipsec_warn_interval)) + gone_in(13, "MD5-HMAC authenticator for IPsec"); + break; + case SADB_X_AALG_RIPEMD160HMAC: + if (ratecheck(&ripewarn, &ipsec_warn_interval)) + gone_in(13, "RIPEMD160-HMAC authenticator for IPsec"); + break; + case SADB_X_AALG_MD5: + if (ratecheck(&kpdkmd5warn, &ipsec_warn_interval)) + gone_in(13, "Keyed-MD5 authenticator for IPsec"); + break; + case SADB_X_AALG_SHA: + if (ratecheck(&kpdksha1warn, &ipsec_warn_interval)) + gone_in(13, "Keyed-SHA1 authenticator for IPsec"); + break; + } + /* * Verify the replay state block allocation is consistent with * the protocol type. We check here so we can make assumptions diff --git a/freebsd/sys/netipsec/xform_esp.c b/freebsd/sys/netipsec/xform_esp.c index f8473575..388fe499 100644 --- a/freebsd/sys/netipsec/xform_esp.c +++ b/freebsd/sys/netipsec/xform_esp.c @@ -96,6 +96,8 @@ SYSCTL_VNET_PCPUSTAT(_net_inet_esp, IPSECCTL_STATS, stats, struct espstat, espstat, "ESP statistics (struct espstat, netipsec/esp_var.h"); +static struct timeval deswarn, blfwarn, castwarn, camelliawarn; + static int esp_input_cb(struct cryptop *op); static int esp_output_cb(struct cryptop *crp); @@ -158,6 +160,26 @@ esp_init(struct secasvar *sav, struct xformsw *xsp) __func__)); return EINVAL; } + + switch (sav->alg_enc) { + case SADB_EALG_DESCBC: + if (ratecheck(&deswarn, &ipsec_warn_interval)) + gone_in(13, "DES cipher for IPsec"); + break; + case SADB_X_EALG_BLOWFISHCBC: + if (ratecheck(&blfwarn, &ipsec_warn_interval)) + gone_in(13, "Blowfish cipher for IPsec"); + break; + case SADB_X_EALG_CAST128CBC: + if (ratecheck(&castwarn, &ipsec_warn_interval)) + gone_in(13, "CAST cipher for IPsec"); + break; + case SADB_X_EALG_CAMELLIACBC: + if (ratecheck(&camelliawarn, &ipsec_warn_interval)) + gone_in(13, "Camellia cipher for IPsec"); + break; + } + /* subtract off the salt, RFC4106, 8.1 and RFC3686, 5.1 */ keylen = _KEYLEN(sav->key_enc) - SAV_ISCTRORGCM(sav) * 4; if (txform->minkey > keylen || keylen > txform->maxkey) { @@ -770,7 +792,7 @@ esp_output(struct mbuf *m, struct secpolicy *sp, struct secasvar *sav, */ switch (sav->flags & SADB_X_EXT_PMASK) { case SADB_X_EXT_PRAND: - (void) read_random(pad, padding - 2); + arc4random_buf(pad, padding - 2); break; case SADB_X_EXT_PZERO: bzero(pad, padding - 2); diff --git a/freebsd/sys/netpfil/ipfw/ip_fw_private.h b/freebsd/sys/netpfil/ipfw/ip_fw_private.h index 7e966d0a..57fa7464 100644 --- a/freebsd/sys/netpfil/ipfw/ip_fw_private.h +++ b/freebsd/sys/netpfil/ipfw/ip_fw_private.h @@ -61,6 +61,7 @@ enum { IP_FW_NGTEE, IP_FW_NAT, IP_FW_REASS, + IP_FW_NAT64, }; /* @@ -83,11 +84,20 @@ struct _ip6dn_args { * efficient to pass variables around and extend the interface. */ struct ip_fw_args { - struct mbuf *m; /* the mbuf chain */ - struct ifnet *oif; /* output interface */ - struct sockaddr_in *next_hop; /* forward address */ - struct sockaddr_in6 *next_hop6; /* ipv6 forward address */ - + uint32_t flags; +#define IPFW_ARGS_ETHER 0x00010000 /* valid ethernet header */ +#define IPFW_ARGS_NH4 0x00020000 /* IPv4 next hop in hopstore */ +#define IPFW_ARGS_NH6 0x00040000 /* IPv6 next hop in hopstore */ +#define IPFW_ARGS_NH4PTR 0x00080000 /* IPv4 next hop in next_hop */ +#define IPFW_ARGS_NH6PTR 0x00100000 /* IPv6 next hop in next_hop6 */ +#define IPFW_ARGS_REF 0x00200000 /* valid ipfw_rule_ref */ +#define IPFW_ARGS_IN 0x00400000 /* called on input */ +#define IPFW_ARGS_OUT 0x00800000 /* called on output */ +#define IPFW_ARGS_IP4 0x01000000 /* belongs to v4 ISR */ +#define IPFW_ARGS_IP6 0x02000000 /* belongs to v6 ISR */ +#define IPFW_ARGS_DROP 0x04000000 /* drop it (dummynet) */ +#define IPFW_ARGS_LENMASK 0x0000ffff /* length of data in *mem */ +#define IPFW_ARGS_LENGTH(f) ((f) & IPFW_ARGS_LENMASK) /* * On return, it points to the matching rule. * On entry, rule.slot > 0 means the info is valid and @@ -95,45 +105,36 @@ struct ip_fw_args { * If chain_id == chain->id && slot >0 then jump to that slot. * Otherwise, we locate the first rule >= rulenum:rule_id */ - struct ipfw_rule_ref rule; /* match/restart info */ - - struct ether_header *eh; /* for bridged packets */ - - struct ipfw_flow_id f_id; /* grabbed from IP header */ - //uint32_t cookie; /* a cookie depending on rule action */ - struct inpcb *inp; - - struct _ip6dn_args dummypar; /* dummynet->ip6_output */ - union { /* store here if cannot use a pointer */ - struct sockaddr_in hopstore; - struct sockaddr_in6 hopstore6; + struct ipfw_rule_ref rule; /* match/restart info */ + + struct ifnet *ifp; /* input/output interface */ + struct inpcb *inp; + union { + /* + * next_hop[6] pointers can be used to point to next hop + * stored in rule's opcode to avoid copying into hopstore. + * Also, it is expected that all 0x1-0x10 flags are mutually + * exclusive. + */ + struct sockaddr_in *next_hop; + struct sockaddr_in6 *next_hop6; + /* ipfw next hop storage */ + struct sockaddr_in hopstore; + struct ip_fw_nh6 { + struct in6_addr sin6_addr; + uint32_t sin6_scope_id; + uint16_t sin6_port; + } hopstore6; }; + union { + struct mbuf *m; /* the mbuf chain */ + void *mem; /* or memory pointer */ + }; + struct ipfw_flow_id f_id; /* grabbed from IP header */ }; MALLOC_DECLARE(M_IPFW); -/* - * Hooks sometime need to know the direction of the packet - * (divert, dummynet, netgraph, ...) - * We use a generic definition here, with bit0-1 indicating the - * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the - * specific protocol - * indicating the protocol (if necessary) - */ -enum { - DIR_MASK = 0x3, - DIR_OUT = 0, - DIR_IN = 1, - DIR_FWD = 2, - DIR_DROP = 3, - PROTO_LAYER2 = 0x4, /* set for layer 2 */ - /* PROTO_DEFAULT = 0, */ - PROTO_IPV4 = 0x08, - PROTO_IPV6 = 0x10, - PROTO_IFB = 0x0c, /* layer2 + ifbridge */ - /* PROTO_OLDBDG = 0x14, unused, old bridge */ -}; - /* wrapper for freeing a packet, in case we need to do more work */ #ifndef FREE_PKT #if defined(__linux__) || defined(_WIN32) @@ -150,8 +151,8 @@ int ipfw_chk(struct ip_fw_args *args); struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, u_int32_t, u_int32_t, int); -/* attach (arg = 1) or detach (arg = 0) hooks */ -int ipfw_attach_hooks(int); +int ipfw_attach_hooks(void); +void ipfw_detach_hooks(void); #ifdef NOTYET void ipfw_nat_destroy(void); #endif @@ -162,10 +163,11 @@ struct ip_fw_chain; void ipfw_bpf_init(int); void ipfw_bpf_uninit(int); +void ipfw_bpf_tap(u_char *, u_int); +void ipfw_bpf_mtap(struct mbuf *); void ipfw_bpf_mtap2(void *, u_int, struct mbuf *); void ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, - struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, - u_short offset, uint32_t tablearg, struct ip *ip); + struct ip_fw_args *args, u_short offset, uint32_t tablearg, struct ip *ip); VNET_DECLARE(u_int64_t, norule_counter); #define V_norule_counter VNET(norule_counter) VNET_DECLARE(int, verbose_limit); @@ -296,6 +298,8 @@ struct ip_fw_chain { void **srvstate; /* runtime service mappings */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t rwmtx; +#else + struct rmlock rwmtx; #endif int static_len; /* total len of static rules (v0) */ uint32_t gencnt; /* NAT generation count */ @@ -436,23 +440,25 @@ struct ipfw_ifc { #define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) #else /* FreeBSD */ #define IPFW_LOCK_INIT(_chain) do { \ + rm_init_flags(&(_chain)->rwmtx, "IPFW static rules", RM_RECURSE); \ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ } while (0) #define IPFW_LOCK_DESTROY(_chain) do { \ + rm_destroy(&(_chain)->rwmtx); \ rw_destroy(&(_chain)->uh_lock); \ } while (0) -#define IPFW_RLOCK_ASSERT(_chain) rm_assert(&V_pfil_lock, RA_RLOCKED) -#define IPFW_WLOCK_ASSERT(_chain) rm_assert(&V_pfil_lock, RA_WLOCKED) +#define IPFW_RLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_RLOCKED) +#define IPFW_WLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_WLOCKED) #define IPFW_RLOCK_TRACKER struct rm_priotracker _tracker -#define IPFW_RLOCK(p) rm_rlock(&V_pfil_lock, &_tracker) -#define IPFW_RUNLOCK(p) rm_runlock(&V_pfil_lock, &_tracker) -#define IPFW_WLOCK(p) rm_wlock(&V_pfil_lock) -#define IPFW_WUNLOCK(p) rm_wunlock(&V_pfil_lock) -#define IPFW_PF_RLOCK(p) -#define IPFW_PF_RUNLOCK(p) +#define IPFW_RLOCK(p) rm_rlock(&(p)->rwmtx, &_tracker) +#define IPFW_RUNLOCK(p) rm_runlock(&(p)->rwmtx, &_tracker) +#define IPFW_WLOCK(p) rm_wlock(&(p)->rwmtx) +#define IPFW_WUNLOCK(p) rm_wunlock(&(p)->rwmtx) +#define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) +#define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) #endif #define IPFW_UH_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_RLOCKED) @@ -659,6 +665,7 @@ struct ip_fw *ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize); void ipfw_free_rule(struct ip_fw *rule); int ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt); int ipfw_mark_object_kidx(uint32_t *bmask, uint16_t etlv, uint16_t kidx); +ipfw_insn *ipfw_get_action(struct ip_fw *); typedef int (sopt_handler_f)(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd); diff --git a/freebsd/sys/netpfil/pf/if_pfsync.c b/freebsd/sys/netpfil/pf/if_pfsync.c index 026d19a3..9d87cf67 100644 --- a/freebsd/sys/netpfil/pf/if_pfsync.c +++ b/freebsd/sys/netpfil/pf/if_pfsync.c @@ -266,7 +266,7 @@ static void pfsync_push(struct pfsync_bucket *); static void pfsync_push_all(struct pfsync_softc *); static void pfsyncintr(void *); static int pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *, - void *); + struct in_mfilter *imf); static void pfsync_multicast_cleanup(struct pfsync_softc *); static void pfsync_pointers_init(void); static void pfsync_pointers_uninit(void); @@ -337,6 +337,7 @@ pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param) pfsync_buckets = mp_ncpus * 2; sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO); + sc->sc_flags |= PFSYNCF_OK; sc->sc_maxupdates = 128; ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC); @@ -364,7 +365,7 @@ pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param) M_PFSYNC, M_ZERO | M_WAITOK); for (c = 0; c < pfsync_buckets; c++) { b = &sc->sc_buckets[c]; - mtx_init(&b->b_mtx, pfsyncname, NULL, MTX_DEF); + mtx_init(&b->b_mtx, "pfsync bucket", NULL, MTX_DEF); b->b_id = c; b->b_sc = sc; @@ -431,8 +432,7 @@ pfsync_clone_destroy(struct ifnet *ifp) pfsync_drop(sc); if_free(ifp); - if (sc->sc_imo.imo_membership) - pfsync_multicast_cleanup(sc); + pfsync_multicast_cleanup(sc); mtx_destroy(&sc->sc_mtx); mtx_destroy(&sc->sc_bulk_mtx); @@ -1374,10 +1374,9 @@ pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) case SIOCSETPFSYNC: { - struct ip_moptions *imo = &sc->sc_imo; + struct in_mfilter *imf = NULL; struct ifnet *sifp; struct ip *ip; - void *mship = NULL; if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0) return (error); @@ -1397,8 +1396,7 @@ pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) pfsyncr.pfsyncr_syncpeer.s_addr == 0 || pfsyncr.pfsyncr_syncpeer.s_addr == htonl(INADDR_PFSYNC_GROUP))) - mship = malloc((sizeof(struct in_multi *) * - IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO); + imf = ip_mfilter_alloc(M_WAITOK, 0, 0); PFSYNC_LOCK(sc); if (pfsyncr.pfsyncr_syncpeer.s_addr == 0) @@ -1420,8 +1418,7 @@ pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) if (sc->sc_sync_if) if_rele(sc->sc_sync_if); sc->sc_sync_if = NULL; - if (imo->imo_membership) - pfsync_multicast_cleanup(sc); + pfsync_multicast_cleanup(sc); PFSYNC_UNLOCK(sc); break; } @@ -1437,14 +1434,13 @@ pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[c]); } - if (imo->imo_membership) - pfsync_multicast_cleanup(sc); + pfsync_multicast_cleanup(sc); if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) { - error = pfsync_multicast_setup(sc, sifp, mship); + error = pfsync_multicast_setup(sc, sifp, imf); if (error) { if_rele(sifp); - free(mship, M_PFSYNC); + ip_mfilter_free(imf); PFSYNC_UNLOCK(sc); return (error); } @@ -2354,7 +2350,8 @@ pfsyncintr(void *arg) } static int -pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship) +pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, + struct in_mfilter *imf) { struct ip_moptions *imo = &sc->sc_imo; int error; @@ -2362,16 +2359,14 @@ pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship) if (!(ifp->if_flags & IFF_MULTICAST)) return (EADDRNOTAVAIL); - imo->imo_membership = (struct in_multi **)mship; - imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_multicast_vif = -1; if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL, - &imo->imo_membership[0])) != 0) { - imo->imo_membership = NULL; + &imf->imf_inm)) != 0) return (error); - } - imo->imo_num_memberships++; + + ip_mfilter_init(&imo->imo_head); + ip_mfilter_insert(&imo->imo_head, imf); imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = PFSYNC_DFLTTL; imo->imo_multicast_loop = 0; @@ -2383,10 +2378,13 @@ static void pfsync_multicast_cleanup(struct pfsync_softc *sc) { struct ip_moptions *imo = &sc->sc_imo; + struct in_mfilter *imf; - in_leavegroup(imo->imo_membership[0], NULL); - free(imo->imo_membership, M_PFSYNC); - imo->imo_membership = NULL; + while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) { + ip_mfilter_remove(&imo->imo_head, imf); + in_leavegroup(imf->imf_inm, NULL); + ip_mfilter_free(imf); + } imo->imo_multicast_ifp = NULL; } @@ -2405,7 +2403,7 @@ pfsync_detach_ifnet(struct ifnet *ifp) * is going away. We do need to ensure we don't try to do * cleanup later. */ - sc->sc_imo.imo_membership = NULL; + ip_mfilter_init(&sc->sc_imo.imo_head); sc->sc_imo.imo_multicast_ifp = NULL; sc->sc_sync_if = NULL; } diff --git a/freebsd/sys/netpfil/pf/pf.c b/freebsd/sys/netpfil/pf/pf.c index 9b4653e2..c0f6459b 100644 --- a/freebsd/sys/netpfil/pf/pf.c +++ b/freebsd/sys/netpfil/pf/pf.c @@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/bus.h> #include <sys/endian.h> +#include <sys/gsb_crc32.h> #include <sys/hash.h> #include <sys/interrupt.h> #include <sys/kernel.h> @@ -93,8 +94,6 @@ __FBSDID("$FreeBSD$"); #include <netinet/udp.h> #include <netinet/udp_var.h> -#include <netpfil/ipfw/ip_fw_private.h> /* XXX: only for DIR_IN/DIR_OUT */ - #ifdef INET6 #include <netinet/ip6.h> #include <netinet/icmp6.h> @@ -115,10 +114,12 @@ __FBSDID("$FreeBSD$"); */ /* state tables */ -VNET_DEFINE(struct pf_altqqueue, pf_altqs[2]); +VNET_DEFINE(struct pf_altqqueue, pf_altqs[4]); VNET_DEFINE(struct pf_palist, pf_pabuf); VNET_DEFINE(struct pf_altqqueue *, pf_altqs_active); +VNET_DEFINE(struct pf_altqqueue *, pf_altq_ifs_active); VNET_DEFINE(struct pf_altqqueue *, pf_altqs_inactive); +VNET_DEFINE(struct pf_altqqueue *, pf_altq_ifs_inactive); VNET_DEFINE(struct pf_kstatus, pf_status); VNET_DEFINE(u_int32_t, ticket_altqs_active); @@ -360,7 +361,7 @@ VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]); counter_u64_add(s->rule.ptr->states_cur, -1); \ } while (0) -static MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures"); +MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures"); VNET_DEFINE(struct pf_keyhash *, pf_keyhash); VNET_DEFINE(struct pf_idhash *, pf_idhash); VNET_DEFINE(struct pf_srchash *, pf_srchash); @@ -862,9 +863,13 @@ pf_initialize() /* ALTQ */ TAILQ_INIT(&V_pf_altqs[0]); TAILQ_INIT(&V_pf_altqs[1]); + TAILQ_INIT(&V_pf_altqs[2]); + TAILQ_INIT(&V_pf_altqs[3]); TAILQ_INIT(&V_pf_pabuf); V_pf_altqs_active = &V_pf_altqs[0]; - V_pf_altqs_inactive = &V_pf_altqs[1]; + V_pf_altq_ifs_active = &V_pf_altqs[1]; + V_pf_altqs_inactive = &V_pf_altqs[2]; + V_pf_altq_ifs_inactive = &V_pf_altqs[3]; /* Send & overload+flush queues. */ STAILQ_INIT(&V_pf_sendqueue); @@ -1560,7 +1565,7 @@ pf_state_expires(const struct pf_state *state) if (!timeout) timeout = V_pf_default_rule.timeout[state->timeout]; start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START]; - if (start) { + if (start && state->rule.ptr != &V_pf_default_rule) { end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END]; states = counter_u64_fetch(state->rule.ptr->states_cur); } else { @@ -3210,7 +3215,7 @@ pf_tcp_iss(struct pf_pdesc *pd) u_int32_t digest[4]; if (V_pf_tcp_secret_init == 0) { - read_random(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); + arc4random_buf(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); MD5Init(&V_pf_tcp_secret_ctx); MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); @@ -4602,7 +4607,7 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, { struct pf_addr *saddr = pd->src, *daddr = pd->dst; u_int16_t icmpid = 0, *icmpsum; - u_int8_t icmptype; + u_int8_t icmptype, icmpcode; int state_icmp = 0; struct pf_state_key_cmp key; @@ -4611,6 +4616,7 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, #ifdef INET case IPPROTO_ICMP: icmptype = pd->hdr.icmp->icmp_type; + icmpcode = pd->hdr.icmp->icmp_code; icmpid = pd->hdr.icmp->icmp_id; icmpsum = &pd->hdr.icmp->icmp_cksum; @@ -4625,6 +4631,7 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, #ifdef INET6 case IPPROTO_ICMPV6: icmptype = pd->hdr.icmp6->icmp6_type; + icmpcode = pd->hdr.icmp6->icmp6_code; icmpid = pd->hdr.icmp6->icmp6_id; icmpsum = &pd->hdr.icmp6->icmp6_cksum; @@ -4823,6 +4830,23 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, #endif /* INET6 */ } + if (PF_ANEQ(pd->dst, pd2.src, pd->af)) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: BAD ICMP %d:%d outer dst: ", + icmptype, icmpcode); + pf_print_host(pd->src, 0, pd->af); + printf(" -> "); + pf_print_host(pd->dst, 0, pd->af); + printf(" inner src: "); + pf_print_host(pd2.src, 0, pd2.af); + printf(" -> "); + pf_print_host(pd2.dst, 0, pd2.af); + printf("\n"); + } + REASON_SET(reason, PFRES_BADSTATE); + return (PF_DROP); + } + switch (pd2.proto) { case IPPROTO_TCP: { struct tcphdr th; @@ -4879,7 +4903,7 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD ICMP %d:%d ", - icmptype, pd->hdr.icmp->icmp_code); + icmptype, icmpcode); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); @@ -4892,7 +4916,7 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, } else { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: OK ICMP %d:%d ", - icmptype, pd->hdr.icmp->icmp_code); + icmptype, icmpcode); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); @@ -5249,7 +5273,7 @@ pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif, nk->addr[pd->didx].v4.s_addr, 0); - break; + break; #endif /* INET */ #ifdef INET6 case AF_INET6: @@ -6159,7 +6183,7 @@ done: pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && - (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) + IN_LOOPBACK(ntohl(pd.dst->v4.s_addr))) m->m_flags |= M_SKIP_FIREWALL; if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL && @@ -6190,7 +6214,7 @@ done: m->m_flags &= ~M_FASTFWD_OURS; } } - ip_divert_ptr(*m0, dir == PF_IN ? DIR_IN : DIR_OUT); + ip_divert_ptr(*m0, dir == PF_IN); *m0 = NULL; return (action); @@ -6339,9 +6363,8 @@ pf_test6(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb m = *m0; /* pf_normalize messes with m0 */ h = mtod(m, struct ip6_hdr *); -#if 1 /* - * we do not support jumbogram yet. if we keep going, zero ip6_plen + * we do not support jumbogram. if we keep going, zero ip6_plen * will do something bad, so drop the packet for now. */ if (htons(h->ip6_plen) == 0) { @@ -6349,7 +6372,6 @@ pf_test6(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb REASON_SET(&reason, PFRES_NORM); /*XXX*/ goto done; } -#endif pd.src = (struct pf_addr *)&h->ip6_src; pd.dst = (struct pf_addr *)&h->ip6_dst; diff --git a/freebsd/sys/netpfil/pf/pf_if.c b/freebsd/sys/netpfil/pf/pf_if.c index 4314bbce..44b6f7a3 100644 --- a/freebsd/sys/netpfil/pf/pf_if.c +++ b/freebsd/sys/netpfil/pf/pf_if.c @@ -302,13 +302,15 @@ pfi_kif_match(struct pfi_kif *rule_kif, struct pfi_kif *packet_kif) return (1); if (rule_kif->pfik_group != NULL) { - IF_ADDR_RLOCK(packet_kif->pfik_ifp); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(p, &packet_kif->pfik_ifp->if_groups, ifgl_next) if (p->ifgl_group == rule_kif->pfik_group) { - IF_ADDR_RUNLOCK(packet_kif->pfik_ifp); + NET_EPOCH_EXIT(et); return (1); } - IF_ADDR_RUNLOCK(packet_kif->pfik_ifp); + NET_EPOCH_EXIT(et); } @@ -475,11 +477,13 @@ pfi_kif_update(struct pfi_kif *kif) /* again for all groups kif is member of */ if (kif->pfik_ifp != NULL) { - IF_ADDR_RLOCK(kif->pfik_ifp); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgl, &kif->pfik_ifp->if_groups, ifgl_next) pfi_kif_update((struct pfi_kif *) ifgl->ifgl_group->ifg_pf_kif); - IF_ADDR_RUNLOCK(kif->pfik_ifp); + NET_EPOCH_EXIT(et); } } @@ -515,10 +519,12 @@ pfi_table_update(struct pfr_ktable *kt, struct pfi_kif *kif, int net, int flags) if (kif->pfik_ifp != NULL) pfi_instance_add(kif->pfik_ifp, net, flags); else if (kif->pfik_group != NULL) { - IFNET_RLOCK_NOSLEEP(); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgm, &kif->pfik_group->ifg_members, ifgm_next) pfi_instance_add(ifgm->ifgm_ifp, net, flags); - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); } if ((e = pfr_set_addrs(&kt->pfrkt_t, V_pfi_buffer, V_pfi_buffer_cnt, &size2, @@ -530,11 +536,12 @@ pfi_table_update(struct pfr_ktable *kt, struct pfi_kif *kif, int net, int flags) static void pfi_instance_add(struct ifnet *ifp, int net, int flags) { + struct epoch_tracker et; struct ifaddr *ia; int got4 = 0, got6 = 0; int net2, af; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr == NULL) continue; @@ -592,7 +599,7 @@ pfi_instance_add(struct ifnet *ifp, int net, int flags) else pfi_address_add(ia->ifa_addr, af, net2); } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); } static void @@ -760,15 +767,17 @@ pfi_skip_if(const char *filter, struct pfi_kif *p) if (filter[n-1] >= '0' && filter[n-1] <= '9') return (1); /* group names may not end in a digit */ if (p->pfik_ifp != NULL) { - IF_ADDR_RLOCK(p->pfik_ifp); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(i, &p->pfik_ifp->if_groups, ifgl_next) { if (!strncmp(i->ifgl_group->ifg_group, filter, IFNAMSIZ)) { - IF_ADDR_RUNLOCK(p->pfik_ifp); + NET_EPOCH_EXIT(et); return (0); /* iface is in group "filter" */ } } - IF_ADDR_RUNLOCK(p->pfik_ifp); + NET_EPOCH_EXIT(et); } return (1); } diff --git a/freebsd/sys/netpfil/pf/pf_ioctl.c b/freebsd/sys/netpfil/pf/pf_ioctl.c index eaac7abc..06b308b5 100644 --- a/freebsd/sys/netpfil/pf/pf_ioctl.c +++ b/freebsd/sys/netpfil/pf/pf_ioctl.c @@ -48,11 +48,14 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_pf.h> #include <sys/param.h> +#include <sys/_bitset.h> +#include <sys/bitset.h> #include <sys/bus.h> #include <sys/conf.h> #include <sys/endian.h> #include <sys/fcntl.h> #include <sys/filio.h> +#include <sys/hash.h> #include <sys/interrupt.h> #include <sys/jail.h> #include <sys/kernel.h> @@ -131,18 +134,40 @@ VNET_DEFINE_STATIC(int, pf_altq_running); #define TAGID_MAX 50000 struct pf_tagname { - TAILQ_ENTRY(pf_tagname) entries; + TAILQ_ENTRY(pf_tagname) namehash_entries; + TAILQ_ENTRY(pf_tagname) taghash_entries; char name[PF_TAG_NAME_SIZE]; uint16_t tag; int ref; }; -TAILQ_HEAD(pf_tags, pf_tagname); -#define V_pf_tags VNET(pf_tags) -VNET_DEFINE(struct pf_tags, pf_tags); -#define V_pf_qids VNET(pf_qids) -VNET_DEFINE(struct pf_tags, pf_qids); -static MALLOC_DEFINE(M_PFTAG, "pf_tag", "pf(4) tag names"); +struct pf_tagset { + TAILQ_HEAD(, pf_tagname) *namehash; + TAILQ_HEAD(, pf_tagname) *taghash; + unsigned int mask; + uint32_t seed; + BITSET_DEFINE(, TAGID_MAX) avail; +}; + +VNET_DEFINE(struct pf_tagset, pf_tags); +#define V_pf_tags VNET(pf_tags) +static unsigned int pf_rule_tag_hashsize; +#define PF_RULE_TAG_HASH_SIZE_DEFAULT 128 +SYSCTL_UINT(_net_pf, OID_AUTO, rule_tag_hashsize, CTLFLAG_RDTUN, + &pf_rule_tag_hashsize, PF_RULE_TAG_HASH_SIZE_DEFAULT, + "Size of pf(4) rule tag hashtable"); + +#ifdef ALTQ +VNET_DEFINE(struct pf_tagset, pf_qids); +#define V_pf_qids VNET(pf_qids) +static unsigned int pf_queue_tag_hashsize; +#define PF_QUEUE_TAG_HASH_SIZE_DEFAULT 128 +SYSCTL_UINT(_net_pf, OID_AUTO, queue_tag_hashsize, CTLFLAG_RDTUN, + &pf_queue_tag_hashsize, PF_QUEUE_TAG_HASH_SIZE_DEFAULT, + "Size of pf(4) queue tag hashtable"); +#endif +VNET_DEFINE(uma_zone_t, pf_tag_z); +#define V_pf_tag_z VNET(pf_tag_z) static MALLOC_DEFINE(M_PFALTQ, "pf_altq", "pf(4) altq configuration db"); static MALLOC_DEFINE(M_PFRULE, "pf_rule", "pf(4) rules"); @@ -150,9 +175,14 @@ static MALLOC_DEFINE(M_PFRULE, "pf_rule", "pf(4) rules"); #error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE #endif -static u_int16_t tagname2tag(struct pf_tags *, char *); +static void pf_init_tagset(struct pf_tagset *, unsigned int *, + unsigned int); +static void pf_cleanup_tagset(struct pf_tagset *); +static uint16_t tagname2hashindex(const struct pf_tagset *, const char *); +static uint16_t tag2hashindex(const struct pf_tagset *, uint16_t); +static u_int16_t tagname2tag(struct pf_tagset *, char *); static u_int16_t pf_tagname2tag(char *); -static void tag_unref(struct pf_tags *, u_int16_t); +static void tag_unref(struct pf_tagset *, u_int16_t); #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x @@ -171,16 +201,16 @@ static void pf_tbladdr_copyout(struct pf_addr_wrap *); * Wrapper functions for pfil(9) hooks */ #ifdef INET -static int pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp, - int dir, int flags, struct inpcb *inp); -static int pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp, - int dir, int flags, struct inpcb *inp); +static pfil_return_t pf_check_in(struct mbuf **m, struct ifnet *ifp, + int flags, void *ruleset __unused, struct inpcb *inp); +static pfil_return_t pf_check_out(struct mbuf **m, struct ifnet *ifp, + int flags, void *ruleset __unused, struct inpcb *inp); #endif #ifdef INET6 -static int pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, - int dir, int flags, struct inpcb *inp); -static int pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp, - int dir, int flags, struct inpcb *inp); +static pfil_return_t pf_check6_in(struct mbuf **m, struct ifnet *ifp, + int flags, void *ruleset __unused, struct inpcb *inp); +static pfil_return_t pf_check6_out(struct mbuf **m, struct ifnet *ifp, + int flags, void *ruleset __unused, struct inpcb *inp); #endif static int hook_pf(void); @@ -438,68 +468,141 @@ pf_free_rule(struct pf_rule *rule) free(rule, M_PFRULE); } +static void +pf_init_tagset(struct pf_tagset *ts, unsigned int *tunable_size, + unsigned int default_size) +{ + unsigned int i; + unsigned int hashsize; + + if (*tunable_size == 0 || !powerof2(*tunable_size)) + *tunable_size = default_size; + + hashsize = *tunable_size; + ts->namehash = mallocarray(hashsize, sizeof(*ts->namehash), M_PFHASH, + M_WAITOK); + ts->taghash = mallocarray(hashsize, sizeof(*ts->taghash), M_PFHASH, + M_WAITOK); + ts->mask = hashsize - 1; + ts->seed = arc4random(); + for (i = 0; i < hashsize; i++) { + TAILQ_INIT(&ts->namehash[i]); + TAILQ_INIT(&ts->taghash[i]); + } + BIT_FILL(TAGID_MAX, &ts->avail); +} + +static void +pf_cleanup_tagset(struct pf_tagset *ts) +{ + unsigned int i; + unsigned int hashsize; + struct pf_tagname *t, *tmp; + + /* + * Only need to clean up one of the hashes as each tag is hashed + * into each table. + */ + hashsize = ts->mask + 1; + for (i = 0; i < hashsize; i++) + TAILQ_FOREACH_SAFE(t, &ts->namehash[i], namehash_entries, tmp) + uma_zfree(V_pf_tag_z, t); + + free(ts->namehash, M_PFHASH); + free(ts->taghash, M_PFHASH); +} + +static uint16_t +tagname2hashindex(const struct pf_tagset *ts, const char *tagname) +{ + + return (murmur3_32_hash(tagname, strlen(tagname), ts->seed) & ts->mask); +} + +static uint16_t +tag2hashindex(const struct pf_tagset *ts, uint16_t tag) +{ + + return (tag & ts->mask); +} + static u_int16_t -tagname2tag(struct pf_tags *head, char *tagname) +tagname2tag(struct pf_tagset *ts, char *tagname) { - struct pf_tagname *tag, *p = NULL; - u_int16_t new_tagid = 1; + struct pf_tagname *tag; + u_int32_t index; + u_int16_t new_tagid; PF_RULES_WASSERT(); - TAILQ_FOREACH(tag, head, entries) + index = tagname2hashindex(ts, tagname); + TAILQ_FOREACH(tag, &ts->namehash[index], namehash_entries) if (strcmp(tagname, tag->name) == 0) { tag->ref++; return (tag->tag); } /* + * new entry + * * to avoid fragmentation, we do a linear search from the beginning - * and take the first free slot we find. if there is none or the list - * is empty, append a new entry at the end. + * and take the first free slot we find. */ - - /* new entry */ - if (!TAILQ_EMPTY(head)) - for (p = TAILQ_FIRST(head); p != NULL && - p->tag == new_tagid; p = TAILQ_NEXT(p, entries)) - new_tagid = p->tag + 1; - - if (new_tagid > TAGID_MAX) + new_tagid = BIT_FFS(TAGID_MAX, &ts->avail); + /* + * Tags are 1-based, with valid tags in the range [1..TAGID_MAX]. + * BIT_FFS() returns a 1-based bit number, with 0 indicating no bits + * set. It may also return a bit number greater than TAGID_MAX due + * to rounding of the number of bits in the vector up to a multiple + * of the vector word size at declaration/allocation time. + */ + if ((new_tagid == 0) || (new_tagid > TAGID_MAX)) return (0); + /* Mark the tag as in use. Bits are 0-based for BIT_CLR() */ + BIT_CLR(TAGID_MAX, new_tagid - 1, &ts->avail); + /* allocate and fill new struct pf_tagname */ - tag = malloc(sizeof(*tag), M_PFTAG, M_NOWAIT|M_ZERO); + tag = uma_zalloc(V_pf_tag_z, M_NOWAIT); if (tag == NULL) return (0); strlcpy(tag->name, tagname, sizeof(tag->name)); tag->tag = new_tagid; - tag->ref++; + tag->ref = 1; - if (p != NULL) /* insert new entry before p */ - TAILQ_INSERT_BEFORE(p, tag, entries); - else /* either list empty or no free slot in between */ - TAILQ_INSERT_TAIL(head, tag, entries); + /* Insert into namehash */ + TAILQ_INSERT_TAIL(&ts->namehash[index], tag, namehash_entries); + /* Insert into taghash */ + index = tag2hashindex(ts, new_tagid); + TAILQ_INSERT_TAIL(&ts->taghash[index], tag, taghash_entries); + return (tag->tag); } static void -tag_unref(struct pf_tags *head, u_int16_t tag) +tag_unref(struct pf_tagset *ts, u_int16_t tag) { - struct pf_tagname *p, *next; - + struct pf_tagname *t; + uint16_t index; + PF_RULES_WASSERT(); - for (p = TAILQ_FIRST(head); p != NULL; p = next) { - next = TAILQ_NEXT(p, entries); - if (tag == p->tag) { - if (--p->ref == 0) { - TAILQ_REMOVE(head, p, entries); - free(p, M_PFTAG); + index = tag2hashindex(ts, tag); + TAILQ_FOREACH(t, &ts->taghash[index], taghash_entries) + if (tag == t->tag) { + if (--t->ref == 0) { + TAILQ_REMOVE(&ts->taghash[index], t, + taghash_entries); + index = tagname2hashindex(ts, t->name); + TAILQ_REMOVE(&ts->namehash[index], t, + namehash_entries); + /* Bits are 0-based for BIT_SET() */ + BIT_SET(TAGID_MAX, tag - 1, &ts->avail); + uma_zfree(V_pf_tag_z, t); } break; } - } } static u_int16_t @@ -524,22 +627,25 @@ pf_qid_unref(u_int32_t qid) static int pf_begin_altq(u_int32_t *ticket) { - struct pf_altq *altq; + struct pf_altq *altq, *tmp; int error = 0; PF_RULES_WASSERT(); - /* Purge the old altq list */ - while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) { - TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries); - if (altq->qname[0] == 0 && - (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { + /* Purge the old altq lists */ + TAILQ_FOREACH_SAFE(altq, V_pf_altq_ifs_inactive, entries, tmp) { + if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { /* detach and destroy the discipline */ error = altq_remove(altq); - } else - pf_qid_unref(altq->qid); + } + free(altq, M_PFALTQ); + } + TAILQ_INIT(V_pf_altq_ifs_inactive); + TAILQ_FOREACH_SAFE(altq, V_pf_altqs_inactive, entries, tmp) { + pf_qid_unref(altq->qid); free(altq, M_PFALTQ); } + TAILQ_INIT(V_pf_altqs_inactive); if (error) return (error); *ticket = ++V_ticket_altqs_inactive; @@ -550,24 +656,27 @@ pf_begin_altq(u_int32_t *ticket) static int pf_rollback_altq(u_int32_t ticket) { - struct pf_altq *altq; + struct pf_altq *altq, *tmp; int error = 0; PF_RULES_WASSERT(); if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive) return (0); - /* Purge the old altq list */ - while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) { - TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries); - if (altq->qname[0] == 0 && - (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { + /* Purge the old altq lists */ + TAILQ_FOREACH_SAFE(altq, V_pf_altq_ifs_inactive, entries, tmp) { + if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { /* detach and destroy the discipline */ error = altq_remove(altq); - } else - pf_qid_unref(altq->qid); + } + free(altq, M_PFALTQ); + } + TAILQ_INIT(V_pf_altq_ifs_inactive); + TAILQ_FOREACH_SAFE(altq, V_pf_altqs_inactive, entries, tmp) { + pf_qid_unref(altq->qid); free(altq, M_PFALTQ); } + TAILQ_INIT(V_pf_altqs_inactive); V_altqs_inactive_open = 0; return (error); } @@ -575,8 +684,8 @@ pf_rollback_altq(u_int32_t ticket) static int pf_commit_altq(u_int32_t ticket) { - struct pf_altqqueue *old_altqs; - struct pf_altq *altq; + struct pf_altqqueue *old_altqs, *old_altq_ifs; + struct pf_altq *altq, *tmp; int err, error = 0; PF_RULES_WASSERT(); @@ -586,14 +695,16 @@ pf_commit_altq(u_int32_t ticket) /* swap altqs, keep the old. */ old_altqs = V_pf_altqs_active; + old_altq_ifs = V_pf_altq_ifs_active; V_pf_altqs_active = V_pf_altqs_inactive; + V_pf_altq_ifs_active = V_pf_altq_ifs_inactive; V_pf_altqs_inactive = old_altqs; + V_pf_altq_ifs_inactive = old_altq_ifs; V_ticket_altqs_active = V_ticket_altqs_inactive; /* Attach new disciplines */ - TAILQ_FOREACH(altq, V_pf_altqs_active, entries) { - if (altq->qname[0] == 0 && - (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { + TAILQ_FOREACH(altq, V_pf_altq_ifs_active, entries) { + if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { /* attach the discipline */ error = altq_pfattach(altq); if (error == 0 && V_pf_altq_running) @@ -603,11 +714,9 @@ pf_commit_altq(u_int32_t ticket) } } - /* Purge the old altq list */ - while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) { - TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries); - if (altq->qname[0] == 0 && - (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { + /* Purge the old altq lists */ + TAILQ_FOREACH_SAFE(altq, V_pf_altq_ifs_inactive, entries, tmp) { + if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { /* detach and destroy the discipline */ if (V_pf_altq_running) error = pf_disable_altq(altq); @@ -617,10 +726,15 @@ pf_commit_altq(u_int32_t ticket) err = altq_remove(altq); if (err != 0 && error == 0) error = err; - } else - pf_qid_unref(altq->qid); + } + free(altq, M_PFALTQ); + } + TAILQ_INIT(V_pf_altq_ifs_inactive); + TAILQ_FOREACH_SAFE(altq, V_pf_altqs_inactive, entries, tmp) { + pf_qid_unref(altq->qid); free(altq, M_PFALTQ); } + TAILQ_INIT(V_pf_altqs_inactive); V_altqs_inactive_open = 0; return (error); @@ -677,14 +791,46 @@ pf_disable_altq(struct pf_altq *altq) return (error); } +static int +pf_altq_ifnet_event_add(struct ifnet *ifp, int remove, u_int32_t ticket, + struct pf_altq *altq) +{ + struct ifnet *ifp1; + int error = 0; + + /* Deactivate the interface in question */ + altq->local_flags &= ~PFALTQ_FLAG_IF_REMOVED; + if ((ifp1 = ifunit(altq->ifname)) == NULL || + (remove && ifp1 == ifp)) { + altq->local_flags |= PFALTQ_FLAG_IF_REMOVED; + } else { + error = altq_add(ifp1, altq); + + if (ticket != V_ticket_altqs_inactive) + error = EBUSY; + + if (error) + free(altq, M_PFALTQ); + } + + return (error); +} + void pf_altq_ifnet_event(struct ifnet *ifp, int remove) { - struct ifnet *ifp1; struct pf_altq *a1, *a2, *a3; u_int32_t ticket; int error = 0; + /* + * No need to re-evaluate the configuration for events on interfaces + * that do not support ALTQ, as it's not possible for such + * interfaces to be part of the configuration. + */ + if (!ALTQ_IS_READY(&ifp->if_snd)) + return; + /* Interrupt userland queue modifications */ if (V_altqs_inactive_open) pf_rollback_altq(V_ticket_altqs_inactive); @@ -694,7 +840,7 @@ pf_altq_ifnet_event(struct ifnet *ifp, int remove) return; /* Copy the current active set */ - TAILQ_FOREACH(a1, V_pf_altqs_active, entries) { + TAILQ_FOREACH(a1, V_pf_altq_ifs_active, entries) { a2 = malloc(sizeof(*a2), M_PFALTQ, M_NOWAIT); if (a2 == NULL) { error = ENOMEM; @@ -702,41 +848,43 @@ pf_altq_ifnet_event(struct ifnet *ifp, int remove) } bcopy(a1, a2, sizeof(struct pf_altq)); - if (a2->qname[0] != 0) { - if ((a2->qid = pf_qname2qid(a2->qname)) == 0) { - error = EBUSY; - free(a2, M_PFALTQ); - break; - } - a2->altq_disc = NULL; - TAILQ_FOREACH(a3, V_pf_altqs_inactive, entries) { - if (strncmp(a3->ifname, a2->ifname, - IFNAMSIZ) == 0 && a3->qname[0] == 0) { - a2->altq_disc = a3->altq_disc; - break; - } - } - } - /* Deactivate the interface in question */ - a2->local_flags &= ~PFALTQ_FLAG_IF_REMOVED; - if ((ifp1 = ifunit(a2->ifname)) == NULL || - (remove && ifp1 == ifp)) { - a2->local_flags |= PFALTQ_FLAG_IF_REMOVED; - } else { - error = altq_add(a2); + error = pf_altq_ifnet_event_add(ifp, remove, ticket, a2); + if (error) + break; - if (ticket != V_ticket_altqs_inactive) - error = EBUSY; + TAILQ_INSERT_TAIL(V_pf_altq_ifs_inactive, a2, entries); + } + if (error) + goto out; + TAILQ_FOREACH(a1, V_pf_altqs_active, entries) { + a2 = malloc(sizeof(*a2), M_PFALTQ, M_NOWAIT); + if (a2 == NULL) { + error = ENOMEM; + break; + } + bcopy(a1, a2, sizeof(struct pf_altq)); - if (error) { - free(a2, M_PFALTQ); + if ((a2->qid = pf_qname2qid(a2->qname)) == 0) { + error = EBUSY; + free(a2, M_PFALTQ); + break; + } + a2->altq_disc = NULL; + TAILQ_FOREACH(a3, V_pf_altq_ifs_inactive, entries) { + if (strncmp(a3->ifname, a2->ifname, + IFNAMSIZ) == 0) { + a2->altq_disc = a3->altq_disc; break; } } + error = pf_altq_ifnet_event_add(ifp, remove, ticket, a2); + if (error) + break; TAILQ_INSERT_TAIL(V_pf_altqs_inactive, a2, entries); } +out: if (error != 0) pf_rollback_altq(ticket); else @@ -1214,6 +1362,28 @@ pf_import_kaltq(struct pfioc_altq_v1 *pa, struct pf_altq *q, size_t ioc_size) return (0); } + +static struct pf_altq * +pf_altq_get_nth_active(u_int32_t n) +{ + struct pf_altq *altq; + u_int32_t nr; + + nr = 0; + TAILQ_FOREACH(altq, V_pf_altq_ifs_active, entries) { + if (nr == n) + return (altq); + nr++; + } + + TAILQ_FOREACH(altq, V_pf_altqs_active, entries) { + if (nr == n) + return (altq); + nr++; + } + + return (NULL); +} #endif /* ALTQ */ static int @@ -2011,7 +2181,7 @@ relock_DIOCKILLSTATES: break; } - p = pstore = malloc(ps->ps_len, M_TEMP, M_WAITOK); + p = pstore = malloc(ps->ps_len, M_TEMP, M_WAITOK | M_ZERO); nr = 0; for (i = 0; i <= pf_hashmask; i++) { @@ -2273,9 +2443,8 @@ DIOCGETSTATES_full: PF_RULES_WLOCK(); /* enable all altq interfaces on active list */ - TAILQ_FOREACH(altq, V_pf_altqs_active, entries) { - if (altq->qname[0] == 0 && (altq->local_flags & - PFALTQ_FLAG_IF_REMOVED) == 0) { + TAILQ_FOREACH(altq, V_pf_altq_ifs_active, entries) { + if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { error = pf_enable_altq(altq); if (error != 0) break; @@ -2293,9 +2462,8 @@ DIOCGETSTATES_full: PF_RULES_WLOCK(); /* disable all altq interfaces on active list */ - TAILQ_FOREACH(altq, V_pf_altqs_active, entries) { - if (altq->qname[0] == 0 && (altq->local_flags & - PFALTQ_FLAG_IF_REMOVED) == 0) { + TAILQ_FOREACH(altq, V_pf_altq_ifs_active, entries) { + if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { error = pf_disable_altq(altq); if (error != 0) break; @@ -2340,9 +2508,9 @@ DIOCGETSTATES_full: break; } altq->altq_disc = NULL; - TAILQ_FOREACH(a, V_pf_altqs_inactive, entries) { + TAILQ_FOREACH(a, V_pf_altq_ifs_inactive, entries) { if (strncmp(a->ifname, altq->ifname, - IFNAMSIZ) == 0 && a->qname[0] == 0) { + IFNAMSIZ) == 0) { altq->altq_disc = a->altq_disc; break; } @@ -2352,7 +2520,7 @@ DIOCGETSTATES_full: if ((ifp = ifunit(altq->ifname)) == NULL) altq->local_flags |= PFALTQ_FLAG_IF_REMOVED; else - error = altq_add(altq); + error = altq_add(ifp, altq); if (error) { PF_RULES_WUNLOCK(); @@ -2360,7 +2528,10 @@ DIOCGETSTATES_full: break; } - TAILQ_INSERT_TAIL(V_pf_altqs_inactive, altq, entries); + if (altq->qname[0] != 0) + TAILQ_INSERT_TAIL(V_pf_altqs_inactive, altq, entries); + else + TAILQ_INSERT_TAIL(V_pf_altq_ifs_inactive, altq, entries); /* version error check done on import above */ pf_export_kaltq(altq, pa, IOCPARM_LEN(cmd)); PF_RULES_WUNLOCK(); @@ -2374,6 +2545,8 @@ DIOCGETSTATES_full: PF_RULES_RLOCK(); pa->nr = 0; + TAILQ_FOREACH(altq, V_pf_altq_ifs_active, entries) + pa->nr++; TAILQ_FOREACH(altq, V_pf_altqs_active, entries) pa->nr++; pa->ticket = V_ticket_altqs_active; @@ -2385,7 +2558,6 @@ DIOCGETSTATES_full: case DIOCGETALTQV1: { struct pfioc_altq_v1 *pa = (struct pfioc_altq_v1 *)addr; struct pf_altq *altq; - u_int32_t nr; PF_RULES_RLOCK(); if (pa->ticket != V_ticket_altqs_active) { @@ -2393,12 +2565,7 @@ DIOCGETSTATES_full: error = EBUSY; break; } - nr = 0; - altq = TAILQ_FIRST(V_pf_altqs_active); - while ((altq != NULL) && (nr < pa->nr)) { - altq = TAILQ_NEXT(altq, entries); - nr++; - } + altq = pf_altq_get_nth_active(pa->nr); if (altq == NULL) { PF_RULES_RUNLOCK(); error = EBUSY; @@ -2419,7 +2586,6 @@ DIOCGETSTATES_full: case DIOCGETQSTATSV1: { struct pfioc_qstats_v1 *pq = (struct pfioc_qstats_v1 *)addr; struct pf_altq *altq; - u_int32_t nr; int nbytes; u_int32_t version; @@ -2430,12 +2596,7 @@ DIOCGETSTATES_full: break; } nbytes = pq->nbytes; - nr = 0; - altq = TAILQ_FIRST(V_pf_altqs_active); - while ((altq != NULL) && (nr < pq->nr)) { - altq = TAILQ_NEXT(altq, entries); - nr++; - } + altq = pf_altq_get_nth_active(pq->nr); if (altq == NULL) { PF_RULES_RUNLOCK(); error = EBUSY; @@ -2954,24 +3115,20 @@ DIOCCHANGEADDR_error: break; } - PF_RULES_WLOCK(); + PF_RULES_RLOCK(); n = pfr_table_count(&io->pfrio_table, io->pfrio_flags); io->pfrio_size = min(io->pfrio_size, n); + PF_RULES_RUNLOCK(); totlen = io->pfrio_size * sizeof(struct pfr_table); pfrts = mallocarray(io->pfrio_size, sizeof(struct pfr_table), - M_TEMP, M_NOWAIT); - if (pfrts == NULL) { - error = ENOMEM; - PF_RULES_WUNLOCK(); - break; - } + M_TEMP, M_WAITOK); error = copyin(io->pfrio_buffer, pfrts, totlen); if (error) { free(pfrts, M_TEMP); - PF_RULES_WUNLOCK(); break; } + PF_RULES_WLOCK(); error = pfr_set_tflags(pfrts, io->pfrio_size, io->pfrio_setflag, io->pfrio_clrflag, &io->pfrio_nchange, &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -3589,19 +3746,25 @@ DIOCCHANGEADDR_error: struct pf_src_node *n, *p, *pstore; uint32_t i, nr = 0; + for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; + i++, sh++) { + PF_HASHROW_LOCK(sh); + LIST_FOREACH(n, &sh->nodes, entry) + nr++; + PF_HASHROW_UNLOCK(sh); + } + + psn->psn_len = min(psn->psn_len, + sizeof(struct pf_src_node) * nr); + if (psn->psn_len == 0) { - for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; - i++, sh++) { - PF_HASHROW_LOCK(sh); - LIST_FOREACH(n, &sh->nodes, entry) - nr++; - PF_HASHROW_UNLOCK(sh); - } psn->psn_len = sizeof(struct pf_src_node) * nr; break; } - p = pstore = malloc(psn->psn_len, M_TEMP, M_WAITOK); + nr = 0; + + p = pstore = malloc(psn->psn_len, M_TEMP, M_WAITOK | M_ZERO); for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) { PF_HASHROW_LOCK(sh); @@ -3997,65 +4160,59 @@ shutdown_pf(void) /* status does not use malloced mem so no need to cleanup */ /* fingerprints and interfaces have their own cleanup code */ - - /* Free counters last as we updated them during shutdown. */ - counter_u64_free(V_pf_default_rule.states_cur); - counter_u64_free(V_pf_default_rule.states_tot); - counter_u64_free(V_pf_default_rule.src_nodes); - - for (int i = 0; i < PFRES_MAX; i++) - counter_u64_free(V_pf_status.counters[i]); - for (int i = 0; i < LCNT_MAX; i++) - counter_u64_free(V_pf_status.lcounters[i]); - for (int i = 0; i < FCNT_MAX; i++) - counter_u64_free(V_pf_status.fcounters[i]); - for (int i = 0; i < SCNT_MAX; i++) - counter_u64_free(V_pf_status.scounters[i]); } while(0); return (error); } +static pfil_return_t +pf_check_return(int chk, struct mbuf **m) +{ + + switch (chk) { + case PF_PASS: + if (*m == NULL) + return (PFIL_CONSUMED); + else + return (PFIL_PASS); + break; + default: + if (*m != NULL) { + m_freem(*m); + *m = NULL; + } + return (PFIL_DROPPED); + } +} + #ifdef INET -static int -pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, int flags, - struct inpcb *inp) +static pfil_return_t +pf_check_in(struct mbuf **m, struct ifnet *ifp, int flags, + void *ruleset __unused, struct inpcb *inp) { int chk; chk = pf_test(PF_IN, flags, ifp, m, inp); - if (chk && *m) { - m_freem(*m); - *m = NULL; - } - if (chk != PF_PASS) - return (EACCES); - return (0); + return (pf_check_return(chk, m)); } -static int -pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, int flags, - struct inpcb *inp) +static pfil_return_t +pf_check_out(struct mbuf **m, struct ifnet *ifp, int flags, + void *ruleset __unused, struct inpcb *inp) { int chk; chk = pf_test(PF_OUT, flags, ifp, m, inp); - if (chk && *m) { - m_freem(*m); - *m = NULL; - } - if (chk != PF_PASS) - return (EACCES); - return (0); + return (pf_check_return(chk, m)); } #endif #ifdef INET6 -static int -pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, int flags, - struct inpcb *inp) +static pfil_return_t +pf_check6_in(struct mbuf **m, struct ifnet *ifp, int flags, + void *ruleset __unused, struct inpcb *inp) { int chk; @@ -4067,67 +4224,89 @@ pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, int flags, CURVNET_SET(ifp->if_vnet); chk = pf_test6(PF_IN, flags, (*m)->m_flags & M_LOOP ? V_loif : ifp, m, inp); CURVNET_RESTORE(); - if (chk && *m) { - m_freem(*m); - *m = NULL; - } - if (chk != PF_PASS) - return (EACCES); - return (0); + + return (pf_check_return(chk, m)); } -static int -pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, int flags, - struct inpcb *inp) +static pfil_return_t +pf_check6_out(struct mbuf **m, struct ifnet *ifp, int flags, + void *ruleset __unused, struct inpcb *inp) { int chk; CURVNET_SET(ifp->if_vnet); chk = pf_test6(PF_OUT, flags, ifp, m, inp); CURVNET_RESTORE(); - if (chk && *m) { - m_freem(*m); - *m = NULL; - } - if (chk != PF_PASS) - return (EACCES); - return (0); + + return (pf_check_return(chk, m)); } #endif /* INET6 */ -static int -hook_pf(void) -{ #ifdef INET - struct pfil_head *pfh_inet; +VNET_DEFINE_STATIC(pfil_hook_t, pf_ip4_in_hook); +VNET_DEFINE_STATIC(pfil_hook_t, pf_ip4_out_hook); +#define V_pf_ip4_in_hook VNET(pf_ip4_in_hook) +#define V_pf_ip4_out_hook VNET(pf_ip4_out_hook) #endif #ifdef INET6 - struct pfil_head *pfh_inet6; +VNET_DEFINE_STATIC(pfil_hook_t, pf_ip6_in_hook); +VNET_DEFINE_STATIC(pfil_hook_t, pf_ip6_out_hook); +#define V_pf_ip6_in_hook VNET(pf_ip6_in_hook) +#define V_pf_ip6_out_hook VNET(pf_ip6_out_hook) #endif +static int +hook_pf(void) +{ + struct pfil_hook_args pha; + struct pfil_link_args pla; + if (V_pf_pfil_hooked) return (0); + pha.pa_version = PFIL_VERSION; + pha.pa_modname = "pf"; + pha.pa_ruleset = NULL; + + pla.pa_version = PFIL_VERSION; + #ifdef INET - pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); - if (pfh_inet == NULL) - return (ESRCH); /* XXX */ - pfil_add_hook_flags(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet); - pfil_add_hook_flags(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet); + pha.pa_type = PFIL_TYPE_IP4; + pha.pa_func = pf_check_in; + pha.pa_flags = PFIL_IN; + pha.pa_rulname = "default-in"; + V_pf_ip4_in_hook = pfil_add_hook(&pha); + pla.pa_flags = PFIL_IN | PFIL_HEADPTR | PFIL_HOOKPTR; + pla.pa_head = V_inet_pfil_head; + pla.pa_hook = V_pf_ip4_in_hook; + (void)pfil_link(&pla); + pha.pa_func = pf_check_out; + pha.pa_flags = PFIL_OUT; + pha.pa_rulname = "default-out"; + V_pf_ip4_out_hook = pfil_add_hook(&pha); + pla.pa_flags = PFIL_OUT | PFIL_HEADPTR | PFIL_HOOKPTR; + pla.pa_head = V_inet_pfil_head; + pla.pa_hook = V_pf_ip4_out_hook; + (void)pfil_link(&pla); #endif #ifdef INET6 - pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); - if (pfh_inet6 == NULL) { -#ifdef INET - pfil_remove_hook_flags(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, - pfh_inet); - pfil_remove_hook_flags(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, - pfh_inet); -#endif - return (ESRCH); /* XXX */ - } - pfil_add_hook_flags(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet6); - pfil_add_hook_flags(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet6); + pha.pa_type = PFIL_TYPE_IP6; + pha.pa_func = pf_check6_in; + pha.pa_flags = PFIL_IN; + pha.pa_rulname = "default-in6"; + V_pf_ip6_in_hook = pfil_add_hook(&pha); + pla.pa_flags = PFIL_IN | PFIL_HEADPTR | PFIL_HOOKPTR; + pla.pa_head = V_inet6_pfil_head; + pla.pa_hook = V_pf_ip6_in_hook; + (void)pfil_link(&pla); + pha.pa_func = pf_check6_out; + pha.pa_rulname = "default-out6"; + pha.pa_flags = PFIL_OUT; + V_pf_ip6_out_hook = pfil_add_hook(&pha); + pla.pa_flags = PFIL_OUT | PFIL_HEADPTR | PFIL_HOOKPTR; + pla.pa_head = V_inet6_pfil_head; + pla.pa_hook = V_pf_ip6_out_hook; + (void)pfil_link(&pla); #endif V_pf_pfil_hooked = 1; @@ -4137,33 +4316,17 @@ hook_pf(void) static int dehook_pf(void) { -#ifdef INET - struct pfil_head *pfh_inet; -#endif -#ifdef INET6 - struct pfil_head *pfh_inet6; -#endif if (V_pf_pfil_hooked == 0) return (0); #ifdef INET - pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); - if (pfh_inet == NULL) - return (ESRCH); /* XXX */ - pfil_remove_hook_flags(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, - pfh_inet); - pfil_remove_hook_flags(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, - pfh_inet); + pfil_remove_hook(V_pf_ip4_in_hook); + pfil_remove_hook(V_pf_ip4_out_hook); #endif #ifdef INET6 - pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); - if (pfh_inet6 == NULL) - return (ESRCH); /* XXX */ - pfil_remove_hook_flags(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK, - pfh_inet6); - pfil_remove_hook_flags(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK, - pfh_inet6); + pfil_remove_hook(V_pf_ip6_in_hook); + pfil_remove_hook(V_pf_ip6_out_hook); #endif V_pf_pfil_hooked = 0; @@ -4173,8 +4336,15 @@ dehook_pf(void) static void pf_load_vnet(void) { - TAILQ_INIT(&V_pf_tags); - TAILQ_INIT(&V_pf_qids); + V_pf_tag_z = uma_zcreate("pf tags", sizeof(struct pf_tagname), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + + pf_init_tagset(&V_pf_tags, &pf_rule_tag_hashsize, + PF_RULE_TAG_HASH_SIZE_DEFAULT); +#ifdef ALTQ + pf_init_tagset(&V_pf_qids, &pf_queue_tag_hashsize, + PF_QUEUE_TAG_HASH_SIZE_DEFAULT); +#endif pfattach_vnet(); V_pf_vnet_active = 1; @@ -4191,7 +4361,7 @@ pf_load(void) pf_mtag_initialize(); - pf_dev = make_dev(&pf_cdevsw, 0, 0, 0, 0600, PF_NAME); + pf_dev = make_dev(&pf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, PF_NAME); if (pf_dev == NULL) return (ENOMEM); @@ -4241,6 +4411,26 @@ pf_unload_vnet(void) pf_cleanup(); if (IS_DEFAULT_VNET(curvnet)) pf_mtag_cleanup(); + + pf_cleanup_tagset(&V_pf_tags); +#ifdef ALTQ + pf_cleanup_tagset(&V_pf_qids); +#endif + uma_zdestroy(V_pf_tag_z); + + /* Free counters last as we updated them during shutdown. */ + counter_u64_free(V_pf_default_rule.states_cur); + counter_u64_free(V_pf_default_rule.states_tot); + counter_u64_free(V_pf_default_rule.src_nodes); + + for (int i = 0; i < PFRES_MAX; i++) + counter_u64_free(V_pf_status.counters[i]); + for (int i = 0; i < LCNT_MAX; i++) + counter_u64_free(V_pf_status.lcounters[i]); + for (int i = 0; i < FCNT_MAX; i++) + counter_u64_free(V_pf_status.fcounters[i]); + for (int i = 0; i < SCNT_MAX; i++) + counter_u64_free(V_pf_status.scounters[i]); } #endif /* __rtems__ */ diff --git a/freebsd/sys/netpfil/pf/pf_norm.c b/freebsd/sys/netpfil/pf/pf_norm.c index 9538e97c..eb25bbc8 100644 --- a/freebsd/sys/netpfil/pf/pf_norm.c +++ b/freebsd/sys/netpfil/pf/pf_norm.c @@ -838,11 +838,11 @@ pf_reassemble6(struct mbuf **m0, struct ip6_hdr *ip6, struct ip6_frag *fraghdr, } /* We have all the data. */ + frent = TAILQ_FIRST(&frag->fr_queue); + KASSERT(frent != NULL, ("frent != NULL")); extoff = frent->fe_extoff; maxlen = frag->fr_maxlen; frag_id = frag->fr_id; - frent = TAILQ_FIRST(&frag->fr_queue); - KASSERT(frent != NULL, ("frent != NULL")); total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; hdrlen = frent->fe_hdrlen - sizeof(struct ip6_frag); @@ -1141,9 +1141,8 @@ pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, int off; struct ip6_ext ext; struct ip6_opt opt; - struct ip6_opt_jumbo jumbo; struct ip6_frag frag; - u_int32_t jumbolen = 0, plen; + u_int32_t plen; int optend; int ooff; u_int8_t proto; @@ -1187,6 +1186,11 @@ pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len) goto drop; + plen = ntohs(h->ip6_plen); + /* jumbo payload option not supported */ + if (plen == 0) + goto drop; + extoff = 0; off = sizeof(struct ip6_hdr); proto = h->ip6_nxt; @@ -1230,26 +1234,8 @@ pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, goto shortpkt; if (ooff + sizeof(opt) + opt.ip6o_len > optend) goto drop; - switch (opt.ip6o_type) { - case IP6OPT_JUMBO: - if (h->ip6_plen != 0) - goto drop; - if (!pf_pull_hdr(m, ooff, &jumbo, - sizeof(jumbo), NULL, NULL, - AF_INET6)) - goto shortpkt; - memcpy(&jumbolen, jumbo.ip6oj_jumbo_len, - sizeof(jumbolen)); - jumbolen = ntohl(jumbolen); - if (jumbolen <= IPV6_MAXPACKET) - goto drop; - if (sizeof(struct ip6_hdr) + jumbolen != - m->m_pkthdr.len) - goto drop; - break; - default: - break; - } + if (opt.ip6o_type == IP6OPT_JUMBO) + goto drop; ooff += sizeof(opt) + opt.ip6o_len; } while (ooff < optend); @@ -1262,13 +1248,6 @@ pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, } } while (!terminal); - /* jumbo payload option must be present, or plen > 0 */ - if (ntohs(h->ip6_plen) == 0) - plen = jumbolen; - else - plen = ntohs(h->ip6_plen); - if (plen == 0) - goto drop; if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) goto shortpkt; @@ -1277,10 +1256,6 @@ pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, return (PF_PASS); fragment: - /* Jumbo payload packets cannot be fragmented. */ - plen = ntohs(h->ip6_plen); - if (plen == 0 || jumbolen) - goto drop; if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) goto shortpkt; diff --git a/freebsd/sys/netpfil/pf/pf_table.c b/freebsd/sys/netpfil/pf/pf_table.c index 3f15fb0e..96ed849c 100644 --- a/freebsd/sys/netpfil/pf/pf_table.c +++ b/freebsd/sys/netpfil/pf/pf_table.c @@ -53,6 +53,8 @@ __FBSDID("$FreeBSD$"); #include <net/vnet.h> #include <net/pfvar.h> +#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x + #define ACCEPT_FLAGS(flags, oklist) \ do { \ if ((flags & ~(oklist)) & \ @@ -113,6 +115,7 @@ struct pfr_walktree { struct pfi_dynaddr *pfrw1_dyn; } pfrw_1; int pfrw_free; + int pfrw_flags; }; #define pfrw_addr pfrw_1.pfrw1_addr #define pfrw_astats pfrw_1.pfrw1_astats @@ -126,15 +129,16 @@ struct pfr_walktree { static MALLOC_DEFINE(M_PFTABLE, "pf_table", "pf(4) tables structures"); VNET_DEFINE_STATIC(uma_zone_t, pfr_kentry_z); #define V_pfr_kentry_z VNET(pfr_kentry_z) -VNET_DEFINE_STATIC(uma_zone_t, pfr_kcounters_z); -#define V_pfr_kcounters_z VNET(pfr_kcounters_z) static struct pf_addr pfr_ffaddr = { .addr32 = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff } }; +static void pfr_copyout_astats(struct pfr_astats *, + const struct pfr_kentry *, + const struct pfr_walktree *); static void pfr_copyout_addr(struct pfr_addr *, - struct pfr_kentry *ke); + const struct pfr_kentry *ke); static int pfr_validate_addr(struct pfr_addr *); static void pfr_enqueue_addrs(struct pfr_ktable *, struct pfr_kentryworkq *, int *, int); @@ -142,8 +146,12 @@ static void pfr_mark_addrs(struct pfr_ktable *); static struct pfr_kentry *pfr_lookup_addr(struct pfr_ktable *, struct pfr_addr *, int); +static bool pfr_create_kentry_counter(struct pfr_kcounters *, + int, int); static struct pfr_kentry *pfr_create_kentry(struct pfr_addr *); static void pfr_destroy_kentries(struct pfr_kentryworkq *); +static void pfr_destroy_kentry_counter(struct pfr_kcounters *, + int, int); static void pfr_destroy_kentry(struct pfr_kentry *); static void pfr_insert_kentries(struct pfr_ktable *, struct pfr_kentryworkq *, long); @@ -202,9 +210,6 @@ pfr_initialize(void) V_pfr_kentry_z = uma_zcreate("pf table entries", sizeof(struct pfr_kentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - V_pfr_kcounters_z = uma_zcreate("pf table counters", - sizeof(struct pfr_kcounters), NULL, NULL, NULL, NULL, - UMA_ALIGN_PTR, 0); V_pf_limits[PF_LIMIT_TABLE_ENTRIES].zone = V_pfr_kentry_z; V_pf_limits[PF_LIMIT_TABLE_ENTRIES].limit = PFR_KENTRY_HIWAT; } @@ -214,7 +219,6 @@ pfr_cleanup(void) { uma_zdestroy(V_pfr_kentry_z); - uma_zdestroy(V_pfr_kcounters_z); } int @@ -608,6 +612,13 @@ pfr_get_astats(struct pfr_table *tbl, struct pfr_astats *addr, int *size, w.pfrw_op = PFRW_GET_ASTATS; w.pfrw_astats = addr; w.pfrw_free = kt->pfrkt_cnt; + /* + * Flags below are for backward compatibility. It was possible to have + * a table without per-entry counters. Now they are always allocated, + * we just discard data when reading it if table is not configured to + * have counters. + */ + w.pfrw_flags = kt->pfrkt_flags; rv = kt->pfrkt_ip4->rnh_walktree(&kt->pfrkt_ip4->rh, pfr_walktree, &w); if (!rv) rv = kt->pfrkt_ip6->rnh_walktree(&kt->pfrkt_ip6->rh, @@ -774,10 +785,30 @@ pfr_lookup_addr(struct pfr_ktable *kt, struct pfr_addr *ad, int exact) return (ke); } +static bool +pfr_create_kentry_counter(struct pfr_kcounters *kc, int pfr_dir, int pfr_op) +{ + kc->pfrkc_packets[pfr_dir][pfr_op] = counter_u64_alloc(M_NOWAIT); + if (! kc->pfrkc_packets[pfr_dir][pfr_op]) + return (false); + + kc->pfrkc_bytes[pfr_dir][pfr_op] = counter_u64_alloc(M_NOWAIT); + if (! kc->pfrkc_bytes[pfr_dir][pfr_op]) { + /* Previous allocation will be freed through + * pfr_destroy_kentry() */ + return (false); + } + + kc->pfrkc_tzero = 0; + + return (true); +} + static struct pfr_kentry * pfr_create_kentry(struct pfr_addr *ad) { struct pfr_kentry *ke; + int pfr_dir, pfr_op; ke = uma_zalloc(V_pfr_kentry_z, M_NOWAIT | M_ZERO); if (ke == NULL) @@ -790,6 +821,14 @@ pfr_create_kentry(struct pfr_addr *ad) ke->pfrke_af = ad->pfra_af; ke->pfrke_net = ad->pfra_net; ke->pfrke_not = ad->pfra_not; + for (pfr_dir = 0; pfr_dir < PFR_DIR_MAX; pfr_dir ++) + for (pfr_op = 0; pfr_op < PFR_OP_ADDR_MAX; pfr_op ++) { + if (! pfr_create_kentry_counter(&ke->pfrke_counters, + pfr_dir, pfr_op)) { + pfr_destroy_kentry(ke); + return (NULL); + } + } return (ke); } @@ -805,10 +844,22 @@ pfr_destroy_kentries(struct pfr_kentryworkq *workq) } static void +pfr_destroy_kentry_counter(struct pfr_kcounters *kc, int pfr_dir, int pfr_op) +{ + counter_u64_free(kc->pfrkc_packets[pfr_dir][pfr_op]); + counter_u64_free(kc->pfrkc_bytes[pfr_dir][pfr_op]); +} + +static void pfr_destroy_kentry(struct pfr_kentry *ke) { - if (ke->pfrke_counters) - uma_zfree(V_pfr_kcounters_z, ke->pfrke_counters); + int pfr_dir, pfr_op; + + for (pfr_dir = 0; pfr_dir < PFR_DIR_MAX; pfr_dir ++) + for (pfr_op = 0; pfr_op < PFR_OP_ADDR_MAX; pfr_op ++) + pfr_destroy_kentry_counter(&ke->pfrke_counters, + pfr_dir, pfr_op); + uma_zfree(V_pfr_kentry_z, ke); } @@ -826,7 +877,7 @@ pfr_insert_kentries(struct pfr_ktable *kt, "(code=%d).\n", rv); break; } - p->pfrke_tzero = tzero; + p->pfrke_counters.pfrkc_tzero = tzero; n++; } kt->pfrkt_cnt += n; @@ -849,7 +900,7 @@ pfr_insert_kentry(struct pfr_ktable *kt, struct pfr_addr *ad, long tzero) if (rv) return (rv); - p->pfrke_tzero = tzero; + p->pfrke_counters.pfrkc_tzero = tzero; kt->pfrkt_cnt++; return (0); @@ -884,15 +935,20 @@ static void pfr_clstats_kentries(struct pfr_kentryworkq *workq, long tzero, int negchange) { struct pfr_kentry *p; + int pfr_dir, pfr_op; SLIST_FOREACH(p, workq, pfrke_workq) { if (negchange) p->pfrke_not = !p->pfrke_not; - if (p->pfrke_counters) { - uma_zfree(V_pfr_kcounters_z, p->pfrke_counters); - p->pfrke_counters = NULL; + for (pfr_dir = 0; pfr_dir < PFR_DIR_MAX; pfr_dir ++) { + for (pfr_op = 0; pfr_op < PFR_OP_ADDR_MAX; pfr_op ++) { + counter_u64_zero(p->pfrke_counters. + pfrkc_packets[pfr_dir][pfr_op]); + counter_u64_zero(p->pfrke_counters. + pfrkc_bytes[pfr_dir][pfr_op]); + } } - p->pfrke_tzero = tzero; + p->pfrke_counters.pfrkc_tzero = tzero; } } @@ -981,7 +1037,7 @@ pfr_unroute_kentry(struct pfr_ktable *kt, struct pfr_kentry *ke) } static void -pfr_copyout_addr(struct pfr_addr *ad, struct pfr_kentry *ke) +pfr_copyout_addr(struct pfr_addr *ad, const struct pfr_kentry *ke) { bzero(ad, sizeof(*ad)); if (ke == NULL) @@ -995,6 +1051,33 @@ pfr_copyout_addr(struct pfr_addr *ad, struct pfr_kentry *ke) ad->pfra_ip6addr = ke->pfrke_sa.sin6.sin6_addr; } +static void +pfr_copyout_astats(struct pfr_astats *as, const struct pfr_kentry *ke, + const struct pfr_walktree *w) +{ + int dir, op; + const struct pfr_kcounters *kc = &ke->pfrke_counters; + + pfr_copyout_addr(&as->pfras_a, ke); + as->pfras_tzero = kc->pfrkc_tzero; + + if (! (w->pfrw_flags & PFR_TFLAG_COUNTERS)) { + bzero(as->pfras_packets, sizeof(as->pfras_packets)); + bzero(as->pfras_bytes, sizeof(as->pfras_bytes)); + as->pfras_a.pfra_fback = PFR_FB_NOCOUNT; + return; + } + + for (dir = 0; dir < PFR_DIR_MAX; dir ++) { + for (op = 0; op < PFR_OP_ADDR_MAX; op ++) { + as->pfras_packets[dir][op] = + counter_u64_fetch(kc->pfrkc_packets[dir][op]); + as->pfras_bytes[dir][op] = + counter_u64_fetch(kc->pfrkc_bytes[dir][op]); + } + } +} + static int pfr_walktree(struct radix_node *rn, void *arg) { @@ -1023,19 +1106,7 @@ pfr_walktree(struct radix_node *rn, void *arg) if (w->pfrw_free-- > 0) { struct pfr_astats as; - pfr_copyout_addr(&as.pfras_a, ke); - - if (ke->pfrke_counters) { - bcopy(ke->pfrke_counters->pfrkc_packets, - as.pfras_packets, sizeof(as.pfras_packets)); - bcopy(ke->pfrke_counters->pfrkc_bytes, - as.pfras_bytes, sizeof(as.pfras_bytes)); - } else { - bzero(as.pfras_packets, sizeof(as.pfras_packets)); - bzero(as.pfras_bytes, sizeof(as.pfras_bytes)); - as.pfras_a.pfra_fback = PFR_FB_NOCOUNT; - } - as.pfras_tzero = ke->pfrke_tzero; + pfr_copyout_astats(&as, ke, w); bcopy(&as, w->pfrw_astats, sizeof(as)); w->pfrw_astats++; @@ -1260,6 +1331,7 @@ pfr_get_tstats(struct pfr_table *filter, struct pfr_tstats *tbl, int *size, struct pfr_ktableworkq workq; int n, nn; long tzero = time_second; + int pfr_dir, pfr_op; /* XXX PFR_FLAG_CLSTATS disabled */ ACCEPT_FLAGS(flags, PFR_FLAG_ALLRSETS); @@ -1278,7 +1350,25 @@ pfr_get_tstats(struct pfr_table *filter, struct pfr_tstats *tbl, int *size, continue; if (n-- <= 0) continue; - bcopy(&p->pfrkt_ts, tbl++, sizeof(*tbl)); + bcopy(&p->pfrkt_kts.pfrts_t, &tbl->pfrts_t, + sizeof(struct pfr_table)); + for (pfr_dir = 0; pfr_dir < PFR_DIR_MAX; pfr_dir ++) { + for (pfr_op = 0; pfr_op < PFR_OP_TABLE_MAX; pfr_op ++) { + tbl->pfrts_packets[pfr_dir][pfr_op] = + counter_u64_fetch( + p->pfrkt_packets[pfr_dir][pfr_op]); + tbl->pfrts_bytes[pfr_dir][pfr_op] = + counter_u64_fetch( + p->pfrkt_bytes[pfr_dir][pfr_op]); + } + } + tbl->pfrts_match = counter_u64_fetch(p->pfrkt_match); + tbl->pfrts_nomatch = counter_u64_fetch(p->pfrkt_nomatch); + tbl->pfrts_tzero = p->pfrkt_tzero; + tbl->pfrts_cnt = p->pfrkt_cnt; + for (pfr_op = 0; pfr_op < PFR_REFCNT_MAX; pfr_op++) + tbl->pfrts_refcnt[pfr_op] = p->pfrkt_refcnt[pfr_op]; + tbl++; SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); } if (flags & PFR_FLAG_CLSTATS) @@ -1612,7 +1702,7 @@ pfr_commit_ktable(struct pfr_ktable *kt, long tzero) q->pfrke_mark = 1; SLIST_INSERT_HEAD(&garbageq, p, pfrke_workq); } else { - p->pfrke_tzero = tzero; + p->pfrke_counters.pfrkc_tzero = tzero; SLIST_INSERT_HEAD(&addq, p, pfrke_workq); } } @@ -1796,14 +1886,20 @@ static void pfr_clstats_ktable(struct pfr_ktable *kt, long tzero, int recurse) { struct pfr_kentryworkq addrq; + int pfr_dir, pfr_op; if (recurse) { pfr_enqueue_addrs(kt, &addrq, NULL, 0); pfr_clstats_kentries(&addrq, tzero, 0); } - bzero(kt->pfrkt_packets, sizeof(kt->pfrkt_packets)); - bzero(kt->pfrkt_bytes, sizeof(kt->pfrkt_bytes)); - kt->pfrkt_match = kt->pfrkt_nomatch = 0; + for (pfr_dir = 0; pfr_dir < PFR_DIR_MAX; pfr_dir ++) { + for (pfr_op = 0; pfr_op < PFR_OP_TABLE_MAX; pfr_op ++) { + counter_u64_zero(kt->pfrkt_packets[pfr_dir][pfr_op]); + counter_u64_zero(kt->pfrkt_bytes[pfr_dir][pfr_op]); + } + } + counter_u64_zero(kt->pfrkt_match); + counter_u64_zero(kt->pfrkt_nomatch); kt->pfrkt_tzero = tzero; } @@ -1812,6 +1908,7 @@ pfr_create_ktable(struct pfr_table *tbl, long tzero, int attachruleset) { struct pfr_ktable *kt; struct pf_ruleset *rs; + int pfr_dir, pfr_op; PF_RULES_WASSERT(); @@ -1830,6 +1927,34 @@ pfr_create_ktable(struct pfr_table *tbl, long tzero, int attachruleset) rs->tables++; } + for (pfr_dir = 0; pfr_dir < PFR_DIR_MAX; pfr_dir ++) { + for (pfr_op = 0; pfr_op < PFR_OP_TABLE_MAX; pfr_op ++) { + kt->pfrkt_packets[pfr_dir][pfr_op] = + counter_u64_alloc(M_NOWAIT); + if (! kt->pfrkt_packets[pfr_dir][pfr_op]) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + kt->pfrkt_bytes[pfr_dir][pfr_op] = + counter_u64_alloc(M_NOWAIT); + if (! kt->pfrkt_bytes[pfr_dir][pfr_op]) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + } + } + kt->pfrkt_match = counter_u64_alloc(M_NOWAIT); + if (! kt->pfrkt_match) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + + kt->pfrkt_nomatch = counter_u64_alloc(M_NOWAIT); + if (! kt->pfrkt_nomatch) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + if (!rn_inithead((void **)&kt->pfrkt_ip4, offsetof(struct sockaddr_in, sin_addr) * 8) || !rn_inithead((void **)&kt->pfrkt_ip6, @@ -1857,6 +1982,7 @@ static void pfr_destroy_ktable(struct pfr_ktable *kt, int flushaddr) { struct pfr_kentryworkq addrq; + int pfr_dir, pfr_op; if (flushaddr) { pfr_enqueue_addrs(kt, &addrq, NULL, 0); @@ -1873,6 +1999,15 @@ pfr_destroy_ktable(struct pfr_ktable *kt, int flushaddr) kt->pfrkt_rs->tables--; pf_remove_if_empty_ruleset(kt->pfrkt_rs); } + for (pfr_dir = 0; pfr_dir < PFR_DIR_MAX; pfr_dir ++) { + for (pfr_op = 0; pfr_op < PFR_OP_TABLE_MAX; pfr_op ++) { + counter_u64_free(kt->pfrkt_packets[pfr_dir][pfr_op]); + counter_u64_free(kt->pfrkt_bytes[pfr_dir][pfr_op]); + } + } + counter_u64_free(kt->pfrkt_match); + counter_u64_free(kt->pfrkt_nomatch); + free(kt, M_PFTABLE); } @@ -1941,9 +2076,9 @@ pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af) } match = (ke && !ke->pfrke_not); if (match) - kt->pfrkt_match++; + counter_u64_add(kt->pfrkt_match, 1); else - kt->pfrkt_nomatch++; + counter_u64_add(kt->pfrkt_nomatch, 1); return (match); } @@ -1994,20 +2129,18 @@ pfr_update_stats(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af, } if ((ke == NULL || ke->pfrke_not) != notrule) { if (op_pass != PFR_OP_PASS) - printf("pfr_update_stats: assertion failed.\n"); + DPFPRINTF(PF_DEBUG_URGENT, + ("pfr_update_stats: assertion failed.\n")); op_pass = PFR_OP_XPASS; } - kt->pfrkt_packets[dir_out][op_pass]++; - kt->pfrkt_bytes[dir_out][op_pass] += len; + counter_u64_add(kt->pfrkt_packets[dir_out][op_pass], 1); + counter_u64_add(kt->pfrkt_bytes[dir_out][op_pass], len); if (ke != NULL && op_pass != PFR_OP_XPASS && (kt->pfrkt_flags & PFR_TFLAG_COUNTERS)) { - if (ke->pfrke_counters == NULL) - ke->pfrke_counters = uma_zalloc(V_pfr_kcounters_z, - M_NOWAIT | M_ZERO); - if (ke->pfrke_counters != NULL) { - ke->pfrke_counters->pfrkc_packets[dir_out][op_pass]++; - ke->pfrke_counters->pfrkc_bytes[dir_out][op_pass] += len; - } + counter_u64_add(ke->pfrke_counters. + pfrkc_packets[dir_out][op_pass], 1); + counter_u64_add(ke->pfrke_counters. + pfrkc_bytes[dir_out][op_pass], len); } } @@ -2097,7 +2230,7 @@ pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter, _next_block: ke = pfr_kentry_byidx(kt, idx, af); if (ke == NULL) { - kt->pfrkt_nomatch++; + counter_u64_add(kt->pfrkt_nomatch, 1); return (1); } pfr_prepare_network(&umask, af, ke->pfrke_net); @@ -2122,7 +2255,7 @@ _next_block: /* this is a single IP address - no possible nested block */ PF_ACPY(counter, addr, af); *pidx = idx; - kt->pfrkt_match++; + counter_u64_add(kt->pfrkt_match, 1); return (0); } for (;;) { @@ -2142,7 +2275,7 @@ _next_block: /* lookup return the same block - perfect */ PF_ACPY(counter, addr, af); *pidx = idx; - kt->pfrkt_match++; + counter_u64_add(kt->pfrkt_match, 1); return (0); } diff --git a/freebsd/sys/opencrypto/cast.c b/freebsd/sys/opencrypto/cast.c index 1fb62f20..8031dabe 100644 --- a/freebsd/sys/opencrypto/cast.c +++ b/freebsd/sys/opencrypto/cast.c @@ -131,7 +131,7 @@ u_int32_t t, l, r; /***** Key Schedule *****/ -void cast_setkey(cast_key* key, u_int8_t* rawkey, int keybytes) +void cast_setkey(cast_key* key, const u_int8_t* rawkey, int keybytes) { u_int32_t t[4] = {0, 0, 0, 0}, z[4] = {0, 0, 0, 0}, x[4]; int i; diff --git a/freebsd/sys/opencrypto/cast.h b/freebsd/sys/opencrypto/cast.h index 8e2d0d19..2aca9340 100644 --- a/freebsd/sys/opencrypto/cast.h +++ b/freebsd/sys/opencrypto/cast.h @@ -16,7 +16,7 @@ typedef struct { int rounds; /* Number of rounds to use, 12 or 16 */ } cast_key; -void cast_setkey(cast_key * key, u_int8_t * rawkey, int keybytes); +void cast_setkey(cast_key * key, const u_int8_t * rawkey, int keybytes); void cast_encrypt(cast_key * key, u_int8_t * inblock, u_int8_t * outblock); void cast_decrypt(cast_key * key, u_int8_t * inblock, u_int8_t * outblock); diff --git a/freebsd/sys/opencrypto/cbc_mac.c b/freebsd/sys/opencrypto/cbc_mac.c new file mode 100644 index 00000000..1bcf356a --- /dev/null +++ b/freebsd/sys/opencrypto/cbc_mac.c @@ -0,0 +1,267 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/* + * Copyright (c) 2018-2019 iXsystems Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/endian.h> +#include <opencrypto/cbc_mac.h> +#include <opencrypto/xform_auth.h> + +/* + * Given two CCM_CBC_BLOCK_LEN blocks, xor + * them into dst, and then encrypt dst. + */ +static void +xor_and_encrypt(struct aes_cbc_mac_ctx *ctx, + const uint8_t *src, uint8_t *dst) +{ + const uint64_t *b1; + uint64_t *b2; + uint64_t temp_block[CCM_CBC_BLOCK_LEN/sizeof(uint64_t)]; + + b1 = (const uint64_t*)src; + b2 = (uint64_t*)dst; + + for (size_t count = 0; + count < CCM_CBC_BLOCK_LEN/sizeof(uint64_t); + count++) { + temp_block[count] = b1[count] ^ b2[count]; + } + rijndaelEncrypt(ctx->keysched, ctx->rounds, (void*)temp_block, dst); +} + +void +AES_CBC_MAC_Init(struct aes_cbc_mac_ctx *ctx) +{ + bzero(ctx, sizeof(*ctx)); +} + +void +AES_CBC_MAC_Setkey(struct aes_cbc_mac_ctx *ctx, const uint8_t *key, uint16_t klen) +{ + ctx->rounds = rijndaelKeySetupEnc(ctx->keysched, key, klen * 8); +} + +/* + * This is called to set the nonce, aka IV. + * Before this call, the authDataLength and cryptDataLength fields + * MUST have been set. Sadly, there's no way to return an error. + * + * The CBC-MAC algorithm requires that the first block contain the + * nonce, as well as information about the sizes and lengths involved. + */ +void +AES_CBC_MAC_Reinit(struct aes_cbc_mac_ctx *ctx, const uint8_t *nonce, uint16_t nonceLen) +{ + uint8_t b0[CCM_CBC_BLOCK_LEN]; + uint8_t *bp = b0, flags = 0; + uint8_t L = 0; + uint64_t dataLength = ctx->cryptDataLength; + + KASSERT(nonceLen >= 7 && nonceLen <= 13, + ("nonceLen must be between 7 and 13 bytes")); + + ctx->nonce = nonce; + ctx->nonceLength = nonceLen; + + ctx->authDataCount = 0; + ctx->blockIndex = 0; + explicit_bzero(ctx->staging_block, sizeof(ctx->staging_block)); + + /* + * Need to determine the L field value. This is the number of + * bytes needed to specify the length of the message; the length + * is whatever is left in the 16 bytes after specifying flags and + * the nonce. + */ + L = 15 - nonceLen; + + flags = ((ctx->authDataLength > 0) << 6) + + (((AES_CBC_MAC_HASH_LEN - 2) / 2) << 3) + + L - 1; + /* + * Now we need to set up the first block, which has flags, nonce, + * and the message length. + */ + b0[0] = flags; + bcopy(nonce, b0 + 1, nonceLen); + bp = b0 + 1 + nonceLen; + + /* Need to copy L' [aka L-1] bytes of cryptDataLength */ + for (uint8_t *dst = b0 + sizeof(b0) - 1; dst >= bp; dst--) { + *dst = dataLength; + dataLength >>= 8; + } + /* Now need to encrypt b0 */ + rijndaelEncrypt(ctx->keysched, ctx->rounds, b0, ctx->block); + /* If there is auth data, we need to set up the staging block */ + if (ctx->authDataLength) { + size_t addLength; + if (ctx->authDataLength < ((1<<16) - (1<<8))) { + uint16_t sizeVal = htobe16(ctx->authDataLength); + bcopy(&sizeVal, ctx->staging_block, sizeof(sizeVal)); + addLength = sizeof(sizeVal); + } else if (ctx->authDataLength < (1ULL<<32)) { + uint32_t sizeVal = htobe32(ctx->authDataLength); + ctx->staging_block[0] = 0xff; + ctx->staging_block[1] = 0xfe; + bcopy(&sizeVal, ctx->staging_block+2, sizeof(sizeVal)); + addLength = 2 + sizeof(sizeVal); + } else { + uint64_t sizeVal = htobe64(ctx->authDataLength); + ctx->staging_block[0] = 0xff; + ctx->staging_block[1] = 0xff; + bcopy(&sizeVal, ctx->staging_block+2, sizeof(sizeVal)); + addLength = 2 + sizeof(sizeVal); + } + ctx->blockIndex = addLength; + /* + * The length descriptor goes into the AAD buffer, so we + * need to account for it. + */ + ctx->authDataLength += addLength; + ctx->authDataCount = addLength; + } +} + +int +AES_CBC_MAC_Update(struct aes_cbc_mac_ctx *ctx, const uint8_t *data, + uint16_t length) +{ + size_t copy_amt; + + /* + * This will be called in one of two phases: + * (1) Applying authentication data, or + * (2) Applying the payload data. + * + * Because CBC-MAC puts the authentication data size before the + * data, subsequent calls won't be block-size-aligned. Which + * complicates things a fair bit. + * + * The payload data doesn't have that problem. + */ + + if (ctx->authDataCount < ctx->authDataLength) { + /* + * We need to process data as authentication data. + * Since we may be out of sync, we may also need + * to pad out the staging block. + */ + const uint8_t *ptr = data; + while (length > 0) { + + copy_amt = MIN(length, + sizeof(ctx->staging_block) - ctx->blockIndex); + + bcopy(ptr, ctx->staging_block + ctx->blockIndex, + copy_amt); + ptr += copy_amt; + length -= copy_amt; + ctx->authDataCount += copy_amt; + ctx->blockIndex += copy_amt; + ctx->blockIndex %= sizeof(ctx->staging_block); + + if (ctx->blockIndex == 0 || + ctx->authDataCount == ctx->authDataLength) { + /* + * We're done with this block, so we + * xor staging_block with block, and then + * encrypt it. + */ + xor_and_encrypt(ctx, ctx->staging_block, ctx->block); + bzero(ctx->staging_block, sizeof(ctx->staging_block)); + ctx->blockIndex = 0; + if (ctx->authDataCount >= ctx->authDataLength) + break; + } + } + /* + * We'd like to be able to check length == 0 and return + * here, but the way OCF calls us, length is always + * blksize (16, in this case). So we have to count on + * the fact that OCF calls us separately for the AAD and + * for the real data. + */ + return (0); + } + /* + * If we're here, then we're encoding payload data. + * This is marginally easier, except that _Update can + * be called with non-aligned update lengths. As a result, + * we still need to use the staging block. + */ + KASSERT((length + ctx->cryptDataCount) <= ctx->cryptDataLength, + ("More encryption data than allowed")); + + while (length) { + uint8_t *ptr; + + copy_amt = MIN(sizeof(ctx->staging_block) - ctx->blockIndex, + length); + ptr = ctx->staging_block + ctx->blockIndex; + bcopy(data, ptr, copy_amt); + data += copy_amt; + ctx->blockIndex += copy_amt; + ctx->cryptDataCount += copy_amt; + length -= copy_amt; + if (ctx->blockIndex == sizeof(ctx->staging_block)) { + /* We've got a full block */ + xor_and_encrypt(ctx, ctx->staging_block, ctx->block); + ctx->blockIndex = 0; + bzero(ctx->staging_block, sizeof(ctx->staging_block)); + } + } + return (0); +} + +void +AES_CBC_MAC_Final(uint8_t *buf, struct aes_cbc_mac_ctx *ctx) +{ + uint8_t s0[CCM_CBC_BLOCK_LEN]; + + /* + * We first need to check to see if we've got any data + * left over to encrypt. + */ + if (ctx->blockIndex != 0) { + xor_and_encrypt(ctx, ctx->staging_block, ctx->block); + ctx->cryptDataCount += ctx->blockIndex; + ctx->blockIndex = 0; + explicit_bzero(ctx->staging_block, sizeof(ctx->staging_block)); + } + bzero(s0, sizeof(s0)); + s0[0] = (15 - ctx->nonceLength) - 1; + bcopy(ctx->nonce, s0 + 1, ctx->nonceLength); + rijndaelEncrypt(ctx->keysched, ctx->rounds, s0, s0); + for (size_t indx = 0; indx < AES_CBC_MAC_HASH_LEN; indx++) + buf[indx] = ctx->block[indx] ^ s0[indx]; + explicit_bzero(s0, sizeof(s0)); +} diff --git a/freebsd/sys/opencrypto/cbc_mac.h b/freebsd/sys/opencrypto/cbc_mac.h new file mode 100644 index 00000000..33e61cc1 --- /dev/null +++ b/freebsd/sys/opencrypto/cbc_mac.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2014 The FreeBSD Foundation + * Copyright (c) 2018, iXsystems Inc. + * All rights reserved. + * + * This software was developed by Sean Eric Fagan, with lots of references + * to existing AES-CCM (gmac) code. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _CBC_CCM_H +# define _CBC_CCM_H + +# include <sys/types.h> +# include <crypto/rijndael/rijndael.h> + +# define CCM_CBC_BLOCK_LEN 16 /* 128 bits */ +# define CCM_CBC_MAX_DIGEST_LEN 16 +# define CCM_CBC_MIN_DIGEST_LEN 4 + +/* + * This is the authentication context structure; + * the encryption one is similar. + */ +struct aes_cbc_mac_ctx { + uint64_t authDataLength, authDataCount; + uint64_t cryptDataLength, cryptDataCount; + int blockIndex; + uint8_t staging_block[CCM_CBC_BLOCK_LEN]; + uint8_t block[CCM_CBC_BLOCK_LEN]; + const uint8_t *nonce; + int nonceLength; /* This one is in bytes, not bits! */ + /* AES state data */ + int rounds; + uint32_t keysched[4*(RIJNDAEL_MAXNR+1)]; +}; + +void AES_CBC_MAC_Init(struct aes_cbc_mac_ctx *); +void AES_CBC_MAC_Setkey(struct aes_cbc_mac_ctx *, const uint8_t *, uint16_t); +void AES_CBC_MAC_Reinit(struct aes_cbc_mac_ctx *, const uint8_t *, uint16_t); +int AES_CBC_MAC_Update(struct aes_cbc_mac_ctx *, const uint8_t *, uint16_t); +void AES_CBC_MAC_Final(uint8_t *, struct aes_cbc_mac_ctx *); + +#endif /* _CBC_CCM_H */ diff --git a/freebsd/sys/opencrypto/cryptodeflate.c b/freebsd/sys/opencrypto/cryptodeflate.c index 8ab063f4..30d0844a 100644 --- a/freebsd/sys/opencrypto/cryptodeflate.c +++ b/freebsd/sys/opencrypto/cryptodeflate.c @@ -31,7 +31,7 @@ /* * This file contains a wrapper around the deflate algo compression - * functions using the zlib library (see libkern/zlib.c and sys/zlib.h}) + * functions using the zlib library (see sys/contrib/zlib) */ #include <sys/cdefs.h> @@ -44,7 +44,7 @@ __FBSDID("$FreeBSD$"); #include <sys/kernel.h> #include <sys/sdt.h> #include <sys/systm.h> -#include <sys/zlib.h> +#include <contrib/zlib/zlib.h> #include <opencrypto/cryptodev.h> #include <opencrypto/deflate.h> @@ -52,16 +52,32 @@ __FBSDID("$FreeBSD$"); SDT_PROVIDER_DECLARE(opencrypto); SDT_PROBE_DEFINE2(opencrypto, deflate, deflate_global, entry, "int", "u_int32_t"); -SDT_PROBE_DEFINE5(opencrypto, deflate, deflate_global, bad, - "int", "int", "int", "int", "int"); -SDT_PROBE_DEFINE5(opencrypto, deflate, deflate_global, iter, - "int", "int", "int", "int", "int"); +SDT_PROBE_DEFINE6(opencrypto, deflate, deflate_global, bad, + "int", "int", "int", "int", "int", "int"); +SDT_PROBE_DEFINE6(opencrypto, deflate, deflate_global, iter, + "int", "int", "int", "int", "int", "int"); SDT_PROBE_DEFINE2(opencrypto, deflate, deflate_global, return, "int", "u_int32_t"); int window_inflate = -1 * MAX_WBITS; int window_deflate = -12; +static void * +crypto_zalloc(void *nil, u_int type, u_int size) +{ + void *ptr; + + ptr = malloc(type *size, M_CRYPTO_DATA, M_NOWAIT); + return ptr; +} + +static void +crypto_zfree(void *nil, void *ptr) +{ + + free(ptr, M_CRYPTO_DATA); +} + /* * This function takes a block of data and (de)compress it using the deflate * algorithm @@ -107,16 +123,16 @@ deflate_global(data, size, decomp, out) bufh = bufp = malloc(sizeof(*bufp) + (size_t)(size * i), M_CRYPTO_DATA, M_NOWAIT); if (bufp == NULL) { - SDT_PROBE5(opencrypto, deflate, deflate_global, bad, - decomp, 0, __LINE__, 0, 0); + SDT_PROBE6(opencrypto, deflate, deflate_global, bad, + decomp, 0, __LINE__, 0, 0, 0); goto bad2; } bufp->next = NULL; bufp->size = size * i; bzero(&zbuf, sizeof(z_stream)); - zbuf.zalloc = z_alloc; - zbuf.zfree = z_free; + zbuf.zalloc = crypto_zalloc; + zbuf.zfree = crypto_zfree; zbuf.opaque = Z_NULL; zbuf.next_in = data; /* Data that is going to be processed. */ zbuf.avail_in = size; /* Total length of data to be processed. */ @@ -127,8 +143,8 @@ deflate_global(data, size, decomp, out) deflateInit2(&zbuf, Z_DEFAULT_COMPRESSION, Z_METHOD, window_deflate, Z_MEMLEVEL, Z_DEFAULT_STRATEGY); if (error != Z_OK) { - SDT_PROBE5(opencrypto, deflate, deflate_global, bad, - decomp, error, __LINE__, 0, 0); + SDT_PROBE6(opencrypto, deflate, deflate_global, bad, + decomp, error, __LINE__, 0, 0, 0); goto bad; } @@ -136,24 +152,14 @@ deflate_global(data, size, decomp, out) error = decomp ? inflate(&zbuf, Z_SYNC_FLUSH) : deflate(&zbuf, Z_FINISH); if (error != Z_OK && error != Z_STREAM_END) { - /* - * Unfortunately we are limited to 5 arguments, - * thus use two probes. - */ - SDT_PROBE5(opencrypto, deflate, deflate_global, bad, - decomp, error, __LINE__, - zbuf.avail_in, zbuf.avail_out); - SDT_PROBE5(opencrypto, deflate, deflate_global, bad, + SDT_PROBE6(opencrypto, deflate, deflate_global, bad, decomp, error, __LINE__, - zbuf.state->dummy, zbuf.total_out); + zbuf.avail_in, zbuf.avail_out, zbuf.total_out); goto bad; } - SDT_PROBE5(opencrypto, deflate, deflate_global, iter, - decomp, error, __LINE__, - zbuf.avail_in, zbuf.avail_out); - SDT_PROBE5(opencrypto, deflate, deflate_global, iter, + SDT_PROBE6(opencrypto, deflate, deflate_global, iter, decomp, error, __LINE__, - zbuf.state->dummy, zbuf.total_out); + zbuf.avail_in, zbuf.avail_out, zbuf.total_out); if (decomp && zbuf.avail_in == 0 && error == Z_STREAM_END) { /* Done. */ break; @@ -167,8 +173,8 @@ deflate_global(data, size, decomp, out) p = malloc(sizeof(*p) + (size_t)(size * i), M_CRYPTO_DATA, M_NOWAIT); if (p == NULL) { - SDT_PROBE5(opencrypto, deflate, deflate_global, - bad, decomp, 0, __LINE__, 0, 0); + SDT_PROBE6(opencrypto, deflate, deflate_global, + bad, decomp, 0, __LINE__, 0, 0, 0); goto bad; } p->next = NULL; @@ -179,16 +185,9 @@ deflate_global(data, size, decomp, out) zbuf.avail_out = bufp->size; } else { /* Unexpect result. */ - /* - * Unfortunately we are limited to 5 arguments, - * thus, again, use two probes. - */ - SDT_PROBE5(opencrypto, deflate, deflate_global, bad, - decomp, error, __LINE__, - zbuf.avail_in, zbuf.avail_out); - SDT_PROBE5(opencrypto, deflate, deflate_global, bad, - decomp, error, __LINE__, - zbuf.state->dummy, zbuf.total_out); + SDT_PROBE6(opencrypto, deflate, deflate_global, + bad, decomp, error, __LINE__, + zbuf.avail_in, zbuf.avail_out, zbuf.total_out); goto bad; } } @@ -197,8 +196,8 @@ deflate_global(data, size, decomp, out) *out = malloc(result, M_CRYPTO_DATA, M_NOWAIT); if (*out == NULL) { - SDT_PROBE5(opencrypto, deflate, deflate_global, bad, - decomp, 0, __LINE__, 0, 0); + SDT_PROBE6(opencrypto, deflate, deflate_global, bad, + decomp, 0, __LINE__, 0, 0, 0); goto bad; } if (decomp) @@ -245,21 +244,3 @@ bad2: *out = NULL; return 0; } - -void * -z_alloc(nil, type, size) - void *nil; - u_int type, size; -{ - void *ptr; - - ptr = malloc(type *size, M_CRYPTO_DATA, M_NOWAIT); - return ptr; -} - -void -z_free(nil, ptr) - void *nil, *ptr; -{ - free(ptr, M_CRYPTO_DATA); -} diff --git a/freebsd/sys/opencrypto/cryptodev.c b/freebsd/sys/opencrypto/cryptodev.c index b569cbf7..575142f2 100644 --- a/freebsd/sys/opencrypto/cryptodev.c +++ b/freebsd/sys/opencrypto/cryptodev.c @@ -296,6 +296,11 @@ struct fcrypt { int sesn; }; +static struct timeval warninterval = { .tv_sec = 60, .tv_usec = 0 }; +SYSCTL_TIMEVAL_SEC(_kern, OID_AUTO, cryptodev_warn_interval, CTLFLAG_RW, + &warninterval, + "Delay in seconds between warnings of deprecated /dev/crypto algorithms"); + #ifndef __rtems__ static int cryptof_ioctl(struct file *, u_long, void *, struct ucred *, struct thread *); @@ -450,6 +455,9 @@ cryptof_ioctl( case CRYPTO_CHACHA20: txform = &enc_xform_chacha20; break; + case CRYPTO_AES_CCM_16: + txform = &enc_xform_ccm; + break; default: CRYPTDEB("invalid cipher"); @@ -494,6 +502,25 @@ cryptof_ioctl( thash = &auth_hash_nist_gmac_aes_256; break; + case CRYPTO_AES_CCM_CBC_MAC: + switch (sop->keylen) { + case 16: + thash = &auth_hash_ccm_cbc_mac_128; + break; + case 24: + thash = &auth_hash_ccm_cbc_mac_192; + break; + case 32: + thash = &auth_hash_ccm_cbc_mac_256; + break; + default: + CRYPTDEB("Invalid CBC MAC key size %d", + sop->keylen); + SDT_PROBE1(opencrypto, dev, ioctl, + error, __LINE__); + return (EINVAL); + } + break; #ifdef notdef case CRYPTO_MD5: thash = &auth_hash_md5; @@ -798,6 +825,47 @@ cod_free(struct cryptop_data *cod) free(cod, M_XDATA); } +static void +cryptodev_warn(struct csession *cse) +{ + static struct timeval arc4warn, blfwarn, castwarn, deswarn, md5warn; + static struct timeval skipwarn, tdeswarn; + + switch (cse->cipher) { + case CRYPTO_DES_CBC: + if (ratecheck(&deswarn, &warninterval)) + gone_in(13, "DES cipher via /dev/crypto"); + break; + case CRYPTO_3DES_CBC: + if (ratecheck(&tdeswarn, &warninterval)) + gone_in(13, "3DES cipher via /dev/crypto"); + break; + case CRYPTO_BLF_CBC: + if (ratecheck(&blfwarn, &warninterval)) + gone_in(13, "Blowfish cipher via /dev/crypto"); + break; + case CRYPTO_CAST_CBC: + if (ratecheck(&castwarn, &warninterval)) + gone_in(13, "CAST128 cipher via /dev/crypto"); + break; + case CRYPTO_SKIPJACK_CBC: + if (ratecheck(&skipwarn, &warninterval)) + gone_in(13, "Skipjack cipher via /dev/crypto"); + break; + case CRYPTO_ARC4: + if (ratecheck(&arc4warn, &warninterval)) + gone_in(13, "ARC4 cipher via /dev/crypto"); + break; + } + + switch (cse->mac) { + case CRYPTO_MD5_HMAC: + if (ratecheck(&md5warn, &warninterval)) + gone_in(13, "MD5-HMAC authenticator via /dev/crypto"); + break; + } +} + static int cryptodev_op( struct csession *cse, @@ -920,6 +988,7 @@ cryptodev_op( error = EINVAL; goto bail; } + cryptodev_warn(cse); again: /* @@ -1030,12 +1099,13 @@ cryptodev_aead( } /* - * For GCM, crd_len covers only the AAD. For other ciphers + * For GCM/CCM, crd_len covers only the AAD. For other ciphers * chained with an HMAC, crd_len covers both the AAD and the * cipher text. */ crda->crd_skip = 0; - if (cse->cipher == CRYPTO_AES_NIST_GCM_16) + if (cse->cipher == CRYPTO_AES_NIST_GCM_16 || + cse->cipher == CRYPTO_AES_CCM_16) crda->crd_len = caead->aadlen; else crda->crd_len = caead->aadlen + caead->len; @@ -1088,6 +1158,7 @@ cryptodev_aead( SDT_PROBE1(opencrypto, dev, ioctl, error, __LINE__); goto bail; } + cryptodev_warn(cse); again: /* * Let the dispatch run unlocked, then, interlock against the diff --git a/freebsd/sys/opencrypto/cryptodev.h b/freebsd/sys/opencrypto/cryptodev.h index 6431e6d8..bd71e518 100644 --- a/freebsd/sys/opencrypto/cryptodev.h +++ b/freebsd/sys/opencrypto/cryptodev.h @@ -63,10 +63,10 @@ #define _CRYPTO_CRYPTO_H_ #include <sys/ioccom.h> -#include <sys/_task.h> #ifdef _KERNEL #include <opencrypto/_cryptodev.h> +#include <sys/_task.h> #endif /* Some initial values */ @@ -86,6 +86,7 @@ #define SHA1_KPDK_HASH_LEN 20 #define AES_GMAC_HASH_LEN 16 #define POLY1305_HASH_LEN 16 +#define AES_CBC_MAC_HASH_LEN 16 /* Maximum hash algorithm result length */ #define HASH_MAX_LEN SHA2_512_HASH_LEN /* Keep this updated */ @@ -107,6 +108,9 @@ #define AES_128_GMAC_KEY_LEN 16 #define AES_192_GMAC_KEY_LEN 24 #define AES_256_GMAC_KEY_LEN 32 +#define AES_128_CBC_MAC_KEY_LEN 16 +#define AES_192_CBC_MAC_KEY_LEN 24 +#define AES_256_CBC_MAC_KEY_LEN 32 #define POLY1305_KEY_LEN 32 @@ -129,6 +133,7 @@ #define ARC4_IV_LEN 1 #define AES_GCM_IV_LEN 12 +#define AES_CCM_IV_LEN 12 #define AES_XTS_IV_LEN 8 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */ @@ -199,7 +204,9 @@ #define CRYPTO_SHA2_384 36 #define CRYPTO_SHA2_512 37 #define CRYPTO_POLY1305 38 -#define CRYPTO_ALGORITHM_MAX 38 /* Keep updated - see below */ +#define CRYPTO_AES_CCM_CBC_MAC 39 /* auth side */ +#define CRYPTO_AES_CCM_16 40 /* cipher side */ +#define CRYPTO_ALGORITHM_MAX 40 /* Keep updated - see below */ #define CRYPTO_ALGO_VALID(x) ((x) >= CRYPTO_ALGORITHM_MIN && \ (x) <= CRYPTO_ALGORITHM_MAX) diff --git a/freebsd/sys/opencrypto/cryptosoft.c b/freebsd/sys/opencrypto/cryptosoft.c index 43455b48..5e63167a 100644 --- a/freebsd/sys/opencrypto/cryptosoft.c +++ b/freebsd/sys/opencrypto/cryptosoft.c @@ -64,6 +64,9 @@ __FBSDID("$FreeBSD$"); #include <sys/bus.h> #include <rtems/bsd/local/cryptodev_if.h> +_Static_assert(AES_CCM_IV_LEN == AES_GCM_IV_LEN, + "AES_GCM_IV_LEN must currently be the same as AES_CCM_IV_LEN"); + static int32_t swcr_id; u_int8_t hmac_ipad_buffer[HMAC_MAX_BLOCK_LEN]; @@ -508,6 +511,7 @@ swcr_authenc(struct cryptop *crp) caddr_t buf = (caddr_t)crp->crp_buf; uint32_t *blkp; int aadlen, blksz, i, ivlen, len, iskip, oskip, r; + int isccm = 0; ivlen = blksz = iskip = oskip = 0; @@ -522,13 +526,18 @@ swcr_authenc(struct cryptop *crp) sw = &ses->swcr_algorithms[i]; switch (sw->sw_alg) { + case CRYPTO_AES_CCM_16: case CRYPTO_AES_NIST_GCM_16: case CRYPTO_AES_NIST_GMAC: swe = sw; crde = crd; exf = swe->sw_exf; - ivlen = 12; + /* AES_CCM_IV_LEN and AES_GCM_IV_LEN are both 12 */ + ivlen = AES_CCM_IV_LEN; break; + case CRYPTO_AES_CCM_CBC_MAC: + isccm = 1; + /* FALLTHROUGH */ case CRYPTO_AES_128_NIST_GMAC: case CRYPTO_AES_192_NIST_GMAC: case CRYPTO_AES_256_NIST_GMAC: @@ -546,8 +555,26 @@ swcr_authenc(struct cryptop *crp) } if (crde == NULL || crda == NULL) return (EINVAL); + /* + * We need to make sure that the auth algorithm matches the + * encr algorithm. Specifically, for AES-GCM must go with + * AES NIST GMAC, and AES-CCM must go with CBC-MAC. + */ + if (crde->crd_alg == CRYPTO_AES_NIST_GCM_16) { + switch (crda->crd_alg) { + case CRYPTO_AES_128_NIST_GMAC: + case CRYPTO_AES_192_NIST_GMAC: + case CRYPTO_AES_256_NIST_GMAC: + break; /* Good! */ + default: + return (EINVAL); /* Not good! */ + } + } else if (crde->crd_alg == CRYPTO_AES_CCM_16 && + crda->crd_alg != CRYPTO_AES_CCM_CBC_MAC) + return (EINVAL); - if (crde->crd_alg == CRYPTO_AES_NIST_GCM_16 && + if ((crde->crd_alg == CRYPTO_AES_NIST_GCM_16 || + crde->crd_alg == CRYPTO_AES_CCM_16) && (crde->crd_flags & CRD_F_IV_EXPLICIT) == 0) return (EINVAL); @@ -578,6 +605,15 @@ swcr_authenc(struct cryptop *crp) } } + if (swa->sw_alg == CRYPTO_AES_CCM_CBC_MAC) { + /* + * AES CCM-CBC needs to know the length of + * both the auth data, and payload data, before + * doing the auth computation. + */ + ctx.aes_cbc_mac_ctx.authDataLength = crda->crd_len; + ctx.aes_cbc_mac_ctx.cryptDataLength = crde->crd_len; + } /* Supply MAC with IV */ if (axf->Reinit) axf->Reinit(&ctx, iv, ivlen); @@ -612,16 +648,30 @@ swcr_authenc(struct cryptop *crp) bzero(blk, blksz); crypto_copydata(crp->crp_flags, buf, crde->crd_skip + i, len, blk); + /* + * One of the problems with CCM+CBC is that the authentication + * is done on the unecncrypted data. As a result, we have + * to do the authentication update at different times, + * depending on whether it's CCM or not. + */ if (crde->crd_flags & CRD_F_ENCRYPT) { + if (isccm) + axf->Update(&ctx, blk, len); if (exf->encrypt_multi != NULL) exf->encrypt_multi(swe->sw_kschedule, blk, len); else exf->encrypt(swe->sw_kschedule, blk); - axf->Update(&ctx, blk, len); + if (!isccm) + axf->Update(&ctx, blk, len); crypto_copyback(crp->crp_flags, buf, crde->crd_skip + i, len, blk); } else { + if (isccm) { + KASSERT(exf->encrypt_multi == NULL, + ("assume CCM is single-block only")); + exf->decrypt(swe->sw_kschedule, blk); + } axf->Update(&ctx, blk, len); } } @@ -652,6 +702,11 @@ swcr_authenc(struct cryptop *crp) r = timingsafe_bcmp(aalg, uaalg, axf->hashsize); if (r == 0) { /* tag matches, decrypt data */ + if (isccm) { + KASSERT(exf->reinit != NULL, + ("AES-CCM reinit function must be set")); + exf->reinit(swe->sw_kschedule, iv); + } for (i = 0; i < crde->crd_len; i += blksz) { len = MIN(crde->crd_len - i, blksz); if (len < blksz) @@ -801,6 +856,9 @@ swcr_newsession(device_t dev, crypto_session_t cses, struct cryptoini *cri) case CRYPTO_AES_NIST_GCM_16: txf = &enc_xform_aes_nist_gcm; goto enccommon; + case CRYPTO_AES_CCM_16: + txf = &enc_xform_ccm; + goto enccommon; case CRYPTO_AES_NIST_GMAC: txf = &enc_xform_aes_nist_gmac; swd->sw_exf = txf; @@ -945,6 +1003,22 @@ swcr_newsession(device_t dev, crypto_session_t cses, struct cryptoini *cri) swd->sw_axf = axf; break; + case CRYPTO_AES_CCM_CBC_MAC: + switch (cri->cri_klen) { + case 128: + axf = &auth_hash_ccm_cbc_mac_128; + break; + case 192: + axf = &auth_hash_ccm_cbc_mac_192; + break; + case 256: + axf = &auth_hash_ccm_cbc_mac_256; + break; + default: + swcr_freesession(dev, cses); + return EINVAL; + } + goto auth4common; case CRYPTO_AES_128_NIST_GMAC: axf = &auth_hash_nist_gmac_aes_128; goto auth4common; @@ -1044,6 +1118,7 @@ swcr_freesession(device_t dev, crypto_session_t cses) case CRYPTO_CAMELLIA_CBC: case CRYPTO_NULL_CBC: case CRYPTO_CHACHA20: + case CRYPTO_AES_CCM_16: txf = swd->sw_exf; if (swd->sw_kschedule) @@ -1058,6 +1133,7 @@ swcr_freesession(device_t dev, crypto_session_t cses) case CRYPTO_SHA2_512_HMAC: case CRYPTO_RIPEMD160_HMAC: case CRYPTO_NULL_HMAC: + case CRYPTO_AES_CCM_CBC_MAC: axf = swd->sw_axf; if (swd->sw_ictx) { @@ -1203,6 +1279,8 @@ swcr_process(device_t dev, struct cryptop *crp, int hint) case CRYPTO_AES_128_NIST_GMAC: case CRYPTO_AES_192_NIST_GMAC: case CRYPTO_AES_256_NIST_GMAC: + case CRYPTO_AES_CCM_16: + case CRYPTO_AES_CCM_CBC_MAC: crp->crp_etype = swcr_authenc(crp); goto done; @@ -1293,6 +1371,8 @@ swcr_attach(device_t dev) REGISTER(CRYPTO_BLAKE2B); REGISTER(CRYPTO_BLAKE2S); REGISTER(CRYPTO_CHACHA20); + REGISTER(CRYPTO_AES_CCM_16); + REGISTER(CRYPTO_AES_CCM_CBC_MAC); REGISTER(CRYPTO_POLY1305); #undef REGISTER diff --git a/freebsd/sys/opencrypto/deflate.h b/freebsd/sys/opencrypto/deflate.h index d31a3bf2..1be746d7 100644 --- a/freebsd/sys/opencrypto/deflate.h +++ b/freebsd/sys/opencrypto/deflate.h @@ -36,16 +36,12 @@ #ifndef _CRYPTO_DEFLATE_H_ #define _CRYPTO_DEFLATE_H_ -#include <sys/zlib.h> - #define Z_METHOD 8 #define Z_MEMLEVEL 8 #define MINCOMP 2 /* won't be used, but must be defined */ #define ZBUF 10 u_int32_t deflate_global(u_int8_t *, u_int32_t, int, u_int8_t **); -void *z_alloc(void *, u_int, u_int); -void z_free(void *, void *); /* * We are going to use a combined allocation to hold the metadata diff --git a/freebsd/sys/opencrypto/skipjack.c b/freebsd/sys/opencrypto/skipjack.c index 047cf642..455d360d 100644 --- a/freebsd/sys/opencrypto/skipjack.c +++ b/freebsd/sys/opencrypto/skipjack.c @@ -67,7 +67,7 @@ static const u_int8_t ftable[0x100] = */ void -subkey_table_gen (u_int8_t *key, u_int8_t **key_tables) +subkey_table_gen (const u_int8_t *key, u_int8_t **key_tables) { int i, k; diff --git a/freebsd/sys/opencrypto/skipjack.h b/freebsd/sys/opencrypto/skipjack.h index 80367ea4..95b0b9e4 100644 --- a/freebsd/sys/opencrypto/skipjack.h +++ b/freebsd/sys/opencrypto/skipjack.h @@ -19,6 +19,6 @@ extern void skipjack_forwards(u_int8_t *plain, u_int8_t *cipher, u_int8_t **key); extern void skipjack_backwards(u_int8_t *cipher, u_int8_t *plain, u_int8_t **key); -extern void subkey_table_gen(u_int8_t *key, u_int8_t **key_tables); +extern void subkey_table_gen(const u_int8_t *key, u_int8_t **key_tables); #endif diff --git a/freebsd/sys/opencrypto/xform_aes_icm.c b/freebsd/sys/opencrypto/xform_aes_icm.c index 8d3694fa..ba3eca0a 100644 --- a/freebsd/sys/opencrypto/xform_aes_icm.c +++ b/freebsd/sys/opencrypto/xform_aes_icm.c @@ -52,11 +52,12 @@ __FBSDID("$FreeBSD$"); #include <opencrypto/xform_enc.h> -static int aes_icm_setkey(u_int8_t **, u_int8_t *, int); +static int aes_icm_setkey(u_int8_t **, const u_int8_t *, int); static void aes_icm_crypt(caddr_t, u_int8_t *); static void aes_icm_zerokey(u_int8_t **); -static void aes_icm_reinit(caddr_t, u_int8_t *); -static void aes_gcm_reinit(caddr_t, u_int8_t *); +static void aes_icm_reinit(caddr_t, const u_int8_t *); +static void aes_gcm_reinit(caddr_t, const u_int8_t *); +static void aes_ccm_reinit(caddr_t, const u_int8_t *); /* Encryption instances */ struct enc_xform enc_xform_aes_icm = { @@ -79,11 +80,23 @@ struct enc_xform enc_xform_aes_nist_gcm = { aes_gcm_reinit, }; +struct enc_xform enc_xform_ccm = { + .type = CRYPTO_AES_CCM_16, + .name = "AES-CCM", + .blocksize = AES_ICM_BLOCK_LEN, .ivsize = AES_CCM_IV_LEN, + .minkey = AES_MIN_KEY, .maxkey = AES_MAX_KEY, + .encrypt = aes_icm_crypt, + .decrypt = aes_icm_crypt, + .setkey = aes_icm_setkey, + .zerokey = aes_icm_zerokey, + .reinit = aes_ccm_reinit, +}; + /* * Encryption wrapper routines. */ static void -aes_icm_reinit(caddr_t key, u_int8_t *iv) +aes_icm_reinit(caddr_t key, const u_int8_t *iv) { struct aes_icm_ctx *ctx; @@ -92,7 +105,7 @@ aes_icm_reinit(caddr_t key, u_int8_t *iv) } static void -aes_gcm_reinit(caddr_t key, u_int8_t *iv) +aes_gcm_reinit(caddr_t key, const u_int8_t *iv) { struct aes_icm_ctx *ctx; @@ -105,6 +118,21 @@ aes_gcm_reinit(caddr_t key, u_int8_t *iv) } static void +aes_ccm_reinit(caddr_t key, const u_int8_t *iv) +{ + struct aes_icm_ctx *ctx; + + ctx = (struct aes_icm_ctx*)key; + + /* CCM has flags, then the IV, then the counter, which starts at 1 */ + bzero(ctx->ac_block, sizeof(ctx->ac_block)); + /* 3 bytes for length field; this gives a nonce of 12 bytes */ + ctx->ac_block[0] = (15 - AES_CCM_IV_LEN) - 1; + bcopy(iv, ctx->ac_block+1, AES_CCM_IV_LEN); + ctx->ac_block[AESICM_BLOCKSIZE - 1] = 1; +} + +static void aes_icm_crypt(caddr_t key, u_int8_t *data) { struct aes_icm_ctx *ctx; @@ -125,7 +153,7 @@ aes_icm_crypt(caddr_t key, u_int8_t *data) } static int -aes_icm_setkey(u_int8_t **sched, u_int8_t *key, int len) +aes_icm_setkey(u_int8_t **sched, const u_int8_t *key, int len) { struct aes_icm_ctx *ctx; @@ -138,7 +166,7 @@ aes_icm_setkey(u_int8_t **sched, u_int8_t *key, int len) return ENOMEM; ctx = (struct aes_icm_ctx *)*sched; - ctx->ac_nr = rijndaelKeySetupEnc(ctx->ac_ek, (u_char *)key, len * 8); + ctx->ac_nr = rijndaelKeySetupEnc(ctx->ac_ek, key, len * 8); return 0; } diff --git a/freebsd/sys/opencrypto/xform_aes_xts.c b/freebsd/sys/opencrypto/xform_aes_xts.c index dedbe627..33f66a5d 100644 --- a/freebsd/sys/opencrypto/xform_aes_xts.c +++ b/freebsd/sys/opencrypto/xform_aes_xts.c @@ -52,11 +52,11 @@ __FBSDID("$FreeBSD$"); #include <opencrypto/xform_enc.h> -static int aes_xts_setkey(u_int8_t **, u_int8_t *, int); +static int aes_xts_setkey(u_int8_t **, const u_int8_t *, int); static void aes_xts_encrypt(caddr_t, u_int8_t *); static void aes_xts_decrypt(caddr_t, u_int8_t *); static void aes_xts_zerokey(u_int8_t **); -static void aes_xts_reinit(caddr_t, u_int8_t *); +static void aes_xts_reinit(caddr_t, const u_int8_t *); /* Encryption instances */ struct enc_xform enc_xform_aes_xts = { @@ -73,7 +73,7 @@ struct enc_xform enc_xform_aes_xts = { * Encryption wrapper routines. */ static void -aes_xts_reinit(caddr_t key, u_int8_t *iv) +aes_xts_reinit(caddr_t key, const u_int8_t *iv) { struct aes_xts_ctx *ctx = (struct aes_xts_ctx *)key; u_int64_t blocknum; @@ -136,7 +136,7 @@ aes_xts_decrypt(caddr_t key, u_int8_t *data) } static int -aes_xts_setkey(u_int8_t **sched, u_int8_t *key, int len) +aes_xts_setkey(u_int8_t **sched, const u_int8_t *key, int len) { struct aes_xts_ctx *ctx; diff --git a/freebsd/sys/opencrypto/xform_auth.h b/freebsd/sys/opencrypto/xform_auth.h index 9af0f8e6..9b072625 100644 --- a/freebsd/sys/opencrypto/xform_auth.h +++ b/freebsd/sys/opencrypto/xform_auth.h @@ -42,6 +42,7 @@ #include <crypto/sha2/sha512.h> #include <opencrypto/rmd160.h> #include <opencrypto/gmac.h> +#include <opencrypto/cbc_mac.h> #include <opencrypto/cryptodev.h> #include <opencrypto/xform_userland.h> @@ -85,6 +86,9 @@ extern struct auth_hash auth_hash_nist_gmac_aes_256; extern struct auth_hash auth_hash_blake2b; extern struct auth_hash auth_hash_blake2s; extern struct auth_hash auth_hash_poly1305; +extern struct auth_hash auth_hash_ccm_cbc_mac_128; +extern struct auth_hash auth_hash_ccm_cbc_mac_192; +extern struct auth_hash auth_hash_ccm_cbc_mac_256; union authctx { MD5_CTX md5ctx; @@ -95,6 +99,7 @@ union authctx { SHA384_CTX sha384ctx; SHA512_CTX sha512ctx; struct aes_gmac_ctx aes_gmac_ctx; + struct aes_cbc_mac_ctx aes_cbc_mac_ctx; }; #endif /* _CRYPTO_XFORM_AUTH_H_ */ diff --git a/freebsd/sys/opencrypto/xform_blf.c b/freebsd/sys/opencrypto/xform_blf.c index b4be5f8d..d0432c99 100644 --- a/freebsd/sys/opencrypto/xform_blf.c +++ b/freebsd/sys/opencrypto/xform_blf.c @@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$"); #include <crypto/blowfish/blowfish.h> #include <opencrypto/xform_enc.h> -static int blf_setkey(u_int8_t **, u_int8_t *, int); +static int blf_setkey(u_int8_t **, const u_int8_t *, int); static void blf_encrypt(caddr_t, u_int8_t *); static void blf_decrypt(caddr_t, u_int8_t *); static void blf_zerokey(u_int8_t **); @@ -104,7 +104,7 @@ blf_decrypt(caddr_t key, u_int8_t *blk) } static int -blf_setkey(u_int8_t **sched, u_int8_t *key, int len) +blf_setkey(u_int8_t **sched, const u_int8_t *key, int len) { int err; diff --git a/freebsd/sys/opencrypto/xform_cast5.c b/freebsd/sys/opencrypto/xform_cast5.c index 85b346eb..f4d9472d 100644 --- a/freebsd/sys/opencrypto/xform_cast5.c +++ b/freebsd/sys/opencrypto/xform_cast5.c @@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$"); #include <opencrypto/cast.h> #include <opencrypto/xform_enc.h> -static int cast5_setkey(u_int8_t **, u_int8_t *, int); +static int cast5_setkey(u_int8_t **, const u_int8_t *, int); static void cast5_encrypt(caddr_t, u_int8_t *); static void cast5_decrypt(caddr_t, u_int8_t *); static void cast5_zerokey(u_int8_t **); @@ -85,7 +85,7 @@ cast5_decrypt(caddr_t key, u_int8_t *blk) } static int -cast5_setkey(u_int8_t **sched, u_int8_t *key, int len) +cast5_setkey(u_int8_t **sched, const u_int8_t *key, int len) { int err; diff --git a/freebsd/sys/opencrypto/xform_cbc_mac.c b/freebsd/sys/opencrypto/xform_cbc_mac.c new file mode 100644 index 00000000..1de2e976 --- /dev/null +++ b/freebsd/sys/opencrypto/xform_cbc_mac.c @@ -0,0 +1,57 @@ +#include <machine/rtems-bsd-kernel-space.h> + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <opencrypto/cbc_mac.h> +#include <opencrypto/xform_auth.h> + +/* Authentication instances */ +struct auth_hash auth_hash_ccm_cbc_mac_128 = { + .type = CRYPTO_AES_CCM_CBC_MAC, + .name = "CBC-CCM-AES-128", + .keysize = AES_128_CBC_MAC_KEY_LEN, + .hashsize = AES_CBC_MAC_HASH_LEN, + .ctxsize = sizeof(struct aes_cbc_mac_ctx), + .blocksize = CCM_CBC_BLOCK_LEN, + .Init = (void (*)(void *)) AES_CBC_MAC_Init, + .Setkey = + (void (*)(void *, const u_int8_t *, u_int16_t))AES_CBC_MAC_Setkey, + .Reinit = + (void (*)(void *, const u_int8_t *, u_int16_t)) AES_CBC_MAC_Reinit, + .Update = + (int (*)(void *, const u_int8_t *, u_int16_t)) AES_CBC_MAC_Update, + .Final = (void (*)(u_int8_t *, void *)) AES_CBC_MAC_Final, +}; +struct auth_hash auth_hash_ccm_cbc_mac_192 = { + .type = CRYPTO_AES_CCM_CBC_MAC, + .name = "CBC-CCM-AES-192", + .keysize = AES_192_CBC_MAC_KEY_LEN, + .hashsize = AES_CBC_MAC_HASH_LEN, + .ctxsize = sizeof(struct aes_cbc_mac_ctx), + .blocksize = CCM_CBC_BLOCK_LEN, + .Init = (void (*)(void *)) AES_CBC_MAC_Init, + .Setkey = + (void (*)(void *, const u_int8_t *, u_int16_t)) AES_CBC_MAC_Setkey, + .Reinit = + (void (*)(void *, const u_int8_t *, u_int16_t)) AES_CBC_MAC_Reinit, + .Update = + (int (*)(void *, const u_int8_t *, u_int16_t)) AES_CBC_MAC_Update, + .Final = (void (*)(u_int8_t *, void *)) AES_CBC_MAC_Final, +}; +struct auth_hash auth_hash_ccm_cbc_mac_256 = { + .type = CRYPTO_AES_CCM_CBC_MAC, + .name = "CBC-CCM-AES-256", + .keysize = AES_256_CBC_MAC_KEY_LEN, + .hashsize = AES_CBC_MAC_HASH_LEN, + .ctxsize = sizeof(struct aes_cbc_mac_ctx), + .blocksize = CCM_CBC_BLOCK_LEN, + .Init = (void (*)(void *)) AES_CBC_MAC_Init, + .Setkey = + (void (*)(void *, const u_int8_t *, u_int16_t)) AES_CBC_MAC_Setkey, + .Reinit = + (void (*)(void *, const u_int8_t *, u_int16_t)) AES_CBC_MAC_Reinit, + .Update = + (int (*)(void *, const u_int8_t *, u_int16_t)) AES_CBC_MAC_Update, + .Final = (void (*)(u_int8_t *, void *)) AES_CBC_MAC_Final, +}; diff --git a/freebsd/sys/opencrypto/xform_cml.c b/freebsd/sys/opencrypto/xform_cml.c index c807fa97..2f857fe6 100644 --- a/freebsd/sys/opencrypto/xform_cml.c +++ b/freebsd/sys/opencrypto/xform_cml.c @@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$"); #include <crypto/camellia/camellia.h> #include <opencrypto/xform_enc.h> -static int cml_setkey(u_int8_t **, u_int8_t *, int); +static int cml_setkey(u_int8_t **, const u_int8_t *, int); static void cml_encrypt(caddr_t, u_int8_t *); static void cml_decrypt(caddr_t, u_int8_t *); static void cml_zerokey(u_int8_t **); @@ -87,7 +87,7 @@ cml_decrypt(caddr_t key, u_int8_t *blk) } static int -cml_setkey(u_int8_t **sched, u_int8_t *key, int len) +cml_setkey(u_int8_t **sched, const u_int8_t *key, int len) { int err; @@ -96,7 +96,7 @@ cml_setkey(u_int8_t **sched, u_int8_t *key, int len) *sched = KMALLOC(sizeof(camellia_ctx), M_CRYPTO_DATA, M_NOWAIT|M_ZERO); if (*sched != NULL) { - camellia_set_key((camellia_ctx *) *sched, (u_char *) key, + camellia_set_key((camellia_ctx *) *sched, key, len * 8); err = 0; } else diff --git a/freebsd/sys/opencrypto/xform_des1.c b/freebsd/sys/opencrypto/xform_des1.c index cbce5e29..0a778eef 100644 --- a/freebsd/sys/opencrypto/xform_des1.c +++ b/freebsd/sys/opencrypto/xform_des1.c @@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$"); #include <crypto/des/des.h> #include <opencrypto/xform_enc.h> -static int des1_setkey(u_int8_t **, u_int8_t *, int); +static int des1_setkey(u_int8_t **, const u_int8_t *, int); static void des1_encrypt(caddr_t, u_int8_t *); static void des1_decrypt(caddr_t, u_int8_t *); static void des1_zerokey(u_int8_t **); @@ -75,23 +75,21 @@ struct enc_xform enc_xform_des = { static void des1_encrypt(caddr_t key, u_int8_t *blk) { - des_cblock *cb = (des_cblock *) blk; des_key_schedule *p = (des_key_schedule *) key; - des_ecb_encrypt(cb, cb, p[0], DES_ENCRYPT); + des_ecb_encrypt(blk, blk, p[0], DES_ENCRYPT); } static void des1_decrypt(caddr_t key, u_int8_t *blk) { - des_cblock *cb = (des_cblock *) blk; des_key_schedule *p = (des_key_schedule *) key; - des_ecb_encrypt(cb, cb, p[0], DES_DECRYPT); + des_ecb_encrypt(blk, blk, p[0], DES_DECRYPT); } static int -des1_setkey(u_int8_t **sched, u_int8_t *key, int len) +des1_setkey(u_int8_t **sched, const u_int8_t *key, int len) { des_key_schedule *p; int err; @@ -99,7 +97,7 @@ des1_setkey(u_int8_t **sched, u_int8_t *key, int len) p = KMALLOC(sizeof (des_key_schedule), M_CRYPTO_DATA, M_NOWAIT|M_ZERO); if (p != NULL) { - des_set_key((des_cblock *) key, p[0]); + des_set_key(key, p[0]); err = 0; } else err = ENOMEM; diff --git a/freebsd/sys/opencrypto/xform_des3.c b/freebsd/sys/opencrypto/xform_des3.c index 1b26b622..ea32a1ab 100644 --- a/freebsd/sys/opencrypto/xform_des3.c +++ b/freebsd/sys/opencrypto/xform_des3.c @@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$"); #include <crypto/des/des.h> #include <opencrypto/xform_enc.h> -static int des3_setkey(u_int8_t **, u_int8_t *, int); +static int des3_setkey(u_int8_t **, const u_int8_t *, int); static void des3_encrypt(caddr_t, u_int8_t *); static void des3_decrypt(caddr_t, u_int8_t *); static void des3_zerokey(u_int8_t **); @@ -76,23 +76,21 @@ struct enc_xform enc_xform_3des = { static void des3_encrypt(caddr_t key, u_int8_t *blk) { - des_cblock *cb = (des_cblock *) blk; des_key_schedule *p = (des_key_schedule *) key; - des_ecb3_encrypt(cb, cb, p[0], p[1], p[2], DES_ENCRYPT); + des_ecb3_encrypt(blk, blk, p[0], p[1], p[2], DES_ENCRYPT); } static void des3_decrypt(caddr_t key, u_int8_t *blk) { - des_cblock *cb = (des_cblock *) blk; des_key_schedule *p = (des_key_schedule *) key; - des_ecb3_encrypt(cb, cb, p[0], p[1], p[2], DES_DECRYPT); + des_ecb3_encrypt(blk, blk, p[0], p[1], p[2], DES_DECRYPT); } static int -des3_setkey(u_int8_t **sched, u_int8_t *key, int len) +des3_setkey(u_int8_t **sched, const u_int8_t *key, int len) { des_key_schedule *p; int err; @@ -100,9 +98,9 @@ des3_setkey(u_int8_t **sched, u_int8_t *key, int len) p = KMALLOC(3*sizeof (des_key_schedule), M_CRYPTO_DATA, M_NOWAIT|M_ZERO); if (p != NULL) { - des_set_key((des_cblock *)(key + 0), p[0]); - des_set_key((des_cblock *)(key + 8), p[1]); - des_set_key((des_cblock *)(key + 16), p[2]); + des_set_key(key + 0, p[0]); + des_set_key(key + 8, p[1]); + des_set_key(key + 16, p[2]); err = 0; } else err = ENOMEM; diff --git a/freebsd/sys/opencrypto/xform_enc.h b/freebsd/sys/opencrypto/xform_enc.h index 545e0ec2..e2b87f5c 100644 --- a/freebsd/sys/opencrypto/xform_enc.h +++ b/freebsd/sys/opencrypto/xform_enc.h @@ -56,9 +56,9 @@ struct enc_xform { u_int16_t minkey, maxkey; void (*encrypt) (caddr_t, u_int8_t *); void (*decrypt) (caddr_t, u_int8_t *); - int (*setkey) (u_int8_t **, u_int8_t *, int len); + int (*setkey) (u_int8_t **, const u_int8_t *, int len); void (*zerokey) (u_int8_t **); - void (*reinit) (caddr_t, u_int8_t *); + void (*reinit) (caddr_t, const u_int8_t *); /* * Encrypt/decrypt 1+ blocks of input -- total size is 'len' bytes. * Len is guaranteed to be a multiple of the defined 'blocksize'. @@ -84,6 +84,7 @@ extern struct enc_xform enc_xform_aes_xts; extern struct enc_xform enc_xform_arc4; extern struct enc_xform enc_xform_camellia; extern struct enc_xform enc_xform_chacha20; +extern struct enc_xform enc_xform_ccm; struct aes_icm_ctx { u_int32_t ac_ek[4*(RIJNDAEL_MAXNR + 1)]; diff --git a/freebsd/sys/opencrypto/xform_null.c b/freebsd/sys/opencrypto/xform_null.c index 3c499b31..28f20bdf 100644 --- a/freebsd/sys/opencrypto/xform_null.c +++ b/freebsd/sys/opencrypto/xform_null.c @@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$"); #include <opencrypto/xform_auth.h> #include <opencrypto/xform_enc.h> -static int null_setkey(u_int8_t **, u_int8_t *, int); +static int null_setkey(u_int8_t **, const u_int8_t *, int); static void null_encrypt(caddr_t, u_int8_t *); static void null_decrypt(caddr_t, u_int8_t *); static void null_zerokey(u_int8_t **); @@ -104,7 +104,7 @@ null_decrypt(caddr_t key, u_int8_t *blk) } static int -null_setkey(u_int8_t **sched, u_int8_t *key, int len) +null_setkey(u_int8_t **sched, const u_int8_t *key, int len) { *sched = NULL; return 0; diff --git a/freebsd/sys/opencrypto/xform_rijndael.c b/freebsd/sys/opencrypto/xform_rijndael.c index 2c974f3d..378e86c0 100644 --- a/freebsd/sys/opencrypto/xform_rijndael.c +++ b/freebsd/sys/opencrypto/xform_rijndael.c @@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$"); #include <crypto/rijndael/rijndael.h> #include <opencrypto/xform_enc.h> -static int rijndael128_setkey(u_int8_t **, u_int8_t *, int); +static int rijndael128_setkey(u_int8_t **, const u_int8_t *, int); static void rijndael128_encrypt(caddr_t, u_int8_t *); static void rijndael128_decrypt(caddr_t, u_int8_t *); static void rijndael128_zerokey(u_int8_t **); @@ -87,7 +87,7 @@ rijndael128_decrypt(caddr_t key, u_int8_t *blk) } static int -rijndael128_setkey(u_int8_t **sched, u_int8_t *key, int len) +rijndael128_setkey(u_int8_t **sched, const u_int8_t *key, int len) { int err; @@ -96,7 +96,7 @@ rijndael128_setkey(u_int8_t **sched, u_int8_t *key, int len) *sched = KMALLOC(sizeof(rijndael_ctx), M_CRYPTO_DATA, M_NOWAIT|M_ZERO); if (*sched != NULL) { - rijndael_set_key((rijndael_ctx *) *sched, (u_char *) key, + rijndael_set_key((rijndael_ctx *) *sched, key, len * 8); err = 0; } else diff --git a/freebsd/sys/opencrypto/xform_skipjack.c b/freebsd/sys/opencrypto/xform_skipjack.c index 94090d0d..22d74b36 100644 --- a/freebsd/sys/opencrypto/xform_skipjack.c +++ b/freebsd/sys/opencrypto/xform_skipjack.c @@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$"); #include <opencrypto/skipjack.h> #include <opencrypto/xform_enc.h> -static int skipjack_setkey(u_int8_t **, u_int8_t *, int); +static int skipjack_setkey(u_int8_t **, const u_int8_t *, int); static void skipjack_encrypt(caddr_t, u_int8_t *); static void skipjack_decrypt(caddr_t, u_int8_t *); static void skipjack_zerokey(u_int8_t **); @@ -85,7 +85,7 @@ skipjack_decrypt(caddr_t key, u_int8_t *blk) } static int -skipjack_setkey(u_int8_t **sched, u_int8_t *key, int len) +skipjack_setkey(u_int8_t **sched, const u_int8_t *key, int len) { int err; diff --git a/freebsd/sys/powerpc/include/machine/cpufunc.h b/freebsd/sys/powerpc/include/machine/cpufunc.h index 204c4801..bad2042b 100644 --- a/freebsd/sys/powerpc/include/machine/cpufunc.h +++ b/freebsd/sys/powerpc/include/machine/cpufunc.h @@ -212,6 +212,43 @@ get_pcpu(void) return (ret); } +/* "NOP" operations to signify priorities to the kernel. */ +static __inline void +nop_prio_vlow(void) +{ + __asm __volatile("or 31,31,31"); +} + +static __inline void +nop_prio_low(void) +{ + __asm __volatile("or 1,1,1"); +} + +static __inline void +nop_prio_mlow(void) +{ + __asm __volatile("or 6,6,6"); +} + +static __inline void +nop_prio_medium(void) +{ + __asm __volatile("or 2,2,2"); +} + +static __inline void +nop_prio_mhigh(void) +{ + __asm __volatile("or 5,5,5"); +} + +static __inline void +nop_prio_high(void) +{ + __asm __volatile("or 3,3,3"); +} + #endif /* _KERNEL */ #endif /* !_MACHINE_CPUFUNC_H_ */ diff --git a/freebsd/sys/powerpc/include/machine/intr_machdep.h b/freebsd/sys/powerpc/include/machine/intr_machdep.h index 6ece0fa8..7ac54253 100644 --- a/freebsd/sys/powerpc/include/machine/intr_machdep.h +++ b/freebsd/sys/powerpc/include/machine/intr_machdep.h @@ -54,7 +54,7 @@ u_int powerpc_get_irq(uint32_t, u_int); void powerpc_dispatch_intr(u_int, struct trapframe *); int powerpc_enable_intr(void); int powerpc_setup_intr(const char *, u_int, driver_filter_t, driver_intr_t, - void *, enum intr_type, void **); + void *, enum intr_type, void **, int); int powerpc_teardown_intr(void *); int powerpc_bind_intr(u_int irq, u_char cpu); int powerpc_config_intr(int, enum intr_trigger, enum intr_polarity); diff --git a/freebsd/sys/powerpc/include/machine/spr.h b/freebsd/sys/powerpc/include/machine/spr.h index 8fa828e1..807b1a0d 100644 --- a/freebsd/sys/powerpc/include/machine/spr.h +++ b/freebsd/sys/powerpc/include/machine/spr.h @@ -93,11 +93,12 @@ #define SPR_MQ 0x000 /* .6. 601 MQ register */ #define SPR_XER 0x001 /* 468 Fixed Point Exception Register */ +#define SPR_DSCR 0x003 /* .6. Data Stream Control Register (Unprivileged) */ #define SPR_RTCU_R 0x004 /* .6. 601 RTC Upper - Read */ #define SPR_RTCL_R 0x005 /* .6. 601 RTC Lower - Read */ #define SPR_LR 0x008 /* 468 Link Register */ #define SPR_CTR 0x009 /* 468 Count Register */ -#define SPR_DSCR 0x011 /* Data Stream Control Register */ +#define SPR_DSCRP 0x011 /* Data Stream Control Register (Privileged) */ #define SPR_DSISR 0x012 /* .68 DSI exception source */ #define DSISR_DIRECT 0x80000000 /* Direct-store error exception */ #define DSISR_NOTFOUND 0x40000000 /* Translation not found */ @@ -122,19 +123,27 @@ #define SPR_EID 0x051 /* ..8 Exception Interrupt ??? */ #define SPR_NRI 0x052 /* ..8 Exception Interrupt ??? */ #define SPR_FSCR 0x099 /* Facility Status and Control Register */ -#define FSCR_IC_MASK 0xFF00000000000000ULL /* FSCR[0:7] is Interrupt Cause */ -#define FSCR_IC_FP 0x0000000000000000ULL /* FP unavailable */ -#define FSCR_IC_VSX 0x0100000000000000ULL /* VSX unavailable */ -#define FSCR_IC_DSCR 0x0200000000000000ULL /* Access to the DSCR at SPRs 3 or 17 */ -#define FSCR_IC_PM 0x0300000000000000ULL /* Read or write access of a Performance Monitor SPR in group A */ -#define FSCR_IC_BHRB 0x0400000000000000ULL /* Execution of a BHRB Instruction */ -#define FSCR_IC_HTM 0x0500000000000000ULL /* Access to a Transactional Memory */ +#define FSCR_IC_MASK 0xFF00000000000000ULL /* FSCR[0:7] is Interrupt Cause */ +#define FSCR_IC_FP 0x0000000000000000ULL /* FP unavailable */ +#define FSCR_IC_VSX 0x0100000000000000ULL /* VSX unavailable */ +#define FSCR_IC_DSCR 0x0200000000000000ULL /* Access to the DSCR at SPRs 3 or 17 */ +#define FSCR_IC_PM 0x0300000000000000ULL /* Read or write access of a Performance Monitor SPR in group A */ +#define FSCR_IC_BHRB 0x0400000000000000ULL /* Execution of a BHRB Instruction */ +#define FSCR_IC_HTM 0x0500000000000000ULL /* Access to a Transactional Memory */ /* Reserved 0x0600000000000000ULL */ -#define FSCR_IC_EBB 0x0700000000000000ULL /* Access to Event-Based Branch */ -#define FSCR_IC_TAR 0x0800000000000000ULL /* Access to Target Address Register */ -#define FSCR_IC_STOP 0x0900000000000000ULL /* Access to the 'stop' instruction in privileged non-hypervisor state */ -#define FSCR_IC_MSG 0x0A00000000000000ULL /* Access to 'msgsndp' or 'msgclrp' instructions */ -#define FSCR_IC_SCV 0x0C00000000000000ULL /* Execution of a 'scv' instruction */ +#define FSCR_IC_EBB 0x0700000000000000ULL /* Access to Event-Based Branch */ +#define FSCR_IC_TAR 0x0800000000000000ULL /* Access to Target Address Register */ +#define FSCR_IC_STOP 0x0900000000000000ULL /* Access to the 'stop' instruction in privileged non-hypervisor state */ +#define FSCR_IC_MSG 0x0A00000000000000ULL /* Access to 'msgsndp' or 'msgclrp' instructions */ +#define FSCR_IC_LM 0x0A00000000000000ULL /* Access to load monitored facility */ +#define FSCR_IC_SCV 0x0C00000000000000ULL /* Execution of a 'scv' instruction */ +#define FSCR_SCV 0x0000000000001000 /* scv instruction available */ +#define FSCR_LM 0x0000000000000800 /* Load monitored facilities available */ +#define FSCR_MSGP 0x0000000000000400 /* msgsndp and SPRs available */ +#define FSCR_TAR 0x0000000000000100 /* TAR register available */ +#define FSCR_EBB 0x0000000000000080 /* Event-based branch available */ +#define FSCR_DSCR 0x0000000000000004 /* DSCR available in PR state */ +#define SPR_DPDES 0x0b0 /* .6. Directed Privileged Doorbell Exception State Register */ #define SPR_USPRG0 0x100 /* 4.. User SPR General 0 */ #define SPR_VRSAVE 0x100 /* .6. AltiVec VRSAVE */ #define SPR_SPRG0 0x110 /* 468 SPR General 0 */ @@ -188,6 +197,7 @@ #define IBMPOWERPCA2 0x0049 #define IBMPOWER7PLUS 0x004a #define IBMPOWER8E 0x004b +#define IBMPOWER8NVL 0x004c #define IBMPOWER8 0x004d #define IBMPOWER9 0x004e #define MPC860 0x0050 @@ -242,7 +252,10 @@ #define LPCR_PECE_ME (1ULL << 12) /* Machine Check and Hypervisor */ /* Maintenance exceptions */ #define SPR_LPID 0x13f /* Logical Partitioning Control */ +#define SPR_HMER 0x150 /* Hypervisor Maintenance Exception Register */ +#define SPR_HMEER 0x151 /* Hypervisor Maintenance Exception Enable Register */ +#define SPR_TIR 0x1be /* .6. Thread Identification Register */ #define SPR_PTCR 0x1d0 /* Partition Table Control Register */ #define SPR_SPEFSCR 0x200 /* ..8 Signal Processing Engine FSCR. */ #define SPEFSCR_SOVH 0x80000000 @@ -403,6 +416,16 @@ #define SPR_MD_TWC 0x31d /* ..8 DMMU tablewalk control */ #define SPR_MD_RPN 0x31e /* ..8 DMMU real (phys) page number */ #define SPR_MD_TW 0x31f /* ..8 MMU tablewalk scratch */ +#define SPR_BESCRS 0x320 /* .6. Branch Event Status and Control Set Register */ +#define SPR_BESCRSU 0x321 /* .6. Branch Event Status and Control Set Register (upper 32-bit) */ +#define SPR_BESCRR 0x322 /* .6. Branch Event Status and Control Reset Register */ +#define SPR_BESCRRU 0x323 /* .6. Branch Event Status and Control Register (upper 32-bit) */ +#define SPR_EBBHR 0x324 /* .6. Event-based Branch Handler Register */ +#define SPR_EBBRR 0x325 /* .6. Event-based Branch Return Register */ +#define SPR_BESCR 0x326 /* .6. Branch Event Status and Control Register */ +#define SPR_LMRR 0x32d /* .6. Load Monitored Region Register */ +#define SPR_LMSER 0x32e /* .6. Load Monitored Section Enable Register */ +#define SPR_TAR 0x32f /* .6. Branch Target Address Register */ #define SPR_MI_CAM 0x330 /* ..8 IMMU CAM entry read */ #define SPR_MI_RAM0 0x331 /* ..8 IMMU RAM entry read reg 0 */ #define SPR_MI_RAM1 0x332 /* ..8 IMMU RAM entry read reg 1 */ @@ -410,6 +433,19 @@ #define SPR_MD_RAM0 0x339 /* ..8 IMMU RAM entry read reg 0 */ #define SPR_MD_RAM1 0x33a /* ..8 IMMU RAM entry read reg 1 */ #define SPR_PSSCR 0x357 /* Processor Stop Status and Control Register (ISA 3.0) */ +#define PSSCR_PLS_S 60 +#define PSSCR_PLS_M (0xf << PSSCR_PLS_S) +#define PSSCR_SD (1 << 22) +#define PSSCR_ESL (1 << 21) +#define PSSCR_EC (1 << 20) +#define PSSCR_PSLL_S 16 +#define PSSCR_PSLL_M (0xf << PSSCR_PSLL_S) +#define PSSCR_TR_S 8 +#define PSSCR_TR_M (0x3 << PSSCR_TR_S) +#define PSSCR_MTL_S 4 +#define PSSCR_MTL_M (0xf << PSSCR_MTL_S) +#define PSSCR_RL_S 0 +#define PSSCR_RL_M (0xf << PSSCR_RL_S) #define SPR_PMCR 0x374 /* Processor Management Control Register */ #define SPR_UMMCR2 0x3a0 /* .6. User Monitor Mode Control Register 2 */ #define SPR_UMMCR0 0x3a8 /* .6. User Monitor Mode Control Register 0 */ diff --git a/freebsd/sys/sys/_eventhandler.h b/freebsd/sys/sys/_eventhandler.h new file mode 100644 index 00000000..f8f24d2b --- /dev/null +++ b/freebsd/sys/sys/_eventhandler.h @@ -0,0 +1,144 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 1999 Michael Smith <msmith@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS__EVENTHANDLER_H_ +#define _SYS__EVENTHANDLER_H_ + +#include <sys/queue.h> + +struct eventhandler_entry { + TAILQ_ENTRY(eventhandler_entry) ee_link; + int ee_priority; +#define EHE_DEAD_PRIORITY (-1) + void *ee_arg; +}; + +typedef struct eventhandler_entry *eventhandler_tag; + +/* + * You can optionally use the EVENTHANDLER_LIST and EVENTHANDLER_DIRECT macros + * to pre-define a symbol for the eventhandler list. This symbol can be used by + * EVENTHANDLER_DIRECT_INVOKE, which has the advantage of not needing to do a + * locked search of the global list of eventhandler lists. At least + * EVENTHANDLER_LIST_DEFINE must be be used for EVENTHANDLER_DIRECT_INVOKE to + * work. EVENTHANDLER_LIST_DECLARE is only needed if the call to + * EVENTHANDLER_DIRECT_INVOKE is in a different compilation unit from + * EVENTHANDLER_LIST_DEFINE. If the events are even relatively high frequency + * it is suggested that you directly define a list for them. + */ +struct eventhandler_list; +#define EVENTHANDLER_LIST_DECLARE(name) \ +extern struct eventhandler_list *_eventhandler_list_ ## name \ + +/* + * Event handlers need to be declared, but do not need to be defined. The + * declaration must be in scope wherever the handler is to be invoked. + */ +#define EVENTHANDLER_DECLARE(name, type) \ +struct eventhandler_entry_ ## name \ +{ \ + struct eventhandler_entry ee; \ + type eh_func; \ +}; \ +struct __hack + +#endif +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 1999 Michael Smith <msmith@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS__EVENTHANDLER_H_ +#define _SYS__EVENTHANDLER_H_ + +#include <sys/queue.h> + +struct eventhandler_entry { + TAILQ_ENTRY(eventhandler_entry) ee_link; + int ee_priority; +#define EHE_DEAD_PRIORITY (-1) + void *ee_arg; +}; + +typedef struct eventhandler_entry *eventhandler_tag; + +/* + * You can optionally use the EVENTHANDLER_LIST and EVENTHANDLER_DIRECT macros + * to pre-define a symbol for the eventhandler list. This symbol can be used by + * EVENTHANDLER_DIRECT_INVOKE, which has the advantage of not needing to do a + * locked search of the global list of eventhandler lists. At least + * EVENTHANDLER_LIST_DEFINE must be be used for EVENTHANDLER_DIRECT_INVOKE to + * work. EVENTHANDLER_LIST_DECLARE is only needed if the call to + * EVENTHANDLER_DIRECT_INVOKE is in a different compilation unit from + * EVENTHANDLER_LIST_DEFINE. If the events are even relatively high frequency + * it is suggested that you directly define a list for them. + */ +struct eventhandler_list; +#define EVENTHANDLER_LIST_DECLARE(name) \ +extern struct eventhandler_list *_eventhandler_list_ ## name \ + +/* + * Event handlers need to be declared, but do not need to be defined. The + * declaration must be in scope wherever the handler is to be invoked. + */ +#define EVENTHANDLER_DECLARE(name, type) \ +struct eventhandler_entry_ ## name \ +{ \ + struct eventhandler_entry ee; \ + type eh_func; \ +}; \ +struct __hack + +#endif diff --git a/freebsd/sys/sys/_lock.h b/freebsd/sys/sys/_lock.h index ae10254c..f5609101 100644 --- a/freebsd/sys/sys/_lock.h +++ b/freebsd/sys/sys/_lock.h @@ -44,4 +44,38 @@ struct lock_object { #endif /* __rtems__ */ }; +#ifdef _KERNEL +/* + * If any of WITNESS, INVARIANTS, or KTR_LOCK KTR tracing has been enabled, + * then turn on LOCK_DEBUG. When this option is on, extra debugging + * facilities such as tracking the file and line number of lock operations + * are enabled. Also, mutex locking operations are not inlined to avoid + * bloat from all the extra debugging code. We also have to turn on all the + * calling conventions for this debugging code in modules so that modules can + * work with both debug and non-debug kernels. + */ +#if (defined(KLD_MODULE) && !defined(KLD_TIED)) || defined(WITNESS) || defined(INVARIANTS) || \ + defined(LOCK_PROFILING) || defined(KTR) +#define LOCK_DEBUG 1 +#else +#define LOCK_DEBUG 0 +#endif + +/* + * In the LOCK_DEBUG case, use the filename and line numbers for debugging + * operations. Otherwise, use default values to avoid the unneeded bloat. + */ +#if LOCK_DEBUG > 0 +#define LOCK_FILE_LINE_ARG_DEF , const char *file, int line +#define LOCK_FILE_LINE_ARG , file, line +#define LOCK_FILE __FILE__ +#define LOCK_LINE __LINE__ +#else +#define LOCK_FILE_LINE_ARG_DEF +#define LOCK_FILE_LINE_ARG +#define LOCK_FILE NULL +#define LOCK_LINE 0 +#endif +#endif /* _KERNEL */ + #endif /* !_SYS__LOCK_H_ */ diff --git a/freebsd/sys/sys/_rwlock.h b/freebsd/sys/sys/_rwlock.h index 318592d5..6530cca8 100644 --- a/freebsd/sys/sys/_rwlock.h +++ b/freebsd/sys/sys/_rwlock.h @@ -2,7 +2,6 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org> - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/freebsd/sys/sys/_task.h b/freebsd/sys/sys/_task.h index 392dc874..6ee48800 100644 --- a/freebsd/sys/sys/_task.h +++ b/freebsd/sys/sys/_task.h @@ -39,12 +39,11 @@ * field of struct task and the second argument is a count of how many * times the task was enqueued before the call to taskqueue_run(). * - * List of locks - * (c) const after init + * List of locks + * (c) const after init * (q) taskqueue lock */ typedef void task_fn_t(void *context, int pending); -typedef void gtask_fn_t(void *context); struct task { STAILQ_ENTRY(task) ta_link; /* (q) link for queue */ @@ -54,6 +53,10 @@ struct task { void *ta_context; /* (c) argument for handler */ }; +#ifdef _KERNEL + +typedef void gtask_fn_t(void *context); + struct gtask { STAILQ_ENTRY(gtask) ta_link; /* (q) link for queue */ uint16_t ta_flags; /* (q) state flags */ @@ -62,15 +65,6 @@ struct gtask { void *ta_context; /* (c) argument for handler */ }; -struct grouptask { - struct gtask gt_task; - void *gt_taskqueue; - LIST_ENTRY(grouptask) gt_list; - void *gt_uniq; -#define GROUPTASK_NAMELEN 32 - char gt_name[GROUPTASK_NAMELEN]; - int16_t gt_irq; - int16_t gt_cpu; -}; +#endif /* _KERNEL */ #endif /* !_SYS__TASK_H_ */ diff --git a/freebsd/sys/sys/ata.h b/freebsd/sys/sys/ata.h index f8a332c3..22edb557 100644 --- a/freebsd/sys/sys/ata.h +++ b/freebsd/sys/sys/ata.h @@ -66,7 +66,8 @@ struct ata_params { /*023*/ u_int8_t revision[8]; /* firmware revision */ /*027*/ u_int8_t model[40]; /* model name */ /*047*/ u_int16_t sectors_intr; /* sectors per interrupt */ -/*048*/ u_int16_t usedmovsd; /* double word read/write? */ +/*048*/ u_int16_t tcg; /* Trusted Computing Group */ +#define ATA_SUPPORT_TCG 0x0001 /*049*/ u_int16_t capabilities1; #define ATA_SUPPORT_DMA 0x0100 #define ATA_SUPPORT_LBA 0x0200 @@ -92,6 +93,12 @@ struct ata_params { /*057*/ u_int16_t current_size_1; /*058*/ u_int16_t current_size_2; /*059*/ u_int16_t multi; +#define ATA_SUPPORT_BLOCK_ERASE_EXT 0x8000 +#define ATA_SUPPORT_OVERWRITE_EXT 0x4000 +#define ATA_SUPPORT_CRYPTO_SCRAMBLE_EXT 0x2000 +#define ATA_SUPPORT_SANITIZE 0x1000 +#define ATA_SUPPORT_SANITIZE_ALLOWED 0x0800 +#define ATA_SUPPORT_ANTIFREEZE_LOCK_EXT 0x0400 #define ATA_MULTI_VALID 0x0100 /*060*/ u_int16_t lba_size_1; @@ -107,6 +114,7 @@ struct ata_params { /*069*/ u_int16_t support3; #define ATA_SUPPORT_RZAT 0x0020 #define ATA_SUPPORT_DRAT 0x4000 +#define ATA_ENCRYPTS_ALL_USER_DATA 0x0010 /* Self-encrypting drive */ #define ATA_SUPPORT_ZONE_MASK 0x0003 #define ATA_SUPPORT_ZONE_NR 0x0000 #define ATA_SUPPORT_ZONE_HOST_AWARE 0x0001 @@ -135,7 +143,8 @@ struct ata_params { /*77*/ u_int16_t satacapabilities2; #define ATA_SATA_CURR_GEN_MASK 0x0006 #define ATA_SUPPORT_NCQ_STREAM 0x0010 -#define ATA_SUPPORT_NCQ_QMANAGEMENT 0x0020 +#define ATA_SUPPORT_NCQ_NON_DATA 0x0020 +#define ATA_SUPPORT_NCQ_QMANAGEMENT ATA_SUPPORT_NCQ_NON_DATA #define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040 /*78*/ u_int16_t satasupport; #define ATA_SUPPORT_NONZERO 0x0002 @@ -144,6 +153,7 @@ struct ata_params { #define ATA_SUPPORT_INORDERDATA 0x0010 #define ATA_SUPPORT_ASYNCNOTIF 0x0020 #define ATA_SUPPORT_SOFTSETPRESERVE 0x0040 +#define ATA_SUPPORT_NCQ_AUTOSENSE 0x0080 /*79*/ u_int16_t sataenabled; #define ATA_ENABLED_DAPST 0x0080 @@ -236,12 +246,15 @@ struct ata_params { #define ATA_SUPPORT_FREEFALL 0x0020 #define ATA_SUPPORT_SENSE_REPORT 0x0040 #define ATA_SUPPORT_EPC 0x0080 +#define ATA_SUPPORT_AMAX_ADDR 0x0100 +#define ATA_SUPPORT_DSN 0x0200 /*120*/ u_int16_t enabled2; #define ATA_ENABLED_WRITEREADVERIFY 0x0002 #define ATA_ENABLED_WRITEUNCORREXT 0x0004 #define ATA_ENABLED_FREEFALL 0x0020 #define ATA_ENABLED_SENSE_REPORT 0x0040 #define ATA_ENABLED_EPC 0x0080 +#define ATA_ENABLED_DSN 0x0200 u_int16_t reserved121[6]; /*127*/ u_int16_t removable_status; /*128*/ u_int16_t security_status; @@ -259,10 +272,23 @@ struct ata_params { /*162*/ u_int16_t cfa_kms_support; /*163*/ u_int16_t cfa_trueide_modes; /*164*/ u_int16_t cfa_memory_modes; - u_int16_t reserved165[4]; + u_int16_t reserved165[3]; +/*168*/ u_int16_t form_factor; +#define ATA_FORM_FACTOR_MASK 0x000f +#define ATA_FORM_FACTOR_NOT_REPORTED 0x0000 +#define ATA_FORM_FACTOR_5_25 0x0001 +#define ATA_FORM_FACTOR_3_5 0x0002 +#define ATA_FORM_FACTOR_2_5 0x0003 +#define ATA_FORM_FACTOR_1_8 0x0004 +#define ATA_FORM_FACTOR_SUB_1_8 0x0005 +#define ATA_FORM_FACTOR_MSATA 0x0006 +#define ATA_FORM_FACTOR_M_2 0x0007 +#define ATA_FORM_FACTOR_MICRO_SSD 0x0008 +#define ATA_FORM_FACTOR_C_FAST 0x0009 /*169*/ u_int16_t support_dsm; #define ATA_SUPPORT_DSM_TRIM 0x0001 - u_int16_t reserved170[6]; +/*170*/ u_int8_t product_id[8]; /* Additional Product Identifier */ + u_int16_t reserved174[2]; /*176*/ u_int8_t media_serial[60]; /*206*/ u_int16_t sct; u_int16_t reserved207[2]; @@ -393,6 +419,12 @@ struct ata_params { #define ATA_READ_LOG_DMA_EXT 0x47 /* read log DMA ext - PIO Data-In */ #define ATA_ZAC_MANAGEMENT_IN 0x4a /* ZAC management in */ #define ATA_ZM_REPORT_ZONES 0x00 /* report zones */ +#define ATA_WRITE_LOG_DMA_EXT 0x57 /* WRITE LOG DMA EXT */ +#define ATA_TRUSTED_NON_DATA 0x5b /* TRUSTED NON-DATA */ +#define ATA_TRUSTED_RECEIVE 0x5c /* TRUSTED RECEIVE */ +#define ATA_TRUSTED_RECEIVE_DMA 0x5d /* TRUSTED RECEIVE DMA */ +#define ATA_TRUSTED_SEND 0x5e /* TRUSTED SEND */ +#define ATA_TRUSTED_SEND_DMA 0x5f /* TRUSTED SEND DMA */ #define ATA_READ_FPDMA_QUEUED 0x60 /* read DMA NCQ */ #define ATA_WRITE_FPDMA_QUEUED 0x61 /* write DMA NCQ */ #define ATA_NCQ_NON_DATA 0x63 /* NCQ non-data command */ @@ -412,15 +444,22 @@ struct ata_params { #define ATA_RFPDMA_ZAC_MGMT_IN 0x02 /* NCQ ZAC mgmt in w/data */ #define ATA_SEP_ATTN 0x67 /* SEP request */ #define ATA_SEEK 0x70 /* seek */ +#define ATA_AMAX_ADDR 0x78 /* Accessible Max Address */ +#define ATA_AMAX_ADDR_GET 0x00 /* GET NATIVE MAX ADDRESS EXT */ +#define ATA_AMAX_ADDR_SET 0x01 /* SET ACCESSIBLE MAX ADDRESS EXT */ +#define ATA_AMAX_ADDR_FREEZE 0x02 /* FREEZE ACCESSIBLE MAX ADDRESS EXT */ #define ATA_ZAC_MANAGEMENT_OUT 0x9f /* ZAC management out */ #define ATA_ZM_CLOSE_ZONE 0x01 /* close zone */ #define ATA_ZM_FINISH_ZONE 0x02 /* finish zone */ #define ATA_ZM_OPEN_ZONE 0x03 /* open zone */ #define ATA_ZM_RWP 0x04 /* reset write pointer */ +#define ATA_DOWNLOAD_MICROCODE 0x92 /* DOWNLOAD MICROCODE */ +#define ATA_DOWNLOAD_MICROCODE_DMA 0x93 /* DOWNLOAD MICROCODE DMA */ #define ATA_PACKET_CMD 0xa0 /* packet command */ #define ATA_ATAPI_IDENTIFY 0xa1 /* get ATAPI params*/ #define ATA_SERVICE 0xa2 /* service command */ #define ATA_SMART_CMD 0xb0 /* SMART command */ +#define ATA_SANITIZE 0xb4 /* sanitize device */ #define ATA_CFA_ERASE 0xc0 /* CFA erase */ #define ATA_READ_MUL 0xc4 /* read multi */ #define ATA_WRITE_MUL 0xc5 /* write multi */ @@ -439,8 +478,11 @@ struct ata_params { #define ATA_CHECK_POWER_MODE 0xe5 /* device power mode */ #define ATA_SLEEP 0xe6 /* sleep */ #define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */ +#define ATA_WRITE_BUFFER 0xe8 /* write buffer */ #define ATA_WRITE_PM 0xe8 /* write portmultiplier */ +#define ATA_READ_BUFFER_DMA 0xe9 /* read buffer DMA */ #define ATA_FLUSHCACHE48 0xea /* flush cache to disk */ +#define ATA_WRITE_BUFFER_DMA 0xeb /* write buffer DMA */ #define ATA_ATA_IDENTIFY 0xec /* get ATA params */ #define ATA_SETFEATURES 0xef /* features command */ #define ATA_SF_ENAB_WCACHE 0x02 /* enable write cache */ diff --git a/freebsd/sys/sys/blist.h b/freebsd/sys/sys/blist.h index 1e6deb52..87f492c8 100644 --- a/freebsd/sys/sys/blist.h +++ b/freebsd/sys/sys/blist.h @@ -33,7 +33,7 @@ * Usage: * blist = blist_create(blocks, flags) * (void) blist_destroy(blist) - * blkno = blist_alloc(blist, count) + * blkno = blist_alloc(blist, &count, maxcount) * (void) blist_free(blist, blkno, count) * nblks = blist_fill(blist, blkno, count) * (void) blist_resize(&blist, count, freeextra, flags) @@ -73,17 +73,17 @@ typedef unsigned long u_daddr_t; /* unsigned disk address */ #define SWAPBLK_NONE ((daddr_t)((u_daddr_t)SWAPBLK_MASK + 1))/* flag */ /* - * Both blmeta and bmu_bitmap MUST be a power of 2 in size. + * Both blmeta and bm_bitmap MUST be a power of 2 in size. */ typedef struct blmeta { - u_daddr_t bm_bitmap; /* bitmap if we are a leaf */ + u_daddr_t bm_bitmap; /* marking unfilled block sets */ daddr_t bm_bighint; /* biggest contiguous block hint*/ } blmeta_t; typedef struct blist { daddr_t bl_blocks; /* area of coverage */ - daddr_t bl_avail; /* # available blocks */ + daddr_t bl_avail; /* # available blocks */ u_daddr_t bl_radix; /* coverage radix */ daddr_t bl_cursor; /* next-fit search starts at */ blmeta_t bl_root[1]; /* root of radix tree */ @@ -96,7 +96,7 @@ typedef struct blist { struct sbuf; -daddr_t blist_alloc(blist_t blist, daddr_t count); +daddr_t blist_alloc(blist_t blist, int *count, int maxcount); daddr_t blist_avail(blist_t blist); blist_t blist_create(daddr_t blocks, int flags); void blist_destroy(blist_t blist); diff --git a/freebsd/sys/sys/buf.h b/freebsd/sys/sys/buf.h index a099a972..f419617a 100644 --- a/freebsd/sys/sys/buf.h +++ b/freebsd/sys/sys/buf.h @@ -44,6 +44,7 @@ #include <sys/queue.h> #include <sys/lock.h> #include <sys/lockmgr.h> +#include <vm/uma.h> struct bio; struct buf; @@ -195,6 +196,11 @@ struct buf { * may not be used with the stage 1 data write under NFS * but may be used for the commit rpc portion. * + * B_INVALONERR This flag is set on dirty buffers. It specifies that a + * write error should forcibly invalidate the buffer + * contents. This flag should be used with caution, as it + * discards data. It is incompatible with B_ASYNC. + * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is @@ -225,7 +231,7 @@ struct buf { #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ -#define B_00040000 0x00040000 /* Available flag. */ +#define B_INVALONERR 0x00040000 /* Invalidate on write error. */ #define B_00080000 0x00080000 /* Available flag. */ #define B_00100000 0x00100000 /* Available flag. */ #define B_00200000 0x00200000 /* Available flag. */ @@ -242,7 +248,7 @@ struct buf { #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26b21\25b20" \ - "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ + "\24b19\23invalonerr\22clusterok\21malloc\20nocache\17b14\16inval" \ "\15reuse\14noreuse\13eintr\12done\11b8\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" @@ -275,6 +281,11 @@ struct buf { #define PRINT_BUF_VFLAGS "\20\4bkgrderr\3bkgrdwait\2bkgrdinprog\1scanned" #ifdef _KERNEL + +#ifndef NSWBUF_MIN +#define NSWBUF_MIN 16 +#endif + /* * Buffer locking */ @@ -287,7 +298,7 @@ extern const char *buf_wmesg; /* Default buffer lock message */ * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ - lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) + lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, LK_NEW) /* * * Get a lock sleeping non-interruptably until it becomes available. @@ -353,15 +364,11 @@ extern const char *buf_wmesg; /* Default buffer lock message */ _lockmgr_assert(&(bp)->b_lock, KA_XLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_UNLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_UNLOCKED, LOCK_FILE, LOCK_LINE) -#define BUF_ASSERT_HELD(bp) -#define BUF_ASSERT_UNHELD(bp) #else #define BUF_ASSERT_LOCKED(bp) #define BUF_ASSERT_SLOCKED(bp) #define BUF_ASSERT_XLOCKED(bp) #define BUF_ASSERT_UNLOCKED(bp) -#define BUF_ASSERT_HELD(bp) -#define BUF_ASSERT_UNHELD(bp) #endif #ifdef _SYS_PROC_H_ /* Avoid #include <sys/proc.h> pollution */ @@ -493,10 +500,6 @@ extern int bdwriteskip; extern int dirtybufferflushes; extern int altbufferflushes; extern int nswbuf; /* Number of swap I/O buffer headers. */ -extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ -extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ -extern int vnode_async_pbuf_freecnt; /* Number of pbufs for vnode pager, - asynchronous reads */ extern caddr_t unmapped_buf; /* Data address for unmapped buffers. */ static inline int @@ -537,7 +540,6 @@ void brelse(struct buf *); void bqrelse(struct buf *); int vfs_bio_awrite(struct buf *); void vfs_drain_busy_pages(struct buf *bp); -struct buf * getpbuf(int *); struct buf *incore(struct bufobj *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); @@ -549,6 +551,9 @@ int bufwrite(struct buf *); void bufdone(struct buf *); void bd_speedup(void); +extern uma_zone_t pbuf_zone; +uma_zone_t pbuf_zsecond_create(char *name, int max); + int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, int, struct buf **); int cluster_wbuild(struct vnode *, long, daddr_t, int, int); @@ -562,7 +567,6 @@ void vfs_busy_pages(struct buf *, int clear_modify); void vfs_unbusy_pages(struct buf *); int vmapbuf(struct buf *, int); void vunmapbuf(struct buf *); -void relpbuf(struct buf *, int *); void brelvp(struct buf *); void bgetvp(struct vnode *, struct buf *); void pbgetbo(struct bufobj *bo, struct buf *bp); @@ -571,7 +575,6 @@ void pbrelbo(struct buf *); void pbrelvp(struct buf *); int allocbuf(struct buf *bp, int size); void reassignbuf(struct buf *); -struct buf *trypbuf(int *); void bwait(struct buf *, u_char, const char *); void bdone(struct buf *); diff --git a/freebsd/sys/sys/buf_ring.h b/freebsd/sys/sys/buf_ring.h index e8c69341..b8b136bd 100644 --- a/freebsd/sys/sys/buf_ring.h +++ b/freebsd/sys/sys/buf_ring.h @@ -310,15 +310,24 @@ buf_ring_peek_clear_sc(struct buf_ring *br) if (!mtx_owned(br->br_lock)) panic("lock not held on single consumer dequeue"); #endif - /* - * I believe it is safe to not have a memory barrier - * here because we control cons and tail is worst case - * a lagging indicator so we worst case we might - * return NULL immediately after a buffer has been enqueued - */ + if (br->br_cons_head == br->br_prod_tail) return (NULL); +#if defined(__arm__) || defined(__aarch64__) + /* + * The barrier is required there on ARM and ARM64 to ensure, that + * br->br_ring[br->br_cons_head] will not be fetched before the above + * condition is checked. + * Without the barrier, it is possible, that buffer will be fetched + * before the enqueue will put mbuf into br, then, in the meantime, the + * enqueue will update the array and the br_prod_tail, and the + * conditional check will be true, so we will return previously fetched + * (and invalid) buffer. + */ + atomic_thread_fence_acq(); +#endif + #ifdef DEBUG_BUFRING /* * Single consumer, i.e. cons_head will not move while we are diff --git a/freebsd/sys/sys/bufobj.h b/freebsd/sys/sys/bufobj.h index b02d4276..b075644d 100644 --- a/freebsd/sys/sys/bufobj.h +++ b/freebsd/sys/sys/bufobj.h @@ -127,7 +127,7 @@ struct bufobj { #define ASSERT_BO_LOCKED(bo) rw_assert(BO_LOCKPTR((bo)), RA_LOCKED) #define ASSERT_BO_UNLOCKED(bo) rw_assert(BO_LOCKPTR((bo)), RA_UNLOCKED) -void bufobj_init(struct bufobj *bo, void *private); +void bufobj_init(struct bufobj *bo, void *priv); void bufobj_wdrop(struct bufobj *bo); void bufobj_wref(struct bufobj *bo); void bufobj_wrefl(struct bufobj *bo); diff --git a/freebsd/sys/sys/bus.h b/freebsd/sys/sys/bus.h index 8e1ba763..26acd639 100644 --- a/freebsd/sys/sys/bus.h +++ b/freebsd/sys/sys/bus.h @@ -130,6 +130,7 @@ struct devreq { #define DEV_DELETE _IOW('D', 10, struct devreq) #define DEV_FREEZE _IOW('D', 11, struct devreq) #define DEV_THAW _IOW('D', 12, struct devreq) +#define DEV_RESET _IOW('D', 13, struct devreq) /* Flags for DEV_DETACH and DEV_DISABLE. */ #define DEVF_FORCE_DETACH 0x0000001 @@ -143,10 +144,15 @@ struct devreq { /* Flags for DEV_DELETE. */ #define DEVF_FORCE_DELETE 0x0000001 +/* Flags for DEV_RESET */ +#define DEVF_RESET_DETACH 0x0000001 /* Detach drivers vs suspend + device */ + #ifdef _KERNEL -#include <sys/eventhandler.h> +#include <sys/_eventhandler.h> #include <sys/kobj.h> +#include <sys/systm.h> /** * devctl hooks. Typically one should use the devctl_notify @@ -414,6 +420,8 @@ void root_bus_configure(void); * Useful functions for implementing buses. */ +struct _cpuset; + int bus_generic_activate_resource(device_t dev, device_t child, int type, int rid, struct resource *r); device_t @@ -426,6 +434,8 @@ struct resource * bus_generic_alloc_resource(device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags); +int bus_generic_translate_resource(device_t dev, int type, rman_res_t start, + rman_res_t *newstart); int bus_generic_attach(device_t dev); int bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq, int cpu); @@ -494,6 +504,8 @@ int bus_generic_unmap_resource(device_t dev, device_t child, int type, struct resource_map *map); int bus_generic_write_ivar(device_t dev, device_t child, int which, uintptr_t value); +int bus_helper_reset_post(device_t dev, int flags); +int bus_helper_reset_prepare(device_t dev, int flags); int bus_null_rescan(device_t dev); /* @@ -802,16 +814,24 @@ DECLARE_MODULE(name##_##busname, name##_##busname##_mod, \ static __inline type varp ## _get_ ## var(device_t dev) \ { \ uintptr_t v; \ - BUS_READ_IVAR(device_get_parent(dev), dev, \ + int e; \ + e = BUS_READ_IVAR(device_get_parent(dev), dev, \ ivarp ## _IVAR_ ## ivar, &v); \ + KASSERT(e == 0, ("%s failed for %s on bus %s, error = %d", \ + __func__, device_get_nameunit(dev), \ + device_get_nameunit(device_get_parent(dev)), e)); \ return ((type) v); \ } \ \ static __inline void varp ## _set_ ## var(device_t dev, type t) \ { \ uintptr_t v = (uintptr_t) t; \ - BUS_WRITE_IVAR(device_get_parent(dev), dev, \ + int e; \ + e = BUS_WRITE_IVAR(device_get_parent(dev), dev, \ ivarp ## _IVAR_ ## ivar, v); \ + KASSERT(e == 0, ("%s failed for %s on bus %s, error = %d", \ + __func__, device_get_nameunit(dev), \ + device_get_nameunit(device_get_parent(dev)), e)); \ } #else /* __rtems__ */ #define __BUS_ACCESSOR(varp, var, ivarp, ivar, type) \ diff --git a/freebsd/sys/sys/bus_dma.h b/freebsd/sys/sys/bus_dma.h index eb2bc42d..e99d8ece 100644 --- a/freebsd/sys/sys/bus_dma.h +++ b/freebsd/sys/sys/bus_dma.h @@ -67,7 +67,9 @@ #ifndef _BUS_DMA_H_ #define _BUS_DMA_H_ +#ifdef _KERNEL #include <sys/_bus_dma.h> +#endif /* * Machine independent interface for mapping physical addresses to peripheral @@ -133,6 +135,7 @@ typedef struct bus_dma_segment { bus_size_t ds_len; /* length of transfer */ } bus_dma_segment_t; +#ifdef _KERNEL /* * A function that returns 1 if the address cannot be accessed by * a device and 0 if it can be. @@ -302,5 +305,6 @@ BUS_DMAMAP_OP void bus_dmamap_sync(bus_dma_tag_t dmat, bus_dmamap_t dmamap, bus_ BUS_DMAMAP_OP void bus_dmamap_unload(bus_dma_tag_t dmat, bus_dmamap_t dmamap); #undef BUS_DMAMAP_OP +#endif /* _KERNEL */ #endif /* _BUS_DMA_H_ */ diff --git a/freebsd/sys/sys/capsicum.h b/freebsd/sys/sys/capsicum.h index d40b8572..ee5e4267 100644 --- a/freebsd/sys/sys/capsicum.h +++ b/freebsd/sys/sys/capsicum.h @@ -246,7 +246,12 @@ /* Process management via process descriptors. */ /* Allows for pdgetpid(2). */ #define CAP_PDGETPID CAPRIGHT(1, 0x0000000000000200ULL) -/* Allows for pdwait4(2). */ +/* + * Allows for pdwait4(2). + * + * XXX: this constant was imported unused, but is targeted to be implemented + * in the future (bug 235871). + */ #define CAP_PDWAIT CAPRIGHT(1, 0x0000000000000400ULL) /* Allows for pdkill(2). */ #define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL) diff --git a/freebsd/sys/sys/conf.h b/freebsd/sys/sys/conf.h index c0a66442..caa34dea 100644 --- a/freebsd/sys/sys/conf.h +++ b/freebsd/sys/sys/conf.h @@ -43,7 +43,7 @@ #define _SYS_CONF_H_ #ifdef _KERNEL -#include <sys/eventhandler.h> +#include <sys/_eventhandler.h> #else #include <sys/queue.h> #endif @@ -186,7 +186,8 @@ typedef int dumper_hdr_t(struct dumperinfo *di, struct kerneldumpheader *kdh, #define D_VERSION_01 0x17032005 /* Add d_uid,gid,mode & kind */ #define D_VERSION_02 0x28042009 /* Add d_mmap_single */ #define D_VERSION_03 0x17122009 /* d_mmap takes memattr,vm_ooffset_t */ -#define D_VERSION D_VERSION_03 +#define D_VERSION_04 0x5c48c353 /* SPECNAMELEN bumped to MAXNAMLEN */ +#define D_VERSION D_VERSION_04 /* * Flags used for internal housekeeping @@ -379,6 +380,10 @@ struct dumperinfo { off_t origdumpoff; /* Starting dump offset. */ struct kerneldumpcrypto *kdcrypto; /* Kernel dump crypto. */ struct kerneldumpcomp *kdcomp; /* Kernel dump compression. */ + + TAILQ_ENTRY(dumperinfo) di_next; + + char di_devname[]; }; #ifndef __rtems__ @@ -388,10 +393,10 @@ extern int dumping; /* system is dumping */ #endif /* __rtems__ */ int doadump(boolean_t); -int set_dumper(struct dumperinfo *di, const char *devname, struct thread *td, - uint8_t compression, uint8_t encryption, const uint8_t *key, - uint32_t encryptedkeysize, const uint8_t *encryptedkey); -int clear_dumper(struct thread *td); +struct diocskerneldump_arg; +int dumper_insert(const struct dumperinfo *di_template, const char *devname, + const struct diocskerneldump_arg *kda); +int dumper_remove(const char *devname, const struct diocskerneldump_arg *kda); int dump_start(struct dumperinfo *di, struct kerneldumpheader *kdh); int dump_append(struct dumperinfo *, void *, vm_offset_t, size_t); diff --git a/freebsd/sys/sys/counter.h b/freebsd/sys/sys/counter.h index 418141a5..9960a2c3 100644 --- a/freebsd/sys/sys/counter.h +++ b/freebsd/sys/sys/counter.h @@ -43,23 +43,23 @@ void counter_u64_zero(counter_u64_t); uint64_t counter_u64_fetch(counter_u64_t); #define COUNTER_ARRAY_ALLOC(a, n, wait) do { \ - for (int i = 0; i < (n); i++) \ - (a)[i] = counter_u64_alloc(wait); \ + for (int _i = 0; _i < (n); _i++) \ + (a)[_i] = counter_u64_alloc(wait); \ } while (0) #define COUNTER_ARRAY_FREE(a, n) do { \ - for (int i = 0; i < (n); i++) \ - counter_u64_free((a)[i]); \ + for (int _i = 0; _i < (n); _i++) \ + counter_u64_free((a)[_i]); \ } while (0) #define COUNTER_ARRAY_COPY(a, dstp, n) do { \ - for (int i = 0; i < (n); i++) \ - ((uint64_t *)(dstp))[i] = counter_u64_fetch((a)[i]);\ + for (int _i = 0; _i < (n); _i++) \ + ((uint64_t *)(dstp))[_i] = counter_u64_fetch((a)[_i]);\ } while (0) #define COUNTER_ARRAY_ZERO(a, n) do { \ - for (int i = 0; i < (n); i++) \ - counter_u64_zero((a)[i]); \ + for (int _i = 0; _i < (n); _i++) \ + counter_u64_zero((a)[_i]); \ } while (0) /* diff --git a/freebsd/sys/sys/cpu.h b/freebsd/sys/sys/cpu.h index 8a74e470..7ec7dc9e 100644 --- a/freebsd/sys/sys/cpu.h +++ b/freebsd/sys/sys/cpu.h @@ -31,7 +31,7 @@ #ifndef _SYS_CPU_H_ #define _SYS_CPU_H_ -#include <sys/eventhandler.h> +#include <sys/_eventhandler.h> /* * CPU device support. diff --git a/freebsd/sys/sys/ctype.h b/freebsd/sys/sys/ctype.h index b2a1fa93..d542e45a 100644 --- a/freebsd/sys/sys/ctype.h +++ b/freebsd/sys/sys/ctype.h @@ -41,19 +41,65 @@ #ifdef _KERNEL -#define isspace(c) ((c) == ' ' || ((c) >= '\t' && (c) <= '\r')) -#define isascii(c) (((c) & ~0x7f) == 0) -#define isupper(c) ((c) >= 'A' && (c) <= 'Z') -#define islower(c) ((c) >= 'a' && (c) <= 'z') -#define isalpha(c) (isupper(c) || islower(c)) -#define isdigit(c) ((c) >= '0' && (c) <= '9') -#define isxdigit(c) (isdigit(c) \ - || ((c) >= 'A' && (c) <= 'F') \ - || ((c) >= 'a' && (c) <= 'f')) -#define isprint(c) ((c) >= ' ' && (c) <= '~') - -#define toupper(c) ((c) - 0x20 * (((c) >= 'a') && ((c) <= 'z'))) -#define tolower(c) ((c) + 0x20 * (((c) >= 'A') && ((c) <= 'Z'))) +static __inline int +isspace(int c) +{ + return (c == ' ' || (c >= '\t' && c <= '\r')); +} + +static __inline int +isascii(int c) +{ + return ((c & ~0x7f) == 0); +} + +static __inline int +isupper(int c) +{ + return (c >= 'A' && c <= 'Z'); +} + +static __inline int +islower(int c) +{ + return (c >= 'a' && c <= 'z'); +} + +static __inline int +isalpha(int c) +{ + return (isupper(c) || islower(c)); +} + +static __inline int +isdigit(int c) +{ + return (c >= '0' && c <= '9'); +} + +static __inline int +isxdigit(int c) +{ + return (isdigit(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')); +} + +static __inline int +isprint(int c) +{ + return (c >= ' ' && c <= '~'); +} + +static __inline int +toupper(int c) +{ + return (c - 0x20 * ((c >= 'a') && (c <= 'z'))); +} + +static __inline int +tolower(int c) +{ + return (c + 0x20 * ((c >= 'A') && (c <= 'Z'))); +} #endif #endif /* !_SYS_CTYPE_H_ */ diff --git a/freebsd/sys/sys/disk.h b/freebsd/sys/sys/disk.h index 020626e2..79ce947f 100644 --- a/freebsd/sys/sys/disk.h +++ b/freebsd/sys/sys/disk.h @@ -19,6 +19,7 @@ #include <sys/kerneldump.h> #include <sys/types.h> #include <sys/disk_zone.h> +#include <sys/socket.h> #ifdef _KERNEL @@ -144,18 +145,53 @@ struct diocgattr_arg { #define DIOCZONECMD _IOWR('d', 143, struct disk_zone_args) #ifndef __rtems__ -struct diocskerneldump_arg { - uint8_t kda_enable; - uint8_t kda_compression; - uint8_t kda_encryption; - uint8_t kda_key[KERNELDUMP_KEY_MAX_SIZE]; - uint32_t kda_encryptedkeysize; - uint8_t *kda_encryptedkey; +struct diocskerneldump_arg_freebsd12 { + uint8_t kda12_enable; + uint8_t kda12_compression; + uint8_t kda12_encryption; + uint8_t kda12_key[KERNELDUMP_KEY_MAX_SIZE]; + uint32_t kda12_encryptedkeysize; + uint8_t *kda12_encryptedkey; }; -#define DIOCSKERNELDUMP _IOW('d', 144, struct diocskerneldump_arg) +#define DIOCSKERNELDUMP_FREEBSD12 \ + _IOW('d', 144, struct diocskerneldump_arg_freebsd12) + +#ifndef WITHOUT_NETDUMP +#include <net/if.h> +#include <netinet/in.h> + +union kd_ip { + struct in_addr in4; + struct in6_addr in6; +}; + +/* + * Sentinel values for kda_index. + * + * If kda_index is KDA_REMOVE_ALL, all dump configurations are cleared. + * + * If kda_index is KDA_REMOVE_DEV, all dump configurations for the specified + * device are cleared. + * + * If kda_index is KDA_REMOVE, only the specified dump configuration for the + * given device is removed from the list of fallback dump configurations. + * + * If kda_index is KDA_APPEND, the dump configuration is added after all + * existing dump configurations. + * + * Otherwise, the new configuration is inserted into the fallback dump list at + * index 'kda_index'. + */ +#define KDA_REMOVE UINT8_MAX +#define KDA_REMOVE_ALL (UINT8_MAX - 1) +#define KDA_REMOVE_DEV (UINT8_MAX - 2) +#define KDA_APPEND (UINT8_MAX - 3) + +#define DIOCGKERNELDUMP _IOWR('d', 146, struct diocskerneldump_arg) /* - * Enable/Disable the device for kernel core dumps. + * Get current kernel netdump configuration details for a given index. */ +#endif #endif /* __rtems__ */ #endif /* _SYS_DISK_H_ */ diff --git a/freebsd/sys/sys/eventhandler.h b/freebsd/sys/sys/eventhandler.h index cc423752..9e3ff019 100644 --- a/freebsd/sys/sys/eventhandler.h +++ b/freebsd/sys/sys/eventhandler.h @@ -31,18 +31,12 @@ #ifndef _SYS_EVENTHANDLER_H_ #define _SYS_EVENTHANDLER_H_ +#include <sys/_eventhandler.h> #include <sys/lock.h> #include <sys/ktr.h> #include <sys/mutex.h> #include <sys/queue.h> -struct eventhandler_entry { - TAILQ_ENTRY(eventhandler_entry) ee_link; - int ee_priority; -#define EHE_DEAD_PRIORITY (-1) - void *ee_arg; -}; - #ifdef VIMAGE struct eventhandler_entry_vimage { void (* func)(void); /* Original function registered. */ @@ -60,8 +54,6 @@ struct eventhandler_list { TAILQ_HEAD(,eventhandler_entry) el_entries; }; -typedef struct eventhandler_entry *eventhandler_tag; - #define EHL_LOCK(p) mtx_lock(&(p)->el_lock) #define EHL_UNLOCK(p) mtx_unlock(&(p)->el_lock) #define EHL_LOCK_ASSERT(p, x) mtx_assert(&(p)->el_lock, x) @@ -107,9 +99,6 @@ typedef struct eventhandler_entry *eventhandler_tag; * EVENTHANDLER_LIST_DEFINE. If the events are even relatively high frequency * it is suggested that you directly define a list for them. */ -#define EVENTHANDLER_LIST_DECLARE(name) \ -extern struct eventhandler_list *_eventhandler_list_ ## name \ - #define EVENTHANDLER_LIST_DEFINE(name) \ struct eventhandler_list *_eventhandler_list_ ## name ; \ static void _ehl_init_ ## name (void * ctx __unused) \ @@ -130,18 +119,6 @@ SYSINIT(name ## _ehl_init, SI_SUB_EVENTHANDLER, SI_ORDER_ANY, \ } \ } while (0) -/* - * Event handlers need to be declared, but do not need to be defined. The - * declaration must be in scope wherever the handler is to be invoked. - */ -#define EVENTHANDLER_DECLARE(name, type) \ -struct eventhandler_entry_ ## name \ -{ \ - struct eventhandler_entry ee; \ - type eh_func; \ -}; \ -struct __hack - #define EVENTHANDLER_DEFINE(name, func, arg, priority) \ static eventhandler_tag name ## _tag; \ static void name ## _evh_init(void *ctx) \ diff --git a/freebsd/sys/sys/fail.h b/freebsd/sys/sys/fail.h index 41e07bae..66150eb7 100644 --- a/freebsd/sys/sys/fail.h +++ b/freebsd/sys/sys/fail.h @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2009 Isilon Inc http://www.isilon.com/ + * Copyright (c) 2009-2019 Dell EMC Isilon http://www.isilon.com/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -191,11 +191,13 @@ fail_point_eval(struct fail_point *fp, int *ret) __END_DECLS /* Declare a fail_point and its sysctl in a function. */ +#define KFAIL_POINT_DECLARE(name) \ + extern struct fail_point _FAIL_POINT_NAME(name) #define _FAIL_POINT_NAME(name) _fail_point_##name #define _FAIL_POINT_LOCATION() "(" __FILE__ ":" __XSTRING(__LINE__) ")" #ifndef __rtems__ -#define _FAIL_POINT_INIT(parent, name, flags) \ - static struct fail_point _FAIL_POINT_NAME(name) = { \ +#define KFAIL_POINT_DEFINE(parent, name, flags) \ + struct fail_point _FAIL_POINT_NAME(name) = { \ .fp_name = #name, \ .fp_location = _FAIL_POINT_LOCATION(), \ .fp_ref_cnt = 0, \ @@ -214,6 +216,9 @@ __END_DECLS CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, \ &_FAIL_POINT_NAME(name), 0, \ fail_point_sysctl_status, "A", ""); + +#define _FAIL_POINT_INIT(parent, name, flags) \ + static KFAIL_POINT_DEFINE(parent, name, flags) #define _FAIL_POINT_EVAL(name, cond, code...) \ int RETURN_VALUE; \ \ @@ -223,6 +228,8 @@ __END_DECLS code; \ \ } +#define KFAIL_POINT_EVAL(name, code...) \ + _FAIL_POINT_EVAL(name, true, code) #else /* __rtems__ */ #define _FAIL_POINT_INIT(parent, name, flags) (void)0; #define _FAIL_POINT_EVAL(name, cond, code...) (void)0; diff --git a/freebsd/sys/sys/file.h b/freebsd/sys/sys/file.h index 20beac22..0e7c296a 100644 --- a/freebsd/sys/sys/file.h +++ b/freebsd/sys/sys/file.h @@ -184,7 +184,10 @@ struct file { /* * DTYPE_VNODE specific fields. */ - int f_seqcount; /* (a) Count of sequential accesses. */ + union { + int16_t f_seqcount; /* (a) Count of sequential accesses. */ + int f_pipegen; + }; off_t f_nextoff; /* next expected read/write offset. */ union { struct cdev_privdata *fvn_cdevpriv; @@ -407,8 +410,14 @@ _fnoop(void) return (0); } -#define fhold(fp) \ - (refcount_acquire(&(fp)->f_count)) +#ifndef __rtems__ +static __inline __result_use_check bool +fhold(struct file *fp) +{ + return (refcount_acquire_checked(&fp->f_count)); +} +#endif /* __rtems__ */ + #ifndef __rtems__ #define fdrop(fp, td) \ (refcount_release(&(fp)->f_count) ? _fdrop((fp), (td)) : _fnoop()) diff --git a/freebsd/sys/sys/filedesc.h b/freebsd/sys/sys/filedesc.h index 857d1fc9..5fafffb5 100644 --- a/freebsd/sys/sys/filedesc.h +++ b/freebsd/sys/sys/filedesc.h @@ -40,7 +40,7 @@ #include <sys/event.h> #include <sys/lock.h> #include <sys/priority.h> -#include <sys/seq.h> +#include <sys/seqc.h> #include <sys/sx.h> #include <machine/_limits.h> @@ -56,19 +56,19 @@ struct filedescent { struct file *fde_file; /* file structure for open file */ struct filecaps fde_caps; /* per-descriptor rights */ uint8_t fde_flags; /* per-process open file flags */ - seq_t fde_seq; /* keep file and caps in sync */ + seqc_t fde_seqc; /* keep file and caps in sync */ }; #define fde_rights fde_caps.fc_rights #define fde_fcntls fde_caps.fc_fcntls #define fde_ioctls fde_caps.fc_ioctls #define fde_nioctls fde_caps.fc_nioctls -#define fde_change_size (offsetof(struct filedescent, fde_seq)) +#define fde_change_size (offsetof(struct filedescent, fde_seqc)) struct fdescenttbl { int fdt_nfiles; /* number of open files allocated */ struct filedescent fdt_ofiles[0]; /* open files */ }; -#define fd_seq(fdt, fd) (&(fdt)->fdt_ofiles[(fd)].fde_seq) +#define fd_seqc(fdt, fd) (&(fdt)->fdt_ofiles[(fd)].fde_seqc) /* * This structure is used for the management of descriptors. It may be @@ -269,10 +269,10 @@ int fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, /* Return a referenced file from an unlocked descriptor. */ #ifndef __rtems__ int fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, - struct file **fpp, seq_t *seqp); + struct file **fpp, seqc_t *seqp); #else /* __rtems__ */ static inline int -do_fget_unlocked(struct filedesc *fdp, int fd, struct file **fpp, seq_t *seqp) +do_fget_unlocked(struct filedesc *fdp, int fd, struct file **fpp, seqc_t *seqp) { struct file *fp; @@ -320,10 +320,10 @@ fdeget_locked(struct filedesc *fdp, int fd) #ifdef CAPABILITIES static __inline bool -fd_modified(struct filedesc *fdp, int fd, seq_t seq) +fd_modified(struct filedesc *fdp, int fd, seqc_t seqc) { - return (!seq_consistent(fd_seq(fdp->fd_files, fd), seq)); + return (!seqc_consistent(fd_seqc(fdp->fd_files, fd), seqc)); } #endif #endif /* __rtems__ */ diff --git a/freebsd/sys/sys/gsb_crc32.h b/freebsd/sys/sys/gsb_crc32.h new file mode 100644 index 00000000..c5a42d3d --- /dev/null +++ b/freebsd/sys/sys/gsb_crc32.h @@ -0,0 +1,47 @@ +/*- + * COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or + * code or tables extracted from it, as desired without restriction. + * + * $FreeBSD$ + */ + +#ifndef _SYS_GSB_CRC32_H_ +#define _SYS_GSB_CRC32_H_ + +#include <sys/types.h> + +#ifdef _KERNEL + +extern const uint32_t crc32_tab[]; + +static __inline uint32_t +crc32_raw(const void *buf, size_t size, uint32_t crc) +{ + const uint8_t *p = (const uint8_t *)buf; + + while (size--) + crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); + return (crc); +} + +static __inline uint32_t +crc32(const void *buf, size_t size) +{ + uint32_t crc; + + crc = crc32_raw(buf, size, ~0U); + return (crc ^ ~0U); +} + +uint32_t calculate_crc32c(uint32_t crc32c, const unsigned char *buffer, + unsigned int length); +#endif + +#if defined(__amd64__) || defined(__i386__) +uint32_t sse42_crc32c(uint32_t, const unsigned char *, unsigned); +#endif +#if defined(__aarch64__) +uint32_t armv8_crc32c(uint32_t, const unsigned char *, unsigned int); +#endif + +#endif /* !_SYS_GSB_CRC32_H_ */ diff --git a/freebsd/sys/sys/gtaskqueue.h b/freebsd/sys/sys/gtaskqueue.h index a36c770a..82307c8a 100644 --- a/freebsd/sys/sys/gtaskqueue.h +++ b/freebsd/sys/sys/gtaskqueue.h @@ -31,20 +31,37 @@ #ifndef _SYS_GTASKQUEUE_H_ #define _SYS_GTASKQUEUE_H_ -#include <sys/taskqueue.h> #ifndef _KERNEL #error "no user-serviceable parts inside" #endif +#include <sys/_task.h> +#include <sys/bus.h> +#include <sys/taskqueue.h> +#include <sys/types.h> + struct gtaskqueue; -typedef void (*gtaskqueue_enqueue_fn)(void *context); /* * Taskqueue groups. Manages dynamic thread groups and irq binding for * device and other tasks. */ +struct grouptask { + struct gtask gt_task; + void *gt_taskqueue; + LIST_ENTRY(grouptask) gt_list; + void *gt_uniq; +#define GROUPTASK_NAMELEN 32 + char gt_name[GROUPTASK_NAMELEN]; +#ifndef __rtems__ + device_t gt_dev; + struct resource *gt_irq; + int gt_cpu; +#endif /* __rtems__ */ +}; + void gtaskqueue_block(struct gtaskqueue *queue); void gtaskqueue_unblock(struct gtaskqueue *queue); @@ -55,28 +72,29 @@ void gtaskqueue_drain_all(struct gtaskqueue *queue); void grouptask_block(struct grouptask *grouptask); void grouptask_unblock(struct grouptask *grouptask); int grouptaskqueue_enqueue(struct gtaskqueue *queue, struct gtask *task); + void taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *grptask, - void *uniq, int irq, const char *name); -int taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *grptask, - void *uniq, int cpu, int irq, const char *name); + void *uniq, device_t dev, struct resource *irq, const char *name); +int taskqgroup_attach_cpu(struct taskqgroup *qgroup, + struct grouptask *grptask, void *uniq, int cpu, device_t dev, + struct resource *irq, const char *name); void taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask); struct taskqgroup *taskqgroup_create(const char *name); void taskqgroup_destroy(struct taskqgroup *qgroup); int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride); -void taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn, - const char *name); +void taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask, + gtask_fn_t *fn, const char *name); void taskqgroup_config_gtask_deinit(struct grouptask *gtask); #define TASK_ENQUEUED 0x1 #define TASK_SKIP_WAKEUP 0x2 #define TASK_NOENQUEUE 0x4 - -#define GTASK_INIT(task, flags, priority, func, context) do { \ - (task)->ta_flags = flags; \ - (task)->ta_priority = (priority); \ - (task)->ta_func = (func); \ - (task)->ta_context = (context); \ +#define GTASK_INIT(gtask, flags, priority, func, context) do { \ + (gtask)->ta_flags = flags; \ + (gtask)->ta_priority = (priority); \ + (gtask)->ta_func = (func); \ + (gtask)->ta_context = (context); \ } while (0) #define GROUPTASK_INIT(gtask, priority, func, context) \ diff --git a/freebsd/sys/sys/interrupt.h b/freebsd/sys/sys/interrupt.h index 5c634054..cf8b7a01 100644 --- a/freebsd/sys/sys/interrupt.h +++ b/freebsd/sys/sys/interrupt.h @@ -156,7 +156,7 @@ extern struct intr_event *clk_intr_event; extern void *vm_ih; /* Counts and names for statistics (defined in MD code). */ -#if defined(__amd64__) || defined(__i386__) +#if defined(__amd64__) || defined(__i386__) || defined(__powerpc__) extern u_long *intrcnt; /* counts for for each device and stray */ extern char *intrnames; /* string table containing device names */ #else @@ -176,6 +176,9 @@ int intr_event_add_handler(struct intr_event *ie, const char *name, int intr_event_bind(struct intr_event *ie, int cpu); int intr_event_bind_irqonly(struct intr_event *ie, int cpu); int intr_event_bind_ithread(struct intr_event *ie, int cpu); +struct _cpuset; +int intr_event_bind_ithread_cpuset(struct intr_event *ie, + struct _cpuset *mask); int intr_event_create(struct intr_event **event, void *source, int flags, int irq, void (*pre_ithread)(void *), void (*post_ithread)(void *), void (*post_filter)(void *), diff --git a/freebsd/sys/sys/ktls.h b/freebsd/sys/sys/ktls.h new file mode 100644 index 00000000..079d4448 --- /dev/null +++ b/freebsd/sys/sys/ktls.h @@ -0,0 +1,194 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014-2019 Netflix Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef _SYS_KTLS_H_ +#define _SYS_KTLS_H_ + +#include <sys/refcount.h> +#include <sys/_task.h> + +struct tls_record_layer { + uint8_t tls_type; + uint8_t tls_vmajor; + uint8_t tls_vminor; + uint16_t tls_length; + uint8_t tls_data[0]; +} __attribute__ ((packed)); + +#define TLS_MAX_MSG_SIZE_V10_2 16384 +#define TLS_MAX_PARAM_SIZE 1024 /* Max key/mac/iv in sockopt */ +#define TLS_AEAD_GCM_LEN 4 +#define TLS_CBC_IMPLICIT_IV_LEN 16 + +/* Type values for the record layer */ +#define TLS_RLTYPE_APP 23 + +/* + * Nonce for GCM. + */ +struct tls_nonce_data { + uint8_t fixed[TLS_AEAD_GCM_LEN]; + uint64_t seq; +} __packed; + +/* + * AEAD additional data format per RFC. + */ +struct tls_aead_data { + uint64_t seq; /* In network order */ + uint8_t type; + uint8_t tls_vmajor; + uint8_t tls_vminor; + uint16_t tls_length; +} __packed; + +/* + * Stream Cipher MAC additional data input. This does not match the + * exact data on the wire (the sequence number is not placed on the + * wire, and any explicit IV after the record header is not covered by + * the MAC). + */ +struct tls_mac_data { + uint64_t seq; + uint8_t type; + uint8_t tls_vmajor; + uint8_t tls_vminor; + uint16_t tls_length; +} __packed; + +#define TLS_MAJOR_VER_ONE 3 +#define TLS_MINOR_VER_ZERO 1 /* 3, 1 */ +#define TLS_MINOR_VER_ONE 2 /* 3, 2 */ +#define TLS_MINOR_VER_TWO 3 /* 3, 3 */ + +/* For TCP_TXTLS_ENABLE */ +struct tls_enable { + const uint8_t *cipher_key; + const uint8_t *iv; /* Implicit IV. */ + const uint8_t *auth_key; + int cipher_algorithm; /* e.g. CRYPTO_AES_CBC */ + int cipher_key_len; + int iv_len; + int auth_algorithm; /* e.g. CRYPTO_SHA2_256_HMAC */ + int auth_key_len; + int flags; + uint8_t tls_vmajor; + uint8_t tls_vminor; +}; + +struct tls_session_params { + uint8_t *cipher_key; + uint8_t *auth_key; + uint8_t iv[TLS_CBC_IMPLICIT_IV_LEN]; + int cipher_algorithm; + int auth_algorithm; + uint16_t cipher_key_len; + uint16_t iv_len; + uint16_t auth_key_len; + uint16_t max_frame_len; + uint8_t tls_vmajor; + uint8_t tls_vminor; + uint8_t tls_hlen; + uint8_t tls_tlen; + uint8_t tls_bs; + uint8_t flags; +}; + +#ifdef _KERNEL + +#define KTLS_API_VERSION 5 + +struct iovec; +struct ktls_session; +struct m_snd_tag; +struct mbuf; +struct mbuf_ext_pgs; +struct sockbuf; +struct socket; + +struct ktls_crypto_backend { + LIST_ENTRY(ktls_crypto_backend) next; + int (*try)(struct socket *so, struct ktls_session *tls); + int prio; + int api_version; + int use_count; + const char *name; +}; + +struct ktls_session { + int (*sw_encrypt)(struct ktls_session *tls, + const struct tls_record_layer *hdr, uint8_t *trailer, + struct iovec *src, struct iovec *dst, int iovcnt, + uint64_t seqno); + union { + void *cipher; + struct m_snd_tag *snd_tag; + }; + struct ktls_crypto_backend *be; + void (*free)(struct ktls_session *tls); + struct tls_session_params params; + u_int wq_index; + volatile u_int refcount; + + struct task reset_tag_task; + struct inpcb *inp; + bool reset_pending; +} __aligned(CACHE_LINE_SIZE); + +int ktls_crypto_backend_register(struct ktls_crypto_backend *be); +int ktls_crypto_backend_deregister(struct ktls_crypto_backend *be); +int ktls_enable_tx(struct socket *so, struct tls_enable *en); +void ktls_destroy(struct ktls_session *tls); +int ktls_frame(struct mbuf *m, struct ktls_session *tls, int *enqueue_cnt, + uint8_t record_type); +void ktls_seq(struct sockbuf *sb, struct mbuf *m); +void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count); +void ktls_enqueue_to_free(struct mbuf_ext_pgs *pgs); +int ktls_set_tx_mode(struct socket *so, int mode); +int ktls_get_tx_mode(struct socket *so); +int ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls); + +static inline struct ktls_session * +ktls_hold(struct ktls_session *tls) +{ + + if (tls != NULL) + refcount_acquire(&tls->refcount); + return (tls); +} + +static inline void +ktls_free(struct ktls_session *tls) +{ + + if (refcount_release(&tls->refcount)) + ktls_destroy(tls); +} + +#endif /* !_KERNEL */ +#endif /* !_SYS_KTLS_H_ */ diff --git a/freebsd/sys/sys/libkern.h b/freebsd/sys/sys/libkern.h index 28da25ca..8a906fb2 100644 --- a/freebsd/sys/sys/libkern.h +++ b/freebsd/sys/sys/libkern.h @@ -227,9 +227,11 @@ void _bsd_srandom(u_long); int strcasecmp(const char *, const char *); char *strcat(char * __restrict, const char * __restrict); char *strchr(const char *, int); +char *strchrnul(const char *, int); int strcmp(const char *, const char *); char *strcpy(char * __restrict, const char * __restrict); size_t strcspn(const char * __restrict, const char * __restrict) __pure; +char *strdup_flags(const char *__restrict, struct malloc_type *, int); #ifdef __rtems__ #include <string.h> #define strdup _bsd_strdup @@ -251,39 +253,6 @@ size_t strspn(const char *, const char *); char *strstr(const char *, const char *); int strvalid(const char *, size_t); -extern const uint32_t crc32_tab[]; - -static __inline uint32_t -crc32_raw(const void *buf, size_t size, uint32_t crc) -{ - const uint8_t *p = (const uint8_t *)buf; - - while (size--) - crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); - return (crc); -} - -static __inline uint32_t -crc32(const void *buf, size_t size) -{ - uint32_t crc; - - crc = crc32_raw(buf, size, ~0U); - return (crc ^ ~0U); -} - -uint32_t -calculate_crc32c(uint32_t crc32c, const unsigned char *buffer, - unsigned int length); -#ifdef _KERNEL -#if defined(__amd64__) || defined(__i386__) -uint32_t sse42_crc32c(uint32_t, const unsigned char *, unsigned); -#endif -#if defined(__aarch64__) -uint32_t armv8_crc32c(uint32_t, const unsigned char *, unsigned int); -#endif -#endif - #ifndef __rtems__ static __inline char * index(const char *p, int ch) diff --git a/freebsd/sys/sys/lockmgr.h b/freebsd/sys/sys/lockmgr.h index 03ae6f9e..d2a14230 100644 --- a/freebsd/sys/sys/lockmgr.h +++ b/freebsd/sys/sys/lockmgr.h @@ -143,7 +143,7 @@ _lockmgr_args_rw(struct lock *lk, u_int flags, struct rwlock *ilk, /* * Flags for lockinit(). */ -#define LK_INIT_MASK 0x0000FF +#define LK_INIT_MASK 0x0001FF #define LK_CANRECURSE 0x000001 #define LK_NODUP 0x000002 #define LK_NOPROFILE 0x000004 @@ -152,6 +152,7 @@ _lockmgr_args_rw(struct lock *lk, u_int flags, struct rwlock *ilk, #define LK_QUIET 0x000020 #define LK_ADAPTIVE 0x000040 #define LK_IS_VNODE 0x000080 /* Tell WITNESS about a VNODE lock */ +#define LK_NEW 0x000100 /* * Additional attributes to be used in lockmgr(). @@ -163,7 +164,6 @@ _lockmgr_args_rw(struct lock *lk, u_int flags, struct rwlock *ilk, #define LK_SLEEPFAIL 0x000800 #define LK_TIMELOCK 0x001000 #define LK_NODDLKTREAT 0x002000 -#define LK_VNHELD 0x004000 /* * Operations for lockmgr(). diff --git a/freebsd/sys/sys/lockstat.h b/freebsd/sys/sys/lockstat.h index 9a6674fa..0526f4fb 100644 --- a/freebsd/sys/sys/lockstat.h +++ b/freebsd/sys/sys/lockstat.h @@ -65,6 +65,13 @@ SDT_PROBE_DECLARE(lockstat, , , sx__spin); SDT_PROBE_DECLARE(lockstat, , , sx__upgrade); SDT_PROBE_DECLARE(lockstat, , , sx__downgrade); +SDT_PROBE_DECLARE(lockstat, , , lockmgr__acquire); +SDT_PROBE_DECLARE(lockstat, , , lockmgr__release); +SDT_PROBE_DECLARE(lockstat, , , lockmgr__disown); +SDT_PROBE_DECLARE(lockstat, , , lockmgr__block); +SDT_PROBE_DECLARE(lockstat, , , lockmgr__upgrade); +SDT_PROBE_DECLARE(lockstat, , , lockmgr__downgrade); + SDT_PROBE_DECLARE(lockstat, , , thread__spin); #define LOCKSTAT_WRITER 0 diff --git a/freebsd/sys/sys/malloc.h b/freebsd/sys/sys/malloc.h index ca81a198..83510329 100644 --- a/freebsd/sys/sys/malloc.h +++ b/freebsd/sys/sys/malloc.h @@ -57,9 +57,10 @@ #define M_NOVM 0x0200 /* don't ask VM for pages */ #define M_USE_RESERVE 0x0400 /* can alloc out of reserve memory */ #define M_NODUMP 0x0800 /* don't dump pages in this allocation */ -#define M_FIRSTFIT 0x1000 /* Only for vmem, fast fit. */ -#define M_BESTFIT 0x2000 /* Only for vmem, low fragmentation. */ -#define M_EXEC 0x4000 /* allocate executable space. */ +#define M_FIRSTFIT 0x1000 /* only for vmem, fast fit */ +#define M_BESTFIT 0x2000 /* only for vmem, low fragmentation */ +#define M_EXEC 0x4000 /* allocate executable space */ +#define M_NEXTFIT 0x8000 /* only for vmem, follow cursor */ #define M_MAGIC 877983977 /* time when first defined :-) */ @@ -182,7 +183,7 @@ void *contigmalloc(unsigned long size, struct malloc_type *type, int flags, void *contigmalloc_domainset(unsigned long size, struct malloc_type *type, struct domainset *ds, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, vm_paddr_t boundary) - __malloc_like __result_use_check __alloc_size(1) __alloc_align(6); + __malloc_like __result_use_check __alloc_size(1) __alloc_align(7); void free(void *addr, struct malloc_type *type); void free_domain(void *addr, struct malloc_type *type); #ifndef __rtems__ diff --git a/freebsd/sys/sys/mbuf.h b/freebsd/sys/sys/mbuf.h index 634f7d9e..ba2e1873 100644 --- a/freebsd/sys/sys/mbuf.h +++ b/freebsd/sys/sys/mbuf.h @@ -40,6 +40,7 @@ #include <sys/queue.h> #ifdef _KERNEL #include <sys/systm.h> +#include <sys/refcount.h> #include <vm/uma.h> #ifdef WITNESS #include <sys/lock.h> @@ -98,6 +99,7 @@ struct mbuf; #define MLEN ((int)(MSIZE - MHSIZE)) #define MHLEN ((int)(MSIZE - MPKTHSIZE)) #define MINCLSIZE (MHLEN + 1) +#define M_NODOM 255 #ifdef _KERNEL /*- @@ -137,6 +139,7 @@ struct m_tag { */ struct m_snd_tag { struct ifnet *ifp; /* network interface tag belongs to */ + volatile u_int refcount; }; /* @@ -158,7 +161,7 @@ struct pkthdr { uint32_t flowid; /* packet's 4-tuple system */ uint32_t csum_flags; /* checksum and offload features */ uint16_t fibnum; /* this packet should use this fib */ - uint8_t cosqos; /* class/quality of service */ + uint8_t numa_domain; /* NUMA domain of recvd pkt */ uint8_t rsstype; /* hash type */ union { uint64_t rcv_tstmp; /* timestamp in ns */ @@ -196,6 +199,8 @@ struct pkthdr { #define lro_nsegs tso_segsz #define csum_phsum PH_per.sixteen[2] #define csum_data PH_per.thirtytwo[1] +#define lro_len PH_per.sixteen[0] /* inbound during LRO */ +#define lro_csum PH_per.sixteen[1] /* inbound during LRO */ #define pace_thoff PH_loc.sixteen[0] #define pace_tlen PH_loc.sixteen[1] #define pace_drphdrlen PH_loc.sixteen[2] @@ -224,7 +229,15 @@ struct m_ext { volatile u_int ext_count; volatile u_int *ext_cnt; }; - char *ext_buf; /* start of buffer */ + union { + /* + * If ext_type == EXT_PGS, 'ext_pgs' points to a + * structure describing the buffer. Otherwise, + * 'ext_buf' points to the start of the buffer. + */ + struct mbuf_ext_pgs *ext_pgs; + char *ext_buf; + }; uint32_t ext_size; /* size of buffer, for ext_free */ uint32_t ext_type:8, /* type of external storage */ ext_flags:24; /* external storage mbuf flags */ @@ -290,10 +303,98 @@ struct mbuf { }; }; +struct ktls_session; +struct socket; + +/* + * TLS records for TLS 1.0-1.2 can have the following header lengths: + * - 5 (AES-CBC with implicit IV) + * - 21 (AES-CBC with explicit IV) + * - 13 (AES-GCM with 8 byte explicit IV) + */ +#define MBUF_PEXT_HDR_LEN 24 + +/* + * TLS records for TLS 1.0-1.2 can have the following maximum trailer + * lengths: + * - 16 (AES-GCM) + * - 36 (AES-CBC with SHA1 and up to 16 bytes of padding) + * - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding) + * - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding) + */ +#define MBUF_PEXT_TRAIL_LEN 64 + +#ifdef __LP64__ +#define MBUF_PEXT_MAX_PGS (152 / sizeof(vm_paddr_t)) +#else +#define MBUF_PEXT_MAX_PGS (156 / sizeof(vm_paddr_t)) +#endif + +#define MBUF_PEXT_MAX_BYTES \ + (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN) + +/* + * This struct is 256 bytes in size and is arranged so that the most + * common case (accessing the first 4 pages of a 16KB TLS record) will + * fit in a single 64 byte cacheline. + */ +struct mbuf_ext_pgs { + uint8_t npgs; /* Number of attached pages */ + uint8_t nrdy; /* Pages with I/O pending */ + uint8_t hdr_len; /* TLS header length */ + uint8_t trail_len; /* TLS trailer length */ + uint16_t first_pg_off; /* Offset into 1st page */ + uint16_t last_pg_len; /* Length of last page */ + vm_paddr_t pa[MBUF_PEXT_MAX_PGS]; /* phys addrs of pages */ + char hdr[MBUF_PEXT_HDR_LEN]; /* TLS header */ + struct ktls_session *tls; /* TLS session */ +#if defined(__i386__) || \ + (defined(__powerpc__) && !defined(__powerpc64__) && defined(BOOKE)) + /* + * i386 and Book-E PowerPC have 64-bit vm_paddr_t, so there is + * a 4 byte remainder from the space allocated for pa[]. + */ + uint32_t pad; +#endif + union { + char trail[MBUF_PEXT_TRAIL_LEN]; /* TLS trailer */ + struct { + struct socket *so; + struct mbuf *mbuf; + uint64_t seqno; + STAILQ_ENTRY(mbuf_ext_pgs) stailq; + int enc_cnt; + }; + }; +}; + +#ifdef _KERNEL +static inline int +mbuf_ext_pg_len(struct mbuf_ext_pgs *ext_pgs, int pidx, int pgoff) +{ + KASSERT(pgoff == 0 || pidx == 0, + ("page %d with non-zero offset %d in %p", pidx, pgoff, ext_pgs)); + if (pidx == ext_pgs->npgs - 1) { + return (ext_pgs->last_pg_len); + } else { + return (PAGE_SIZE - pgoff); + } +} + +#ifdef INVARIANT_SUPPORT +void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs); +#endif +#ifdef INVARIANTS +#define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs) mb_ext_pgs_check((ext_pgs)) +#else +#define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs) +#endif +#endif + /* * mbuf flags of global significance and layer crossing. * Those of only protocol/layer specific significance are to be mapped - * to M_PROTO[1-12] and cleared at layer handoff boundaries. + * to M_PROTO[1-11] and cleared at layer handoff boundaries. * NB: Limited to the lower 24 bits. */ #define M_EXT 0x00000001 /* has associated external storage */ @@ -304,25 +405,29 @@ struct mbuf { #define M_MCAST 0x00000020 /* send/received as link-level multicast */ #define M_PROMISC 0x00000040 /* packet was not for us */ #define M_VLANTAG 0x00000080 /* ether_vtag is valid */ -#define M_NOMAP 0x00000100 /* mbuf data is unmapped (soon from Drew) */ +#ifndef __rtems__ +#define M_NOMAP 0x00000100 /* mbuf data is unmapped */ +#else /* __rtems__ */ +#define M_NOMAP 0x00000000 /* disable unmapped mbuf data */ +#endif /* __rtems__ */ #define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */ #define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */ #define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically hw-stamped on port (useful for IEEE 1588 and 802.1AS) */ - -#define M_PROTO1 0x00001000 /* protocol-specific */ -#define M_PROTO2 0x00002000 /* protocol-specific */ -#define M_PROTO3 0x00004000 /* protocol-specific */ -#define M_PROTO4 0x00008000 /* protocol-specific */ -#define M_PROTO5 0x00010000 /* protocol-specific */ -#define M_PROTO6 0x00020000 /* protocol-specific */ -#define M_PROTO7 0x00040000 /* protocol-specific */ -#define M_PROTO8 0x00080000 /* protocol-specific */ -#define M_PROTO9 0x00100000 /* protocol-specific */ -#define M_PROTO10 0x00200000 /* protocol-specific */ -#define M_PROTO11 0x00400000 /* protocol-specific */ -#define M_PROTO12 0x00800000 /* protocol-specific */ +#define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */ + +#define M_PROTO1 0x00002000 /* protocol-specific */ +#define M_PROTO2 0x00004000 /* protocol-specific */ +#define M_PROTO3 0x00008000 /* protocol-specific */ +#define M_PROTO4 0x00010000 /* protocol-specific */ +#define M_PROTO5 0x00020000 /* protocol-specific */ +#define M_PROTO6 0x00040000 /* protocol-specific */ +#define M_PROTO7 0x00080000 /* protocol-specific */ +#define M_PROTO8 0x00100000 /* protocol-specific */ +#define M_PROTO9 0x00200000 /* protocol-specific */ +#define M_PROTO10 0x00400000 /* protocol-specific */ +#define M_PROTO11 0x00800000 /* protocol-specific */ #define MB_DTOR_SKIP 0x1 /* don't pollute the cache by touching a freed mbuf */ @@ -331,7 +436,7 @@ struct mbuf { */ #define M_PROTOFLAGS \ (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\ - M_PROTO9|M_PROTO10|M_PROTO11|M_PROTO12) + M_PROTO9|M_PROTO10|M_PROTO11) /* * Flags preserved when copying m_pkthdr. @@ -345,11 +450,11 @@ struct mbuf { */ #define M_FLAG_BITS \ "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \ - "\7M_PROMISC\10M_VLANTAG\13M_TSTMP\14M_TSTMP_HPREC" + "\7M_PROMISC\10M_VLANTAG\11M_NOMAP\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC" #define M_FLAG_PROTOBITS \ "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \ "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \ - "\27M_PROTO11\30M_PROTO12" + "\27M_PROTO11" #define M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS) /* @@ -407,33 +512,6 @@ struct mbuf { #define M_HASHTYPE_ISHASH(m) (M_HASHTYPE_GET(m) & M_HASHTYPE_HASHPROP) /* - * COS/QOS class and quality of service tags. - * It uses DSCP code points as base. - */ -#define QOS_DSCP_CS0 0x00 -#define QOS_DSCP_DEF QOS_DSCP_CS0 -#define QOS_DSCP_CS1 0x20 -#define QOS_DSCP_AF11 0x28 -#define QOS_DSCP_AF12 0x30 -#define QOS_DSCP_AF13 0x38 -#define QOS_DSCP_CS2 0x40 -#define QOS_DSCP_AF21 0x48 -#define QOS_DSCP_AF22 0x50 -#define QOS_DSCP_AF23 0x58 -#define QOS_DSCP_CS3 0x60 -#define QOS_DSCP_AF31 0x68 -#define QOS_DSCP_AF32 0x70 -#define QOS_DSCP_AF33 0x78 -#define QOS_DSCP_CS4 0x80 -#define QOS_DSCP_AF41 0x88 -#define QOS_DSCP_AF42 0x90 -#define QOS_DSCP_AF43 0x98 -#define QOS_DSCP_CS5 0xa0 -#define QOS_DSCP_EF 0xb8 -#define QOS_DSCP_CS6 0xc0 -#define QOS_DSCP_CS7 0xe0 - -/* * External mbuf storage buffer types. */ #define EXT_CLUSTER 1 /* mbuf cluster */ @@ -445,6 +523,10 @@ struct mbuf { #define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */ #define EXT_PACKET 6 /* mbuf+cluster from packet zone */ #define EXT_MBUF 7 /* external mbuf reference */ +#define EXT_RXRING 8 /* data in NIC receive ring */ +#ifndef __rtems__ +#define EXT_PGS 9 /* array of unmapped pages */ +#endif /* __rtems__ */ #define EXT_VENDOR1 224 /* for vendor-internal use */ #define EXT_VENDOR2 225 /* for vendor-internal use */ @@ -489,6 +571,11 @@ struct mbuf { "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \ "\30EXT_FLAG_EXP4" +#define MBUF_EXT_PGS_ASSERT(m) \ + KASSERT((((m)->m_flags & M_EXT) != 0) && \ + ((m)->m_ext.ext_type == EXT_PGS), \ + ("%s: m %p !M_EXT or !EXT_PGS", __func__, m)) + /* * Flags indicating checksum, segmentation and other offload work to be * done, or already done, by hardware or lower layers. It is split into @@ -521,6 +608,8 @@ struct mbuf { #define CSUM_L5_VALID 0x20000000 /* checksum is correct */ #define CSUM_COALESCED 0x40000000 /* contains merged segments */ +#define CSUM_SND_TAG 0x80000000 /* Packet header has send tag */ + /* * CSUM flag description for use with printf(9) %b identifier. */ @@ -530,7 +619,7 @@ struct mbuf { "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \ "\16CSUM_IP6_ISCSI" \ "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \ - "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED" + "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG" /* CSUM flags compatibility mappings. */ #define CSUM_IP_CHECKED CSUM_L3_CALC @@ -589,6 +678,7 @@ struct mbuf { #define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k" #define MBUF_TAG_MEM_NAME "mbuf_tag" #define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt" +#define MBUF_EXTPGS_MEM_NAME "mbuf_extpgs" #ifdef _KERNEL @@ -613,9 +703,25 @@ extern uma_zone_t zone_pack; extern uma_zone_t zone_jumbop; extern uma_zone_t zone_jumbo9; extern uma_zone_t zone_jumbo16; +extern uma_zone_t zone_extpgs; void mb_dupcl(struct mbuf *, struct mbuf *); void mb_free_ext(struct mbuf *); +void mb_free_mext_pgs(struct mbuf *); +struct mbuf *mb_alloc_ext_pgs(int, bool, m_ext_free_t); +int mb_unmapped_compress(struct mbuf *m); +#ifndef __rtems__ +struct mbuf *mb_unmapped_to_ext(struct mbuf *m); +#else /* __rtems__ */ +static __inline struct mbuf * +mb_unmapped_to_ext(struct mbuf *m) +{ + + return (m); +} + +#endif /* __rtems__ */ +void mb_free_notready(struct mbuf *m, int count); void m_adj(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, u_int), void *); @@ -650,6 +756,7 @@ struct mbuf *m_getm2(struct mbuf *, int, int, short, int); struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); int m_mbuftouio(struct uio *, const struct mbuf *, int); +int m_unmappedtouio(const struct mbuf *, int, struct uio *, int); void m_move_pkthdr(struct mbuf *, struct mbuf *); int m_pkthdr_init(struct mbuf *, int); struct mbuf *m_prepend(struct mbuf *, int, int); @@ -660,6 +767,8 @@ int m_sanity(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); struct mbuf *m_uiotombuf(struct uio *, int, int, int, int); struct mbuf *m_unshare(struct mbuf *, int); +void m_snd_tag_init(struct m_snd_tag *, struct ifnet *); +void m_snd_tag_destroy(struct m_snd_tag *); static __inline int m_gettype(int size) @@ -903,7 +1012,7 @@ m_extrefcnt(struct mbuf *m) * be both the local data payload, or an external buffer area, depending on * whether M_EXT is set). */ -#define M_WRITABLE(m) (!((m)->m_flags & M_RDONLY) && \ +#define M_WRITABLE(m) (((m)->m_flags & (M_RDONLY | M_NOMAP)) == 0 && \ (!(((m)->m_flags & M_EXT)) || \ (m_extrefcnt(m) == 1))) @@ -926,7 +1035,8 @@ m_extrefcnt(struct mbuf *m) * handling external storage, packet-header mbufs, and regular data mbufs. */ #define M_START(m) \ - (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ + (((m)->m_flags & M_NOMAP) ? NULL : \ + ((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \ &(m)->m_dat[0]) @@ -1023,6 +1133,17 @@ m_align(struct mbuf *m, int len) */ #define MCHTYPE(m, t) m_chtype((m), (t)) +/* Return the rcvif of a packet header. */ +static __inline struct ifnet * +m_rcvif(struct mbuf *m) +{ + + M_ASSERTPKTHDR(m); + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) + return (NULL); + return (m->m_pkthdr.rcvif); +} + /* Length to m_copy to copy all. */ #define M_COPYALL 1000000000 @@ -1031,6 +1152,7 @@ extern int max_hdr; /* Largest link + protocol header */ extern int max_linkhdr; /* Largest link-level header */ extern int max_protohdr; /* Largest protocol header */ extern int nmbclusters; /* Maximum number of clusters */ +extern bool mb_use_ext_pgs; /* Use ext_pgs for sendfile */ /*- * Network packets may have annotations attached by affixing a list of @@ -1213,6 +1335,22 @@ m_tag_find(struct mbuf *m, int type, struct m_tag *start) m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); } +static inline struct m_snd_tag * +m_snd_tag_ref(struct m_snd_tag *mst) +{ + + refcount_acquire(&mst->refcount); + return (mst); +} + +static inline void +m_snd_tag_rele(struct m_snd_tag *mst) +{ + + if (refcount_release(&mst->refcount)) + m_snd_tag_destroy(mst); +} + static __inline struct mbuf * m_free(struct mbuf *m) { @@ -1221,6 +1359,8 @@ m_free(struct mbuf *m) MBUF_PROBE1(m__free, m); if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE)) m_tag_delete_chain(m, NULL); + if (m->m_flags & M_PKTHDR && m->m_pkthdr.csum_flags & CSUM_SND_TAG) + m_snd_tag_rele(m->m_pkthdr.snd_tag); if (m->m_flags & M_EXT) mb_free_ext(m); else if ((m->m_flags & M_NOFREE) == 0) @@ -1314,7 +1454,7 @@ static inline int mbufq_full(const struct mbufq *mq) { - return (mq->mq_len >= mq->mq_maxlen); + return (mq->mq_maxlen > 0 && mq->mq_len >= mq->mq_maxlen); } static inline int @@ -1388,5 +1528,20 @@ void netdump_mbuf_dump(void); void netdump_mbuf_reinit(int nmbuf, int nclust, int clsize); #endif +static inline bool +mbuf_has_tls_session(struct mbuf *m) +{ + +#ifndef __rtems__ + if (m->m_flags & M_NOMAP) { + MBUF_EXT_PGS_ASSERT(m); + if (m->m_ext.ext_pgs->tls != NULL) { + return (true); + } + } +#endif /* __rtems__ */ + return (false); +} + #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */ diff --git a/freebsd/sys/sys/mount.h b/freebsd/sys/sys/mount.h index 2a5d4cff..4b60055c 100644 --- a/freebsd/sys/sys/mount.h +++ b/freebsd/sys/sys/mount.h @@ -226,6 +226,11 @@ struct mount { struct lock mnt_explock; /* vfs_export walkers lock */ TAILQ_ENTRY(mount) mnt_upper_link; /* (m) we in the all uppers */ TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ + int __aligned(CACHE_LINE_SIZE) mnt_vfs_ops;/* (i) pending vfs ops */ + int *mnt_thread_in_ops_pcpu; + int *mnt_ref_pcpu; + int *mnt_lockref_pcpu; + int *mnt_writeopcount_pcpu; }; /* @@ -265,11 +270,17 @@ void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *); #define MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) #define MNT_MTX(mp) (&(mp)->mnt_mtx) -#define MNT_REF(mp) (mp)->mnt_ref++ + +#define MNT_REF(mp) do { \ + mtx_assert(MNT_MTX(mp), MA_OWNED); \ + mp->mnt_ref++; \ +} while (0) #define MNT_REL(mp) do { \ - KASSERT((mp)->mnt_ref > 0, ("negative mnt_ref")); \ + mtx_assert(MNT_MTX(mp), MA_OWNED); \ (mp)->mnt_ref--; \ - if ((mp)->mnt_ref == 0) \ + if ((mp)->mnt_vfs_ops && (mp)->mnt_ref < 0) \ + vfs_dump_mount_counters(mp); \ + if ((mp)->mnt_ref == 0 && (mp)->mnt_vfs_ops) \ wakeup((mp)); \ } while (0) @@ -296,6 +307,7 @@ void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *); #define MNT_NOCLUSTERW 0x0000000080000000ULL /* disable cluster write */ #define MNT_SUJ 0x0000000100000000ULL /* using journaled soft updates */ #define MNT_AUTOMOUNTED 0x0000000200000000ULL /* mounted by automountd(8) */ +#define MNT_UNTRUSTED 0x0000000800000000ULL /* filesys metadata untrusted */ /* * NFS export related mount flags. @@ -333,7 +345,8 @@ void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *); MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \ MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \ MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS | \ - MNT_NFS4ACLS | MNT_AUTOMOUNTED | MNT_VERIFIED) + MNT_NFS4ACLS | MNT_AUTOMOUNTED | MNT_VERIFIED | \ + MNT_UNTRUSTED) /* Mask of flags that can be updated. */ #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | \ @@ -342,7 +355,7 @@ void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *); MNT_NOSYMFOLLOW | MNT_IGNORE | \ MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR | \ MNT_ACLS | MNT_USER | MNT_NFS4ACLS | \ - MNT_AUTOMOUNTED) + MNT_AUTOMOUNTED | MNT_UNTRUSTED) /* * External filesystem command modifier flags. @@ -360,29 +373,28 @@ void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *); #define MNT_SNAPSHOT 0x0000000001000000ULL /* snapshot the filesystem */ #define MNT_NONBUSY 0x0000000004000000ULL /* check vnode use counts. */ #define MNT_BYFSID 0x0000000008000000ULL /* specify filesystem by ID. */ +#define MNT_NOCOVER 0x0000001000000000ULL /* Do not cover a mount point */ +#define MNT_EMPTYDIR 0x0000002000000000ULL /* Only mount on empty dir */ #define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ MNT_FORCE | MNT_SNAPSHOT | MNT_NONBUSY | \ - MNT_BYFSID) + MNT_BYFSID | MNT_NOCOVER | MNT_EMPTYDIR) /* * Internal filesystem control flags stored in mnt_kern_flag. * - * MNTK_UNMOUNT locks the mount entry so that name lookup cannot proceed - * past the mount point. This keeps the subtree stable during mounts - * and unmounts. + * MNTK_UNMOUNT locks the mount entry so that name lookup cannot + * proceed past the mount point. This keeps the subtree stable during + * mounts and unmounts. When non-forced unmount flushes all vnodes + * from the mp queue, the MNTK_UNMOUNT flag prevents insmntque() from + * queueing new vnodes. * * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while * dounmount() is still waiting to lock the mountpoint. This allows * the filesystem to cancel operations that might otherwise deadlock * with the unmount attempt (used by NFS). - * - * MNTK_NOINSMNTQ is strict subset of MNTK_UNMOUNT. They are separated - * to allow for failed unmount attempt to restore the syncer vnode for - * the mount. */ #define MNTK_UNMOUNTF 0x00000001 /* forced unmount in progress */ #define MNTK_ASYNC 0x00000002 /* filtered async flag */ #define MNTK_SOFTDEP 0x00000004 /* async disabled by softdep */ -#define MNTK_NOINSMNTQ 0x00000008 /* insmntque is not allowed */ #define MNTK_DRAINING 0x00000010 /* lock draining is happening */ #define MNTK_REFEXPIRE 0x00000020 /* refcount expiring is happening */ #define MNTK_EXTENDED_SHARED 0x00000040 /* Allow shared locking for more ops */ @@ -396,6 +408,7 @@ void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *); #define MNTK_MARKER 0x00001000 #define MNTK_UNMAPPED_BUFS 0x00002000 #define MNTK_USES_BCACHE 0x00004000 /* FS uses the buffer cache. */ +#define MNTK_TEXT_REFS 0x00008000 /* Keep use ref for text */ #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ @@ -647,6 +660,18 @@ struct nameidata; struct sysctl_req; struct mntarg; +/* + * N.B., vfs_cmount is the ancient vfsop invoked by the old mount(2) syscall. + * The new way is vfs_mount. + * + * vfs_cmount implementations typically translate arguments from their + * respective old per-FS structures into the key-value list supported by + * nmount(2), then use kernel_mount(9) to mimic nmount(2) from kernelspace. + * + * Filesystems with mounters that use nmount(2) do not need to and should not + * implement vfs_cmount. Hopefully a future cleanup can remove vfs_cmount and + * mount(2) entirely. + */ typedef int vfs_cmount_t(struct mntarg *ma, void *data, uint64_t flags); typedef int vfs_unmount_t(struct mount *mp, int mntflags); typedef int vfs_root_t(struct mount *mp, int flags, struct vnode **vpp); @@ -925,6 +950,74 @@ vfs_sysctl_t vfs_stdsysctl; void syncer_suspend(void); void syncer_resume(void); +void vfs_op_barrier_wait(struct mount *); +void vfs_op_enter(struct mount *); +void vfs_op_exit_locked(struct mount *); +void vfs_op_exit(struct mount *); + +#ifdef DIAGNOSTIC +void vfs_assert_mount_counters(struct mount *); +void vfs_dump_mount_counters(struct mount *); +#else +#define vfs_assert_mount_counters(mp) do { } while (0) +#define vfs_dump_mount_counters(mp) do { } while (0) +#endif + +enum mount_counter { MNT_COUNT_REF, MNT_COUNT_LOCKREF, MNT_COUNT_WRITEOPCOUNT }; +int vfs_mount_fetch_counter(struct mount *, enum mount_counter); + +/* + * We mark ourselves as entering the section and post a sequentially consistent + * fence, meaning the store is completed before we get into the section and + * mnt_vfs_ops is only read afterwards. + * + * Any thread transitioning the ops counter 0->1 does things in the opposite + * order - first bumps the count, posts a sequentially consistent fence and + * observes all CPUs not executing within the section. + * + * This provides an invariant that by the time the last CPU is observed not + * executing, everyone else entering will see the counter > 0 and exit. + * + * Note there is no barrier between vfs_ops and the rest of the code in the + * section. It is not necessary as the writer has to wait for everyone to drain + * before making any changes or only make changes safe while the section is + * executed. + */ +#define vfs_op_thread_entered(mp) ({ \ + MPASS(curthread->td_critnest > 0); \ + *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) == 1; \ +}) + +#define vfs_op_thread_enter(mp) ({ \ + bool _retval = true; \ + critical_enter(); \ + MPASS(!vfs_op_thread_entered(mp)); \ + *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 1; \ + atomic_thread_fence_seq_cst(); \ + if (__predict_false(mp->mnt_vfs_ops > 0)) { \ + vfs_op_thread_exit(mp); \ + _retval = false; \ + } \ + _retval; \ +}) + +#define vfs_op_thread_exit(mp) do { \ + MPASS(vfs_op_thread_entered(mp)); \ + atomic_thread_fence_rel(); \ + *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 0; \ + critical_exit(); \ +} while (0) + +#define vfs_mp_count_add_pcpu(mp, count, val) do { \ + MPASS(vfs_op_thread_entered(mp)); \ + (*(int *)zpcpu_get(mp->mnt_##count##_pcpu)) += val; \ +} while (0) + +#define vfs_mp_count_sub_pcpu(mp, count, val) do { \ + MPASS(vfs_op_thread_entered(mp)); \ + (*(int *)zpcpu_get(mp->mnt_##count##_pcpu)) -= val; \ +} while (0) + #else /* !_KERNEL */ #include <sys/cdefs.h> diff --git a/freebsd/sys/sys/mouse.h b/freebsd/sys/sys/mouse.h index e6ea68bc..a23e09ab 100644 --- a/freebsd/sys/sys/mouse.h +++ b/freebsd/sys/sys/mouse.h @@ -136,6 +136,7 @@ typedef struct synapticshw { int infoXupmm; int infoYupmm; int forcePad; + int topButtonPad; } synapticshw_t; /* iftype */ diff --git a/freebsd/sys/sys/pcpu.h b/freebsd/sys/sys/pcpu.h index 0ce30af7..2e1cdde3 100644 --- a/freebsd/sys/sys/pcpu.h +++ b/freebsd/sys/sys/pcpu.h @@ -39,6 +39,7 @@ #error "no assembler-serviceable parts inside" #endif +#include <sys/param.h> #include <sys/_cpuset.h> #include <sys/_lock.h> #include <sys/_mutex.h> @@ -84,7 +85,8 @@ extern uintptr_t dpcpu_off[]; /* struct _hack is to stop this from being used with the static keyword. */ #define DPCPU_DEFINE(t, n) \ struct _hack; t DPCPU_NAME(n) __section(DPCPU_SETNAME) __used -#if defined(KLD_MODULE) && (defined(__aarch64__) || defined(__riscv)) +#if defined(KLD_MODULE) && (defined(__aarch64__) || defined(__riscv) \ + || defined(__powerpc64__)) /* * On some architectures the compiler will use PC-relative load to * find the address of DPCPU data with the static keyword. We then @@ -183,6 +185,7 @@ struct pcpu { struct thread *pc_fpcurthread; /* Fp state owner */ struct thread *pc_deadthread; /* Zombie thread or NULL */ struct pcb *pc_curpcb; /* Current pcb */ + void *pc_sched; /* Scheduler state */ uint64_t pc_switchtime; /* cpu_ticks() at last csw */ int pc_switchticks; /* `ticks' at last csw */ u_int pc_cpuid; /* This cpu number */ @@ -221,10 +224,6 @@ extern struct cpuhead cpuhead; extern struct pcpu *cpuid_to_pcpu[]; #define curcpu PCPU_GET(cpuid) -#define curproc (curthread->td_proc) -#ifndef curthread -#define curthread PCPU_GET(curthread) -#endif #define curvidata PCPU_GET(vidata) #ifndef __rtems__ @@ -233,20 +232,12 @@ extern struct pcpu *cpuid_to_pcpu[]; #define UMA_PCPU_ALLOC_SIZE (PAGE_SIZE / 32) #endif /* __rtems__ */ -#ifndef __rtems__ -#ifdef CTASSERT -#if defined(__i386__) || defined(__amd64__) -/* Required for counters(9) to work on x86. */ -CTASSERT(sizeof(struct pcpu) == UMA_PCPU_ALLOC_SIZE); -#else -/* - * To minimize memory waste in per-cpu UMA zones, size of struct pcpu - * should be denominator of PAGE_SIZE. - */ -CTASSERT((PAGE_SIZE / sizeof(struct pcpu)) * sizeof(struct pcpu) == PAGE_SIZE); -#endif /* UMA_PCPU_ALLOC_SIZE && x86 */ -#endif /* CTASSERT */ -#endif /* __rtems__ */ +#include <machine/pcpu_aux.h> + +#ifndef curthread +#define curthread PCPU_GET(curthread) +#endif +#define curproc (curthread->td_proc) /* Accessor to elements allocated via UMA_ZONE_PCPU zone. */ static inline void * @@ -268,6 +259,18 @@ zpcpu_get_cpu(void *base, int cpu) } /* + * This operation is NOT atomic and does not post any barriers. + * If you use this the assumption is that the target CPU will not + * be modifying this variable. + * If you need atomicity use xchg. + * */ +#define zpcpu_replace_cpu(base, val, cpu) ({ \ + __typeof(val) _old = *(__typeof(val) *)zpcpu_get_cpu(base, cpu);\ + *(__typeof(val) *)zpcpu_get_cpu(base, cpu) = val; \ + _old; \ +}) + +/* * Machine dependent callouts. cpu_pcpu_init() is responsible for * initializing machine dependent fields of struct pcpu, and * db_show_mdpcpu() is responsible for handling machine dependent diff --git a/freebsd/sys/sys/proc.h b/freebsd/sys/sys/proc.h index 6c352059..592a8ef1 100644 --- a/freebsd/sys/sys/proc.h +++ b/freebsd/sys/sys/proc.h @@ -42,6 +42,9 @@ #include <sys/callout.h> /* For struct callout. */ #include <sys/event.h> /* For struct klist. */ +#ifdef _KERNEL +#include <sys/_eventhandler.h> +#endif #include <sys/condvar.h> #ifndef _KERNEL #include <sys/filedesc.h> @@ -178,6 +181,7 @@ struct filecaps; struct filemon; struct kaioinfo; struct kaudit_record; +struct kcov_info; struct kdtrace_proc; struct kdtrace_thread; struct mqueue_notifier; @@ -334,7 +338,9 @@ struct thread { void *td_su; /* (k) FFS SU private */ sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ int td_rtcgen; /* (s) rtc_generation of abs. sleep */ + int td_errno; /* (k) Error from last syscall. */ size_t td_vslock_sz; /* (k) amount of vslock-ed space */ + struct kcov_info *td_kcov_info; /* (*) Kernel code coverage data */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ @@ -359,7 +365,7 @@ struct thread { * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ - enum { + enum td_states { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, @@ -387,8 +393,6 @@ struct thread { struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ - int td_errno; /* Error returned by last syscall. */ - /* LP64 hole */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ @@ -620,7 +624,7 @@ struct proc { int p_flag; /* (c) P_* flags. */ int p_flag2; /* (c) P2_* flags. */ - enum { + enum p_states { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE @@ -664,7 +668,6 @@ struct proc { struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ - u_long p_code; /* (n) For core dump/debugger XXX. */ u_int p_stops; /* (c) Stop event bitmask. */ u_int p_stype; /* (c) Stop event type. */ char p_step; /* (c) Process is stopped. */ @@ -804,6 +807,13 @@ struct proc { #define P2_AST_SU 0x00000008 /* Handles SU ast for kthreads. */ #define P2_PTRACE_FSTP 0x00000010 /* SIGSTOP from PT_ATTACH not yet handled. */ #define P2_TRAPCAP 0x00000020 /* SIGTRAP on ENOTCAPABLE */ +#define P2_ASLR_ENABLE 0x00000040 /* Force enable ASLR. */ +#define P2_ASLR_DISABLE 0x00000080 /* Force disable ASLR. */ +#define P2_ASLR_IGNSTART 0x00000100 /* Enable ASLR to consume sbrk area. */ +#define P2_PROTMAX_ENABLE 0x00000200 /* Force enable implied PROT_MAX. */ +#define P2_PROTMAX_DISABLE 0x00000400 /* Force disable implied PROT_MAX. */ +#define P2_STKGAP_DISABLE 0x00000800 /* Disable stack gap for MAP_STACK */ +#define P2_STKGAP_DISABLE_EXEC 0x00001000 /* Stack gap disabled after exec */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ @@ -1014,7 +1024,6 @@ extern u_long pgrphash; extern struct sx allproc_lock; extern int allproc_gen; -extern struct sx zombproc_lock; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct mtx procid_lock; @@ -1032,15 +1041,16 @@ LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ -extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct proc *pfind_any(pid_t); /* Find (zombie) process by id. */ +struct proc *pfind_any_locked(pid_t pid); /* Find process by id, locked. */ struct pgrp *pgfind(pid_t); /* Find process group by id. */ -struct proc *zpfind(pid_t); /* Find zombie process by id. */ +void pidhash_slockall(void); /* Shared lock all pid hash lists. */ +void pidhash_sunlockall(void); /* Shared unlock all pid hash lists. */ struct fork_req { int fr_flags; @@ -1123,11 +1133,13 @@ void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); void proc_reap(struct thread *td, struct proc *p, int *status, int options); void proc_reparent(struct proc *child, struct proc *newparent, bool set_oppid); +void proc_add_orphan(struct proc *child, struct proc *parent); void proc_set_traced(struct proc *p, bool stop); void proc_wkilled(struct proc *p); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); +void proc_clear_orphan(struct proc *p); void reaper_abandon_children(struct proc *p, bool exiting); #ifndef __rtems__ int securelevel_ge(struct ucred *cr, int level); @@ -1158,9 +1170,12 @@ void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int, int) __dead2; void cpu_copy_thread(struct thread *td, struct thread *td0); +bool cpu_exec_vmspace_reuse(struct proc *p, struct vm_map *map); int cpu_fetch_syscall_args(struct thread *td); void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_fork_kthread_handler(struct thread *, void (*)(void *), void *); +int cpu_procctl(struct thread *td, int idtype, id_t id, int com, + void *data); void cpu_set_syscall_retval(struct thread *, int); #ifndef __rtems__ void cpu_set_upcall(struct thread *, void (*)(void *), void *, @@ -1250,6 +1265,18 @@ void proc_id_set(int type, pid_t id); void proc_id_set_cond(int type, pid_t id); void proc_id_clear(int type, pid_t id); +EVENTHANDLER_LIST_DECLARE(process_ctor); +EVENTHANDLER_LIST_DECLARE(process_dtor); +EVENTHANDLER_LIST_DECLARE(process_init); +EVENTHANDLER_LIST_DECLARE(process_fini); +EVENTHANDLER_LIST_DECLARE(process_exit); +EVENTHANDLER_LIST_DECLARE(process_fork); +EVENTHANDLER_LIST_DECLARE(process_exec); + +EVENTHANDLER_LIST_DECLARE(thread_ctor); +EVENTHANDLER_LIST_DECLARE(thread_dtor); +EVENTHANDLER_LIST_DECLARE(thread_init); + #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ diff --git a/freebsd/sys/sys/random.h b/freebsd/sys/sys/random.h index aa6f6458..5dd7ee0b 100644 --- a/freebsd/sys/sys/random.h +++ b/freebsd/sys/sys/random.h @@ -37,31 +37,29 @@ struct uio; -#if defined(DEV_RANDOM) -u_int read_random(void *, u_int); -int read_random_uio(struct uio *, bool); -#else -static __inline int -read_random_uio(void *a __unused, u_int b __unused) -{ - return (0); -} #ifndef __rtems__ -static __inline u_int -read_random(void *a __unused, u_int b __unused) -{ - return (0); -} +void read_random(void *, u_int); #else /* __rtems__ */ #include <unistd.h> static __inline u_int read_random(void *ptr, u_int n) { + getentropy(ptr, n); return (n); } #endif /* __rtems__ */ -#endif +int read_random_uio(struct uio *, bool); +#ifndef __rtems__ +bool is_random_seeded(void); +#else /* __rtems__ */ +static __inline bool +is_random_seeded(void) +{ + + return (true); +} +#endif /* __rtems__ */ /* * Note: if you add or remove members of random_entropy_source, remember to @@ -97,6 +95,7 @@ enum random_entropy_source { RANDOM_PURE_BROADCOM, RANDOM_PURE_CCP, RANDOM_PURE_DARN, + RANDOM_PURE_TPM, ENTROPYSOURCE }; _Static_assert(ENTROPYSOURCE <= 32, @@ -104,9 +103,8 @@ _Static_assert(ENTROPYSOURCE <= 32, #define RANDOM_LEGACY_BOOT_ENTROPY_MODULE "/boot/entropy" #define RANDOM_CACHED_BOOT_ENTROPY_MODULE "boot_entropy_cache" -#define RANDOM_CACHED_SKIP_START 256 -#if defined(DEV_RANDOM) +#ifndef __rtems__ extern u_int hc_source_mask; void random_harvest_queue_(const void *, u_int, enum random_entropy_source); void random_harvest_fast_(const void *, u_int); @@ -163,6 +161,9 @@ void random_harvest_deregister_source(enum random_entropy_source); #define GRND_NONBLOCK 0x1 #define GRND_RANDOM 0x2 + +__BEGIN_DECLS ssize_t getrandom(void *buf, size_t buflen, unsigned int flags); +__END_DECLS #endif /* _SYS_RANDOM_H_ */ diff --git a/freebsd/sys/sys/refcount.h b/freebsd/sys/sys/refcount.h index 0cc4eb41..c21a0dca 100644 --- a/freebsd/sys/sys/refcount.h +++ b/freebsd/sys/sys/refcount.h @@ -2,7 +2,6 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 John Baldwin <jhb@FreeBSD.org> - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,82 +30,150 @@ #ifndef __SYS_REFCOUNT_H__ #define __SYS_REFCOUNT_H__ -#include <sys/limits.h> #include <machine/atomic.h> #ifdef _KERNEL #include <sys/systm.h> #else +#include <stdbool.h> #define KASSERT(exp, msg) /* */ #endif +#define REFCOUNT_WAITER (1U << 31) /* Refcount has waiter. */ +#define REFCOUNT_SATURATION_VALUE (3U << 29) + +#define REFCOUNT_SATURATED(val) (((val) & (1U << 30)) != 0) +#define REFCOUNT_COUNT(x) ((x) & ~REFCOUNT_WAITER) + +bool refcount_release_last(volatile u_int *count, u_int n, u_int old); +void refcount_sleep(volatile u_int *count, const char *wmesg, int prio); + +/* + * Attempt to handle reference count overflow and underflow. Force the counter + * to stay at the saturation value so that a counter overflow cannot trigger + * destruction of the containing object and instead leads to a less harmful + * memory leak. + */ static __inline void -refcount_init(volatile u_int *count, u_int value) +_refcount_update_saturated(volatile u_int *count) { +#ifdef INVARIANTS + panic("refcount %p wraparound", count); +#else + atomic_store_int((volatile int *)count, REFCOUNT_SATURATION_VALUE); +#endif +} +static __inline void +refcount_init(volatile u_int *count, u_int value) +{ + KASSERT(!REFCOUNT_SATURATED(value), + ("invalid initial refcount value %u", value)); *count = value; } static __inline void refcount_acquire(volatile u_int *count) { + u_int old; - KASSERT(*count < UINT_MAX, ("refcount %p overflowed", count)); - atomic_add_int((volatile int *)count, 1); + old = atomic_fetchadd_int((volatile int *)count, 1); + if (__predict_false(REFCOUNT_SATURATED(old))) + _refcount_update_saturated(count); } -static __inline int -refcount_release(volatile u_int *count) +static __inline void +refcount_acquiren(volatile u_int *count, u_int n) +{ + u_int old; + + KASSERT(n < REFCOUNT_SATURATION_VALUE / 2, + ("refcount_acquiren: n=%u too large", n)); + old = atomic_fetchadd_int((volatile int *)count, n); + if (__predict_false(REFCOUNT_SATURATED(old))) + _refcount_update_saturated(count); +} + +static __inline __result_use_check bool +refcount_acquire_checked(volatile u_int *count) +{ + u_int lcount; + + for (lcount = *count;;) { + if (__predict_false(REFCOUNT_SATURATED(lcount + 1))) + return (false); + if (__predict_true(atomic_fcmpset_int((volatile int *)count, + (int *)&lcount, lcount + 1) == 1)) + return (true); + } +} + +static __inline bool +refcount_releasen(volatile u_int *count, u_int n) { u_int old; + KASSERT(n < REFCOUNT_SATURATION_VALUE / 2, + ("refcount_releasen: n=%u too large", n)); + atomic_thread_fence_rel(); - old = atomic_fetchadd_int((volatile int *)count, -1); - KASSERT(old > 0, ("refcount %p is zero", count)); - if (old > 1) - return (0); - - /* - * Last reference. Signal the user to call the destructor. - * - * Ensure that the destructor sees all updates. The fence_rel - * at the start of the function synchronized with this fence. - */ - atomic_thread_fence_acq(); - return (1); + old = atomic_fetchadd_int((volatile int *)count, -n); + if (__predict_false(n >= REFCOUNT_COUNT(old) || + REFCOUNT_SATURATED(old))) + return (refcount_release_last(count, n, old)); + return (false); +} + +static __inline bool +refcount_release(volatile u_int *count) +{ + + return (refcount_releasen(count, 1)); +} + +static __inline void +refcount_wait(volatile u_int *count, const char *wmesg, int prio) +{ + + while (*count != 0) + refcount_sleep(count, wmesg, prio); } /* * This functions returns non-zero if the refcount was * incremented. Else zero is returned. */ -static __inline __result_use_check int +static __inline __result_use_check bool refcount_acquire_if_not_zero(volatile u_int *count) { u_int old; old = *count; for (;;) { - KASSERT(old < UINT_MAX, ("refcount %p overflowed", count)); - if (old == 0) - return (0); - if (atomic_fcmpset_int(count, &old, old + 1)) - return (1); + if (REFCOUNT_COUNT(old) == 0) + return (false); + if (__predict_false(REFCOUNT_SATURATED(old))) + return (true); + if (atomic_fcmpset_int((volatile int *)count, + (int *)&old, old + 1)) + return (true); } } -static __inline __result_use_check int +static __inline __result_use_check bool refcount_release_if_not_last(volatile u_int *count) { u_int old; old = *count; for (;;) { - KASSERT(old > 0, ("refcount %p is zero", count)); - if (old == 1) - return (0); - if (atomic_fcmpset_int(count, &old, old - 1)) - return (1); + if (REFCOUNT_COUNT(old) == 1) + return (false); + if (__predict_false(REFCOUNT_SATURATED(old))) + return (true); + if (atomic_fcmpset_int((volatile int *)count, + (int *)&old, old - 1)) + return (true); } } diff --git a/freebsd/sys/sys/rmlock.h b/freebsd/sys/sys/rmlock.h index 1dd2740c..ca098b2d 100644 --- a/freebsd/sys/sys/rmlock.h +++ b/freebsd/sys/sys/rmlock.h @@ -55,7 +55,6 @@ void rm_init_flags(struct rmlock *rm, const char *name, int opts); void rm_destroy(struct rmlock *rm); int rm_wowned(const struct rmlock *rm); void rm_sysinit(void *arg); -void rm_sysinit_flags(void *arg); void _rm_wlock_debug(struct rmlock *rm, const char *file, int line); void _rm_wunlock_debug(struct rmlock *rm, const char *file, int line); diff --git a/freebsd/sys/sys/rwlock.h b/freebsd/sys/sys/rwlock.h index 531f10d2..c1e5a45c 100644 --- a/freebsd/sys/sys/rwlock.h +++ b/freebsd/sys/sys/rwlock.h @@ -2,7 +2,6 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org> - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -136,7 +135,6 @@ void _rw_init_flags(volatile uintptr_t *c, const char *name, int opts); void _rw_destroy(volatile uintptr_t *c); void rw_sysinit(void *arg); -void rw_sysinit_flags(void *arg); int _rw_wowned(const volatile uintptr_t *c); void _rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line); int __rw_try_wlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF); diff --git a/freebsd/sys/sys/sbuf.h b/freebsd/sys/sys/sbuf.h index 8e958cbe..10b59f36 100644 --- a/freebsd/sys/sys/sbuf.h +++ b/freebsd/sys/sys/sbuf.h @@ -52,11 +52,13 @@ struct sbuf { #define SBUF_AUTOEXTEND 0x00000001 /* automatically extend buffer */ #define SBUF_INCLUDENUL 0x00000002 /* nulterm byte is counted in len */ #define SBUF_DRAINTOEOR 0x00000004 /* use section 0 as drain EOR marker */ +#define SBUF_NOWAIT 0x00000008 /* Extend with non-blocking malloc */ #define SBUF_USRFLAGMSK 0x0000ffff /* mask of flags the user may specify */ #define SBUF_DYNAMIC 0x00010000 /* s_buf must be freed */ #define SBUF_FINISHED 0x00020000 /* set by sbuf_finish() */ #define SBUF_DYNSTRUCT 0x00080000 /* sbuf must be freed */ #define SBUF_INSECTION 0x00100000 /* set by sbuf_start_section() */ +#define SBUF_DRAINATEOL 0x00200000 /* drained contents ended in \n */ int s_flags; /* flags */ ssize_t s_sect_len; /* current length of section */ ssize_t s_rec_off; /* current record start offset */ @@ -90,6 +92,7 @@ int sbuf_printf(struct sbuf *, const char *, ...) __printflike(2, 3); int sbuf_vprintf(struct sbuf *, const char *, __va_list) __printflike(2, 0); +int sbuf_nl_terminate(struct sbuf *); int sbuf_putc(struct sbuf *, int); void sbuf_set_drain(struct sbuf *, sbuf_drain_func *, void *); int sbuf_trim(struct sbuf *); @@ -103,6 +106,8 @@ void sbuf_start_section(struct sbuf *, ssize_t *); ssize_t sbuf_end_section(struct sbuf *, ssize_t, size_t, int); void sbuf_hexdump(struct sbuf *, const void *, int, const char *, int); +int sbuf_count_drain(void *arg, const char *data, int len); +int sbuf_printf_drain(void *arg, const char *data, int len); void sbuf_putbuf(struct sbuf *); #ifdef _KERNEL diff --git a/freebsd/sys/sys/seq.h b/freebsd/sys/sys/seq.h deleted file mode 100644 index c5f00bcb..00000000 --- a/freebsd/sys/sys/seq.h +++ /dev/null @@ -1,156 +0,0 @@ -/*- - * Copyright (c) 2014 Mateusz Guzik <mjg@FreeBSD.org> - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _SYS_SEQ_H_ -#define _SYS_SEQ_H_ - -#ifdef _KERNEL -#include <sys/systm.h> -#endif -#include <sys/types.h> - -/* - * seq_t may be included in structs visible to userspace - */ -typedef uint32_t seq_t; - -#ifdef _KERNEL - -/* - * seq allows readers and writers to work with a consistent snapshot. Modifying - * operations must be enclosed within a transaction delineated by - * seq_write_beg/seq_write_end. The trick works by having the writer increment - * the sequence number twice, at the beginning and end of the transaction. - * The reader detects that the sequence number has not changed between its start - * and end, and that the sequence number is even, to validate consistency. - * - * Some fencing (both hard fencing and compiler barriers) may be needed, - * depending on the cpu. Modern AMD cpus provide strong enough guarantees to not - * require any fencing by the reader or writer. - * - * Example usage: - * - * writers: - * lock_exclusive(&obj->lock); - * seq_write_begin(&obj->seq); - * obj->var1 = ...; - * obj->var2 = ...; - * seq_write_end(&obj->seq); - * unlock_exclusive(&obj->lock); - * - * readers: - * int var1, var2; - * seq_t seq; - * - * for (;;) { - * seq = seq_read(&obj->seq); - * var1 = obj->var1; - * var2 = obj->var2; - * if (seq_consistent(&obj->seq, seq)) - * break; - * } - * ..... - * - * Writers may not block or sleep in any way. - * - * There are 2 minor caveats in this implementation: - * - * 1. There is no guarantee of progress. That is, a large number of writers can - * interfere with the execution of the readers and cause the code to live-lock - * in a loop trying to acquire a consistent snapshot. - * - * 2. If the reader loops long enough, the counter may overflow and eventually - * wrap back to its initial value, fooling the reader into accepting the - * snapshot. Given that this needs 4 billion transactional writes across a - * single contended reader, it is unlikely to ever happen. - */ - -/* A hack to get MPASS macro */ -#include <sys/lock.h> - -#include <machine/cpu.h> - -static __inline bool -seq_in_modify(seq_t seqp) -{ - - return (seqp & 1); -} - -static __inline void -seq_write_begin(seq_t *seqp) -{ - - critical_enter(); - MPASS(!seq_in_modify(*seqp)); - *seqp += 1; - atomic_thread_fence_rel(); -} - -static __inline void -seq_write_end(seq_t *seqp) -{ - - atomic_store_rel_32(seqp, *seqp + 1); - MPASS(!seq_in_modify(*seqp)); - critical_exit(); -} - -static __inline seq_t -seq_read(const seq_t *seqp) -{ - seq_t ret; - - for (;;) { - ret = atomic_load_acq_32(__DECONST(seq_t *, seqp)); - if (seq_in_modify(ret)) { - cpu_spinwait(); - continue; - } - break; - } - - return (ret); -} - -static __inline seq_t -seq_consistent_nomb(const seq_t *seqp, seq_t oldseq) -{ - - return (*seqp == oldseq); -} - -static __inline seq_t -seq_consistent(const seq_t *seqp, seq_t oldseq) -{ - - atomic_thread_fence_acq(); - return (seq_consistent_nomb(seqp, oldseq)); -} - -#endif /* _KERNEL */ -#endif /* _SYS_SEQ_H_ */ diff --git a/freebsd/sys/sys/seqc.h b/freebsd/sys/sys/seqc.h new file mode 100644 index 00000000..00e4cc3c --- /dev/null +++ b/freebsd/sys/sys/seqc.h @@ -0,0 +1,107 @@ +/*- + * Copyright (c) 2014 Mateusz Guzik <mjg@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_SEQC_H_ +#define _SYS_SEQC_H_ + +#ifdef _KERNEL +#include <sys/systm.h> +#endif +#include <sys/types.h> + +/* + * seqc_t may be included in structs visible to userspace + */ +typedef uint32_t seqc_t; + +#ifdef _KERNEL + +/* A hack to get MPASS macro */ +#include <sys/lock.h> + +#include <machine/cpu.h> + +static __inline bool +seqc_in_modify(seqc_t seqcp) +{ + + return (seqcp & 1); +} + +static __inline void +seqc_write_begin(seqc_t *seqcp) +{ + + critical_enter(); + MPASS(!seqc_in_modify(*seqcp)); + *seqcp += 1; + atomic_thread_fence_rel(); +} + +static __inline void +seqc_write_end(seqc_t *seqcp) +{ + + atomic_store_rel_int(seqcp, *seqcp + 1); + MPASS(!seqc_in_modify(*seqcp)); + critical_exit(); +} + +static __inline seqc_t +seqc_read(const seqc_t *seqcp) +{ + seqc_t ret; + + for (;;) { + ret = atomic_load_acq_int(__DECONST(seqc_t *, seqcp)); + if (__predict_false(seqc_in_modify(ret))) { + cpu_spinwait(); + continue; + } + break; + } + + return (ret); +} + +static __inline bool +seqc_consistent_nomb(const seqc_t *seqcp, seqc_t oldseqc) +{ + + return (*seqcp == oldseqc); +} + +static __inline bool +seqc_consistent(const seqc_t *seqcp, seqc_t oldseqc) +{ + + atomic_thread_fence_acq(); + return (seqc_consistent_nomb(seqcp, oldseqc)); +} + +#endif /* _KERNEL */ +#endif /* _SYS_SEQC_H_ */ diff --git a/freebsd/sys/sys/sglist.h b/freebsd/sys/sys/sglist.h index 5674416c..f11c74a4 100644 --- a/freebsd/sys/sys/sglist.h +++ b/freebsd/sys/sys/sglist.h @@ -57,6 +57,7 @@ struct sglist { struct bio; struct mbuf; +struct mbuf_ext_pgs; struct uio; static __inline void @@ -87,6 +88,9 @@ sglist_hold(struct sglist *sg) struct sglist *sglist_alloc(int nsegs, int mflags); int sglist_append(struct sglist *sg, void *buf, size_t len); int sglist_append_bio(struct sglist *sg, struct bio *bp); +int sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs, + size_t off, size_t len); +int sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m); int sglist_append_mbuf(struct sglist *sg, struct mbuf *m0); int sglist_append_phys(struct sglist *sg, vm_paddr_t paddr, size_t len); @@ -101,6 +105,9 @@ struct sglist *sglist_build(void *buf, size_t len, int mflags); struct sglist *sglist_clone(struct sglist *sg, int mflags); int sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid); int sglist_count(void *buf, size_t len); +int sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off, + size_t len); +int sglist_count_mb_ext_pgs(struct mbuf *m); int sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len); void sglist_free(struct sglist *sg); int sglist_join(struct sglist *first, struct sglist *second); diff --git a/freebsd/sys/sys/sleepqueue.h b/freebsd/sys/sys/sleepqueue.h index 07530e3b..8974869f 100644 --- a/freebsd/sys/sys/sleepqueue.h +++ b/freebsd/sys/sys/sleepqueue.h @@ -2,7 +2,6 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org> - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -85,6 +84,7 @@ struct thread; #define SLEEPQ_SX 0x03 /* Used by an sx lock. */ #define SLEEPQ_LK 0x04 /* Used by a lockmgr. */ #define SLEEPQ_INTERRUPTIBLE 0x100 /* Sleep is interruptible. */ +#define SLEEPQ_UNFAIR 0x200 /* Unfair wakeup order. */ void init_sleepqueues(void); int sleepq_abort(struct thread *td, int intrval); diff --git a/freebsd/sys/sys/slicer.h b/freebsd/sys/sys/slicer.h index 1565ecce..675b5acc 100644 --- a/freebsd/sys/sys/slicer.h +++ b/freebsd/sys/sys/slicer.h @@ -58,7 +58,7 @@ typedef int (*flash_slicer_t)(device_t dev, const char *provider, #define FLASH_SLICES_TYPE_SPI 2 #define FLASH_SLICES_TYPE_MMC 3 -/* Use NULL for deregistering a slicer */ +/* Use NULL and set force to true for deregistering a slicer */ void flash_register_slicer(flash_slicer_t slicer, u_int type, bool force); #endif /* _KERNEL */ diff --git a/freebsd/sys/sys/smp.h b/freebsd/sys/sys/smp.h index aa0c3119..22b7dcd5 100644 --- a/freebsd/sys/sys/smp.h +++ b/freebsd/sys/sys/smp.h @@ -168,8 +168,10 @@ extern cpuset_t logical_cpus_mask; #ifndef __rtems__ extern u_int mp_maxid; extern int mp_maxcpus; +extern int mp_ncores; extern int mp_ncpus; extern volatile int smp_started; +extern int smp_threads_per_core; extern cpuset_t all_cpus; extern cpuset_t cpuset_domain[MAXMEMDOM]; /* CPUs in each NUMA domain. */ diff --git a/freebsd/sys/sys/sockbuf.h b/freebsd/sys/sys/sockbuf.h index 3b716283..020c6bfe 100644 --- a/freebsd/sys/sys/sockbuf.h +++ b/freebsd/sys/sys/sockbuf.h @@ -50,6 +50,7 @@ #define SB_AUTOSIZE 0x800 /* automatically size socket buffer */ #define SB_STOP 0x1000 /* backpressure indicator */ #define SB_AIO_RUNNING 0x2000 /* AIO operation running */ +#define SB_TLS_IFNET 0x4000 /* has used / is using ifnet KTLS */ #define SBS_CANTSENDMORE 0x0010 /* can't send more data to peer */ #define SBS_CANTRCVMORE 0x0020 /* can't receive more data from peer */ @@ -63,6 +64,7 @@ #define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */ +struct ktls_session; struct mbuf; struct sockaddr; struct socket; @@ -74,6 +76,7 @@ struct selinfo; * * Locking key to struct sockbuf: * (a) locked by SOCKBUF_LOCK(). + * (b) locked by sblock() */ struct sockbuf { struct mtx sb_mtx; /* sockbuf lock */ @@ -98,7 +101,9 @@ struct sockbuf { u_int sb_ctl; /* (a) non-data chars in buffer */ int sb_lowat; /* (a) low water mark */ sbintime_t sb_timeo; /* (a) timeout for read/write */ - short sb_flags; /* (a) flags, see below */ + uint64_t sb_tls_seqno; /* (a) TLS seqno */ + struct ktls_session *sb_tls_info; /* (a + b) TLS state */ + short sb_flags; /* (a) flags, see above */ int (*sb_upcall)(struct socket *, void *, int); /* (a) */ void *sb_upcallarg; /* (a) */ #ifndef __rtems__ diff --git a/freebsd/sys/sys/socketvar.h b/freebsd/sys/sys/socketvar.h index af85aa05..74fc1306 100644 --- a/freebsd/sys/sys/socketvar.h +++ b/freebsd/sys/sys/socketvar.h @@ -180,13 +180,13 @@ struct socket { /* * Socket state bits. * - * Historically, this bits were all kept in the so_state field. For - * locking reasons, they are now in multiple fields, as they are - * locked differently. so_state maintains basic socket state protected - * by the socket lock. so_qstate holds information about the socket - * accept queues. Each socket buffer also has a state field holding - * information relevant to that socket buffer (can't send, rcv). Many - * fields will be read without locks to improve performance and avoid + * Historically, these bits were all kept in the so_state field. + * They are now split into separate, lock-specific fields. + * so_state maintains basic socket state protected by the socket lock. + * so_qstate holds information about the socket accept queues. + * Each socket buffer also has a state field holding information + * relevant to that socket buffer (can't send, rcv). + * Many fields will be read without locks to improve performance and avoid * lock order issues. However, this approach must be used with caution. */ #define SS_NOFDREF 0x0001 /* no file table ref any more */ diff --git a/freebsd/sys/sys/sysctl.h b/freebsd/sys/sys/sysctl.h index a8562b8d..52f92265 100644 --- a/freebsd/sys/sys/sysctl.h +++ b/freebsd/sys/sys/sysctl.h @@ -218,6 +218,7 @@ int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS); int sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS); int sysctl_usec_to_sbintime(SYSCTL_HANDLER_ARGS); +int sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS); int sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS); int sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS); @@ -391,6 +392,25 @@ TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry); NULL); \ }) +/* Oid for a constant '\0' terminated string. */ +#define SYSCTL_CONST_STRING(parent, nbr, name, access, arg, descr) \ + SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING|(access), \ + __DECONST(char *, arg), 0, sysctl_handle_string, "A", descr); \ + CTASSERT(!(access & CTLFLAG_WR)); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING) + +#define SYSCTL_ADD_CONST_STRING(ctx, parent, nbr, name, access, arg, descr) \ +({ \ + char *__arg = __DECONST(char *, arg); \ + CTASSERT(!(access & CTLFLAG_WR)); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING); \ + sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_STRING|(access), \ + __arg, 0, sysctl_handle_string, "A", __DESCR(descr), \ + NULL); \ +}) + /* Oid for a bool. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_BOOL_PTR ((bool *)NULL) #define SYSCTL_BOOL(parent, nbr, name, access, ptr, val, descr) \ @@ -875,6 +895,24 @@ TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry); NULL); \ }) +/* OID expressing a struct timeval as seconds */ +#define SYSCTL_TIMEVAL_SEC(parent, nbr, name, access, ptr, descr) \ + SYSCTL_OID(parent, nbr, name, \ + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ + (ptr), 0, sysctl_sec_to_timeval, "I", descr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) +#define SYSCTL_ADD_TIMEVAL_SEC(ctx, parent, nbr, name, access, ptr, descr) \ +({ \ + struct timeval *__ptr = (ptr); \ + CTASSERT(((access) & CTLTYPE) == 0 || \ + ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT); \ + sysctl_add_oid(ctx, parent, nbr, name, \ + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ + __ptr, 0, sysctl_sec_to_timeval, "I", __DESCR(descr), \ + NULL); \ +}) + /* * A macro to generate a read-only sysctl to indicate the presence of optional * kernel features. @@ -888,7 +926,7 @@ TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry); /* * Top-level identifiers */ -#define CTL_UNSPEC 0 /* unused */ +#define CTL_SYSCTL 0 /* "magic" numbers */ #define CTL_KERN 1 /* "high kernel": proc, limits */ #define CTL_VM 2 /* virtual memory */ #define CTL_VFS 3 /* filesystem, mount type is next */ @@ -900,6 +938,17 @@ TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry); #define CTL_P1003_1B 9 /* POSIX 1003.1B */ /* + * CTL_SYSCTL identifiers + */ +#define CTL_SYSCTL_DEBUG 0 /* printf all nodes */ +#define CTL_SYSCTL_NAME 1 /* string name of OID */ +#define CTL_SYSCTL_NEXT 2 /* next OID */ +#define CTL_SYSCTL_NAME2OID 3 /* int array of name */ +#define CTL_SYSCTL_OIDFMT 4 /* OID's kind and format */ +#define CTL_SYSCTL_OIDDESCR 5 /* OID's description */ +#define CTL_SYSCTL_OIDLABEL 6 /* aggregation label */ + +/* * CTL_KERN identifiers */ #define KERN_OSTYPE 1 /* string: system version */ @@ -1140,6 +1189,9 @@ int sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid, void sysctl_wlock(void); void sysctl_wunlock(void); int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len); +int kern___sysctlbyname(struct thread *td, const char *name, + size_t namelen, void *old, size_t *oldlenp, void *new, + size_t newlen, size_t *retval, int flags, bool inkernel); struct sbuf; struct sbuf *sbuf_new_for_sysctl(struct sbuf *, char *, int, diff --git a/freebsd/sys/sys/sysproto.h b/freebsd/sys/sys/sysproto.h index faa5a62f..8fb785b3 100644 --- a/freebsd/sys/sys/sysproto.h +++ b/freebsd/sys/sys/sysproto.h @@ -1,7 +1,7 @@ /* * System call prototypes. * - * DO NOT EDIT-- this file is automatically generated. + * DO NOT EDIT-- this file is automatically @generated. * $FreeBSD$ */ @@ -597,7 +597,7 @@ struct sysctl_args { char namelen_l_[PADL_(u_int)]; u_int namelen; char namelen_r_[PADR_(u_int)]; char old_l_[PADL_(void *)]; void * old; char old_r_[PADR_(void *)]; char oldlenp_l_[PADL_(size_t *)]; size_t * oldlenp; char oldlenp_r_[PADR_(size_t *)]; - char new_l_[PADL_(void *)]; void * new; char new_r_[PADR_(void *)]; + char new_l_[PADL_(const void *)]; const void * new; char new_r_[PADR_(const void *)]; char newlen_l_[PADL_(size_t)]; size_t newlen; char newlen_r_[PADR_(size_t)]; }; struct mlock_args { @@ -1837,6 +1837,28 @@ struct fhreadlink_args { char buf_l_[PADL_(char *)]; char * buf; char buf_r_[PADR_(char *)]; char bufsize_l_[PADL_(size_t)]; size_t bufsize; char bufsize_r_[PADR_(size_t)]; }; +struct funlinkat_args { + char dfd_l_[PADL_(int)]; int dfd; char dfd_r_[PADR_(int)]; + char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)]; + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char flag_l_[PADL_(int)]; int flag; char flag_r_[PADR_(int)]; +}; +struct copy_file_range_args { + char infd_l_[PADL_(int)]; int infd; char infd_r_[PADR_(int)]; + char inoffp_l_[PADL_(off_t *)]; off_t * inoffp; char inoffp_r_[PADR_(off_t *)]; + char outfd_l_[PADL_(int)]; int outfd; char outfd_r_[PADR_(int)]; + char outoffp_l_[PADL_(off_t *)]; off_t * outoffp; char outoffp_r_[PADR_(off_t *)]; + char len_l_[PADL_(size_t)]; size_t len; char len_r_[PADR_(size_t)]; + char flags_l_[PADL_(unsigned int)]; unsigned int flags; char flags_r_[PADR_(unsigned int)]; +}; +struct __sysctlbyname_args { + char name_l_[PADL_(const char *)]; const char * name; char name_r_[PADR_(const char *)]; + char namelen_l_[PADL_(size_t)]; size_t namelen; char namelen_r_[PADR_(size_t)]; + char old_l_[PADL_(void *)]; void * old; char old_r_[PADR_(void *)]; + char oldlenp_l_[PADL_(size_t *)]; size_t * oldlenp; char oldlenp_r_[PADR_(size_t *)]; + char new_l_[PADL_(void *)]; void * new; char new_r_[PADR_(void *)]; + char newlen_l_[PADL_(size_t)]; size_t newlen; char newlen_r_[PADR_(size_t)]; +}; int nosys(struct thread *, struct nosys_args *); void sys_sys_exit(struct thread *, struct sys_exit_args *); int sys_fork(struct thread *, struct fork_args *); @@ -2221,6 +2243,9 @@ int sys_getfhat(struct thread *, struct getfhat_args *); int sys_fhlink(struct thread *, struct fhlink_args *); int sys_fhlinkat(struct thread *, struct fhlinkat_args *); int sys_fhreadlink(struct thread *, struct fhreadlink_args *); +int sys_funlinkat(struct thread *, struct funlinkat_args *); +int sys_copy_file_range(struct thread *, struct copy_file_range_args *); +int sys___sysctlbyname(struct thread *, struct __sysctlbyname_args *); #ifdef COMPAT_43 @@ -3127,6 +3152,9 @@ int freebsd11_mknodat(struct thread *, struct freebsd11_mknodat_args *); #define SYS_AUE_fhlink AUE_NULL #define SYS_AUE_fhlinkat AUE_NULL #define SYS_AUE_fhreadlink AUE_NULL +#define SYS_AUE_funlinkat AUE_UNLINKAT +#define SYS_AUE_copy_file_range AUE_NULL +#define SYS_AUE___sysctlbyname AUE_SYSCTL #endif /* __rtems__ */ #undef PAD_ diff --git a/freebsd/sys/sys/systm.h b/freebsd/sys/sys/systm.h index 1aa57670..8bb44703 100644 --- a/freebsd/sys/sys/systm.h +++ b/freebsd/sys/sys/systm.h @@ -105,7 +105,8 @@ extern int vm_guest; /* Running as virtual machine guest? */ * Keep in sync with vm_guest_sysctl_names[]. */ enum VM_GUEST { VM_GUEST_NO = 0, VM_GUEST_VM, VM_GUEST_XEN, VM_GUEST_HV, - VM_GUEST_VMWARE, VM_GUEST_KVM, VM_GUEST_BHYVE, VM_LAST }; + VM_GUEST_VMWARE, VM_GUEST_KVM, VM_GUEST_BHYVE, VM_GUEST_VBOX, + VM_GUEST_PARALLELS, VM_LAST }; /* * These functions need to be declared before the KASSERT macro is invoked in @@ -270,6 +271,13 @@ void init_param2(long physpages); void init_static_kenv(char *, size_t); void tablefull(const char *); +/* + * Allocate per-thread "current" state in the linuxkpi + */ +extern int (*lkpi_alloc_current)(struct thread *, int); +int linux_alloc_current_noop(struct thread *, int); + + #if defined(KLD_MODULE) || defined(KTR_CRITICAL) || !defined(_KERNEL) || defined(GENOFFSET) #define critical_enter() critical_enter_KBI() #define critical_exit() critical_exit_KBI() @@ -612,6 +620,7 @@ int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, _sleep((chan), NULL, (pri), (wmesg), (bt), (pr), (flags)) void wakeup(void * chan); void wakeup_one(void * chan); +void wakeup_any(void * chan); /* * Common `struct cdev *' stuff are declared here to avoid #include poisoning @@ -687,6 +696,7 @@ void counted_warning(unsigned *counter, const char *msg); /* * APIs to manage deprecation and obsolescence. */ +#ifndef __rtems__ struct device; void _gone_in(int major, const char *msg); void _gone_in_dev(struct device *dev, int major, const char *msg); @@ -699,9 +709,10 @@ void _gone_in_dev(struct device *dev, int major, const char *msg); #endif #define gone_in(major, msg) __gone_ok(major, msg) _gone_in(major, msg) #define gone_in_dev(dev, major, msg) __gone_ok(major, msg) _gone_in_dev(dev, major, msg) -#define gone_by_fcp101_dev(dev) \ - gone_in_dev((dev), 13, \ - "see https://github.com/freebsd/fcp/blob/master/fcp-0101.md") +#else /* __rtems__ */ +#define gone_in(major, msg) do { } while (0) +#define gone_in_dev(dev, major, msg) do { } while (0) +#endif /* __rtems__ */ __NULLABILITY_PRAGMA_POP diff --git a/freebsd/sys/sys/tree.h b/freebsd/sys/sys/tree.h index 539afb42..345f7dec 100644 --- a/freebsd/sys/sys/tree.h +++ b/freebsd/sys/sys/tree.h @@ -125,7 +125,7 @@ struct type *name##_SPLAY_INSERT(struct name *, struct type *); \ struct type *name##_SPLAY_REMOVE(struct name *, struct type *); \ \ /* Finds the node with the same key as elm */ \ -static __inline struct type * \ +static __unused __inline struct type * \ name##_SPLAY_FIND(struct name *head, struct type *elm) \ { \ if (SPLAY_EMPTY(head)) \ @@ -136,7 +136,7 @@ name##_SPLAY_FIND(struct name *head, struct type *elm) \ return (NULL); \ } \ \ -static __inline struct type * \ +static __unused __inline struct type * \ name##_SPLAY_NEXT(struct name *head, struct type *elm) \ { \ name##_SPLAY(head, elm); \ @@ -150,7 +150,7 @@ name##_SPLAY_NEXT(struct name *head, struct type *elm) \ return (elm); \ } \ \ -static __inline struct type * \ +static __unused __inline struct type * \ name##_SPLAY_MIN_MAX(struct name *head, int val) \ { \ name##_SPLAY_MINMAX(head, val); \ diff --git a/freebsd/sys/sys/ucred.h b/freebsd/sys/sys/ucred.h index 37a93357..cf785c63 100644 --- a/freebsd/sys/sys/ucred.h +++ b/freebsd/sys/sys/ucred.h @@ -92,7 +92,10 @@ struct xucred { uid_t cr_uid; /* effective user id */ short cr_ngroups; /* number of groups */ gid_t cr_groups[XU_NGROUPS]; /* groups */ - void *_cr_unused1; /* compatibility with old ucred */ + union { + void *_cr_unused1; /* compatibility with old ucred */ + pid_t cr_pid; + }; #endif /* __rtems__ */ }; #define XUCRED_VERSION 0 @@ -121,12 +124,14 @@ void crfree(struct ucred *cr); struct ucred *crget(void); struct ucred *crhold(struct ucred *cr); void cru2x(struct ucred *cr, struct xucred *xcr); +void cru2xt(struct thread *td, struct xucred *xcr); void crsetgroups(struct ucred *cr, int n, gid_t *groups); int groupmember(gid_t gid, struct ucred *cred); #else /* __rtems__ */ #define crfree(cr) do { } while (0) #define crhold(cr) NULL #define cru2x(cr, xcr) do { } while (0) +#define cru2xt(td, xcr) do { } while (0) #define groupmember(gid, cred) 1 #endif /* __rtems__ */ #endif /* _KERNEL */ diff --git a/freebsd/sys/sys/user.h b/freebsd/sys/sys/user.h index 80716460..b2338f01 100644 --- a/freebsd/sys/sys/user.h +++ b/freebsd/sys/sys/user.h @@ -477,6 +477,7 @@ struct kinfo_file { #define KVME_FLAG_SUPER 0x00000008 #define KVME_FLAG_GROWS_UP 0x00000010 #define KVME_FLAG_GROWS_DOWN 0x00000020 +#define KVME_FLAG_USER_WIRED 0x00000040 #if defined(__amd64__) #define KINFO_OVMENTRY_SIZE 1168 @@ -613,6 +614,7 @@ int kern_proc_vmmap_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags); int vntype_to_kinfo(int vtype); +void pack_kinfo(struct kinfo_file *kif); #endif /* !_KERNEL */ #endif diff --git a/freebsd/sys/sys/vmmeter.h b/freebsd/sys/sys/vmmeter.h index 579d1675..3714a069 100644 --- a/freebsd/sys/sys/vmmeter.h +++ b/freebsd/sys/sys/vmmeter.h @@ -153,6 +153,8 @@ extern domainset_t vm_severe_domains; #define VM_CNT_INC(var) VM_CNT_ADD(var, 1) #define VM_CNT_FETCH(var) counter_u64_fetch(vm_cnt.var) +extern u_long vm_user_wire_count; + static inline void vm_wire_add(int cnt) { diff --git a/freebsd/sys/sys/vnode.h b/freebsd/sys/sys/vnode.h index 81f90bde..7c75adb4 100644 --- a/freebsd/sys/sys/vnode.h +++ b/freebsd/sys/sys/vnode.h @@ -59,6 +59,7 @@ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; +enum vgetstate { VGET_HOLDCNT, VGET_USECOUNT }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). @@ -170,7 +171,8 @@ struct vnode { u_int v_iflag; /* i vnode flags (see below) */ u_int v_vflag; /* v vnode flags */ u_int v_mflag; /* l mnt-specific vnode flags */ - int v_writecount; /* v ref count of writers */ + int v_writecount; /* I ref count of writers or + (negative) text users */ u_int v_hash; enum vtype v_type; /* u vnode type */ }; @@ -233,6 +235,7 @@ struct xvnode { * VI_DOOMED is doubly protected by the interlock and vnode lock. Both * are required for writing but the status may be checked with either. */ +#define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */ #define VI_MOUNT 0x0020 /* Mount in progress */ #define VI_DOOMED 0x0080 /* This vnode is being recycled */ #define VI_FREE 0x0100 /* This vnode is on the freelist */ @@ -245,7 +248,6 @@ struct xvnode { #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_ETERNALDEV 0x0008 /* device that is never destroyed */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ -#define VV_TEXT 0x0020 /* vnode is a pure text prototype */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ @@ -652,14 +654,18 @@ int vcount(struct vnode *vp); #define vdropl(vp) _vdrop((vp), 1) void _vdrop(struct vnode *, bool); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); -int vget(struct vnode *vp, int lockflag, struct thread *td); +int vget(struct vnode *vp, int flags, struct thread *td); +enum vgetstate vget_prep(struct vnode *vp); +int vget_finish(struct vnode *vp, int flags, enum vgetstate vs); void vgone(struct vnode *vp); #define vhold(vp) _vhold((vp), 0) #define vholdl(vp) _vhold((vp), 1) void _vhold(struct vnode *, bool); +void vholdnz(struct vnode *); void vinactive(struct vnode *, struct thread *); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); -int vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, +int vtruncbuf(struct vnode *vp, off_t length, int blksize); +void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize); void vunref(struct vnode *); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); @@ -669,8 +675,18 @@ int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); +int vn_copy_file_range(struct vnode *invp, off_t *inoffp, + struct vnode *outvp, off_t *outoffp, size_t *lenp, + unsigned int flags, struct ucred *incred, struct ucred *outcred, + struct thread *fsize_td); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); +int vn_fsync_buf(struct vnode *vp, int waitfor); +int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, + struct vnode *outvp, off_t *outoffp, size_t *lenp, + unsigned int flags, struct ucred *incred, struct ucred *outcred, + struct thread *fsize_td); +int vn_need_pageq_flush(struct vnode *vp); int vn_isdisk(struct vnode *vp, int *errp); int _vn_lock(struct vnode *vp, int flags, char *file, int line); #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__) @@ -696,6 +712,8 @@ int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags); +int vn_truncate_locked(struct vnode *vp, off_t length, bool sync, + struct ucred *cred); int vn_writechk(struct vnode *vp); int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td); @@ -721,8 +739,12 @@ int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, VI_MTX(vp)) #define vn_rangelock_rlock(vp, start, end) \ rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) +#define vn_rangelock_tryrlock(vp, start, end) \ + rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_wlock(vp, start, end) \ rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) +#define vn_rangelock_trywlock(vp, start, end) \ + rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) int vfs_cache_lookup(struct vop_lookup_args *ap); void vfs_timestamp(struct timespec *); @@ -736,6 +758,7 @@ int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); +int vop_stdneed_inactive(struct vop_need_inactive_args *); int vop_stdislocked(struct vop_islocked_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock1_args *); @@ -749,6 +772,7 @@ int vop_stdadvlock(struct vop_advlock_args *ap); int vop_stdadvlockasync(struct vop_advlockasync_args *ap); int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap); int vop_stdallocate(struct vop_allocate_args *ap); +int vop_stdset_text(struct vop_set_text_args *ap); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_stdvptocnp(struct vop_vptocnp_args *ap); @@ -793,14 +817,18 @@ int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a); void vop_strategy_pre(void *a); void vop_lock_pre(void *a); void vop_lock_post(void *a, int rc); -void vop_unlock_post(void *a, int rc); void vop_unlock_pre(void *a); +void vop_unlock_post(void *a, int rc); +void vop_need_inactive_pre(void *a); +void vop_need_inactive_post(void *a, int rc); #else #define vop_strategy_pre(x) do { } while (0) #define vop_lock_pre(x) do { } while (0) #define vop_lock_post(x, y) do { } while (0) -#define vop_unlock_post(x, y) do { } while (0) #define vop_unlock_pre(x) do { } while (0) +#define vop_unlock_post(x, y) do { } while (0) +#define vop_need_inactive_pre(x) do { } while (0) +#define vop_need_inactive_post(x, y) do { } while (0) #endif void vop_rename_fail(struct vop_rename_args *ap); @@ -828,6 +856,36 @@ void vop_rename_fail(struct vop_rename_args *ap); #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__) +#ifdef INVARIANTS +#define VOP_ADD_WRITECOUNT_CHECKED(vp, cnt) \ +do { \ + int error_; \ + \ + error_ = VOP_ADD_WRITECOUNT((vp), (cnt)); \ + VNASSERT(error_ == 0, (vp), ("VOP_ADD_WRITECOUNT returned %d", \ + error_)); \ +} while (0) +#define VOP_SET_TEXT_CHECKED(vp) \ +do { \ + int error_; \ + \ + error_ = VOP_SET_TEXT((vp)); \ + VNASSERT(error_ == 0, (vp), ("VOP_SET_TEXT returned %d", \ + error_)); \ +} while (0) +#define VOP_UNSET_TEXT_CHECKED(vp) \ +do { \ + int error_; \ + \ + error_ = VOP_UNSET_TEXT((vp)); \ + VNASSERT(error_ == 0, (vp), ("VOP_UNSET_TEXT returned %d", \ + error_)); \ +} while (0) +#else +#define VOP_ADD_WRITECOUNT_CHECKED(vp, cnt) VOP_ADD_WRITECOUNT((vp), (cnt)) +#define VOP_SET_TEXT_CHECKED(vp) VOP_SET_TEXT((vp)) +#define VOP_UNSET_TEXT_CHECKED(vp) VOP_UNSET_TEXT((vp)) +#endif void vput(struct vnode *vp); void vrele(struct vnode *vp); @@ -873,6 +931,7 @@ int vfs_kqfilter(struct vop_kqfilter_args *); void vfs_mark_atime(struct vnode *vp, struct ucred *cred); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); +int vfs_emptydir(struct vnode *vp); int vfs_unixify_accmode(accmode_t *accmode); diff --git a/freebsd/sys/sys/watchdog.h b/freebsd/sys/sys/watchdog.h index 191456a4..3728d742 100644 --- a/freebsd/sys/sys/watchdog.h +++ b/freebsd/sys/sys/watchdog.h @@ -104,7 +104,7 @@ #ifdef _KERNEL -#include <sys/eventhandler.h> +#include <sys/_eventhandler.h> typedef void (*watchdog_fn)(void *, u_int, int *); diff --git a/freebsd/sys/vm/uma.h b/freebsd/sys/vm/uma.h index f05f686c..d7c41c85 100644 --- a/freebsd/sys/vm/uma.h +++ b/freebsd/sys/vm/uma.h @@ -50,8 +50,6 @@ struct uma_zone; /* Opaque type used as a handle to the zone */ typedef struct uma_zone * uma_zone_t; -void zone_drain(uma_zone_t); - /* * Item constructor * @@ -218,17 +216,6 @@ uma_zone_t uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, uma_init zinit, uma_fini zfini, uma_zone_t master); /* - * Add a second master to a secondary zone. This provides multiple data - * backends for objects with the same size. Both masters must have - * compatible allocation flags. Presently, UMA_ZONE_MALLOC type zones are - * the only supported. - * - * Returns: - * Error on failure, 0 on success. - */ -int uma_zsecond_add(uma_zone_t zone, uma_zone_t master); - -/* * Create cache-only zones. * * This allows uma's per-cpu cache facilities to handle arbitrary @@ -286,20 +273,23 @@ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, * NUMA aware Zone. Implements a best * effort first-touch policy. */ +#define UMA_ZONE_MINBUCKET 0x20000 /* Use smallest buckets. */ #endif /* __rtems__ */ -#define UMA_ZONE_NOBUCKETCACHE 0x20000 /* - * Don't cache full buckets. Limit - * UMA to per-cpu state. - */ /* * These flags are shared between the keg and zone. In zones wishing to add * new kegs these flags must be compatible. Some are determined based on * physical parameters of the request and may not be provided by the consumer. */ +#ifndef __rtems__ +#define UMA_ZONE_INHERIT \ + (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE | \ + UMA_ZONE_HASH | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU | UMA_ZONE_NUMA) +#else /* __rtems__ */ #define UMA_ZONE_INHERIT \ (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE | \ UMA_ZONE_HASH | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU) +#endif /* __rtems__ */ /* Definitions for align */ #define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */ @@ -310,6 +300,8 @@ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, #define UMA_ALIGN_CACHE (0 - 1) /* Cache line size align */ #define UMA_ALIGNOF(type) (_Alignof(type) - 1) /* Alignment fit for 'type' */ +#define UMA_ANYDOMAIN -1 /* Special value for domain search. */ + /* * Destroys an empty uma zone. If the zone is not empty uma complains loudly. * @@ -452,17 +444,18 @@ typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain, typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); /* - * Reclaims unused memory for all zones + * Reclaims unused memory * * Arguments: - * None + * req Reclamation request type. * Returns: * None - * - * This should only be called by the page out daemon. */ - -void uma_reclaim(void); +#define UMA_RECLAIM_DRAIN 1 /* release bucket cache */ +#define UMA_RECLAIM_DRAIN_CPU 2 /* release bucket and per-CPU caches */ +#define UMA_RECLAIM_TRIM 3 /* trim bucket cache to WSS */ +void uma_reclaim(int req); +void uma_zone_reclaim(uma_zone_t, int req); /* * Sets the alignment mask to be used for all zones requesting cache @@ -514,6 +507,18 @@ int uma_zone_reserve_kva(uma_zone_t zone, int nitems); int uma_zone_set_max(uma_zone_t zone, int nitems); /* + * Sets a high limit on the number of items allowed in zone's bucket cache + * + * Arguments: + * zone The zone to limit + * nitems The requested upper limit on the number of items allowed + * + * Returns: + * int The effective value of nitems set + */ +int uma_zone_set_maxcache(uma_zone_t zone, int nitems); + +/* * Obtains the effective limit on the number of items in a zone * * Arguments: @@ -657,8 +662,8 @@ int uma_zone_exhausted_nolock(uma_zone_t zone); /* * Common UMA_ZONE_PCPU zones. */ +extern uma_zone_t pcpu_zone_int; extern uma_zone_t pcpu_zone_64; -extern uma_zone_t pcpu_zone_ptr; /* * Exported statistics structures to be used by user space monitoring tools. @@ -698,7 +703,8 @@ struct uma_type_header { uint64_t uth_frees; /* Zone: number of frees. */ uint64_t uth_fails; /* Zone: number of alloc failures. */ uint64_t uth_sleeps; /* Zone: number of alloc sleeps. */ - uint64_t _uth_reserved1[2]; /* Reserved. */ + uint64_t uth_xdomain; /* Zone: Number of cross domain frees. */ + uint64_t _uth_reserved1[1]; /* Reserved. */ }; struct uma_percpu_stat { diff --git a/freebsd/sys/vm/uma_core.c b/freebsd/sys/vm/uma_core.c index e4161510..4f2127cd 100644 --- a/freebsd/sys/vm/uma_core.c +++ b/freebsd/sys/vm/uma_core.c @@ -174,11 +174,18 @@ static char *bootmem; static int boot_pages; #endif /* __rtems__ */ -static struct sx uma_drain_lock; +static struct sx uma_reclaim_lock; -/* kmem soft limit. */ +/* + * kmem soft limit, initialized by uma_set_limit(). Ensure that early + * allocations don't trigger a wakeup of the reclaim thread. + */ static unsigned long uma_kmem_limit = LONG_MAX; -static volatile unsigned long uma_kmem_total; +SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0, + "UMA kernel memory soft limit"); +static unsigned long uma_kmem_total; +SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0, + "UMA kernel memory usage"); #ifndef __rtems__ /* Is the VM done starting up? */ @@ -237,6 +244,7 @@ struct uma_bucket_zone { #ifndef __rtems__ #define BUCKET_MAX BUCKET_SIZE(256) +#define BUCKET_MIN BUCKET_SIZE(4) #else /* __rtems__ */ #define BUCKET_MAX BUCKET_SIZE(128) #endif /* __rtems__ */ @@ -259,9 +267,12 @@ struct uma_bucket_zone bucket_zones[] = { /* * Flags and enumerations to be passed to internal functions. */ -enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI }; - -#define UMA_ANYDOMAIN -1 /* Special value for domain search. */ +enum zfreeskip { + SKIP_NONE = 0, + SKIP_CNT = 0x00000001, + SKIP_DTOR = 0x00010000, + SKIP_FINI = 0x00020000, +}; /* Prototypes.. */ @@ -286,10 +297,10 @@ static void page_free(void *, vm_size_t, uint8_t); #ifndef __rtems__ static void pcpu_page_free(void *, vm_size_t, uint8_t); #endif /* __rtems__ */ -static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int); +static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); -static void bucket_cache_drain(uma_zone_t zone); +static void bucket_cache_reclaim(uma_zone_t zone, bool); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); static int zone_ctor(void *, int, void *, int); @@ -299,25 +310,23 @@ static void keg_small_init(uma_keg_t keg); static void keg_large_init(uma_keg_t keg); static void zone_foreach(void (*zfunc)(uma_zone_t)); static void zone_timeout(uma_zone_t zone); -static int hash_alloc(struct uma_hash *); +static int hash_alloc(struct uma_hash *, u_int); static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *); static void uma_startup3(void); static void *zone_alloc_item(uma_zone_t, void *, int, int); +static void *zone_alloc_item_locked(uma_zone_t, void *, int, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); static void bucket_zone_drain(void); -static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); +static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int, int); static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int); -#ifndef __rtems__ -static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int); -#endif /* __rtems__ */ static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); -static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item); +static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item); static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags); static int zone_import(uma_zone_t, void **, int, int, int); @@ -520,37 +529,53 @@ bucket_zone_drain(void) struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) - zone_drain(ubz->ubz_zone); + uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN); } +/* + * Attempt to satisfy an allocation by retrieving a full bucket from one of the + * zone's caches. + */ static uma_bucket_t -zone_try_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, const bool ws) +zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom) { uma_bucket_t bucket; ZONE_LOCK_ASSERT(zone); - if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) { + if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) { MPASS(zdom->uzd_nitems >= bucket->ub_cnt); - LIST_REMOVE(bucket, ub_link); + TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link); zdom->uzd_nitems -= bucket->ub_cnt; - if (ws && zdom->uzd_imin > zdom->uzd_nitems) + if (zdom->uzd_imin > zdom->uzd_nitems) zdom->uzd_imin = zdom->uzd_nitems; + zone->uz_bkt_count -= bucket->ub_cnt; } return (bucket); } +/* + * Insert a full bucket into the specified cache. The "ws" parameter indicates + * whether the bucket's contents should be counted as part of the zone's working + * set. + */ static void zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket, const bool ws) { ZONE_LOCK_ASSERT(zone); + KASSERT(zone->uz_bkt_count < zone->uz_bkt_max, ("%s: zone %p overflow", + __func__, zone)); - LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); + if (ws) + TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); + else + TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link); zdom->uzd_nitems += bucket->ub_cnt; if (ws && zdom->uzd_imax < zdom->uzd_nitems) zdom->uzd_imax = zdom->uzd_nitems; + zone->uz_bkt_count += bucket->ub_cnt; } static void @@ -573,15 +598,6 @@ zone_maxaction(uma_zone_t zone) taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); } -static void -zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t)) -{ - uma_klink_t klink; - - LIST_FOREACH(klink, &zone->uz_kegs, kl_link) - kegfn(klink->kl_keg); -} - /* * Routine called by timeout which is used to fire off some time interval * based calculations. (stats, hash size, etc.) @@ -616,7 +632,7 @@ zone_domain_update_wss(uma_zone_domain_t zdom) MPASS(zdom->uzd_imax >= zdom->uzd_imin); wss = zdom->uzd_imax - zdom->uzd_imin; zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems; - zdom->uzd_wss = (3 * wss + 2 * zdom->uzd_wss) / 5; + zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5; } /* @@ -626,8 +642,10 @@ zone_domain_update_wss(uma_zone_domain_t zdom) * Returns nothing. */ static void -keg_timeout(uma_keg_t keg) +zone_timeout(uma_zone_t zone) { + uma_keg_t keg = zone->uz_keg; + u_int slabs; KEG_LOCK(keg); /* @@ -638,7 +656,8 @@ keg_timeout(uma_keg_t keg) * may be a little aggressive. Should I allow for two collisions max? */ if (keg->uk_flags & UMA_ZONE_HASH && - keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) { + (slabs = keg->uk_pages / keg->uk_ppera) > + keg->uk_hash.uh_hashsize) { struct uma_hash newhash; struct uma_hash oldhash; int ret; @@ -649,9 +668,8 @@ keg_timeout(uma_keg_t keg) * I have to do everything in stages and check for * races. */ - newhash = keg->uk_hash; KEG_UNLOCK(keg); - ret = hash_alloc(&newhash); + ret = hash_alloc(&newhash, 1 << fls(slabs)); KEG_LOCK(keg); if (ret) { if (hash_expand(&keg->uk_hash, &newhash)) { @@ -666,17 +684,9 @@ keg_timeout(uma_keg_t keg) } } KEG_UNLOCK(keg); -} - -static void -zone_timeout(uma_zone_t zone) -{ - int i; - - zone_foreach_keg(zone, &keg_timeout); ZONE_LOCK(zone); - for (i = 0; i < vm_ndomains; i++) + for (int i = 0; i < vm_ndomains; i++) zone_domain_update_wss(&zone->uz_domain[i]); ZONE_UNLOCK(zone); } @@ -692,16 +702,13 @@ zone_timeout(uma_zone_t zone) * 1 on success and 0 on failure. */ static int -hash_alloc(struct uma_hash *hash) +hash_alloc(struct uma_hash *hash, u_int size) { - int oldsize; - int alloc; + size_t alloc; - oldsize = hash->uh_hashsize; - - /* We're just going to go to a power of two greater */ - if (oldsize) { - hash->uh_hashsize = oldsize * 2; + KASSERT(powerof2(size), ("hash size must be power of 2")); + if (size > UMA_HASH_SIZE_INIT) { + hash->uh_hashsize = size; alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; hash->uh_slab_hash = (struct slabhead *)malloc(alloc, M_UMAHASH, M_NOWAIT); @@ -738,8 +745,8 @@ static int hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) { uma_slab_t slab; - int hval; - int i; + u_int hval; + u_int idx; if (!newhash->uh_slab_hash) return (0); @@ -752,10 +759,10 @@ hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) * full rehash. */ - for (i = 0; i < oldhash->uh_hashsize; i++) - while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) { - slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]); - SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink); + for (idx = 0; idx < oldhash->uh_hashsize; idx++) + while (!SLIST_EMPTY(&oldhash->uh_slab_hash[idx])) { + slab = SLIST_FIRST(&oldhash->uh_slab_hash[idx]); + SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[idx], us_hlink); hval = UMA_HASH(newhash, slab->us_data); SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], slab, us_hlink); @@ -808,6 +815,13 @@ bucket_drain(uma_zone_t zone, uma_bucket_t bucket) for (i = 0; i < bucket->ub_cnt; i++) zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); + if (zone->uz_max_items > 0) { + ZONE_LOCK(zone); + zone->uz_items -= bucket->ub_cnt; + if (zone->uz_sleepers && zone->uz_items < zone->uz_max_items) + wakeup_one(zone); + ZONE_UNLOCK(zone); + } bucket->ub_cnt = 0; } @@ -838,22 +852,27 @@ cache_drain(uma_zone_t zone) * XXX: It would good to be able to assert that the zone is being * torn down to prevent improper use of cache_drain(). * - * XXX: We lock the zone before passing into bucket_cache_drain() as + * XXX: We lock the zone before passing into bucket_cache_reclaim() as * it is used elsewhere. Should the tear-down path be made special * there in some form? */ CPU_FOREACH(cpu) { cache = &zone->uz_cpu[cpu]; bucket_drain(zone, cache->uc_allocbucket); - bucket_drain(zone, cache->uc_freebucket); if (cache->uc_allocbucket != NULL) bucket_free(zone, cache->uc_allocbucket, NULL); + cache->uc_allocbucket = NULL; + bucket_drain(zone, cache->uc_freebucket); if (cache->uc_freebucket != NULL) bucket_free(zone, cache->uc_freebucket, NULL); - cache->uc_allocbucket = cache->uc_freebucket = NULL; + cache->uc_freebucket = NULL; + bucket_drain(zone, cache->uc_crossbucket); + if (cache->uc_crossbucket != NULL) + bucket_free(zone, cache->uc_crossbucket, NULL); + cache->uc_crossbucket = NULL; } ZONE_LOCK(zone); - bucket_cache_drain(zone); + bucket_cache_reclaim(zone, true); ZONE_UNLOCK(zone); } @@ -874,13 +893,13 @@ static void cache_drain_safe_cpu(uma_zone_t zone) { uma_cache_t cache; - uma_bucket_t b1, b2; + uma_bucket_t b1, b2, b3; int domain; if (zone->uz_flags & UMA_ZFLAG_INTERNAL) return; - b1 = b2 = NULL; + b1 = b2 = b3 = NULL; ZONE_LOCK(zone); critical_enter(); #ifndef __rtems__ @@ -906,12 +925,18 @@ cache_drain_safe_cpu(uma_zone_t zone) b2 = cache->uc_freebucket; cache->uc_freebucket = NULL; } + b3 = cache->uc_crossbucket; + cache->uc_crossbucket = NULL; critical_exit(); ZONE_UNLOCK(zone); if (b1) bucket_free(zone, b1, NULL); if (b2) bucket_free(zone, b2, NULL); + if (b3) { + bucket_drain(zone, b3); + bucket_free(zone, b3, NULL); + } } /* @@ -922,7 +947,7 @@ cache_drain_safe_cpu(uma_zone_t zone) * Zone lock must not be held on call this function. */ static void -cache_drain_safe(uma_zone_t zone) +pcpu_cache_drain_safe(uma_zone_t zone) { int cpu; @@ -951,22 +976,46 @@ cache_drain_safe(uma_zone_t zone) #endif /* __rtems__ */ /* - * Drain the cached buckets from a zone. Expects a locked zone on entry. + * Reclaim cached buckets from a zone. All buckets are reclaimed if the caller + * requested a drain, otherwise the per-domain caches are trimmed to either + * estimated working set size. */ static void -bucket_cache_drain(uma_zone_t zone) +bucket_cache_reclaim(uma_zone_t zone, bool drain) { uma_zone_domain_t zdom; uma_bucket_t bucket; + long target, tofree; int i; - /* - * Drain the bucket queues and free the buckets. - */ for (i = 0; i < vm_ndomains; i++) { zdom = &zone->uz_domain[i]; - while ((bucket = zone_try_fetch_bucket(zone, zdom, false)) != - NULL) { + + /* + * If we were asked to drain the zone, we are done only once + * this bucket cache is empty. Otherwise, we reclaim items in + * excess of the zone's estimated working set size. If the + * difference nitems - imin is larger than the WSS estimate, + * then the estimate will grow at the end of this interval and + * we ignore the historical average. + */ + target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems - + zdom->uzd_imin); + while (zdom->uzd_nitems > target) { + bucket = TAILQ_LAST(&zdom->uzd_buckets, uma_bucketlist); + if (bucket == NULL) + break; + tofree = bucket->ub_cnt; + TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link); + zdom->uzd_nitems -= tofree; + + /* + * Shift the bounds of the current WSS interval to avoid + * perturbing the estimate. + */ + zdom->uzd_imax -= lmin(zdom->uzd_imax, tofree); + zdom->uzd_imin -= lmin(zdom->uzd_imin, tofree); + ZONE_UNLOCK(zone); bucket_drain(zone, bucket); bucket_free(zone, bucket, NULL); @@ -975,8 +1024,8 @@ bucket_cache_drain(uma_zone_t zone) } /* - * Shrink further bucket sizes. Price of single zone lock collision - * is probably lower then price of global cache drain. + * Shrink the zone bucket size to ensure that the per-CPU caches + * don't grow too large. */ if (zone->uz_count > zone->uz_count_min) zone->uz_count--; @@ -1076,7 +1125,7 @@ finished: } static void -zone_drain_wait(uma_zone_t zone, int waitok) +zone_reclaim(uma_zone_t zone, int waitok, bool drain) { /* @@ -1086,32 +1135,40 @@ zone_drain_wait(uma_zone_t zone, int waitok) * when it wakes up. */ ZONE_LOCK(zone); - while (zone->uz_flags & UMA_ZFLAG_DRAINING) { + while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) { if (waitok == M_NOWAIT) goto out; msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1); } - zone->uz_flags |= UMA_ZFLAG_DRAINING; - bucket_cache_drain(zone); + zone->uz_flags |= UMA_ZFLAG_RECLAIMING; + bucket_cache_reclaim(zone, drain); ZONE_UNLOCK(zone); + /* * The DRAINING flag protects us from being freed while * we're running. Normally the uma_rwlock would protect us but we * must be able to release and acquire the right lock for each keg. */ - zone_foreach_keg(zone, &keg_drain); + keg_drain(zone->uz_keg); ZONE_LOCK(zone); - zone->uz_flags &= ~UMA_ZFLAG_DRAINING; + zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING; wakeup(zone); out: ZONE_UNLOCK(zone); } -void +static void zone_drain(uma_zone_t zone) { - zone_drain_wait(zone, M_NOWAIT); + zone_reclaim(zone, M_NOWAIT, true); +} + +static void +zone_trim(uma_zone_t zone) +{ + + zone_reclaim(zone, M_NOWAIT, false); } /* @@ -1120,25 +1177,28 @@ zone_drain(uma_zone_t zone) * otherwise the keg will be left unlocked. * * Arguments: - * wait Shall we wait? + * flags Wait flags for the item initialization routine + * aflags Wait flags for the slab allocation * * Returns: * The slab that was allocated or NULL if there is no memory and the * caller specified M_NOWAIT. */ static uma_slab_t -keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait) +keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags, + int aflags) { uma_alloc allocf; uma_slab_t slab; unsigned long size; uint8_t *mem; - uint8_t flags; + uint8_t sflags; int i; KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_alloc_slab: domain %d out of range", domain)); - mtx_assert(&keg->uk_lock, MA_OWNED); + KEG_LOCK_ASSERT(keg); + MPASS(zone->uz_lockptr == &keg->uk_lock); allocf = keg->uk_allocf; KEG_UNLOCK(keg); @@ -1146,7 +1206,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait) slab = NULL; mem = NULL; if (keg->uk_flags & UMA_ZONE_OFFPAGE) { - slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, wait); + slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, aflags); if (slab == NULL) goto out; } @@ -1159,16 +1219,16 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait) */ if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) - wait |= M_ZERO; + aflags |= M_ZERO; else - wait &= ~M_ZERO; + aflags &= ~M_ZERO; if (keg->uk_flags & UMA_ZONE_NODUMP) - wait |= M_NODUMP; + aflags |= M_NODUMP; /* zone is passed for legacy reasons. */ size = keg->uk_ppera * PAGE_SIZE; - mem = allocf(zone, size, domain, &flags, wait); + mem = allocf(zone, size, domain, &sflags, aflags); if (mem == NULL) { if (keg->uk_flags & UMA_ZONE_OFFPAGE) zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); @@ -1188,7 +1248,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait) slab->us_keg = keg; slab->us_data = mem; slab->us_freecount = keg->uk_ipers; - slab->us_flags = flags; + slab->us_flags = sflags; slab->us_domain = domain; BIT_FILL(SLAB_SETSIZE, &slab->us_free); #ifdef INVARIANTS @@ -1198,7 +1258,7 @@ keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait) if (keg->uk_init != NULL) { for (i = 0; i < keg->uk_ipers; i++) if (keg->uk_init(slab->us_data + (keg->uk_rsize * i), - keg->uk_size, wait) != 0) + keg->uk_size, flags) != 0) break; if (i != keg->uk_ipers) { keg_free_slab(keg, slab, i); @@ -1235,8 +1295,7 @@ startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, void *mem; int pages; - keg = zone_first_keg(zone); - + keg = zone->uz_keg; /* * If we are in BOOT_BUCKETS or higher, than switch to real * allocator. Zones with page sized slabs switch at BOOT_PAGEALLOC. @@ -1351,9 +1410,9 @@ pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, zkva += PAGE_SIZE; } return ((void*)addr); - fail: +fail: TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { - vm_page_unwire(p, PQ_NONE); + vm_page_unwire_noq(p); vm_page_free(p); } return (NULL); @@ -1381,7 +1440,7 @@ noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, uma_keg_t keg; TAILQ_INIT(&alloctail); - keg = zone_first_keg(zone); + keg = zone->uz_keg; npages = howmany(bytes, PAGE_SIZE); while (npages > 0) { @@ -1403,7 +1462,7 @@ noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, * exit. */ TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { - vm_page_unwire(p, PQ_NONE); + vm_page_unwire_noq(p); vm_page_free(p); } return (NULL); @@ -1473,7 +1532,7 @@ pcpu_page_free(void *mem, vm_size_t size, uint8_t flags) for (curva = sva; curva < sva + size; curva += PAGE_SIZE) { paddr = pmap_kextract(curva); m = PHYS_TO_VM_PAGE(paddr); - vm_page_unwire(m, PQ_NONE); + vm_page_unwire_noq(m); vm_page_free(m); } pmap_qremove(sva, size >> PAGE_SHIFT); @@ -1613,8 +1672,6 @@ keg_large_init(uma_keg_t keg) { KASSERT(keg != NULL, ("Keg is null in keg_large_init")); - KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0, - ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg")); KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0, ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__)); @@ -1799,7 +1856,7 @@ keg_ctor(void *mem, int size, void *udata, int flags) } if (keg->uk_flags & UMA_ZONE_HASH) - hash_alloc(&keg->uk_hash); + hash_alloc(&keg->uk_hash, 0); CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n", keg, zone->uz_name, zone, @@ -1814,6 +1871,15 @@ keg_ctor(void *mem, int size, void *udata, int flags) return (0); } +static void +zone_alloc_counters(uma_zone_t zone) +{ + + zone->uz_allocs = counter_u64_alloc(M_WAITOK); + zone->uz_frees = counter_u64_alloc(M_WAITOK); + zone->uz_fails = counter_u64_alloc(M_WAITOK); +} + /* * Zone header ctor. This initializes all fields, locks, etc. * @@ -1827,30 +1893,42 @@ zone_ctor(void *mem, int size, void *udata, int flags) uma_zone_t zone = mem; uma_zone_t z; uma_keg_t keg; + int i; bzero(zone, size); zone->uz_name = arg->name; zone->uz_ctor = arg->ctor; zone->uz_dtor = arg->dtor; - zone->uz_slab = zone_fetch_slab; zone->uz_init = NULL; zone->uz_fini = NULL; - zone->uz_allocs = 0; - zone->uz_frees = 0; - zone->uz_fails = 0; zone->uz_sleeps = 0; + zone->uz_xdomain = 0; zone->uz_count = 0; zone->uz_count_min = 0; + zone->uz_count_max = BUCKET_MAX; zone->uz_flags = 0; zone->uz_warning = NULL; #ifndef __rtems__ /* The domain structures follow the cpu structures. */ zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus]; #endif /* __rtems__ */ + zone->uz_bkt_max = ULONG_MAX; timevalclear(&zone->uz_ratecheck); - keg = arg->keg; - ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS)); +#ifndef __rtems__ + if (__predict_true(booted == BOOT_RUNNING)) +#else /* __rtems__ */ + if (__predict_true(pcpu_zone_64 != NULL)) +#endif /* __rtems__ */ + zone_alloc_counters(zone); + else { + zone->uz_allocs = EARLY_COUNTER; + zone->uz_frees = EARLY_COUNTER; + zone->uz_fails = EARLY_COUNTER; + } + + for (i = 0; i < vm_ndomains; i++) + TAILQ_INIT(&zone->uz_domain[i].uzd_buckets); /* * This is a pure cache zone, no kegs. @@ -1864,6 +1942,7 @@ zone_ctor(void *mem, int size, void *udata, int flags) zone->uz_release = arg->release; zone->uz_arg = arg->arg; zone->uz_lockptr = &zone->uz_lock; + ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS)); rw_wlock(&uma_rwlock); LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); rw_wunlock(&uma_rwlock); @@ -1876,6 +1955,7 @@ zone_ctor(void *mem, int size, void *udata, int flags) zone->uz_import = (uma_import)zone_import; zone->uz_release = (uma_release)zone_release; zone->uz_arg = zone; + keg = arg->keg; if (arg->flags & UMA_ZONE_SECONDARY) { KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); @@ -1914,12 +1994,7 @@ zone_ctor(void *mem, int size, void *udata, int flags) return (error); } - /* - * Link in the first keg. - */ - zone->uz_klink.kl_keg = keg; - LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link); - zone->uz_lockptr = &keg->uk_lock; + zone->uz_keg = keg; zone->uz_size = keg->uk_size; zone->uz_flags |= (keg->uk_flags & (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); @@ -1938,9 +2013,14 @@ out: KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), ("Invalid zone flag combination")); - if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) + if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) { zone->uz_count = BUCKET_MAX; - else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0) +#ifndef __rtems__ + } else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0) { + zone->uz_count = BUCKET_MIN; + zone->uz_count_max = BUCKET_MIN; +#endif /* __rtems__ */ + } else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0) zone->uz_count = 0; else zone->uz_count = bucket_select(zone->uz_size); @@ -1985,12 +2065,10 @@ keg_dtor(void *arg, int size, void *udata) static void zone_dtor(void *arg, int size, void *udata) { - uma_klink_t klink; uma_zone_t zone; uma_keg_t keg; zone = (uma_zone_t)arg; - keg = zone_first_keg(zone); if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) cache_drain(zone); @@ -2004,27 +2082,22 @@ zone_dtor(void *arg, int size, void *udata) * released and then refilled before we * remove it... we dont care for now */ - zone_drain_wait(zone, M_WAITOK); + zone_reclaim(zone, M_WAITOK, true); /* - * Unlink all of our kegs. + * We only destroy kegs from non secondary/non cache zones. */ - while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) { - klink->kl_keg = NULL; - LIST_REMOVE(klink, kl_link); - if (klink == &zone->uz_klink) - continue; - free(klink, M_TEMP); - } - /* - * We only destroy kegs from non secondary zones. - */ - if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0) { + if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { + keg = zone->uz_keg; rw_wlock(&uma_rwlock); LIST_REMOVE(keg, uk_link); rw_wunlock(&uma_rwlock); zone_free_item(kegs, keg, NULL, SKIP_NONE); } - ZONE_LOCK_FINI(zone); + counter_u64_free(zone->uz_allocs); + counter_u64_free(zone->uz_frees); + counter_u64_free(zone->uz_fails); + if (zone->uz_lockptr == &zone->uz_lock) + ZONE_LOCK_FINI(zone); } /* @@ -2043,12 +2116,23 @@ zone_foreach(void (*zfunc)(uma_zone_t)) uma_keg_t keg; uma_zone_t zone; - rw_rlock(&uma_rwlock); + /* + * Before BOOT_RUNNING we are guaranteed to be single + * threaded, so locking isn't needed. Startup functions + * are allowed to use M_WAITOK. + */ +#ifndef __rtems__ + if (__predict_true(booted == BOOT_RUNNING)) +#endif /* __rtems__ */ + rw_rlock(&uma_rwlock); LIST_FOREACH(keg, &uma_kegs, uk_link) { LIST_FOREACH(zone, &keg->uk_zones, uz_link) zfunc(zone); } - rw_runlock(&uma_rwlock); +#ifndef __rtems__ + if (__predict_true(booted == BOOT_RUNNING)) +#endif /* __rtems__ */ + rw_runlock(&uma_rwlock); } #ifndef __rtems__ @@ -2235,7 +2319,7 @@ uma_startup2(void) printf("Entering %s with %d boot pages left\n", __func__, boot_pages); #endif booted = BOOT_BUCKETS; - sx_init(&uma_drain_lock, "umadrain"); + sx_init(&uma_reclaim_lock, "umareclaim"); bucket_enable(); } #endif /* __rtems__ */ @@ -2255,6 +2339,7 @@ uma_startup3(void) uma_skip_cnt = counter_u64_alloc(M_WAITOK); #endif /* __rtems__ */ #endif + zone_foreach(zone_alloc_counters); callout_init(&uma_callout, 1); callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); #ifndef __rtems__ @@ -2302,6 +2387,11 @@ uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"", align, name)); + /* Sets all zones to a first-touch domain policy. */ +#ifdef UMA_FIRSTTOUCH + flags |= UMA_ZONE_NUMA; +#endif + /* This stuff is essential for the zone ctor */ memset(&args, 0, sizeof(args)); args.name = name; @@ -2333,7 +2423,7 @@ uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, locked = false; } else { #endif /* __rtems__ */ - sx_slock(&uma_drain_lock); + sx_slock(&uma_reclaim_lock); #ifndef __rtems__ locked = true; } @@ -2342,7 +2432,7 @@ uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, #ifndef __rtems__ if (locked) #endif /* __rtems__ */ - sx_sunlock(&uma_drain_lock); + sx_sunlock(&uma_reclaim_lock); return (res); } @@ -2358,7 +2448,7 @@ uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, bool locked; #endif /* __rtems__ */ - keg = zone_first_keg(master); + keg = master->uz_keg; memset(&args, 0, sizeof(args)); args.name = name; args.size = keg->uk_size; @@ -2375,7 +2465,7 @@ uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, locked = false; } else { #endif /* __rtems__ */ - sx_slock(&uma_drain_lock); + sx_slock(&uma_reclaim_lock); #ifndef __rtems__ locked = true; } @@ -2385,7 +2475,7 @@ uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, #ifndef __rtems__ if (locked) #endif /* __rtems__ */ - sx_sunlock(&uma_drain_lock); + sx_sunlock(&uma_reclaim_lock); return (res); } @@ -2408,100 +2498,19 @@ uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, args.release = zrelease; args.arg = arg; args.align = 0; - args.flags = flags; + args.flags = flags | UMA_ZFLAG_CACHE; return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); } -#ifndef __rtems__ -static void -zone_lock_pair(uma_zone_t a, uma_zone_t b) -{ - if (a < b) { - ZONE_LOCK(a); - mtx_lock_flags(b->uz_lockptr, MTX_DUPOK); - } else { - ZONE_LOCK(b); - mtx_lock_flags(a->uz_lockptr, MTX_DUPOK); - } -} - -static void -zone_unlock_pair(uma_zone_t a, uma_zone_t b) -{ - - ZONE_UNLOCK(a); - ZONE_UNLOCK(b); -} - -int -uma_zsecond_add(uma_zone_t zone, uma_zone_t master) -{ - uma_klink_t klink; - uma_klink_t kl; - int error; - - error = 0; - klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO); - - zone_lock_pair(zone, master); - /* - * zone must use vtoslab() to resolve objects and must already be - * a secondary. - */ - if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) - != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) { - error = EINVAL; - goto out; - } - /* - * The new master must also use vtoslab(). - */ - if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) { - error = EINVAL; - goto out; - } - - /* - * The underlying object must be the same size. rsize - * may be different. - */ - if (master->uz_size != zone->uz_size) { - error = E2BIG; - goto out; - } - /* - * Put it at the end of the list. - */ - klink->kl_keg = zone_first_keg(master); - LIST_FOREACH(kl, &zone->uz_kegs, kl_link) { - if (LIST_NEXT(kl, kl_link) == NULL) { - LIST_INSERT_AFTER(kl, klink, kl_link); - break; - } - } - klink = NULL; - zone->uz_flags |= UMA_ZFLAG_MULTI; - zone->uz_slab = zone_fetch_slab_multi; - -out: - zone_unlock_pair(zone, master); - if (klink != NULL) - free(klink, M_TEMP); - - return (error); -} -#endif /* __rtems__ */ - - /* See uma.h */ void uma_zdestroy(uma_zone_t zone) { - sx_slock(&uma_drain_lock); + sx_slock(&uma_reclaim_lock); zone_free_item(zones, zone, NULL, SKIP_NONE); - sx_sunlock(&uma_drain_lock); + sx_sunlock(&uma_reclaim_lock); } void @@ -2555,7 +2564,7 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) uma_bucket_t bucket; uma_cache_t cache; void *item; - int cpu, domain, lockfail; + int cpu, domain, lockfail, maxbucket; #ifdef INVARIANTS bool skipdbg; #endif @@ -2634,8 +2643,8 @@ zalloc_start: zone->uz_dtor != trash_dtor) && #endif zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { - atomic_add_long(&zone->uz_fails, 1); - zone_free_item(zone, item, udata, SKIP_DTOR); + counter_u64_add(zone->uz_fails, 1); + zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT); return (NULL); } #ifdef INVARIANTS @@ -2670,18 +2679,17 @@ zalloc_start: if (bucket != NULL) bucket_free(zone, bucket, udata); + /* Short-circuit for zones without buckets and low memory. */ + if (zone->uz_count == 0 || bucketdisable) { + ZONE_LOCK(zone); #ifndef __rtems__ - if (zone->uz_flags & UMA_ZONE_NUMA) { - domain = PCPU_GET(domain); - if (VM_DOMAIN_EMPTY(domain)) - domain = UMA_ANYDOMAIN; - } else + if (zone->uz_flags & UMA_ZONE_NUMA) + domain = PCPU_GET(domain); + else #endif /* __rtems__ */ - domain = UMA_ANYDOMAIN; - - /* Short-circuit for zones without buckets and low memory. */ - if (zone->uz_count == 0 || bucketdisable) + domain = UMA_ANYDOMAIN; goto zalloc_item; + } /* * Attempt to retrieve the item from the per-CPU cache has failed, so @@ -2711,11 +2719,19 @@ zalloc_start: /* * Check the zone's cache of buckets. */ - if (domain == UMA_ANYDOMAIN) - zdom = &zone->uz_domain[0]; - else +#ifndef __rtems__ + if (zone->uz_flags & UMA_ZONE_NUMA) { + domain = PCPU_GET(domain); zdom = &zone->uz_domain[domain]; - if ((bucket = zone_try_fetch_bucket(zone, zdom, true)) != NULL) { + } else { +#endif /* __rtems__ */ + domain = UMA_ANYDOMAIN; + zdom = &zone->uz_domain[0]; +#ifndef __rtems__ + } +#endif /* __rtems__ */ + + if ((bucket = zone_fetch_bucket(zone, zdom)) != NULL) { KASSERT(bucket->ub_cnt != 0, ("uma_zalloc_arg: Returning an empty bucket.")); cache->uc_allocbucket = bucket; @@ -2729,8 +2745,17 @@ zalloc_start: * We bump the uz count when the cache size is insufficient to * handle the working set. */ - if (lockfail && zone->uz_count < BUCKET_MAX) + if (lockfail && zone->uz_count < zone->uz_count_max) zone->uz_count++; + + if (zone->uz_max_items > 0) { + if (zone->uz_items >= zone->uz_max_items) + goto zalloc_item; + maxbucket = MIN(zone->uz_count, + zone->uz_max_items - zone->uz_items); + zone->uz_items += maxbucket; + } else + maxbucket = zone->uz_count; ZONE_UNLOCK(zone); /* @@ -2738,11 +2763,18 @@ zalloc_start: * works we'll restart the allocation from the beginning and it * will use the just filled bucket. */ - bucket = zone_alloc_bucket(zone, udata, domain, flags); + bucket = zone_alloc_bucket(zone, udata, domain, flags, maxbucket); CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", zone->uz_name, zone, bucket); + ZONE_LOCK(zone); if (bucket != NULL) { - ZONE_LOCK(zone); + if (zone->uz_max_items > 0 && bucket->ub_cnt < maxbucket) { + MPASS(zone->uz_items >= maxbucket - bucket->ub_cnt); + zone->uz_items -= maxbucket - bucket->ub_cnt; + if (zone->uz_sleepers > 0 && + zone->uz_items < zone->uz_max_items) + wakeup_one(zone); + } critical_enter(); cpu = curcpu; cache = &zone->uz_cpu[cpu]; @@ -2761,7 +2793,7 @@ zalloc_start: #endif /* __rtems__ */ cache->uc_allocbucket = bucket; zdom->uzd_imax += bucket->ub_cnt; - } else if ((zone->uz_flags & UMA_ZONE_NOBUCKETCACHE) != 0) { + } else if (zone->uz_bkt_count >= zone->uz_bkt_max) { critical_exit(); ZONE_UNLOCK(zone); bucket_drain(zone, bucket); @@ -2771,13 +2803,18 @@ zalloc_start: zone_put_bucket(zone, zdom, bucket, false); ZONE_UNLOCK(zone); goto zalloc_start; + } else if (zone->uz_max_items > 0) { + zone->uz_items -= maxbucket; + if (zone->uz_sleepers > 0 && + zone->uz_items + 1 < zone->uz_max_items) + wakeup_one(zone); } /* * We may not be able to get a bucket so return an actual item. */ zalloc_item: - item = zone_alloc_item(zone, udata, domain, flags); + item = zone_alloc_item_locked(zone, udata, domain, flags); return (item); } @@ -2822,6 +2859,7 @@ keg_first_slab(uma_keg_t keg, int domain, bool rr) KASSERT(domain >= 0 && domain < vm_ndomains, ("keg_first_slab: domain %d out of range", domain)); + KEG_LOCK_ASSERT(keg); slab = NULL; start = domain; @@ -2849,7 +2887,7 @@ keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags) { uint32_t reserve; - mtx_assert(&keg->uk_lock, MA_OWNED); + KEG_LOCK_ASSERT(keg); reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve; if (keg->uk_free <= reserve) @@ -2871,7 +2909,7 @@ keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags) #ifndef __rtems__ restart: #endif /* __rtems__ */ - mtx_assert(&keg->uk_lock, MA_OWNED); + KEG_LOCK_ASSERT(keg); /* * Use the keg's policy if upper layers haven't already specified a @@ -2910,24 +2948,11 @@ restart: if (flags & M_NOVM) break; - if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) { - keg->uk_flags |= UMA_ZFLAG_FULL; - /* - * If this is not a multi-zone, set the FULL bit. - * Otherwise slab_multi() takes care of it. - */ - if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) { - zone->uz_flags |= UMA_ZFLAG_FULL; - zone_log_warning(zone); - zone_maxaction(zone); - } - if (flags & M_NOWAIT) - return (NULL); - zone->uz_sleeps++; - msleep(keg, &keg->uk_lock, PVM, "keglimit", 0); - continue; - } - slab = keg_alloc_slab(keg, zone, domain, aflags); + KASSERT(zone->uz_max_items == 0 || + zone->uz_items <= zone->uz_max_items, + ("%s: zone %p overflow", __func__, zone)); + + slab = keg_alloc_slab(keg, zone, domain, flags, aflags); /* * If we got a slab here it's safe to mark it partially used * and return. We assume that the caller is going to remove @@ -2973,7 +2998,7 @@ zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags) uma_slab_t slab; if (keg == NULL) { - keg = zone_first_keg(zone); + keg = zone->uz_keg; KEG_LOCK(keg); } @@ -2988,89 +3013,6 @@ zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags) return (NULL); } -#ifndef __rtems__ -/* - * uma_zone_fetch_slab_multi: Fetches a slab from one available keg. Returns - * with the keg locked. On NULL no lock is held. - * - * The last pointer is used to seed the search. It is not required. - */ -static uma_slab_t -zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int domain, int rflags) -{ - uma_klink_t klink; - uma_slab_t slab; - uma_keg_t keg; - int flags; - int empty; - int full; - - /* - * Don't wait on the first pass. This will skip limit tests - * as well. We don't want to block if we can find a provider - * without blocking. - */ - flags = (rflags & ~M_WAITOK) | M_NOWAIT; - /* - * Use the last slab allocated as a hint for where to start - * the search. - */ - if (last != NULL) { - slab = keg_fetch_slab(last, zone, domain, flags); - if (slab) - return (slab); - KEG_UNLOCK(last); - } - /* - * Loop until we have a slab incase of transient failures - * while M_WAITOK is specified. I'm not sure this is 100% - * required but we've done it for so long now. - */ - for (;;) { - empty = 0; - full = 0; - /* - * Search the available kegs for slabs. Be careful to hold the - * correct lock while calling into the keg layer. - */ - LIST_FOREACH(klink, &zone->uz_kegs, kl_link) { - keg = klink->kl_keg; - KEG_LOCK(keg); - if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) { - slab = keg_fetch_slab(keg, zone, domain, flags); - if (slab) - return (slab); - } - if (keg->uk_flags & UMA_ZFLAG_FULL) - full++; - else - empty++; - KEG_UNLOCK(keg); - } - if (rflags & (M_NOWAIT | M_NOVM)) - break; - flags = rflags; - /* - * All kegs are full. XXX We can't atomically check all kegs - * and sleep so just sleep for a short period and retry. - */ - if (full && !empty) { - ZONE_LOCK(zone); - zone->uz_flags |= UMA_ZFLAG_FULL; - zone->uz_sleeps++; - zone_log_warning(zone); - zone_maxaction(zone); - msleep(zone, zone->uz_lockptr, PVM, - "zonelimit", hz/100); - zone->uz_flags &= ~UMA_ZFLAG_FULL; - ZONE_UNLOCK(zone); - continue; - } - } - return (NULL); -} -#endif /* __rtems__ */ - static void * slab_alloc_item(uma_keg_t keg, uma_slab_t slab) { @@ -3079,7 +3021,7 @@ slab_alloc_item(uma_keg_t keg, uma_slab_t slab) uint8_t freei; MPASS(keg == slab->us_keg); - mtx_assert(&keg->uk_lock, MA_OWNED); + KEG_LOCK_ASSERT(keg); freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1; BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free); @@ -3111,7 +3053,7 @@ zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags) keg = NULL; /* Try to keep the buckets totally full */ for (i = 0; i < max; ) { - if ((slab = zone->uz_slab(zone, keg, domain, flags)) == NULL) + if ((slab = zone_fetch_slab(zone, keg, domain, flags)) == NULL) break; keg = slab->us_keg; #ifdef NUMA @@ -3146,21 +3088,25 @@ zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags) } static uma_bucket_t -zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) +zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags, int max) { uma_bucket_t bucket; - int max; CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain); +#ifndef __rtems__ + /* Avoid allocs targeting empty domains. */ + if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; +#endif /* __rtems__ */ + /* Don't wait for buckets, preserve caller's NOVM setting. */ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); if (bucket == NULL) return (NULL); - max = MIN(bucket->ub_entries, zone->uz_count); bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, - max, domain, flags); + MIN(max, bucket->ub_entries), domain, flags); /* * Initialize the memory if necessary. @@ -3189,7 +3135,7 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) if (bucket->ub_cnt == 0) { bucket_free(zone, bucket, udata); - atomic_add_long(&zone->uz_fails, 1); + counter_u64_add(zone->uz_fails, 1); return (NULL); } @@ -3213,23 +3159,54 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) static void * zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) { + + ZONE_LOCK(zone); + return (zone_alloc_item_locked(zone, udata, domain, flags)); +} + +/* + * Returns with zone unlocked. + */ +static void * +zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags) +{ void *item; #ifdef INVARIANTS bool skipdbg; #endif - item = NULL; + ZONE_LOCK_ASSERT(zone); -#ifndef __rtems__ - if (domain != UMA_ANYDOMAIN) { - /* avoid allocs targeting empty domains */ - if (VM_DOMAIN_EMPTY(domain)) - domain = UMA_ANYDOMAIN; + if (zone->uz_max_items > 0) { + if (zone->uz_items >= zone->uz_max_items) { + zone_log_warning(zone); + zone_maxaction(zone); + if (flags & M_NOWAIT) { + ZONE_UNLOCK(zone); + return (NULL); + } + zone->uz_sleeps++; + zone->uz_sleepers++; + while (zone->uz_items >= zone->uz_max_items) + mtx_sleep(zone, zone->uz_lockptr, PVM, + "zonelimit", 0); + zone->uz_sleepers--; + if (zone->uz_sleepers > 0 && + zone->uz_items + 1 < zone->uz_max_items) + wakeup_one(zone); + } + zone->uz_items++; } + ZONE_UNLOCK(zone); + +#ifndef __rtems__ + /* Avoid allocs targeting empty domains. */ + if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) + domain = UMA_ANYDOMAIN; #endif /* __rtems__ */ + if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) goto fail; - atomic_add_long(&zone->uz_allocs, 1); #ifdef INVARIANTS skipdbg = uma_dbg_zskip(zone, item); @@ -3242,7 +3219,7 @@ zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) */ if (zone->uz_init != NULL) { if (zone->uz_init(item, zone->uz_size, flags) != 0) { - zone_free_item(zone, item, udata, SKIP_FINI); + zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT); goto fail; } } @@ -3252,7 +3229,7 @@ zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) zone->uz_dtor != trash_dtor) && #endif zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { - zone_free_item(zone, item, udata, SKIP_DTOR); + zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT); goto fail; } #ifdef INVARIANTS @@ -3262,15 +3239,21 @@ zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) if (flags & M_ZERO) uma_zero_item(item, zone); + counter_u64_add(zone->uz_allocs, 1); CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item, zone->uz_name, zone); return (item); fail: + if (zone->uz_max_items > 0) { + ZONE_LOCK(zone); + zone->uz_items--; + ZONE_UNLOCK(zone); + } + counter_u64_add(zone->uz_fails, 1); CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", zone->uz_name, zone); - atomic_add_long(&zone->uz_fails, 1); return (NULL); } @@ -3282,10 +3265,14 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata) uma_bucket_t bucket; uma_zone_domain_t zdom; #ifndef __rtems__ - int cpu, domain, lockfail; + int cpu, domain; #else /* __rtems__ */ - int cpu, lockfail; + int cpu; #endif /* __rtems__ */ +#ifdef UMA_XDOMAIN + int itemdomain; +#endif + bool lockfail; #ifdef INVARIANTS bool skipdbg; #endif @@ -3333,9 +3320,14 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata) * The race here is acceptable. If we miss it we'll just have to wait * a little longer for the limits to be reset. */ - if (zone->uz_flags & UMA_ZFLAG_FULL) + if (zone->uz_sleepers > 0) goto zfree_item; +#ifdef UMA_XDOMAIN + if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) + itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item)); +#endif + /* * If possible, free to the per-CPU cache. There are two * requirements for safe access to the per-CPU cache: (1) the thread @@ -3353,14 +3345,28 @@ zfree_restart: cache = &zone->uz_cpu[cpu]; zfree_start: +#ifndef __rtems__ + domain = PCPU_GET(domain); +#endif /* __rtems__ */ +#ifdef UMA_XDOMAIN + if ((zone->uz_flags & UMA_ZONE_NUMA) == 0) + itemdomain = domain; +#endif /* * Try to free into the allocbucket first to give LIFO ordering * for cache-hot datastructures. Spill over into the freebucket * if necessary. Alloc will swap them if one runs dry. */ - bucket = cache->uc_allocbucket; - if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries) - bucket = cache->uc_freebucket; +#ifdef UMA_XDOMAIN + if (domain != itemdomain) { + bucket = cache->uc_crossbucket; + } else +#endif + { + bucket = cache->uc_allocbucket; + if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries) + bucket = cache->uc_freebucket; + } if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) { KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL, ("uma_zfree: Freeing to non free bucket index.")); @@ -3383,34 +3389,80 @@ zfree_start: if (zone->uz_count == 0 || bucketdisable) goto zfree_item; - lockfail = 0; + lockfail = false; if (ZONE_TRYLOCK(zone) == 0) { /* Record contention to size the buckets. */ ZONE_LOCK(zone); - lockfail = 1; + lockfail = true; } critical_enter(); cpu = curcpu; +#ifndef __rtems__ + domain = PCPU_GET(domain); +#endif /* __rtems__ */ cache = &zone->uz_cpu[cpu]; - bucket = cache->uc_freebucket; +#ifdef UMA_XDOMAIN + if (domain != itemdomain) + bucket = cache->uc_crossbucket; + else +#endif + bucket = cache->uc_freebucket; if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) { ZONE_UNLOCK(zone); goto zfree_start; } - cache->uc_freebucket = NULL; +#ifdef UMA_XDOMAIN + if (domain != itemdomain) + cache->uc_crossbucket = NULL; + else +#endif + cache->uc_freebucket = NULL; /* We are no longer associated with this CPU. */ critical_exit(); +#ifdef UMA_XDOMAIN + if (domain != itemdomain) { + if (bucket != NULL) { + zone->uz_xdomain += bucket->ub_cnt; + if (vm_ndomains > 2 || + zone->uz_bkt_count >= zone->uz_bkt_max) { + ZONE_UNLOCK(zone); + bucket_drain(zone, bucket); + bucket_free(zone, bucket, udata); + } else { + zdom = &zone->uz_domain[itemdomain]; + zone_put_bucket(zone, zdom, bucket, true); + ZONE_UNLOCK(zone); + } + } else + ZONE_UNLOCK(zone); + bucket = bucket_alloc(zone, udata, M_NOWAIT); + if (bucket == NULL) + goto zfree_item; + critical_enter(); + cpu = curcpu; + cache = &zone->uz_cpu[cpu]; + if (cache->uc_crossbucket == NULL) { + cache->uc_crossbucket = bucket; + goto zfree_start; + } + critical_exit(); + bucket_free(zone, bucket, udata); + goto zfree_restart; + } +#endif + #ifndef __rtems__ if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) { - domain = PCPU_GET(domain); - if (VM_DOMAIN_EMPTY(domain)) - domain = UMA_ANYDOMAIN; - } else + zdom = &zone->uz_domain[domain]; + } else { domain = 0; -#endif /* __rtems__ */ + zdom = &zone->uz_domain[0]; + } +#else /* __rtems__ */ zdom = &zone->uz_domain[0]; +#endif /* __rtems__ */ /* Can we throw this on the zone full list? */ if (bucket != NULL) { @@ -3418,9 +3470,9 @@ zfree_start: "uma_zfree: zone %s(%p) putting bucket %p on free list", zone->uz_name, zone, bucket); /* ub_cnt is pointing to the last free item */ - KASSERT(bucket->ub_cnt != 0, - ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n")); - if ((zone->uz_flags & UMA_ZONE_NOBUCKETCACHE) != 0) { + KASSERT(bucket->ub_cnt == bucket->ub_entries, + ("uma_zfree: Attempting to insert not full bucket onto the full list.\n")); + if (zone->uz_bkt_count >= zone->uz_bkt_max) { ZONE_UNLOCK(zone); bucket_drain(zone, bucket); bucket_free(zone, bucket, udata); @@ -3433,7 +3485,7 @@ zfree_start: * We bump the uz count when the cache size is insufficient to * handle the working set. */ - if (lockfail && zone->uz_count < BUCKET_MAX) + if (lockfail && zone->uz_count < zone->uz_count_max) zone->uz_count++; ZONE_UNLOCK(zone); @@ -3468,8 +3520,6 @@ zfree_start: */ zfree_item: zone_free_item(zone, item, udata, SKIP_DTOR); - - return; } void @@ -3494,12 +3544,15 @@ uma_zfree_domain(uma_zone_t zone, void *item, void *udata) } static void -slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item) +slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item) { + uma_keg_t keg; uma_domain_t dom; uint8_t freei; - mtx_assert(&keg->uk_lock, MA_OWNED); + keg = zone->uz_keg; + MPASS(zone->uz_lockptr == &keg->uk_lock); + KEG_LOCK_ASSERT(keg); MPASS(keg == slab->us_keg); dom = &keg->uk_domain[slab->us_domain]; @@ -3529,11 +3582,9 @@ zone_release(uma_zone_t zone, void **bucket, int cnt) uma_slab_t slab; uma_keg_t keg; uint8_t *mem; - int clearfull; int i; - clearfull = 0; - keg = zone_first_keg(zone); + keg = zone->uz_keg; KEG_LOCK(keg); for (i = 0; i < cnt; i++) { item = bucket[i]; @@ -3547,37 +3598,11 @@ zone_release(uma_zone_t zone, void **bucket, int cnt) } } else { slab = vtoslab((vm_offset_t)item); - if (slab->us_keg != keg) { - KEG_UNLOCK(keg); - keg = slab->us_keg; - KEG_LOCK(keg); - } - } - slab_free_item(keg, slab, item); - if (keg->uk_flags & UMA_ZFLAG_FULL) { - if (keg->uk_pages < keg->uk_maxpages) { - keg->uk_flags &= ~UMA_ZFLAG_FULL; - clearfull = 1; - } - - /* - * We can handle one more allocation. Since we're - * clearing ZFLAG_FULL, wake up all procs blocked - * on pages. This should be uncommon, so keeping this - * simple for now (rather than adding count of blocked - * threads etc). - */ - wakeup(keg); + MPASS(slab->us_keg == keg); } + slab_free_item(zone, slab, item); } KEG_UNLOCK(keg); - if (clearfull) { - ZONE_LOCK(zone); - zone->uz_flags &= ~UMA_ZFLAG_FULL; - wakeup(zone); - ZONE_UNLOCK(zone); - } - } /* @@ -3614,34 +3639,60 @@ zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) if (skip < SKIP_FINI && zone->uz_fini) zone->uz_fini(item, zone->uz_size); - atomic_add_long(&zone->uz_frees, 1); zone->uz_release(zone->uz_arg, &item, 1); + + if (skip & SKIP_CNT) + return; + + counter_u64_add(zone->uz_frees, 1); + + if (zone->uz_max_items > 0) { + ZONE_LOCK(zone); + zone->uz_items--; + if (zone->uz_sleepers > 0 && + zone->uz_items < zone->uz_max_items) + wakeup_one(zone); + ZONE_UNLOCK(zone); + } } /* See uma.h */ int uma_zone_set_max(uma_zone_t zone, int nitems) { - uma_keg_t keg; + struct uma_bucket_zone *ubz; - keg = zone_first_keg(zone); - if (keg == NULL) - return (0); - KEG_LOCK(keg); -#ifdef __rtems__ -#ifdef SMP /* - * Ensure we have enough items to fill the per-processor caches. This - * is a heuristic approach and works not under all conditions. + * If limit is very low we may need to limit how + * much items are allowed in CPU caches. */ - nitems += 2 * BUCKET_MAX * (mp_maxid + 1); -#endif -#endif /* __rtems__ */ - keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera; - if (keg->uk_maxpages * keg->uk_ipers < nitems) - keg->uk_maxpages += keg->uk_ppera; - nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers; - KEG_UNLOCK(keg); + ubz = &bucket_zones[0]; + for (; ubz->ubz_entries != 0; ubz++) + if (ubz->ubz_entries * 2 * mp_ncpus > nitems) + break; + if (ubz == &bucket_zones[0]) + nitems = ubz->ubz_entries * 2 * mp_ncpus; + else + ubz--; + + ZONE_LOCK(zone); + zone->uz_count_max = zone->uz_count = ubz->ubz_entries; + if (zone->uz_count_min > zone->uz_count_max) + zone->uz_count_min = zone->uz_count_max; + zone->uz_max_items = nitems; + ZONE_UNLOCK(zone); + + return (nitems); +} + +/* See uma.h */ +int +uma_zone_set_maxcache(uma_zone_t zone, int nitems) +{ + + ZONE_LOCK(zone); + zone->uz_bkt_max = nitems; + ZONE_UNLOCK(zone); return (nitems); } @@ -3651,14 +3702,10 @@ int uma_zone_get_max(uma_zone_t zone) { int nitems; - uma_keg_t keg; - keg = zone_first_keg(zone); - if (keg == NULL) - return (0); - KEG_LOCK(keg); - nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers; - KEG_UNLOCK(keg); + ZONE_LOCK(zone); + nitems = zone->uz_max_items; + ZONE_UNLOCK(zone); return (nitems); } @@ -3691,10 +3738,11 @@ uma_zone_get_cur(uma_zone_t zone) u_int i; ZONE_LOCK(zone); - nitems = zone->uz_allocs - zone->uz_frees; + nitems = counter_u64_fetch(zone->uz_allocs) - + counter_u64_fetch(zone->uz_frees); CPU_FOREACH(i) { /* - * See the comment in sysctl_vm_zone_stats() regarding the + * See the comment in uma_vm_zone_stats() regarding the * safety of accessing the per-cpu caches. With the zone lock * held, it is safe, but can potentially result in stale data. */ @@ -3712,8 +3760,7 @@ uma_zone_set_init(uma_zone_t zone, uma_init uminit) { uma_keg_t keg; - keg = zone_first_keg(zone); - KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type")); + KEG_GET(zone, keg); KEG_LOCK(keg); KASSERT(keg->uk_pages == 0, ("uma_zone_set_init on non-empty keg")); @@ -3727,8 +3774,7 @@ uma_zone_set_fini(uma_zone_t zone, uma_fini fini) { uma_keg_t keg; - keg = zone_first_keg(zone); - KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type")); + KEG_GET(zone, keg); KEG_LOCK(keg); KASSERT(keg->uk_pages == 0, ("uma_zone_set_fini on non-empty keg")); @@ -3742,7 +3788,7 @@ uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) { ZONE_LOCK(zone); - KASSERT(zone_first_keg(zone)->uk_pages == 0, + KASSERT(zone->uz_keg->uk_pages == 0, ("uma_zone_set_zinit on non-empty keg")); zone->uz_init = zinit; ZONE_UNLOCK(zone); @@ -3754,7 +3800,7 @@ uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) { ZONE_LOCK(zone); - KASSERT(zone_first_keg(zone)->uk_pages == 0, + KASSERT(zone->uz_keg->uk_pages == 0, ("uma_zone_set_zfini on non-empty keg")); zone->uz_fini = zfini; ZONE_UNLOCK(zone); @@ -3767,7 +3813,7 @@ uma_zone_set_freef(uma_zone_t zone, uma_free freef) { uma_keg_t keg; - keg = zone_first_keg(zone); + KEG_GET(zone, keg); KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type")); KEG_LOCK(keg); keg->uk_freef = freef; @@ -3781,7 +3827,7 @@ uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) { uma_keg_t keg; - keg = zone_first_keg(zone); + KEG_GET(zone, keg); KEG_LOCK(keg); keg->uk_allocf = allocf; KEG_UNLOCK(keg); @@ -3793,14 +3839,10 @@ uma_zone_reserve(uma_zone_t zone, int items) { uma_keg_t keg; - keg = zone_first_keg(zone); - if (keg == NULL) - return; + KEG_GET(zone, keg); KEG_LOCK(keg); keg->uk_reserve = items; KEG_UNLOCK(keg); - - return; } #ifndef __rtems__ @@ -3812,11 +3854,9 @@ uma_zone_reserve_kva(uma_zone_t zone, int count) vm_offset_t kva; u_int pages; - keg = zone_first_keg(zone); - if (keg == NULL) - return (0); - pages = count / keg->uk_ipers; + KEG_GET(zone, keg); + pages = count / keg->uk_ipers; if (pages * keg->uk_ipers < count) pages++; pages *= keg->uk_ppera; @@ -3831,17 +3871,19 @@ uma_zone_reserve_kva(uma_zone_t zone, int count) return (0); } else kva = 0; - KEG_LOCK(keg); + + ZONE_LOCK(zone); + MPASS(keg->uk_kva == 0); keg->uk_kva = kva; keg->uk_offset = 0; - keg->uk_maxpages = pages; + zone->uz_max_items = pages * keg->uk_ipers; #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; #else keg->uk_allocf = noobj_alloc; #endif keg->uk_flags |= UMA_ZONE_NOFREE; - KEG_UNLOCK(keg); + ZONE_UNLOCK(zone); return (1); } @@ -3854,46 +3896,65 @@ uma_prealloc(uma_zone_t zone, int items) uma_domain_t dom; uma_slab_t slab; uma_keg_t keg; - int domain, flags, slabs; + int aflags, domain, slabs; - keg = zone_first_keg(zone); - if (keg == NULL) - return; + KEG_GET(zone, keg); KEG_LOCK(keg); slabs = items / keg->uk_ipers; if (slabs * keg->uk_ipers < items) slabs++; - flags = M_WAITOK; - vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, &flags); while (slabs-- > 0) { - slab = keg_alloc_slab(keg, zone, domain, flags); - if (slab == NULL) - return; - MPASS(slab->us_keg == keg); - dom = &keg->uk_domain[slab->us_domain]; - LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); - if (vm_domainset_iter_policy(&di, &domain) != 0) - break; + aflags = M_NOWAIT; + vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, + &aflags); + for (;;) { + slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, + aflags); + if (slab != NULL) { + MPASS(slab->us_keg == keg); + dom = &keg->uk_domain[slab->us_domain]; + LIST_INSERT_HEAD(&dom->ud_free_slab, slab, + us_link); + break; + } + KEG_LOCK(keg); + if (vm_domainset_iter_policy(&di, &domain) != 0) { + KEG_UNLOCK(keg); + vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask); + KEG_LOCK(keg); + } + } } KEG_UNLOCK(keg); } #endif /* __rtems__ */ /* See uma.h */ -static void -uma_reclaim_locked(bool kmem_danger) +void +uma_reclaim(int req) { CTR0(KTR_UMA, "UMA: vm asked us to release pages!"); - sx_assert(&uma_drain_lock, SA_XLOCKED); + sx_xlock(&uma_reclaim_lock); bucket_enable(); - zone_foreach(zone_drain); -#ifndef __rtems__ - if (vm_page_count_min() || kmem_danger) { - cache_drain_safe(NULL); + + switch (req) { + case UMA_RECLAIM_TRIM: + zone_foreach(zone_trim); + break; + case UMA_RECLAIM_DRAIN: + case UMA_RECLAIM_DRAIN_CPU: zone_foreach(zone_drain); - } +#ifndef __rtems__ + if (req == UMA_RECLAIM_DRAIN_CPU) { + pcpu_cache_drain_safe(NULL); + zone_foreach(zone_drain); + } #endif /* __rtems__ */ + break; + default: + panic("unhandled reclamation request %d", req); + } /* * Some slabs may have been freed but this zone will be visited early @@ -3902,15 +3963,7 @@ uma_reclaim_locked(bool kmem_danger) */ zone_drain(slabzone); bucket_zone_drain(); -} - -void -uma_reclaim(void) -{ - - sx_xlock(&uma_drain_lock); - uma_reclaim_locked(false); - sx_xunlock(&uma_drain_lock); + sx_xunlock(&uma_reclaim_lock); } static volatile int uma_reclaim_needed; @@ -3928,31 +3981,52 @@ uma_reclaim_worker(void *arg __unused) { for (;;) { - sx_xlock(&uma_drain_lock); + sx_xlock(&uma_reclaim_lock); while (atomic_load_int(&uma_reclaim_needed) == 0) - sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl", + sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl", hz); + sx_xunlock(&uma_reclaim_lock); #ifndef __rtems__ - sx_xunlock(&uma_drain_lock); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); - sx_xlock(&uma_drain_lock); #endif /* __rtems__ */ - uma_reclaim_locked(true); + uma_reclaim(UMA_RECLAIM_DRAIN_CPU); atomic_store_int(&uma_reclaim_needed, 0); - sx_xunlock(&uma_drain_lock); /* Don't fire more than once per-second. */ pause("umarclslp", hz); } } /* See uma.h */ +void +uma_zone_reclaim(uma_zone_t zone, int req) +{ + + switch (req) { + case UMA_RECLAIM_TRIM: + zone_trim(zone); + break; + case UMA_RECLAIM_DRAIN: + zone_drain(zone); + break; +#ifndef __rtems__ + case UMA_RECLAIM_DRAIN_CPU: + pcpu_cache_drain_safe(zone); + zone_drain(zone); + break; +#endif /* __rtems__ */ + default: + panic("unhandled reclamation request %d", req); + } +} + +/* See uma.h */ int uma_zone_exhausted(uma_zone_t zone) { int full; ZONE_LOCK(zone); - full = (zone->uz_flags & UMA_ZFLAG_FULL); + full = zone->uz_sleepers > 0; ZONE_UNLOCK(zone); return (full); } @@ -3960,7 +4034,7 @@ uma_zone_exhausted(uma_zone_t zone) int uma_zone_exhausted_nolock(uma_zone_t zone) { - return (zone->uz_flags & UMA_ZFLAG_FULL); + return (zone->uz_sleepers > 0); } #ifndef __rtems__ @@ -4041,14 +4115,14 @@ unsigned long uma_size(void) { - return (uma_kmem_total); + return (atomic_load_long(&uma_kmem_total)); } long uma_avail(void) { - return (uma_kmem_limit - uma_kmem_total); + return (uma_kmem_limit - uma_size()); } void @@ -4067,11 +4141,13 @@ slab_print(uma_slab_t slab) static void cache_print(uma_cache_t cache) { - printf("alloc: %p(%d), free: %p(%d)\n", + printf("alloc: %p(%d), free: %p(%d), cross: %p(%d)j\n", cache->uc_allocbucket, cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0, cache->uc_freebucket, - cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0); + cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0, + cache->uc_crossbucket, + cache->uc_crossbucket?cache->uc_crossbucket->ub_cnt:0); } static void @@ -4082,11 +4158,11 @@ uma_print_keg(uma_keg_t keg) int i; printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d " - "out %d free %d limit %d\n", + "out %d free %d\n", keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags, keg->uk_ipers, keg->uk_ppera, (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free, - keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers); + keg->uk_free); for (i = 0; i < vm_ndomains; i++) { dom = &keg->uk_domain[i]; printf("Part slabs:\n"); @@ -4105,13 +4181,13 @@ void uma_print_zone(uma_zone_t zone) { uma_cache_t cache; - uma_klink_t kl; int i; - printf("zone: %s(%p) size %d flags %#x\n", - zone->uz_name, zone, zone->uz_size, zone->uz_flags); - LIST_FOREACH(kl, &zone->uz_kegs, kl_link) - uma_print_keg(kl->kl_keg); + printf("zone: %s(%p) size %d maxitems %ju flags %#x\n", + zone->uz_name, zone, zone->uz_size, (uintmax_t)zone->uz_max_items, + zone->uz_flags); + if (zone->uz_lockptr != &zone->uz_lock) + uma_print_keg(zone->uz_keg); CPU_FOREACH(i) { cache = &zone->uz_cpu[i]; printf("CPU %d Cache:\n", i); @@ -4134,13 +4210,13 @@ uma_print_zone(uma_zone_t zone) */ static void uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp, - uint64_t *freesp, uint64_t *sleepsp) + uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp) { uma_cache_t cache; - uint64_t allocs, frees, sleeps; + uint64_t allocs, frees, sleeps, xdomain; int cachefree, cpu; - allocs = frees = sleeps = 0; + allocs = frees = sleeps = xdomain = 0; cachefree = 0; CPU_FOREACH(cpu) { cache = &z->uz_cpu[cpu]; @@ -4148,12 +4224,17 @@ uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp, cachefree += cache->uc_allocbucket->ub_cnt; if (cache->uc_freebucket != NULL) cachefree += cache->uc_freebucket->ub_cnt; + if (cache->uc_crossbucket != NULL) { + xdomain += cache->uc_crossbucket->ub_cnt; + cachefree += cache->uc_crossbucket->ub_cnt; + } allocs += cache->uc_allocs; frees += cache->uc_frees; } - allocs += z->uz_allocs; - frees += z->uz_frees; + allocs += counter_u64_fetch(z->uz_allocs); + frees += counter_u64_fetch(z->uz_frees); sleeps += z->uz_sleeps; + xdomain += z->uz_xdomain; if (cachefreep != NULL) *cachefreep = cachefree; if (allocsp != NULL) @@ -4162,6 +4243,8 @@ uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp, *freesp = frees; if (sleepsp != NULL) *sleepsp = sleeps; + if (xdomainp != NULL) + *xdomainp = xdomain; } #endif /* DDB */ #endif /* __rtems__ */ @@ -4179,23 +4262,67 @@ sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) LIST_FOREACH(z, &kz->uk_zones, uz_link) count++; } + LIST_FOREACH(z, &uma_cachezones, uz_link) + count++; + rw_runlock(&uma_rwlock); return (sysctl_handle_int(oidp, &count, 0, req)); } +static void +uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf, + struct uma_percpu_stat *ups, bool internal) +{ + uma_zone_domain_t zdom; + uma_cache_t cache; + int i; + + + for (i = 0; i < vm_ndomains; i++) { + zdom = &z->uz_domain[i]; + uth->uth_zone_free += zdom->uzd_nitems; + } + uth->uth_allocs = counter_u64_fetch(z->uz_allocs); + uth->uth_frees = counter_u64_fetch(z->uz_frees); + uth->uth_fails = counter_u64_fetch(z->uz_fails); + uth->uth_sleeps = z->uz_sleeps; + uth->uth_xdomain = z->uz_xdomain; + /* + * While it is not normally safe to access the cache + * bucket pointers while not on the CPU that owns the + * cache, we only allow the pointers to be exchanged + * without the zone lock held, not invalidated, so + * accept the possible race associated with bucket + * exchange during monitoring. + */ + for (i = 0; i < mp_maxid + 1; i++) { + bzero(&ups[i], sizeof(*ups)); + if (internal || CPU_ABSENT(i)) + continue; + cache = &z->uz_cpu[i]; + if (cache->uc_allocbucket != NULL) + ups[i].ups_cache_free += + cache->uc_allocbucket->ub_cnt; + if (cache->uc_freebucket != NULL) + ups[i].ups_cache_free += + cache->uc_freebucket->ub_cnt; + if (cache->uc_crossbucket != NULL) + ups[i].ups_cache_free += + cache->uc_crossbucket->ub_cnt; + ups[i].ups_allocs = cache->uc_allocs; + ups[i].ups_frees = cache->uc_frees; + } +} + static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) { struct uma_stream_header ush; struct uma_type_header uth; struct uma_percpu_stat *ups; - uma_zone_domain_t zdom; struct sbuf sbuf; - uma_cache_t cache; - uma_klink_t kl; uma_keg_t kz; uma_zone_t z; - uma_keg_t k; int count, error, i; error = sysctl_wire_old_buffer(req, 0); @@ -4212,6 +4339,9 @@ sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) count++; } + LIST_FOREACH(z, &uma_cachezones, uz_link) + count++; + /* * Insert stream header. */ @@ -4229,14 +4359,15 @@ sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) uth.uth_align = kz->uk_align; uth.uth_size = kz->uk_size; uth.uth_rsize = kz->uk_rsize; - LIST_FOREACH(kl, &z->uz_kegs, kl_link) { - k = kl->kl_keg; - uth.uth_maxpages += k->uk_maxpages; - uth.uth_pages += k->uk_pages; - uth.uth_keg_free += k->uk_free; - uth.uth_limit = (k->uk_maxpages / k->uk_ppera) - * k->uk_ipers; - } + if (z->uz_max_items > 0) + uth.uth_pages = (z->uz_items / kz->uk_ipers) * + kz->uk_ppera; + else + uth.uth_pages = kz->uk_pages; + uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) * + kz->uk_ppera; + uth.uth_limit = z->uz_max_items; + uth.uth_keg_free = z->uz_keg->uk_free; /* * A zone is secondary is it is not the first entry @@ -4245,44 +4376,26 @@ sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) if ((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z)) uth.uth_zone_flags = UTH_ZONE_SECONDARY; - - for (i = 0; i < vm_ndomains; i++) { - zdom = &z->uz_domain[i]; - uth.uth_zone_free += zdom->uzd_nitems; - } - uth.uth_allocs = z->uz_allocs; - uth.uth_frees = z->uz_frees; - uth.uth_fails = z->uz_fails; - uth.uth_sleeps = z->uz_sleeps; - /* - * While it is not normally safe to access the cache - * bucket pointers while not on the CPU that owns the - * cache, we only allow the pointers to be exchanged - * without the zone lock held, not invalidated, so - * accept the possible race associated with bucket - * exchange during monitoring. - */ - for (i = 0; i < mp_maxid + 1; i++) { - bzero(&ups[i], sizeof(*ups)); - if (kz->uk_flags & UMA_ZFLAG_INTERNAL || - CPU_ABSENT(i)) - continue; - cache = &z->uz_cpu[i]; - if (cache->uc_allocbucket != NULL) - ups[i].ups_cache_free += - cache->uc_allocbucket->ub_cnt; - if (cache->uc_freebucket != NULL) - ups[i].ups_cache_free += - cache->uc_freebucket->ub_cnt; - ups[i].ups_allocs = cache->uc_allocs; - ups[i].ups_frees = cache->uc_frees; - } + uma_vm_zone_stats(&uth, z, &sbuf, ups, + kz->uk_flags & UMA_ZFLAG_INTERNAL); ZONE_UNLOCK(z); (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); for (i = 0; i < mp_maxid + 1; i++) (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); } } + LIST_FOREACH(z, &uma_cachezones, uz_link) { + bzero(&uth, sizeof(uth)); + ZONE_LOCK(z); + strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); + uth.uth_size = z->uz_size; + uma_vm_zone_stats(&uth, z, &sbuf, ups, false); + ZONE_UNLOCK(z); + (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); + for (i = 0; i < mp_maxid + 1; i++) + (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); + } + rw_runlock(&uma_rwlock); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); @@ -4333,8 +4446,10 @@ uma_dbg_getslab(uma_zone_t zone, void *item) * zone is unlocked because the item's allocation state * essentially holds a reference. */ + if (zone->uz_lockptr == &zone->uz_lock) + return (NULL); ZONE_LOCK(zone); - keg = LIST_FIRST(&zone->uz_kegs)->kl_keg; + keg = zone->uz_keg; if (keg->uk_flags & UMA_ZONE_HASH) slab = hash_sfind(&keg->uk_hash, mem); else @@ -4348,12 +4463,11 @@ uma_dbg_getslab(uma_zone_t zone, void *item) static bool uma_dbg_zskip(uma_zone_t zone, void *mem) { - uma_keg_t keg; - if ((keg = zone_first_keg(zone)) == NULL) + if (zone->uz_lockptr == &zone->uz_lock) return (true); - return (uma_dbg_kskip(keg, mem)); + return (uma_dbg_kskip(zone->uz_keg, mem)); } static bool @@ -4453,32 +4567,32 @@ DB_SHOW_COMMAND(uma, db_show_uma) { uma_keg_t kz; uma_zone_t z; - uint64_t allocs, frees, sleeps; + uint64_t allocs, frees, sleeps, xdomain; long cachefree; int i; - db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used", - "Free", "Requests", "Sleeps", "Bucket"); + db_printf("%18s %8s %8s %8s %12s %8s %8s %8s\n", "Zone", "Size", "Used", + "Free", "Requests", "Sleeps", "Bucket", "XFree"); LIST_FOREACH(kz, &uma_kegs, uk_link) { LIST_FOREACH(z, &kz->uk_zones, uz_link) { if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { - allocs = z->uz_allocs; - frees = z->uz_frees; + allocs = counter_u64_fetch(z->uz_allocs); + frees = counter_u64_fetch(z->uz_frees); sleeps = z->uz_sleeps; cachefree = 0; } else uma_zone_sumstat(z, &cachefree, &allocs, - &frees, &sleeps); + &frees, &sleeps, &xdomain); if (!((z->uz_flags & UMA_ZONE_SECONDARY) && (LIST_FIRST(&kz->uk_zones) != z))) cachefree += kz->uk_free; for (i = 0; i < vm_ndomains; i++) cachefree += z->uz_domain[i].uzd_nitems; - db_printf("%18s %8ju %8jd %8ld %12ju %8ju %8u\n", + db_printf("%18s %8ju %8jd %8ld %12ju %8ju %8u %8ju\n", z->uz_name, (uintmax_t)kz->uk_size, (intmax_t)(allocs - frees), cachefree, - (uintmax_t)allocs, sleeps, z->uz_count); + (uintmax_t)allocs, sleeps, z->uz_count, xdomain); if (db_pager_quit) return; } @@ -4495,7 +4609,7 @@ DB_SHOW_COMMAND(umacache, db_show_umacache) db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", "Requests", "Bucket"); LIST_FOREACH(z, &uma_cachezones, uz_link) { - uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL); + uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL); for (i = 0; i < vm_ndomains; i++) cachefree += z->uz_domain[i].uzd_nitems; db_printf("%18s %8ju %8jd %8ld %12ju %8u\n", @@ -4516,7 +4630,7 @@ rtems_bsd_uma_startup(void *unused) uma_kmem_limit = rtems_bsd_get_allocator_domain_size( RTEMS_BSD_ALLOCATOR_DOMAIN_PAGE); - sx_init_flags(&uma_drain_lock, "umadrain", SX_RECURSE); + sx_init_flags(&uma_reclaim_lock, "umareclaim", SX_RECURSE); uma_startup(NULL, 0); } diff --git a/freebsd/sys/vm/uma_int.h b/freebsd/sys/vm/uma_int.h index 0429bac6..1d055ed6 100644 --- a/freebsd/sys/vm/uma_int.h +++ b/freebsd/sys/vm/uma_int.h @@ -30,6 +30,7 @@ * */ +#include <sys/counter.h> #include <sys/_bitset.h> #include <sys/_domainset.h> #include <sys/_task.h> @@ -178,8 +179,8 @@ SLIST_HEAD(slabhead, uma_slab); struct uma_hash { struct slabhead *uh_slab_hash; /* Hash table for slabs */ - int uh_hashsize; /* Current size of the hash table */ - int uh_hashmask; /* Mask used during hashing */ + u_int uh_hashsize; /* Current size of the hash table */ + u_int uh_hashmask; /* Mask used during hashing */ }; /* @@ -196,7 +197,7 @@ struct uma_hash { */ struct uma_bucket { - LIST_ENTRY(uma_bucket) ub_link; /* Link into the zone */ + TAILQ_ENTRY(uma_bucket) ub_link; /* Link into the zone */ int16_t ub_cnt; /* Count of items in bucket. */ int16_t ub_entries; /* Max items. */ void *ub_bucket[]; /* actual allocation storage */ @@ -207,6 +208,7 @@ typedef struct uma_bucket * uma_bucket_t; struct uma_cache { uma_bucket_t uc_freebucket; /* Bucket we're freeing to */ uma_bucket_t uc_allocbucket; /* Bucket to allocate from */ + uma_bucket_t uc_crossbucket; /* cross domain bucket */ uint64_t uc_allocs; /* Count of allocations */ uint64_t uc_frees; /* Count of frees */ } UMA_ALIGN; @@ -231,7 +233,9 @@ typedef struct uma_domain * uma_domain_t; * */ struct uma_keg { - struct mtx uk_lock; /* Lock for the keg */ + struct mtx uk_lock; /* Lock for the keg must be first. + * See shared uz_keg/uz_lockptr + * member of struct uma_zone. */ struct uma_hash uk_hash; LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */ @@ -244,7 +248,6 @@ struct uma_keg { uint32_t uk_reserve; /* Number of reserved items. */ uint32_t uk_size; /* Requested size of each item */ uint32_t uk_rsize; /* Real size of each item */ - uint32_t uk_maxpages; /* Maximum number of pages to alloc */ uma_init uk_init; /* Keg's init routine */ uma_fini uk_fini; /* Keg's fini routine */ @@ -308,16 +311,11 @@ struct uma_slab { #endif typedef struct uma_slab * uma_slab_t; -typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int, int); -struct uma_klink { - LIST_ENTRY(uma_klink) kl_link; - uma_keg_t kl_keg; -}; -typedef struct uma_klink *uma_klink_t; +TAILQ_HEAD(uma_bucketlist, uma_bucket); struct uma_zone_domain { - LIST_HEAD(,uma_bucket) uzd_buckets; /* full buckets */ + struct uma_bucketlist uzd_buckets; /* full buckets */ long uzd_nitems; /* total item count */ long uzd_imax; /* maximum item count this period */ long uzd_imin; /* minimum item count this period */ @@ -334,8 +332,10 @@ typedef struct uma_zone_domain * uma_zone_domain_t; */ struct uma_zone { /* Offset 0, used in alloc/free fast/medium fast path and const. */ - struct mtx *uz_lockptr; - const char *uz_name; /* Text name of the zone */ + union { + uma_keg_t uz_keg; /* This zone's keg */ + struct mtx *uz_lockptr; /* To keg or to self */ + }; #ifndef __rtems__ struct uma_zone_domain *uz_domain; /* per-domain buckets */ #else /* __rtems__ */ @@ -345,19 +345,21 @@ struct uma_zone { uint32_t uz_size; /* Size inherited from kegs */ uma_ctor uz_ctor; /* Constructor for each allocation */ uma_dtor uz_dtor; /* Destructor */ - uma_init uz_init; /* Initializer for each item */ - uma_fini uz_fini; /* Finalizer for each item. */ + uint64_t uz_items; /* Total items count */ + uint64_t uz_max_items; /* Maximum number of items to alloc */ + uint32_t uz_sleepers; /* Number of sleepers on memory */ + uint16_t uz_count; /* Amount of items in full bucket */ + uint16_t uz_count_max; /* Maximum amount of items there */ /* Offset 64, used in bucket replenish. */ uma_import uz_import; /* Import new memory to cache. */ uma_release uz_release; /* Release memory from cache. */ void *uz_arg; /* Import/release argument. */ - uma_slaballoc uz_slab; /* Allocate a slab from the backend. */ - uint16_t uz_count; /* Amount of items in full bucket */ - uint16_t uz_count_min; /* Minimal amount of items there */ - /* 32bit pad on 64bit. */ - LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */ - LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */ + uma_init uz_init; /* Initializer for each item */ + uma_fini uz_fini; /* Finalizer for each item. */ + void *uz_spare; + uint64_t uz_bkt_count; /* Items in bucket cache */ + uint64_t uz_bkt_max; /* Maximum bucket cache size */ /* Offset 128 Rare. */ /* @@ -366,19 +368,20 @@ struct uma_zone { * members to reduce alignment overhead. */ struct mtx uz_lock; /* Lock for the zone */ - struct uma_klink uz_klink; /* klink for first keg. */ + LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */ + const char *uz_name; /* Text name of the zone */ /* The next two fields are used to print a rate-limited warnings. */ const char *uz_warning; /* Warning to print on failure */ struct timeval uz_ratecheck; /* Warnings rate-limiting */ struct task uz_maxaction; /* Task to run when at limit */ + uint16_t uz_count_min; /* Minimal amount of items in bucket */ - /* 16 bytes of pad. */ - - /* Offset 256, atomic stats. */ - volatile u_long uz_allocs UMA_ALIGN; /* Total number of allocations */ - volatile u_long uz_fails; /* Total number of alloc failures */ - volatile u_long uz_frees; /* Total number of frees */ + /* Offset 256, stats. */ + counter_u64_t uz_allocs; /* Total number of allocations */ + counter_u64_t uz_frees; /* Total number of frees */ + counter_u64_t uz_fails; /* Total number of alloc failures */ uint64_t uz_sleeps; /* Total number of alloc sleeps */ + uint64_t uz_xdomain; /* Total number of cross-domain frees */ /* * This HAS to be the last item because we adjust the zone size @@ -392,25 +395,15 @@ struct uma_zone { /* * These flags must not overlap with the UMA_ZONE flags specified in uma.h. */ -#define UMA_ZFLAG_MULTI 0x04000000 /* Multiple kegs in the zone. */ -#define UMA_ZFLAG_DRAINING 0x08000000 /* Running zone_drain. */ +#define UMA_ZFLAG_CACHE 0x04000000 /* uma_zcache_create()d it */ +#define UMA_ZFLAG_RECLAIMING 0x08000000 /* Running zone_reclaim(). */ #define UMA_ZFLAG_BUCKET 0x10000000 /* Bucket zone. */ #define UMA_ZFLAG_INTERNAL 0x20000000 /* No offpage no PCPU. */ -#define UMA_ZFLAG_FULL 0x40000000 /* Reached uz_maxpages */ #define UMA_ZFLAG_CACHEONLY 0x80000000 /* Don't ask VM for buckets. */ #define UMA_ZFLAG_INHERIT \ (UMA_ZFLAG_INTERNAL | UMA_ZFLAG_CACHEONLY | UMA_ZFLAG_BUCKET) -static inline uma_keg_t -zone_first_keg(uma_zone_t zone) -{ - uma_klink_t klink; - - klink = LIST_FIRST(&zone->uz_kegs); - return (klink != NULL) ? klink->kl_keg : NULL; -} - #undef UMA_ALIGN #ifdef _KERNEL @@ -435,6 +428,13 @@ void uma_large_free(uma_slab_t slab); #define KEG_LOCK_FINI(k) mtx_destroy(&(k)->uk_lock) #define KEG_LOCK(k) mtx_lock(&(k)->uk_lock) #define KEG_UNLOCK(k) mtx_unlock(&(k)->uk_lock) +#define KEG_LOCK_ASSERT(k) mtx_assert(&(k)->uk_lock, MA_OWNED) + +#define KEG_GET(zone, keg) do { \ + (keg) = (zone)->uz_keg; \ + KASSERT((void *)(keg) != (void *)&(zone)->uz_lock, \ + ("%s: Invalid zone %p type", __func__, (zone))); \ + } while (0) #define ZONE_LOCK_INIT(z, lc) \ do { \ @@ -467,7 +467,7 @@ static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data) { uma_slab_t slab; - int hval; + u_int hval; hval = UMA_HASH(hash, data); diff --git a/freebsd/sys/x86/include/machine/bus.h b/freebsd/sys/x86/include/machine/bus.h index 297b5edc..2427ae51 100644 --- a/freebsd/sys/x86/include/machine/bus.h +++ b/freebsd/sys/x86/include/machine/bus.h @@ -114,7 +114,11 @@ #define BUS_SPACE_MAXSIZE_24BIT 0xFFFFFF #define BUS_SPACE_MAXSIZE_32BIT 0xFFFFFFFF +#if defined(__amd64__) +#define BUS_SPACE_MAXSIZE 0xFFFFFFFFFFFFFFFFULL +#else #define BUS_SPACE_MAXSIZE 0xFFFFFFFF +#endif #define BUS_SPACE_MAXADDR_24BIT 0xFFFFFF #define BUS_SPACE_MAXADDR_32BIT 0xFFFFFFFF #if defined(__amd64__) || defined(PAE) diff --git a/freebsd/sys/x86/include/machine/pci_cfgreg.h b/freebsd/sys/x86/include/machine/pci_cfgreg.h index 8083eb0e..85d6485c 100644 --- a/freebsd/sys/x86/include/machine/pci_cfgreg.h +++ b/freebsd/sys/x86/include/machine/pci_cfgreg.h @@ -48,6 +48,15 @@ #define CONF2_ENABLE_CHK 0x0e #define CONF2_ENABLE_RES 0x0e +enum { + CFGMECH_NONE = 0, + CFGMECH_1, + CFGMECH_2, + CFGMECH_PCIE, +}; + +extern int cfgmech; + rman_res_t hostb_alloc_start(int type, rman_res_t start, rman_res_t end, rman_res_t count); int pcie_cfgregopen(uint64_t base, uint8_t minbus, uint8_t maxbus); int pci_cfgregopen(void); |