diff options
Diffstat (limited to 'freebsd/sys')
144 files changed, 5827 insertions, 5481 deletions
diff --git a/freebsd/sys/arm/freescale/imx/imx_gpio.c b/freebsd/sys/arm/freescale/imx/imx_gpio.c index 983e4d74..f5b476d7 100644 --- a/freebsd/sys/arm/freescale/imx/imx_gpio.c +++ b/freebsd/sys/arm/freescale/imx/imx_gpio.c @@ -870,6 +870,15 @@ imx51_gpio_detach(device_t dev) return(0); } +static phandle_t +imx51_gpio_get_node(device_t bus, device_t dev) +{ + /* + * Share controller node with gpiobus device + */ + return ofw_bus_get_node(bus); +} + static device_method_t imx51_gpio_methods[] = { DEVMETHOD(device_probe, imx51_gpio_probe), DEVMETHOD(device_attach, imx51_gpio_attach), @@ -887,6 +896,9 @@ static device_method_t imx51_gpio_methods[] = { DEVMETHOD(pic_pre_ithread, gpio_pic_pre_ithread), #endif + /* OFW methods */ + DEVMETHOD(ofw_bus_get_node, imx51_gpio_get_node), + /* GPIO protocol */ DEVMETHOD(gpio_get_bus, imx51_gpio_get_bus), DEVMETHOD(gpio_pin_max, imx51_gpio_pin_max), diff --git a/freebsd/sys/arm/ti/ti_sdhci.c b/freebsd/sys/arm/ti/ti_sdhci.c index a2be1f19..e3502099 100644 --- a/freebsd/sys/arm/ti/ti_sdhci.c +++ b/freebsd/sys/arm/ti/ti_sdhci.c @@ -483,15 +483,16 @@ ti_sdhci_hw_init(device_t dev) /* * The attach() routine has examined fdt data and set flags in * slot.host.caps to reflect what voltages we can handle. Set those - * values in the CAPA register. The manual says that these values can - * only be set once, and that they survive a reset so unless u-boot didn't - * set this register this code is a no-op. + * values in the CAPA register. Empirical testing shows that the + * values in this register can be overwritten at any time, but the + * manual says that these values should only be set once, "before + * initialization" whatever that means, and that they survive a reset. */ regval = ti_mmchs_read_4(sc, MMCHS_SD_CAPA); if (sc->slot.host.caps & MMC_OCR_LOW_VOLTAGE) regval |= MMCHS_SD_CAPA_VS18; - if (sc->slot.host.caps & (MMC_OCR_320_330 | MMC_OCR_330_340)) - regval |= MMCHS_SD_CAPA_VS33; + if (sc->slot.host.caps & (MMC_OCR_290_300 | MMC_OCR_300_310)) + regval |= MMCHS_SD_CAPA_VS30; ti_mmchs_write_4(sc, MMCHS_SD_CAPA, regval); /* Set initial host configuration (1-bit, std speed, pwr off). */ @@ -525,20 +526,16 @@ ti_sdhci_attach(device_t dev) } /* - * The hardware can inherently do dual-voltage (1p8v, 3p3v) on the first + * The hardware can inherently do dual-voltage (1p8v, 3p0v) on the first * device, and only 1p8v on other devices unless an external transceiver * is used. The only way we could know about a transceiver is fdt data. * Note that we have to do this before calling ti_sdhci_hw_init() so - * that it can set the right values in the CAPA register, which can only - * be done once and never reset. + * that it can set the right values in the CAPA register. */ - if (OF_hasprop(node, "ti,dual-volt")) { - sc->slot.host.caps |= MMC_OCR_LOW_VOLTAGE | MMC_OCR_320_330 | MMC_OCR_330_340; - } else if (OF_hasprop(node, "no-1-8-v")) { - sc->slot.host.caps |= MMC_OCR_320_330 | MMC_OCR_330_340; - } else - sc->slot.host.caps |= MMC_OCR_LOW_VOLTAGE; - + sc->slot.host.caps |= MMC_OCR_LOW_VOLTAGE; + if (sc->mmchs_clk_id == MMC1_CLK || OF_hasprop(node, "ti,dual-volt")) { + sc->slot.host.caps |= MMC_OCR_290_300 | MMC_OCR_300_310; + } /* * Set the offset from the device's memory start to the MMCHS registers. diff --git a/freebsd/sys/cam/cam_periph.h b/freebsd/sys/cam/cam_periph.h index b087b872..d5dcfed0 100644 --- a/freebsd/sys/cam/cam_periph.h +++ b/freebsd/sys/cam/cam_periph.h @@ -132,6 +132,8 @@ struct cam_periph { #define CAM_PERIPH_RUN_TASK 0x40 #define CAM_PERIPH_FREE 0x80 #define CAM_PERIPH_ANNOUNCED 0x100 +#define CAM_PERIPH_RECOVERY_WAIT 0x200 +#define CAM_PERIPH_RECOVERY_WAIT_FAILED 0x400 uint32_t scheduled_priority; uint32_t immediate_priority; int periph_allocating; diff --git a/freebsd/sys/cam/scsi/scsi_all.c b/freebsd/sys/cam/scsi/scsi_all.c index 99d82fee..b547fbbd 100644 --- a/freebsd/sys/cam/scsi/scsi_all.c +++ b/freebsd/sys/cam/scsi/scsi_all.c @@ -1115,7 +1115,7 @@ static struct asc_table_entry asc_table[] = { { SST(0x04, 0x08, SS_FATAL | EBUSY, "Logical unit not ready, long write in progress") }, /* DTLPWROMAEBKVF */ - { SST(0x04, 0x09, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x09, SS_FATAL | EBUSY, "Logical unit not ready, self-test in progress") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x0A, SS_WAIT | ENXIO, @@ -1133,37 +1133,37 @@ static struct asc_table_entry asc_table[] = { { SST(0x04, 0x0E, SS_RDEF, /* XXX TBD */ "Logical unit not ready, security session in progress") }, /* DT WROM B */ - { SST(0x04, 0x10, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x10, SS_FATAL | ENODEV, "Logical unit not ready, auxiliary memory not accessible") }, /* DT WRO AEB VF */ - { SST(0x04, 0x11, SS_WAIT | EBUSY, + { SST(0x04, 0x11, SS_WAIT | ENXIO, "Logical unit not ready, notify (enable spinup) required") }, /* M V */ - { SST(0x04, 0x12, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x12, SS_FATAL | ENXIO, "Logical unit not ready, offline") }, /* DT R MAEBKV */ - { SST(0x04, 0x13, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x13, SS_WAIT | EBUSY, "Logical unit not ready, SA creation in progress") }, /* D B */ - { SST(0x04, 0x14, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x14, SS_WAIT | ENOSPC, "Logical unit not ready, space allocation in progress") }, /* M */ - { SST(0x04, 0x15, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x15, SS_FATAL | ENXIO, "Logical unit not ready, robotics disabled") }, /* M */ - { SST(0x04, 0x16, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x16, SS_FATAL | ENXIO, "Logical unit not ready, configuration required") }, /* M */ - { SST(0x04, 0x17, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x17, SS_FATAL | ENXIO, "Logical unit not ready, calibration required") }, /* M */ - { SST(0x04, 0x18, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x18, SS_FATAL | ENXIO, "Logical unit not ready, a door is open") }, /* M */ - { SST(0x04, 0x19, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x19, SS_FATAL | ENODEV, "Logical unit not ready, operating in sequential mode") }, /* DT B */ - { SST(0x04, 0x1A, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x1A, SS_WAIT | EBUSY, "Logical unit not ready, START/STOP UNIT command in progress") }, /* D B */ { SST(0x04, 0x1B, SS_WAIT | EBUSY, @@ -1172,7 +1172,7 @@ static struct asc_table_entry asc_table[] = { { SST(0x04, 0x1C, SS_START | SSQ_DECREMENT_COUNT | ENXIO, "Logical unit not ready, additional power use not yet granted") }, /* D */ - { SST(0x04, 0x1D, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x1D, SS_WAIT | EBUSY, "Logical unit not ready, configuration in progress") }, /* D */ { SST(0x04, 0x1E, SS_FATAL | ENXIO, @@ -1181,14 +1181,20 @@ static struct asc_table_entry asc_table[] = { { SST(0x04, 0x1F, SS_FATAL | ENXIO, "Logical unit not ready, microcode download required") }, /* DTLPWROMAEBKVF */ - { SST(0x04, 0x20, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x20, SS_FATAL | ENXIO, "Logical unit not ready, logical unit reset required") }, /* DTLPWROMAEBKVF */ - { SST(0x04, 0x21, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x21, SS_FATAL | ENXIO, "Logical unit not ready, hard reset required") }, /* DTLPWROMAEBKVF */ - { SST(0x04, 0x22, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x22, SS_FATAL | ENXIO, "Logical unit not ready, power cycle required") }, + /* D */ + { SST(0x04, 0x23, SS_FATAL | ENXIO, + "Logical unit not ready, affiliation required") }, + /* D */ + { SST(0x04, 0x24, SS_FATAL | EBUSY, + "Depopulation in progress") }, /* DTL WROMAEBKVF */ { SST(0x05, 0x00, SS_RDEF, "Logical unit does not respond to selection") }, @@ -3387,7 +3393,7 @@ scsi_error_action(struct ccb_scsiio *csio, struct scsi_inquiry_data *inq_data, if (!scsi_extract_sense_ccb((union ccb *)csio, &error_code, &sense_key, &asc, &ascq)) { - action = SS_RETRY | SSQ_DECREMENT_COUNT | SSQ_PRINT_SENSE | EIO; + action = SS_RDEF; } else if ((error_code == SSD_DEFERRED_ERROR) || (error_code == SSD_DESC_DEFERRED_ERROR)) { /* diff --git a/freebsd/sys/dev/dwc/if_dwc.c b/freebsd/sys/dev/dwc/if_dwc.c index 895fdfe5..7e249414 100644 --- a/freebsd/sys/dev/dwc/if_dwc.c +++ b/freebsd/sys/dev/dwc/if_dwc.c @@ -1239,16 +1239,13 @@ dwc_reset(device_t dev) if (OF_hasprop(node, "snps,reset-active-low")) pin_value = GPIO_PIN_HIGH; - if (flags & GPIO_ACTIVE_LOW) - pin_value = !pin_value; - GPIO_PIN_SETFLAGS(gpio, pin, GPIO_PIN_OUTPUT); GPIO_PIN_SET(gpio, pin, pin_value); - DELAY(delay_prop[0]); + DELAY(delay_prop[0] * 5); GPIO_PIN_SET(gpio, pin, !pin_value); - DELAY(delay_prop[1]); + DELAY(delay_prop[1] * 5); GPIO_PIN_SET(gpio, pin, pin_value); - DELAY(delay_prop[2]); + DELAY(delay_prop[2] * 5); #endif /* __rtems__ */ return (0); diff --git a/freebsd/sys/dev/e1000/e1000_api.c b/freebsd/sys/dev/e1000/e1000_api.c index c351901c..21b11bc2 100644 --- a/freebsd/sys/dev/e1000/e1000_api.c +++ b/freebsd/sys/dev/e1000/e1000_api.c @@ -321,6 +321,7 @@ s32 e1000_set_mac_type(struct e1000_hw *hw) case E1000_DEV_ID_PCH_ICP_I219_V8: case E1000_DEV_ID_PCH_ICP_I219_LM9: case E1000_DEV_ID_PCH_ICP_I219_V9: + case E1000_DEV_ID_PCH_ICP_I219_V10: mac->type = e1000_pch_cnp; break; case E1000_DEV_ID_82575EB_COPPER: diff --git a/freebsd/sys/dev/e1000/e1000_hw.h b/freebsd/sys/dev/e1000/e1000_hw.h index 7e4e7f1a..6c0b5203 100644 --- a/freebsd/sys/dev/e1000/e1000_hw.h +++ b/freebsd/sys/dev/e1000/e1000_hw.h @@ -155,6 +155,7 @@ struct e1000_hw; #define E1000_DEV_ID_PCH_ICP_I219_V8 0x15E0 #define E1000_DEV_ID_PCH_ICP_I219_LM9 0x15E1 #define E1000_DEV_ID_PCH_ICP_I219_V9 0x15E2 +#define E1000_DEV_ID_PCH_ICP_I219_V10 0x0D4F #define E1000_DEV_ID_82576 0x10C9 #define E1000_DEV_ID_82576_FIBER 0x10E6 #define E1000_DEV_ID_82576_SERDES 0x10E7 diff --git a/freebsd/sys/dev/e1000/if_em.c b/freebsd/sys/dev/e1000/if_em.c index 32eb4afe..9b52c35a 100644 --- a/freebsd/sys/dev/e1000/if_em.c +++ b/freebsd/sys/dev/e1000/if_em.c @@ -176,6 +176,7 @@ static pci_vendor_info_t em_vendor_info_array[] = PVID(0x8086, E1000_DEV_ID_PCH_ICP_I219_V8, "Intel(R) PRO/1000 Network Connection"), PVID(0x8086, E1000_DEV_ID_PCH_ICP_I219_LM9, "Intel(R) PRO/1000 Network Connection"), PVID(0x8086, E1000_DEV_ID_PCH_ICP_I219_V9, "Intel(R) PRO/1000 Network Connection"), + PVID(0x8086, E1000_DEV_ID_PCH_ICP_I219_V10, "Intel(R) PRO/1000 Network Connection"), /* required last entry */ PVID_END }; @@ -1397,10 +1398,8 @@ em_intr(void *arg) IFDI_INTR_DISABLE(ctx); /* Link status change */ - if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { - adapter->hw.mac.get_link_status = 1; - iflib_admin_intr_deferred(ctx); - } + if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) + em_handle_link(ctx); if (reg_icr & E1000_ICR_RXO) adapter->rx_overruns++; @@ -1483,22 +1482,24 @@ em_msix_link(void *arg) if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { em_handle_link(adapter->ctx); - } else { - E1000_WRITE_REG(&adapter->hw, E1000_IMS, - EM_MSIX_LINK | E1000_IMS_LSC); - if (adapter->hw.mac.type >= igb_mac_min) - E1000_WRITE_REG(&adapter->hw, E1000_EIMS, adapter->link_mask); + } else if (adapter->hw.mac.type == e1000_82574) { + /* Only re-arm 82574 if em_if_update_admin_status() won't. */ + E1000_WRITE_REG(&adapter->hw, E1000_IMS, EM_MSIX_LINK | + E1000_IMS_LSC); } - /* - * Because we must read the ICR for this interrupt - * it may clear other causes using autoclear, for - * this reason we simply create a soft interrupt - * for all these vectors. - */ - if (reg_icr && adapter->hw.mac.type < igb_mac_min) { - E1000_WRITE_REG(&adapter->hw, - E1000_ICS, adapter->ims); + if (adapter->hw.mac.type == e1000_82574) { + /* + * Because we must read the ICR for this interrupt it may + * clear other causes using autoclear, for this reason we + * simply create a soft interrupt for all these vectors. + */ + if (reg_icr) + E1000_WRITE_REG(&adapter->hw, E1000_ICS, adapter->ims); + } else { + /* Re-arm unconditionally */ + E1000_WRITE_REG(&adapter->hw, E1000_IMS, E1000_IMS_LSC); + E1000_WRITE_REG(&adapter->hw, E1000_EIMS, adapter->link_mask); } return (FILTER_HANDLED); @@ -1514,7 +1515,6 @@ em_handle_link(void *context) iflib_admin_intr_deferred(ctx); } - /********************************************************************* * * Media Ioctl callback @@ -1831,14 +1831,15 @@ em_if_update_admin_status(if_ctx_t ctx) em_update_stats_counters(adapter); /* Reset LAA into RAR[0] on 82571 */ - if ((adapter->hw.mac.type == e1000_82571) && - e1000_get_laa_state_82571(&adapter->hw)) - e1000_rar_set(&adapter->hw, adapter->hw.mac.addr, 0); + if (hw->mac.type == e1000_82571 && e1000_get_laa_state_82571(hw)) + e1000_rar_set(hw, hw->mac.addr, 0); - if (adapter->hw.mac.type < em_mac_min) + if (hw->mac.type < em_mac_min) lem_smartspeed(adapter); - - E1000_WRITE_REG(&adapter->hw, E1000_IMS, EM_MSIX_LINK | E1000_IMS_LSC); + else if (hw->mac.type == e1000_82574 && + adapter->intr_type == IFLIB_INTR_MSIX) + E1000_WRITE_REG(&adapter->hw, E1000_IMS, EM_MSIX_LINK | + E1000_IMS_LSC); } static void @@ -3905,6 +3906,7 @@ em_disable_aspm(struct adapter *adapter) static void em_update_stats_counters(struct adapter *adapter) { + u64 prev_xoffrxc = adapter->stats.xoffrxc; if(adapter->hw.phy.media_type == e1000_media_type_copper || (E1000_READ_REG(&adapter->hw, E1000_STATUS) & E1000_STATUS_LU)) { @@ -3928,7 +3930,8 @@ em_update_stats_counters(struct adapter *adapter) ** For watchdog management we need to know if we have been ** paused during the last interval, so capture that here. */ - adapter->shared->isc_pause_frames = adapter->stats.xoffrxc; + if (adapter->stats.xoffrxc != prev_xoffrxc) + adapter->shared->isc_pause_frames = 1; adapter->stats.xofftxc += E1000_READ_REG(&adapter->hw, E1000_XOFFTXC); adapter->stats.fcruc += E1000_READ_REG(&adapter->hw, E1000_FCRUC); adapter->stats.prc64 += E1000_READ_REG(&adapter->hw, E1000_PRC64); diff --git a/freebsd/sys/dev/gpio/gpiobus.c b/freebsd/sys/dev/gpio/gpiobus.c index d256ee4a..25daf717 100644 --- a/freebsd/sys/dev/gpio/gpiobus.c +++ b/freebsd/sys/dev/gpio/gpiobus.c @@ -80,6 +80,18 @@ static int gpiobus_pin_get(device_t, device_t, uint32_t, unsigned int*); static int gpiobus_pin_toggle(device_t, device_t, uint32_t); /* + * gpiobus_pin flags + * The flags in struct gpiobus_pin are not related to the flags used by the + * low-level controller driver in struct gpio_pin. Currently, only pins + * acquired via FDT data have gpiobus_pin.flags set, sourced from the flags in + * the FDT properties. In theory, these flags are defined per-platform. In + * practice they are always the flags from the dt-bindings/gpio/gpio.h file. + * The only one of those flags we currently support is for handling active-low + * pins, so we just define that flag here instead of including a GPL'd header. + */ +#define GPIO_ACTIVE_LOW 1 + +/* * XXX -> Move me to better place - gpio_subr.c? * Also, this function must be changed when interrupt configuration * data will be moved into struct resource. @@ -137,6 +149,114 @@ gpio_check_flags(uint32_t caps, uint32_t flags) return (0); } +int +gpio_pin_get_by_bus_pinnum(device_t busdev, uint32_t pinnum, gpio_pin_t *ppin) +{ + gpio_pin_t pin; + int err; + + err = gpiobus_acquire_pin(busdev, pinnum); + if (err != 0) + return (EBUSY); + + pin = malloc(sizeof(*pin), M_DEVBUF, M_WAITOK | M_ZERO); + + pin->dev = device_get_parent(busdev); + pin->pin = pinnum; + pin->flags = 0; + + *ppin = pin; + return (0); +} + +int +gpio_pin_get_by_child_index(device_t childdev, uint32_t idx, gpio_pin_t *ppin) +{ + struct gpiobus_ivar *devi; + + devi = GPIOBUS_IVAR(childdev); + if (idx >= devi->npins) + return (EINVAL); + + return (gpio_pin_get_by_bus_pinnum(device_get_parent(childdev), + devi->pins[idx], ppin)); +} + +int +gpio_pin_getcaps(gpio_pin_t pin, uint32_t *caps) +{ + + KASSERT(pin != NULL, ("GPIO pin is NULL.")); + KASSERT(pin->dev != NULL, ("GPIO pin device is NULL.")); + return (GPIO_PIN_GETCAPS(pin->dev, pin->pin, caps)); +} + +int +gpio_pin_is_active(gpio_pin_t pin, bool *active) +{ + int rv; + uint32_t tmp; + + KASSERT(pin != NULL, ("GPIO pin is NULL.")); + KASSERT(pin->dev != NULL, ("GPIO pin device is NULL.")); + rv = GPIO_PIN_GET(pin->dev, pin->pin, &tmp); + if (rv != 0) { + return (rv); + } + + if (pin->flags & GPIO_ACTIVE_LOW) + *active = tmp == 0; + else + *active = tmp != 0; + return (0); +} + +void +gpio_pin_release(gpio_pin_t gpio) +{ + device_t busdev; + + if (gpio == NULL) + return; + + KASSERT(gpio->dev != NULL, ("GPIO pin device is NULL.")); + + busdev = GPIO_GET_BUS(gpio->dev); + if (busdev != NULL) + gpiobus_release_pin(busdev, gpio->pin); + + free(gpio, M_DEVBUF); +} + +int +gpio_pin_set_active(gpio_pin_t pin, bool active) +{ + int rv; + uint32_t tmp; + + if (pin->flags & GPIO_ACTIVE_LOW) + tmp = active ? 0 : 1; + else + tmp = active ? 1 : 0; + + KASSERT(pin != NULL, ("GPIO pin is NULL.")); + KASSERT(pin->dev != NULL, ("GPIO pin device is NULL.")); + rv = GPIO_PIN_SET(pin->dev, pin->pin, tmp); + return (rv); +} + +int +gpio_pin_setflags(gpio_pin_t pin, uint32_t flags) +{ + int rv; + + KASSERT(pin != NULL, ("GPIO pin is NULL.")); + KASSERT(pin->dev != NULL, ("GPIO pin device is NULL.")); + + rv = GPIO_PIN_SETFLAGS(pin->dev, pin->pin, flags); + return (rv); +} + static void gpiobus_print_pins(struct gpiobus_ivar *devi, char *buf, size_t buflen) { @@ -372,8 +492,6 @@ gpiobus_parse_pins(struct gpiobus_softc *sc, device_t child, int mask) devi->pins[npins++] = i; } - if (gpiobus_acquire_child_pins(sc->sc_busdev, child) != 0) - return (EINVAL); return (0); } @@ -427,8 +545,6 @@ gpiobus_parse_pin_list(struct gpiobus_softc *sc, device_t child, p = endp + 1; } - if (gpiobus_acquire_child_pins(sc->sc_busdev, child) != 0) - return (EINVAL); return (0); } @@ -602,6 +718,21 @@ gpiobus_add_child(device_t dev, u_int order, const char *name, int unit) return (child); } +static int +gpiobus_rescan(device_t dev) +{ + + /* + * Re-scan is supposed to remove and add children, but if someone has + * deleted the hints for a child we attached earlier, we have no easy + * way to handle that. So this just attaches new children for whom new + * hints or drivers have arrived since we last tried. + */ + bus_enumerate_hinted_children(dev); + bus_generic_attach(dev); + return (0); +} + static void gpiobus_hinted_child(device_t bus, const char *dname, int dunit) { @@ -611,6 +742,10 @@ gpiobus_hinted_child(device_t bus, const char *dname, int dunit) const char *pins; int irq, pinmask; + if (device_find_child(bus, dname, dunit) != NULL) { + return; + } + child = BUS_ADD_CHILD(bus, 0, dname, dunit); devi = GPIOBUS_IVAR(child); if (resource_int_value(dname, dunit, "pins", &pinmask) == 0) { @@ -963,6 +1098,7 @@ static device_method_t gpiobus_methods[] = { DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_get_resource_list, gpiobus_get_resource_list), DEVMETHOD(bus_add_child, gpiobus_add_child), + DEVMETHOD(bus_rescan, gpiobus_rescan), DEVMETHOD(bus_probe_nomatch, gpiobus_probe_nomatch), DEVMETHOD(bus_print_child, gpiobus_print_child), DEVMETHOD(bus_child_pnpinfo_str, gpiobus_child_pnpinfo_str), diff --git a/freebsd/sys/dev/gpio/gpiobusvar.h b/freebsd/sys/dev/gpio/gpiobusvar.h index 3ba8993e..ff49784a 100644 --- a/freebsd/sys/dev/gpio/gpiobusvar.h +++ b/freebsd/sys/dev/gpio/gpiobusvar.h @@ -141,7 +141,7 @@ int ofw_gpiobus_parse_gpios(device_t, char *, struct gpiobus_pin **); void ofw_gpiobus_register_provider(device_t); void ofw_gpiobus_unregister_provider(device_t); -/* Consumers interface. */ +/* Acquire a pin by parsing FDT data. */ int gpio_pin_get_by_ofw_name(device_t consumer, phandle_t node, char *name, gpio_pin_t *gpio); int gpio_pin_get_by_ofw_idx(device_t consumer, phandle_t node, @@ -150,14 +150,29 @@ int gpio_pin_get_by_ofw_property(device_t consumer, phandle_t node, char *name, gpio_pin_t *gpio); int gpio_pin_get_by_ofw_propidx(device_t consumer, phandle_t node, char *name, int idx, gpio_pin_t *gpio); +#endif /* FDT */ + +/* Acquire a pin by bus and pin number. */ +int gpio_pin_get_by_bus_pinnum(device_t _bus, uint32_t _pinnum, gpio_pin_t *_gp); + +/* Acquire a pin by child and index (used by direct children of gpiobus). */ +int gpio_pin_get_by_child_index(device_t _child, uint32_t _idx, gpio_pin_t *_gp); + +/* Release a pin acquired via any gpio_pin_get_xxx() function. */ void gpio_pin_release(gpio_pin_t gpio); + +/* Work with gpio pins acquired using the functions above. */ int gpio_pin_getcaps(gpio_pin_t pin, uint32_t *caps); int gpio_pin_is_active(gpio_pin_t pin, bool *active); int gpio_pin_set_active(gpio_pin_t pin, bool active); int gpio_pin_setflags(gpio_pin_t pin, uint32_t flags); -#endif struct resource *gpio_alloc_intr_resource(device_t consumer_dev, int *rid, u_int alloc_flags, gpio_pin_t pin, uint32_t intr_mode); + +/* + * Functions shared between gpiobus and other bus classes that derive from it; + * these should not be called directly by other drivers. + */ int gpio_check_flags(uint32_t, uint32_t); device_t gpiobus_attach_bus(device_t); int gpiobus_detach_bus(device_t); diff --git a/freebsd/sys/dev/gpio/ofw_gpiobus.c b/freebsd/sys/dev/gpio/ofw_gpiobus.c index 1cf3aa82..bd617ead 100644 --- a/freebsd/sys/dev/gpio/ofw_gpiobus.c +++ b/freebsd/sys/dev/gpio/ofw_gpiobus.c @@ -49,8 +49,6 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/gpiobus_if.h> -#define GPIO_ACTIVE_LOW 1 - static struct ofw_gpiobus_devinfo *ofw_gpiobus_setup_devinfo(device_t, device_t, phandle_t); static void ofw_gpiobus_destroy_devinfo(device_t, struct ofw_gpiobus_devinfo *); @@ -146,82 +144,6 @@ gpio_pin_get_by_ofw_name(device_t consumer, phandle_t node, return (gpio_pin_get_by_ofw_idx(consumer, node, idx, pin)); } -void -gpio_pin_release(gpio_pin_t gpio) -{ - device_t busdev; - - if (gpio == NULL) - return; - - KASSERT(gpio->dev != NULL, ("invalid pin state")); - - busdev = GPIO_GET_BUS(gpio->dev); - if (busdev != NULL) - gpiobus_release_pin(busdev, gpio->pin); - - /* XXXX Unreserve pin. */ - free(gpio, M_DEVBUF); -} - -int -gpio_pin_getcaps(gpio_pin_t pin, uint32_t *caps) -{ - - KASSERT(pin != NULL, ("GPIO pin is NULL.")); - KASSERT(pin->dev != NULL, ("GPIO pin device is NULL.")); - return (GPIO_PIN_GETCAPS(pin->dev, pin->pin, caps)); -} - -int -gpio_pin_is_active(gpio_pin_t pin, bool *active) -{ - int rv; - uint32_t tmp; - - KASSERT(pin != NULL, ("GPIO pin is NULL.")); - KASSERT(pin->dev != NULL, ("GPIO pin device is NULL.")); - rv = GPIO_PIN_GET(pin->dev, pin->pin, &tmp); - if (rv != 0) { - return (rv); - } - - if (pin->flags & GPIO_ACTIVE_LOW) - *active = tmp == 0; - else - *active = tmp != 0; - return (0); -} - -int -gpio_pin_set_active(gpio_pin_t pin, bool active) -{ - int rv; - uint32_t tmp; - - if (pin->flags & GPIO_ACTIVE_LOW) - tmp = active ? 0 : 1; - else - tmp = active ? 1 : 0; - - KASSERT(pin != NULL, ("GPIO pin is NULL.")); - KASSERT(pin->dev != NULL, ("GPIO pin device is NULL.")); - rv = GPIO_PIN_SET(pin->dev, pin->pin, tmp); - return (rv); -} - -int -gpio_pin_setflags(gpio_pin_t pin, uint32_t flags) -{ - int rv; - - KASSERT(pin != NULL, ("GPIO pin is NULL.")); - KASSERT(pin->dev != NULL, ("GPIO pin device is NULL.")); - - rv = GPIO_PIN_SETFLAGS(pin->dev, pin->pin, flags); - return (rv); -} - /* * OFW_GPIOBUS driver. */ @@ -498,7 +420,7 @@ ofw_gpiobus_probe(device_t dev) return (ENXIO); device_set_desc(dev, "OFW GPIO bus"); - return (0); + return (BUS_PROBE_DEFAULT); } static int @@ -517,6 +439,8 @@ ofw_gpiobus_attach(device_t dev) */ for (child = OF_child(ofw_bus_get_node(dev)); child != 0; child = OF_peer(child)) { + if (OF_hasprop(child, "gpio-hog")) + continue; if (!OF_hasprop(child, "gpios")) continue; if (ofw_gpiobus_add_fdt_child(dev, NULL, child) == NULL) diff --git a/freebsd/sys/dev/kbd/kbd.c b/freebsd/sys/dev/kbd/kbd.c index b157e57e..235e8f78 100644 --- a/freebsd/sys/dev/kbd/kbd.c +++ b/freebsd/sys/dev/kbd/kbd.c @@ -178,6 +178,10 @@ kbd_add_driver(keyboard_driver_t *driver) { if (SLIST_NEXT(driver, link)) return (EINVAL); + if (driver->kbdsw->get_fkeystr == NULL) + driver->kbdsw->get_fkeystr = genkbd_get_fkeystr; + if (driver->kbdsw->diag == NULL) + driver->kbdsw->diag = genkbd_diag; SLIST_INSERT_HEAD(&keyboard_drivers, driver, link); return (0); } @@ -1516,3 +1520,20 @@ kbd_ev_event(keyboard_t *kbd, uint16_t type, uint16_t code, int32_t value) kbdd_ioctl(kbd, KDSETREPEAT, (caddr_t)delay); } } + +static void +kbd_drv_init(void) +{ + const keyboard_driver_t **list; + const keyboard_driver_t *p; + + SET_FOREACH(list, kbddriver_set) { + p = *list; + if (p->kbdsw->get_fkeystr == NULL) + p->kbdsw->get_fkeystr = genkbd_get_fkeystr; + if (p->kbdsw->diag == NULL) + p->kbdsw->diag = genkbd_diag; + } +} + +SYSINIT(kbd_drv_init, SI_SUB_DRIVERS, SI_ORDER_FIRST, kbd_drv_init, NULL); diff --git a/freebsd/sys/dev/kbd/kbdreg.h b/freebsd/sys/dev/kbd/kbdreg.h index 07c4cfd9..886b6c49 100644 --- a/freebsd/sys/dev/kbd/kbdreg.h +++ b/freebsd/sys/dev/kbd/kbdreg.h @@ -205,14 +205,19 @@ typedef struct keyboard_switch { #define kbdd_poll(kbd, on) \ (*kbdsw[(kbd)->kb_index]->poll)((kbd), (on)) #define kbdd_diag(kbd, level) \ - (*kbdsw[(kbd)->kb_index]->diag)((kbd), (leve)) + (*kbdsw[(kbd)->kb_index]->diag)((kbd), (level)) -/* keyboard driver */ +/* + * Keyboard driver definition. Some of these be immutable after definition + * time, e.g. one shouldn't be able to rename a driver or use a different kbdsw + * entirely, but patching individual methods is acceptable. + */ typedef struct keyboard_driver { SLIST_ENTRY(keyboard_driver) link; - char *name; - keyboard_switch_t *kbdsw; - int (*configure)(int); /* backdoor for the console driver */ + const char * const name; + keyboard_switch_t * const kbdsw; + /* backdoor for the console driver */ + int (* const configure)(int); } keyboard_driver_t; #ifdef _KERNEL diff --git a/freebsd/sys/dev/mii/mii.c b/freebsd/sys/dev/mii/mii.c index 2ed40543..d0428f24 100644 --- a/freebsd/sys/dev/mii/mii.c +++ b/freebsd/sys/dev/mii/mii.c @@ -62,6 +62,7 @@ MODULE_VERSION(miibus, 1); #include <rtems/bsd/local/miibus_if.h> static device_attach_t miibus_attach; +static bus_child_detached_t miibus_child_detached; static bus_child_location_str_t miibus_child_location_str; static bus_child_pnpinfo_str_t miibus_child_pnpinfo_str; static device_detach_t miibus_detach; @@ -87,6 +88,7 @@ static device_method_t miibus_methods[] = { /* bus interface */ DEVMETHOD(bus_print_child, miibus_print_child), DEVMETHOD(bus_read_ivar, miibus_read_ivar), + DEVMETHOD(bus_child_detached, miibus_child_detached), DEVMETHOD(bus_child_pnpinfo_str, miibus_child_pnpinfo_str), DEVMETHOD(bus_child_location_str, miibus_child_location_str), DEVMETHOD(bus_hinted_child, miibus_hinted_child), @@ -162,15 +164,27 @@ static int miibus_detach(device_t dev) { struct mii_data *mii; + struct miibus_ivars *ivars; + ivars = device_get_ivars(dev); bus_generic_detach(dev); mii = device_get_softc(dev); ifmedia_removeall(&mii->mii_media); + free(ivars, M_DEVBUF); mii->mii_ifp = NULL; return (0); } +static void +miibus_child_detached(device_t dev, device_t child) +{ + struct mii_attach_args *args; + + args = device_get_ivars(child); + free(args, M_DEVBUF); +} + static int miibus_print_child(device_t dev, device_t child) { diff --git a/freebsd/sys/dev/nvme/nvme.c b/freebsd/sys/dev/nvme/nvme.c index 20b328c9..00759aa3 100644 --- a/freebsd/sys/dev/nvme/nvme.c +++ b/freebsd/sys/dev/nvme/nvme.c @@ -134,25 +134,6 @@ nvme_attach(device_t dev) int status; status = nvme_ctrlr_construct(ctrlr, dev); - - if (status != 0) { - nvme_ctrlr_destruct(ctrlr, dev); - return (status); - } - - /* - * Reset controller twice to ensure we do a transition from cc.en==1 to - * cc.en==0. This is because we don't really know what status the - * controller was left in when boot handed off to OS. Linux doesn't do - * this, however. If we adopt that policy, see also nvme_ctrlr_resume(). - */ - status = nvme_ctrlr_hw_reset(ctrlr); - if (status != 0) { - nvme_ctrlr_destruct(ctrlr, dev); - return (status); - } - - status = nvme_ctrlr_hw_reset(ctrlr); if (status != 0) { nvme_ctrlr_destruct(ctrlr, dev); return (status); diff --git a/freebsd/sys/dev/nvme/nvme.h b/freebsd/sys/dev/nvme/nvme.h index 16b9aa5f..21ae79cb 100644 --- a/freebsd/sys/dev/nvme/nvme.h +++ b/freebsd/sys/dev/nvme/nvme.h @@ -1561,9 +1561,19 @@ struct nvme_get_nsid { uint32_t nsid; }; +struct nvme_hmb_desc { + uint64_t addr; + uint32_t size; + uint32_t reserved; +}; + #define nvme_completion_is_error(cpl) \ (NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0) +#ifdef __rtems__ +/* This function is also used by user-space programs */ +#define nvme_strvis _bsd_nvme_strvis +#endif /* __rtems__ */ void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen); #ifdef _KERNEL @@ -1596,6 +1606,8 @@ int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, /* Admin functions */ void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, + uint32_t cdw12, uint32_t cdw13, + uint32_t cdw14, uint32_t cdw15, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, diff --git a/freebsd/sys/dev/nvme/nvme_ctrlr.c b/freebsd/sys/dev/nvme/nvme_ctrlr.c index 86cabfba..2c19e694 100644 --- a/freebsd/sys/dev/nvme/nvme_ctrlr.c +++ b/freebsd/sys/dev/nvme/nvme_ctrlr.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include <sys/smp.h> #include <sys/uio.h> #include <sys/endian.h> +#include <vm/vm.h> #include "nvme_private.h" #ifdef __rtems__ @@ -66,6 +67,11 @@ nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr) int error; qpair = &ctrlr->adminq; + qpair->id = 0; +#ifndef __rtems__ + qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1; + qpair->domain = ctrlr->domain; +#endif /* __rtems__ */ num_entries = NVME_ADMIN_ENTRIES; TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries); @@ -84,34 +90,39 @@ nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr) * The admin queue's max xfer size is treated differently than the * max I/O xfer size. 16KB is sufficient here - maybe even less? */ - error = nvme_qpair_construct(qpair, - 0, /* qpair ID */ - 0, /* vector */ - num_entries, - NVME_ADMIN_TRACKERS, - ctrlr); + error = nvme_qpair_construct(qpair, num_entries, NVME_ADMIN_TRACKERS, + ctrlr); return (error); } +#define QP(ctrlr, c) ((c) * (ctrlr)->num_io_queues / mp_ncpus) + static int nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) { struct nvme_qpair *qpair; uint32_t cap_lo; uint16_t mqes; - int i, error, num_entries, num_trackers; - - num_entries = NVME_IO_ENTRIES; - TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries); + int c, error, i, n; + int num_entries, num_trackers, max_entries; /* - * NVMe spec sets a hard limit of 64K max entries, but - * devices may specify a smaller limit, so we need to check - * the MQES field in the capabilities register. + * NVMe spec sets a hard limit of 64K max entries, but devices may + * specify a smaller limit, so we need to check the MQES field in the + * capabilities register. We have to cap the number of entries to the + * current stride allows for in BAR 0/1, otherwise the remainder entries + * are inaccessable. MQES should reflect this, and this is just a + * fail-safe. */ + max_entries = + (rman_get_size(ctrlr->resource) - nvme_mmio_offsetof(doorbell[0])) / + (1 << (ctrlr->dstrd + 1)); + num_entries = NVME_IO_ENTRIES; + TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries); cap_lo = nvme_mmio_read_4(ctrlr, cap_lo); mqes = NVME_CAP_LO_MQES(cap_lo); num_entries = min(num_entries, mqes + 1); + num_entries = min(num_entries, max_entries); num_trackers = NVME_IO_TRACKERS; TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers); @@ -119,9 +130,9 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS); num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS); /* - * No need to have more trackers than entries in the submit queue. - * Note also that for a queue size of N, we can only have (N-1) - * commands outstanding, hence the "-1" here. + * No need to have more trackers than entries in the submit queue. Note + * also that for a queue size of N, we can only have (N-1) commands + * outstanding, hence the "-1" here. */ num_trackers = min(num_trackers, (num_entries-1)); @@ -133,32 +144,37 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) */ ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4; - /* - * This was calculated previously when setting up interrupts, but - * a controller could theoretically support fewer I/O queues than - * MSI-X vectors. So calculate again here just to be safe. - */ - ctrlr->num_cpus_per_ioq = howmany(mp_ncpus, ctrlr->num_io_queues); - ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair), M_NVME, M_ZERO | M_WAITOK); - for (i = 0; i < ctrlr->num_io_queues; i++) { + for (i = c = n = 0; i < ctrlr->num_io_queues; i++, c += n) { qpair = &ctrlr->ioq[i]; /* * Admin queue has ID=0. IO queues start at ID=1 - * hence the 'i+1' here. - * + */ + qpair->id = i + 1; +#ifndef __rtems__ + if (ctrlr->num_io_queues > 1) { + /* Find number of CPUs served by this queue. */ + for (n = 1; QP(ctrlr, c + n) == i; n++) + ; + /* Shuffle multiple NVMe devices between CPUs. */ + qpair->cpu = c + (device_get_unit(ctrlr->dev)+n/2) % n; + qpair->domain = pcpu_find(qpair->cpu)->pc_domain; + } else { + qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1; + qpair->domain = ctrlr->domain; + } +#endif /* __rtems__ */ + + /* * For I/O queues, use the controller-wide max_xfer_size * calculated in nvme_attach(). */ - error = nvme_qpair_construct(qpair, - i+1, /* qpair ID */ - ctrlr->msix_enabled ? i+1 : 0, /* vector */ - num_entries, - num_trackers, - ctrlr); + error = nvme_qpair_construct(qpair, num_entries, num_trackers, + ctrlr); if (error) return (error); @@ -167,8 +183,11 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) * interrupt thread for this controller. */ if (ctrlr->num_io_queues > 1) - bus_bind_intr(ctrlr->dev, qpair->res, - i * ctrlr->num_cpus_per_ioq); +#ifndef __rtems__ + bus_bind_intr(ctrlr->dev, qpair->res, qpair->cpu); +#else /* __rtems__ */ + bus_bind_intr(ctrlr->dev, qpair->res, QP(ctrlr, i)); +#endif /* __rtems__ */ } return (0); @@ -179,7 +198,7 @@ nvme_ctrlr_fail(struct nvme_controller *ctrlr) { int i; - ctrlr->is_failed = TRUE; + ctrlr->is_failed = true; nvme_admin_qpair_disable(&ctrlr->adminq); nvme_qpair_fail(&ctrlr->adminq); if (ctrlr->ioq != NULL) { @@ -461,6 +480,8 @@ nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr) */ ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated); ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated); + if (ctrlr->num_io_queues > vm_ndomains) + ctrlr->num_io_queues -= ctrlr->num_io_queues % vm_ndomains; return (0); } @@ -476,7 +497,7 @@ nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr) qpair = &ctrlr->ioq[i]; status.done = 0; - nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector, + nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { @@ -542,7 +563,7 @@ nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr) return (0); } -static boolean_t +static bool is_log_page_id_valid(uint8_t page_id) { @@ -554,10 +575,10 @@ is_log_page_id_valid(uint8_t page_id) case NVME_LOG_COMMAND_EFFECT: case NVME_LOG_RES_NOTIFICATION: case NVME_LOG_SANITIZE_STATUS: - return (TRUE); + return (true); } - return (FALSE); + return (false); } static uint32_t @@ -778,7 +799,7 @@ nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, * Disable timeout here, since asynchronous event requests should by * nature never be timed out. */ - req->timeout = FALSE; + req->timeout = false; req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST; nvme_ctrlr_submit_admin_request(ctrlr, req); } @@ -837,6 +858,173 @@ nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr) } static void +nvme_ctrlr_hmb_free(struct nvme_controller *ctrlr) +{ +#ifndef __rtems__ + struct nvme_hmb_chunk *hmbc; + int i; + + if (ctrlr->hmb_desc_paddr) { + bus_dmamap_unload(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map); + bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr, + ctrlr->hmb_desc_map); + ctrlr->hmb_desc_paddr = 0; + } + if (ctrlr->hmb_desc_tag) { + bus_dma_tag_destroy(ctrlr->hmb_desc_tag); + ctrlr->hmb_desc_tag = NULL; + } + for (i = 0; i < ctrlr->hmb_nchunks; i++) { + hmbc = &ctrlr->hmb_chunks[i]; + bus_dmamap_unload(ctrlr->hmb_tag, hmbc->hmbc_map); + bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr, + hmbc->hmbc_map); + } + ctrlr->hmb_nchunks = 0; + if (ctrlr->hmb_tag) { + bus_dma_tag_destroy(ctrlr->hmb_tag); + ctrlr->hmb_tag = NULL; + } + if (ctrlr->hmb_chunks) { + free(ctrlr->hmb_chunks, M_NVME); + ctrlr->hmb_chunks = NULL; + } +#endif /* __rtems__ */ +} + +#ifndef __rtems__ +static void +nvme_ctrlr_hmb_alloc(struct nvme_controller *ctrlr) +{ + struct nvme_hmb_chunk *hmbc; + size_t pref, min, minc, size; + int err, i; + uint64_t max; + + /* Limit HMB to 5% of RAM size per device by default. */ + max = (uint64_t)physmem * PAGE_SIZE / 20; + TUNABLE_UINT64_FETCH("hw.nvme.hmb_max", &max); + + min = (long long unsigned)ctrlr->cdata.hmmin * 4096; + if (max == 0 || max < min) + return; + pref = MIN((long long unsigned)ctrlr->cdata.hmpre * 4096, max); + minc = MAX(ctrlr->cdata.hmminds * 4096, PAGE_SIZE); + if (min > 0 && ctrlr->cdata.hmmaxd > 0) + minc = MAX(minc, min / ctrlr->cdata.hmmaxd); + ctrlr->hmb_chunk = pref; + +again: + ctrlr->hmb_chunk = roundup2(ctrlr->hmb_chunk, PAGE_SIZE); + ctrlr->hmb_nchunks = howmany(pref, ctrlr->hmb_chunk); + if (ctrlr->cdata.hmmaxd > 0 && ctrlr->hmb_nchunks > ctrlr->cdata.hmmaxd) + ctrlr->hmb_nchunks = ctrlr->cdata.hmmaxd; + ctrlr->hmb_chunks = malloc(sizeof(struct nvme_hmb_chunk) * + ctrlr->hmb_nchunks, M_NVME, M_WAITOK); + err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), + PAGE_SIZE, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, + ctrlr->hmb_chunk, 1, ctrlr->hmb_chunk, 0, NULL, NULL, &ctrlr->hmb_tag); + if (err != 0) { + nvme_printf(ctrlr, "HMB tag create failed %d\n", err); + nvme_ctrlr_hmb_free(ctrlr); + return; + } + + for (i = 0; i < ctrlr->hmb_nchunks; i++) { + hmbc = &ctrlr->hmb_chunks[i]; + if (bus_dmamem_alloc(ctrlr->hmb_tag, + (void **)&hmbc->hmbc_vaddr, BUS_DMA_NOWAIT, + &hmbc->hmbc_map)) { + nvme_printf(ctrlr, "failed to alloc HMB\n"); + break; + } + if (bus_dmamap_load(ctrlr->hmb_tag, hmbc->hmbc_map, + hmbc->hmbc_vaddr, ctrlr->hmb_chunk, nvme_single_map, + &hmbc->hmbc_paddr, BUS_DMA_NOWAIT) != 0) { + bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr, + hmbc->hmbc_map); + nvme_printf(ctrlr, "failed to load HMB\n"); + break; + } + bus_dmamap_sync(ctrlr->hmb_tag, hmbc->hmbc_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + } + + if (i < ctrlr->hmb_nchunks && i * ctrlr->hmb_chunk < min && + ctrlr->hmb_chunk / 2 >= minc) { + ctrlr->hmb_nchunks = i; + nvme_ctrlr_hmb_free(ctrlr); + ctrlr->hmb_chunk /= 2; + goto again; + } + ctrlr->hmb_nchunks = i; + if (ctrlr->hmb_nchunks * ctrlr->hmb_chunk < min) { + nvme_ctrlr_hmb_free(ctrlr); + return; + } + + size = sizeof(struct nvme_hmb_desc) * ctrlr->hmb_nchunks; + err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), + 16, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, + size, 1, size, 0, NULL, NULL, &ctrlr->hmb_desc_tag); + if (err != 0) { + nvme_printf(ctrlr, "HMB desc tag create failed %d\n", err); + nvme_ctrlr_hmb_free(ctrlr); + return; + } + if (bus_dmamem_alloc(ctrlr->hmb_desc_tag, + (void **)&ctrlr->hmb_desc_vaddr, BUS_DMA_WAITOK, + &ctrlr->hmb_desc_map)) { + nvme_printf(ctrlr, "failed to alloc HMB desc\n"); + nvme_ctrlr_hmb_free(ctrlr); + return; + } + if (bus_dmamap_load(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map, + ctrlr->hmb_desc_vaddr, size, nvme_single_map, + &ctrlr->hmb_desc_paddr, BUS_DMA_NOWAIT) != 0) { + bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr, + ctrlr->hmb_desc_map); + nvme_printf(ctrlr, "failed to load HMB desc\n"); + nvme_ctrlr_hmb_free(ctrlr); + return; + } + + for (i = 0; i < ctrlr->hmb_nchunks; i++) { + ctrlr->hmb_desc_vaddr[i].addr = + htole64(ctrlr->hmb_chunks[i].hmbc_paddr); + ctrlr->hmb_desc_vaddr[i].size = htole32(ctrlr->hmb_chunk / 4096); + } + bus_dmamap_sync(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map, + BUS_DMASYNC_PREWRITE); + + nvme_printf(ctrlr, "Allocated %lluMB host memory buffer\n", + (long long unsigned)ctrlr->hmb_nchunks * ctrlr->hmb_chunk + / 1024 / 1024); +} + +static void +nvme_ctrlr_hmb_enable(struct nvme_controller *ctrlr, bool enable, bool memret) +{ + struct nvme_completion_poll_status status; + uint32_t cdw11; + + cdw11 = 0; + if (enable) + cdw11 |= 1; + if (memret) + cdw11 |= 2; + status.done = 0; + nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_HOST_MEMORY_BUFFER, cdw11, + ctrlr->hmb_nchunks * ctrlr->hmb_chunk / 4096, ctrlr->hmb_desc_paddr, + ctrlr->hmb_desc_paddr >> 32, ctrlr->hmb_nchunks, NULL, 0, + nvme_completion_poll_cb, &status); + nvme_completion_poll(&status); + if (nvme_completion_is_error(&status.cpl)) + nvme_printf(ctrlr, "nvme_ctrlr_hmb_enable failed!\n"); +} +#endif /* __rtems__ */ + +static void nvme_ctrlr_start(void *ctrlr_arg, bool resetting) { struct nvme_controller *ctrlr = ctrlr_arg; @@ -884,6 +1072,15 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting) } } +#ifndef __rtems__ + if (ctrlr->cdata.hmpre > 0 && ctrlr->hmb_nchunks == 0) { + nvme_ctrlr_hmb_alloc(ctrlr); + if (ctrlr->hmb_nchunks > 0) + nvme_ctrlr_hmb_enable(ctrlr, true, false); + } else if (ctrlr->hmb_nchunks > 0) + nvme_ctrlr_hmb_enable(ctrlr, true, true); +#endif /* __rtems__ */ + if (nvme_ctrlr_create_qpairs(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; @@ -905,6 +1102,25 @@ void nvme_ctrlr_start_config_hook(void *arg) { struct nvme_controller *ctrlr = arg; + int status; + + /* + * Reset controller twice to ensure we do a transition from cc.en==1 to + * cc.en==0. This is because we don't really know what status the + * controller was left in when boot handed off to OS. Linux doesn't do + * this, however. If we adopt that policy, see also nvme_ctrlr_resume(). + */ + status = nvme_ctrlr_hw_reset(ctrlr); + if (status != 0) { + nvme_ctrlr_fail(ctrlr); + return; + } + + status = nvme_ctrlr_hw_reset(ctrlr); + if (status != 0) { + nvme_ctrlr_fail(ctrlr); + return; + } nvme_qpair_reset(&ctrlr->adminq); nvme_admin_qpair_enable(&ctrlr->adminq); @@ -1135,22 +1351,19 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) uint32_t cap_lo; uint32_t cap_hi; uint32_t to; - uint8_t dstrd; uint8_t mpsmin; int status, timeout_period; ctrlr->dev = dev; mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF); +#ifndef __rtems__ + if (bus_get_domain(dev, &ctrlr->domain) != 0) + ctrlr->domain = 0; +#endif /* __rtems__ */ - /* - * Software emulators may set the doorbell stride to something - * other than zero, but this driver is not set up to handle that. - */ cap_hi = nvme_mmio_read_4(ctrlr, cap_hi); - dstrd = NVME_CAP_HI_DSTRD(cap_hi); - if (dstrd != 0) - return (ENXIO); + ctrlr->dstrd = NVME_CAP_HI_DSTRD(cap_hi) + 2; mpsmin = NVME_CAP_HI_MPSMIN(cap_hi); ctrlr->min_page_size = 1 << (12 + mpsmin); @@ -1186,7 +1399,7 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr); STAILQ_INIT(&ctrlr->fail_req); - ctrlr->is_failed = FALSE; + ctrlr->is_failed = false; make_dev_args_init(&md_args); md_args.mda_devsw = &nvme_ctrlr_cdevsw; @@ -1228,11 +1441,17 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) destroy_dev(ctrlr->cdev); if (ctrlr->is_initialized) { - if (!gone) + if (!gone) { +#ifndef __rtems__ + if (ctrlr->hmb_nchunks > 0) + nvme_ctrlr_hmb_enable(ctrlr, false, false); +#endif /* __rtems__ */ nvme_ctrlr_delete_qpairs(ctrlr); + } for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_destroy(&ctrlr->ioq[i]); free(ctrlr->ioq, M_NVME); + nvme_ctrlr_hmb_free(ctrlr); nvme_admin_qpair_destroy(&ctrlr->adminq); } @@ -1312,7 +1531,7 @@ nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, { struct nvme_qpair *qpair; - qpair = &ctrlr->ioq[curcpu / ctrlr->num_cpus_per_ioq]; + qpair = &ctrlr->ioq[QP(ctrlr, curcpu)]; nvme_qpair_submit_request(qpair, req); } @@ -1356,6 +1575,11 @@ nvme_ctrlr_suspend(struct nvme_controller *ctrlr) return (EWOULDBLOCK); } +#ifndef __rtems__ + if (ctrlr->hmb_nchunks > 0) + nvme_ctrlr_hmb_enable(ctrlr, false, false); +#endif /* __rtems__ */ + /* * Per Section 7.6.2 of NVMe spec 1.4, to properly suspend, we need to * delete the hardware I/O queues, and then shutdown. This properly diff --git a/freebsd/sys/dev/nvme/nvme_ctrlr_cmd.c b/freebsd/sys/dev/nvme/nvme_ctrlr_cmd.c index f5c1832c..8ce51e1f 100644 --- a/freebsd/sys/dev/nvme/nvme_ctrlr_cmd.c +++ b/freebsd/sys/dev/nvme/nvme_ctrlr_cmd.c @@ -78,8 +78,7 @@ nvme_ctrlr_cmd_identify_namespace(struct nvme_controller *ctrlr, uint32_t nsid, void nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr, - struct nvme_qpair *io_que, uint16_t vector, nvme_cb_fn_t cb_fn, - void *cb_arg) + struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; struct nvme_command *cmd; @@ -95,7 +94,7 @@ nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr, */ cmd->cdw10 = htole32(((io_que->num_entries-1) << 16) | io_que->id); /* 0x3 = interrupts enabled | physically contiguous */ - cmd->cdw11 = htole32((vector << 16) | 0x3); + cmd->cdw11 = htole32((io_que->vector << 16) | 0x3); cmd->prp1 = htole64(io_que->cpl_bus_addr); nvme_ctrlr_submit_admin_request(ctrlr, req); @@ -169,7 +168,8 @@ nvme_ctrlr_cmd_delete_io_sq(struct nvme_controller *ctrlr, void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, - uint32_t cdw11, void *payload, uint32_t payload_size, + uint32_t cdw11, uint32_t cdw12, uint32_t cdw13, uint32_t cdw14, + uint32_t cdw15, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; @@ -181,6 +181,10 @@ nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, cmd->opc = NVME_OPC_SET_FEATURES; cmd->cdw10 = htole32(feature); cmd->cdw11 = htole32(cdw11); + cmd->cdw12 = htole32(cdw12); + cmd->cdw13 = htole32(cdw13); + cmd->cdw14 = htole32(cdw14); + cmd->cdw15 = htole32(cdw15); nvme_ctrlr_submit_admin_request(ctrlr, req); } @@ -211,7 +215,7 @@ nvme_ctrlr_cmd_set_num_queues(struct nvme_controller *ctrlr, cdw11 = ((num_queues - 1) << 16) | (num_queues - 1); nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_NUMBER_OF_QUEUES, cdw11, - NULL, 0, cb_fn, cb_arg); + 0, 0, 0, 0, NULL, 0, cb_fn, cb_arg); } void @@ -222,8 +226,8 @@ nvme_ctrlr_cmd_set_async_event_config(struct nvme_controller *ctrlr, cdw11 = state; nvme_ctrlr_cmd_set_feature(ctrlr, - NVME_FEAT_ASYNC_EVENT_CONFIGURATION, cdw11, NULL, 0, cb_fn, - cb_arg); + NVME_FEAT_ASYNC_EVENT_CONFIGURATION, cdw11, 0, 0, 0, 0, NULL, 0, + cb_fn, cb_arg); } void @@ -248,7 +252,7 @@ nvme_ctrlr_cmd_set_interrupt_coalescing(struct nvme_controller *ctrlr, cdw11 = ((microseconds/100) << 8) | threshold; nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_INTERRUPT_COALESCING, cdw11, - NULL, 0, cb_fn, cb_arg); + 0, 0, 0, 0, NULL, 0, cb_fn, cb_arg); } void diff --git a/freebsd/sys/dev/nvme/nvme_pci.c b/freebsd/sys/dev/nvme/nvme_pci.c index b9d46a8b..6b07a5ab 100644 --- a/freebsd/sys/dev/nvme/nvme_pci.c +++ b/freebsd/sys/dev/nvme/nvme_pci.c @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include <sys/conf.h> #include <sys/proc.h> #include <sys/smp.h> +#include <vm/vm.h> #include <dev/pci/pcireg.h> #include <dev/pci/pcivar.h> @@ -235,7 +236,6 @@ nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr) ctrlr->msix_enabled = 0; ctrlr->num_io_queues = 1; - ctrlr->num_cpus_per_ioq = mp_ncpus; ctrlr->rid = 0; ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE); @@ -261,82 +261,65 @@ static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr) { device_t dev; - int per_cpu_io_queues; + int force_intx, num_io_queues, per_cpu_io_queues; int min_cpus_per_ioq; int num_vectors_requested, num_vectors_allocated; - int num_vectors_available; dev = ctrlr->dev; - min_cpus_per_ioq = 1; - TUNABLE_INT_FETCH("hw.nvme.min_cpus_per_ioq", &min_cpus_per_ioq); - if (min_cpus_per_ioq < 1) { - min_cpus_per_ioq = 1; - } else if (min_cpus_per_ioq > mp_ncpus) { - min_cpus_per_ioq = mp_ncpus; + force_intx = 0; + TUNABLE_INT_FETCH("hw.nvme.force_intx", &force_intx); + if (force_intx || pci_msix_count(dev) < 2) { + nvme_ctrlr_configure_intx(ctrlr); + return; } + num_io_queues = mp_ncpus; + TUNABLE_INT_FETCH("hw.nvme.num_io_queues", &num_io_queues); + if (num_io_queues < 1 || num_io_queues > mp_ncpus) + num_io_queues = mp_ncpus; + per_cpu_io_queues = 1; TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues); + if (per_cpu_io_queues == 0) + num_io_queues = 1; - if (per_cpu_io_queues == 0) { - min_cpus_per_ioq = mp_ncpus; +#ifndef __rtems__ + min_cpus_per_ioq = smp_threads_per_core; +#else /* __rtems__ */ + min_cpus_per_ioq = 1; +#endif /* __rtems__ */ + TUNABLE_INT_FETCH("hw.nvme.min_cpus_per_ioq", &min_cpus_per_ioq); + if (min_cpus_per_ioq > 1) { + num_io_queues = min(num_io_queues, + max(1, mp_ncpus / min_cpus_per_ioq)); } - ctrlr->force_intx = 0; - TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx); - - /* - * FreeBSD currently cannot allocate more than about 190 vectors at - * boot, meaning that systems with high core count and many devices - * requesting per-CPU interrupt vectors will not get their full - * allotment. So first, try to allocate as many as we may need to - * understand what is available, then immediately release them. - * Then figure out how many of those we will actually use, based on - * assigning an equal number of cores to each I/O queue. - */ + num_io_queues = min(num_io_queues, pci_msix_count(dev) - 1); +again: + if (num_io_queues > vm_ndomains) + num_io_queues -= num_io_queues % vm_ndomains; /* One vector for per core I/O queue, plus one vector for admin queue. */ - num_vectors_available = min(pci_msix_count(dev), mp_ncpus + 1); - if (pci_alloc_msix(dev, &num_vectors_available) != 0) { - num_vectors_available = 0; - } - pci_release_msi(dev); - - if (ctrlr->force_intx || num_vectors_available < 2) { - nvme_ctrlr_configure_intx(ctrlr); - return; - } - - /* - * Do not use all vectors for I/O queues - one must be saved for the - * admin queue. - */ - ctrlr->num_cpus_per_ioq = max(min_cpus_per_ioq, - howmany(mp_ncpus, num_vectors_available - 1)); - - ctrlr->num_io_queues = howmany(mp_ncpus, ctrlr->num_cpus_per_ioq); - num_vectors_requested = ctrlr->num_io_queues + 1; + num_vectors_requested = num_io_queues + 1; num_vectors_allocated = num_vectors_requested; - - /* - * Now just allocate the number of vectors we need. This should - * succeed, since we previously called pci_alloc_msix() - * successfully returning at least this many vectors, but just to - * be safe, if something goes wrong just revert to INTx. - */ if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) { nvme_ctrlr_configure_intx(ctrlr); return; } - - if (num_vectors_allocated < num_vectors_requested) { + if (num_vectors_allocated < 2) { pci_release_msi(dev); nvme_ctrlr_configure_intx(ctrlr); return; } + if (num_vectors_allocated != num_vectors_requested) { + pci_release_msi(dev); + num_io_queues = num_vectors_allocated - 1; + goto again; + } ctrlr->msix_enabled = 1; + ctrlr->num_io_queues = num_io_queues; } static int diff --git a/freebsd/sys/dev/nvme/nvme_private.h b/freebsd/sys/dev/nvme/nvme_private.h index a49d2b94..54ce1bfc 100644 --- a/freebsd/sys/dev/nvme/nvme_private.h +++ b/freebsd/sys/dev/nvme/nvme_private.h @@ -147,7 +147,7 @@ struct nvme_request { } u; uint32_t type; uint32_t payload_size; - boolean_t timeout; + bool timeout; nvme_cb_fn_t cb_fn; void *cb_arg; int32_t retries; @@ -187,7 +187,10 @@ struct nvme_qpair { struct nvme_controller *ctrlr; uint32_t id; - uint32_t phase; +#ifndef __rtems__ + int domain; + int cpu; +#endif /* __rtems__ */ uint16_t vector; int rid; @@ -199,6 +202,7 @@ struct nvme_qpair { uint32_t sq_tdbl_off; uint32_t cq_hdbl_off; + uint32_t phase; uint32_t sq_head; uint32_t sq_tail; uint32_t cq_head; @@ -226,7 +230,7 @@ struct nvme_qpair { struct nvme_tracker **act_tr; - boolean_t is_enabled; + bool is_enabled; struct mtx lock __aligned(CACHE_LINE_SIZE); @@ -252,7 +256,9 @@ struct nvme_controller { device_t dev; struct mtx lock; - +#ifndef __rtems__ + int domain; +#endif /* __rtems__ */ uint32_t ready_timeout_in_ms; uint32_t quirks; #define QUIRK_DELAY_B4_CHK_RDY 1 /* Can't touch MMIO on disable */ @@ -272,11 +278,9 @@ struct nvme_controller { struct resource *bar4_resource; uint32_t msix_enabled; - uint32_t force_intx; uint32_t enable_aborts; uint32_t num_io_queues; - uint32_t num_cpus_per_ioq; uint32_t max_hw_pend_io; /* Fields for tracking progress during controller initialization. */ @@ -293,9 +297,6 @@ struct nvme_controller { struct resource *res; void *tag; - bus_dma_tag_t hw_desc_tag; - bus_dmamap_t hw_desc_map; - /** maximum i/o size in bytes */ uint32_t max_xfer_size; @@ -311,6 +312,9 @@ struct nvme_controller { /** timeout period in seconds */ uint32_t timeout_period; + /** doorbell stride */ + uint32_t dstrd; + struct nvme_qpair adminq; struct nvme_qpair *ioq; @@ -333,8 +337,24 @@ struct nvme_controller { uint32_t is_initialized; uint32_t notification_sent; - boolean_t is_failed; + bool is_failed; STAILQ_HEAD(, nvme_request) fail_req; + + /* Host Memory Buffer */ +#ifndef __rtems__ + int hmb_nchunks; + size_t hmb_chunk; + bus_dma_tag_t hmb_tag; + struct nvme_hmb_chunk { + bus_dmamap_t hmbc_map; + void *hmbc_vaddr; + uint64_t hmbc_paddr; + } *hmb_chunks; + bus_dma_tag_t hmb_desc_tag; + bus_dmamap_t hmb_desc_map; + struct nvme_hmb_desc *hmb_desc_vaddr; + uint64_t hmb_desc_paddr; +#endif /* __rtems__ */ }; #define nvme_mmio_offsetof(reg) \ @@ -388,7 +408,7 @@ void nvme_ctrlr_cmd_get_firmware_page(struct nvme_controller *ctrlr, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr, - struct nvme_qpair *io_que, uint16_t vector, + struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_create_io_sq(struct nvme_controller *ctrlr, struct nvme_qpair *io_que, @@ -424,9 +444,8 @@ void nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, void nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr, struct nvme_request *req); -int nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, - uint16_t vector, uint32_t num_entries, - uint32_t num_trackers, +int nvme_qpair_construct(struct nvme_qpair *qpair, + uint32_t num_entries, uint32_t num_trackers, struct nvme_controller *ctrlr); void nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr); @@ -499,7 +518,7 @@ _nvme_allocate_request(nvme_cb_fn_t cb_fn, void *cb_arg) if (req != NULL) { req->cb_fn = cb_fn; req->cb_arg = cb_arg; - req->timeout = TRUE; + req->timeout = true; } return (req); } diff --git a/freebsd/sys/dev/nvme/nvme_qpair.c b/freebsd/sys/dev/nvme/nvme_qpair.c index 6c16240d..3955f09b 100644 --- a/freebsd/sys/dev/nvme/nvme_qpair.c +++ b/freebsd/sys/dev/nvme/nvme_qpair.c @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/bus.h> #include <sys/conf.h> +#include <sys/domainset.h> #include <sys/proc.h> #include <dev/pci/pcivar.h> @@ -358,7 +359,7 @@ nvme_qpair_print_completion(struct nvme_qpair *qpair, cpl->cdw0); } -static boolean_t +static bool nvme_completion_is_retry(const struct nvme_completion *cpl) { uint8_t sct, sc, dnr; @@ -419,11 +420,12 @@ nvme_completion_is_retry(const struct nvme_completion *cpl) } static void -nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr, +nvme_qpair_complete_tracker(struct nvme_tracker *tr, struct nvme_completion *cpl, error_print_t print_on_error) { + struct nvme_qpair * qpair = tr->qpair; struct nvme_request *req; - boolean_t retry, error, retriable; + bool retry, error, retriable; req = tr->req; error = nvme_completion_is_error(cpl); @@ -444,8 +446,17 @@ nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr, KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n")); - if (req->cb_fn && !retry) - req->cb_fn(req->cb_arg, cpl); + if (!retry) { +#ifndef __rtems__ + if (req->type != NVME_REQUEST_NULL) { + bus_dmamap_sync(qpair->dma_tag_payload, + tr->payload_dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + } +#endif /* __rtems__ */ + if (req->cb_fn) + req->cb_fn(req->cb_arg, cpl); + } mtx_lock(&qpair->lock); callout_stop(&tr->timer); @@ -456,9 +467,6 @@ nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr, } else { #ifndef __rtems__ if (req->type != NVME_REQUEST_NULL) { - bus_dmamap_sync(qpair->dma_tag_payload, - tr->payload_dma_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(qpair->dma_tag_payload, tr->payload_dma_map); } @@ -487,19 +495,22 @@ nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr, } static void -nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair, +nvme_qpair_manual_complete_tracker( struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, error_print_t print_on_error) { struct nvme_completion cpl; memset(&cpl, 0, sizeof(cpl)); + + struct nvme_qpair * qpair = tr->qpair; + cpl.sqid = qpair->id; cpl.cid = tr->cid; cpl.status |= (sct & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT; cpl.status |= (sc & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; cpl.status |= (dnr & NVME_STATUS_DNR_MASK) << NVME_STATUS_DNR_SHIFT; - nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); + nvme_qpair_complete_tracker(tr, &cpl, print_on_error); } void @@ -507,7 +518,7 @@ nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, struct nvme_request *req, uint32_t sct, uint32_t sc) { struct nvme_completion cpl; - boolean_t error; + bool error; memset(&cpl, 0, sizeof(cpl)); cpl.sqid = qpair->id; @@ -596,7 +607,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair) tr = qpair->act_tr[cpl.cid]; if (tr != NULL) { - nvme_qpair_complete_tracker(qpair, tr, &cpl, ERROR_PRINT_ALL); + nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL); qpair->sq_head = cpl.sqhd; done++; } else if (!in_panic) { @@ -630,8 +641,13 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair) qpair->phase = !qpair->phase; /* 3 */ } - nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].cq_hdbl, - qpair->cq_head); +#ifndef __rtems__ + bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle, + qpair->cq_hdbl_off, qpair->cq_head); +#else /* __rtems__ */ + bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle, + qpair->cq_hdbl_off, htole32(qpair->cq_head)); +#endif /* __rtems__ */ } return (done != 0); } @@ -645,8 +661,8 @@ nvme_qpair_msix_handler(void *arg) } int -nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, - uint16_t vector, uint32_t num_entries, uint32_t num_trackers, +nvme_qpair_construct(struct nvme_qpair *qpair, + uint32_t num_entries, uint32_t num_trackers, struct nvme_controller *ctrlr) { struct nvme_tracker *tr; @@ -655,8 +671,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, uint8_t *queuemem, *prpmem, *prp_list; int i, err; - qpair->id = id; - qpair->vector = vector; + qpair->vector = ctrlr->msix_enabled ? qpair->id : 0; qpair->num_entries = num_entries; qpair->num_trackers = num_trackers; qpair->ctrlr = ctrlr; @@ -667,19 +682,19 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, * MSI-X vector resource IDs start at 1, so we add one to * the queue's vector to get the corresponding rid to use. */ - qpair->rid = vector + 1; + qpair->rid = qpair->vector + 1; qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, &qpair->rid, RF_ACTIVE); bus_setup_intr(ctrlr->dev, qpair->res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_qpair_msix_handler, qpair, &qpair->tag); - if (id == 0) { + if (qpair->id == 0) { bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, "admin"); } else { bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, - "io%d", id - 1); + "io%d", qpair->id - 1); } } @@ -717,6 +732,9 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, nvme_printf(ctrlr, "tag create failed %d\n", err); goto out; } +#ifndef __rtems__ + bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain); +#endif /* __rtems__ */ if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem, BUS_DMA_NOWAIT, &qpair->queuemem_map)) { @@ -741,8 +759,15 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, qpair->cpl_bus_addr = queuemem_phys + cmdsz; prpmem_phys = queuemem_phys + cmdsz + cplsz; - qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[id].sq_tdbl); - qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[id].cq_hdbl); + /* + * Calcuate the stride of the doorbell register. Many emulators set this + * value to correspond to a cache line. However, some hardware has set + * it to various small values. + */ + qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) + + (qpair->id << (ctrlr->dstrd + 1)); + qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) + + (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd); TAILQ_INIT(&qpair->free_tr); TAILQ_INIT(&qpair->outstanding_tr); @@ -768,7 +793,8 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, (uint8_t *)roundup2((uintptr_t)prp_list, PAGE_SIZE); } - tr = malloc(sizeof(*tr), M_NVME, M_ZERO | M_WAITOK); + tr = malloc_domainset(sizeof(*tr), M_NVME, + DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK); #ifndef __rtems__ bus_dmamap_create(qpair->dma_tag_payload, 0, &tr->payload_dma_map); @@ -788,8 +814,9 @@ nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, goto out; } - qpair->act_tr = malloc(sizeof(struct nvme_tracker *) * - qpair->num_entries, M_NVME, M_ZERO | M_WAITOK); + qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) * + qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain), + M_ZERO | M_WAITOK); return (0); out: @@ -819,7 +846,7 @@ nvme_qpair_destroy(struct nvme_qpair *qpair) } if (qpair->act_tr) - free(qpair->act_tr, M_NVME); + free_domain(qpair->act_tr, M_NVME); while (!TAILQ_EMPTY(&qpair->free_tr)) { tr = TAILQ_FIRST(&qpair->free_tr); @@ -828,7 +855,7 @@ nvme_qpair_destroy(struct nvme_qpair *qpair) bus_dmamap_destroy(qpair->dma_tag_payload, tr->payload_dma_map); #endif /* __rtems__ */ - free(tr, M_NVME); + free_domain(tr, M_NVME); } if (qpair->dma_tag) @@ -848,7 +875,7 @@ nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair) tr = TAILQ_FIRST(&qpair->outstanding_tr); while (tr != NULL) { if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) { - nvme_qpair_manual_complete_tracker(qpair, tr, + nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0, ERROR_PRINT_NONE); tr = TAILQ_FIRST(&qpair->outstanding_tr); @@ -892,7 +919,7 @@ nvme_abort_complete(void *arg, const struct nvme_completion *status) */ nvme_printf(tr->qpair->ctrlr, "abort command failed, aborting command manually\n"); - nvme_qpair_manual_complete_tracker(tr->qpair, tr, + nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL); } } @@ -947,8 +974,13 @@ nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) ctrlr = qpair->ctrlr; if (req->timeout) - callout_reset_curcpu(&tr->timer, ctrlr->timeout_period * hz, - nvme_timeout, tr); +#ifndef __rtems__ + callout_reset_on(&tr->timer, ctrlr->timeout_period * hz, + nvme_timeout, tr, qpair->cpu); +#else /* __rtems__ */ + callout_reset_on(&tr->timer, ctrlr->timeout_period * hz, + nvme_timeout, tr, -1); +#endif /* __rtems__ */ /* Copy the command from the tracker to the submission queue. */ memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd)); @@ -970,9 +1002,13 @@ nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) wmb(); #endif /* __rtems__ */ - nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].sq_tdbl, - qpair->sq_tail); - +#ifndef __rtems__ + bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle, + qpair->sq_tdbl_off, qpair->sq_tail); +#else /* __rtems__ */ + bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle, + qpair->sq_tdbl_off, htole32(qpair->sq_tail)); +#endif /* __rtems__ */ qpair->num_cmds++; } @@ -1199,7 +1235,7 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) * with the qpair lock held. */ mtx_unlock(&qpair->lock); - nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, + nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL); mtx_lock(&qpair->lock); } @@ -1218,7 +1254,7 @@ static void nvme_qpair_enable(struct nvme_qpair *qpair) { - qpair->is_enabled = TRUE; + qpair->is_enabled = true; } void @@ -1257,7 +1293,7 @@ nvme_admin_qpair_enable(struct nvme_qpair *qpair) TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { nvme_printf(qpair->ctrlr, "aborting outstanding admin command\n"); - nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, + nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL); } @@ -1279,7 +1315,7 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair) */ TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n"); - nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, + nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY); } @@ -1306,7 +1342,7 @@ nvme_qpair_disable(struct nvme_qpair *qpair) { struct nvme_tracker *tr; - qpair->is_enabled = FALSE; + qpair->is_enabled = false; mtx_lock(&qpair->lock); TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) callout_stop(&tr->timer); @@ -1358,11 +1394,10 @@ nvme_qpair_fail(struct nvme_qpair *qpair) */ nvme_printf(qpair->ctrlr, "failing outstanding i/o\n"); mtx_unlock(&qpair->lock); - nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, + nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL); mtx_lock(&qpair->lock); } mtx_unlock(&qpair->lock); } - diff --git a/freebsd/sys/dev/nvme/nvme_sysctl.c b/freebsd/sys/dev/nvme/nvme_sysctl.c index 7110cb80..589f4f43 100644 --- a/freebsd/sys/dev/nvme/nvme_sysctl.c +++ b/freebsd/sys/dev/nvme/nvme_sysctl.c @@ -308,9 +308,9 @@ nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr) ctrlr_tree = device_get_sysctl_tree(ctrlr->dev); ctrlr_list = SYSCTL_CHILDREN(ctrlr_tree); - SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "num_cpus_per_ioq", - CTLFLAG_RD, &ctrlr->num_cpus_per_ioq, 0, - "Number of CPUs assigned per I/O queue pair"); + SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "num_io_queues", + CTLFLAG_RD, &ctrlr->num_io_queues, 0, + "Number of I/O queue pairs"); SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO, "int_coal_time", CTLTYPE_UINT | CTLFLAG_RW, ctrlr, 0, diff --git a/freebsd/sys/dev/ofw/ofw_bus_subr.h b/freebsd/sys/dev/ofw/ofw_bus_subr.h index 218ba710..4a55037b 100644 --- a/freebsd/sys/dev/ofw/ofw_bus_subr.h +++ b/freebsd/sys/dev/ofw/ofw_bus_subr.h @@ -69,7 +69,8 @@ struct intr_map_data_fdt { #define FDTCOMPAT_PNP_INFO(t, busname) \ MODULE_PNP_INFO(FDTCOMPAT_PNP_DESCR, busname, t, t, sizeof(t) / sizeof(t[0])); -#define SIMPLEBUS_PNP_INFO(t) FDTCOMPAT_PNP_INFO(t, simplebus) +#define OFWBUS_PNP_INFO(t) FDTCOMPAT_PNP_INFO(t, ofwbus) +#define SIMPLEBUS_PNP_INFO(t) FDTCOMPAT_PNP_INFO(t, simplebus) /* Generic implementation of ofw_bus_if.m methods and helper routines */ int ofw_bus_gen_setup_devinfo(struct ofw_bus_devinfo *, phandle_t); diff --git a/freebsd/sys/dev/pci/pci.c b/freebsd/sys/dev/pci/pci.c index 586efc3d..f1501208 100644 --- a/freebsd/sys/dev/pci/pci.c +++ b/freebsd/sys/dev/pci/pci.c @@ -108,8 +108,6 @@ static void pci_assign_interrupt(device_t bus, device_t dev, static int pci_add_map(device_t bus, device_t dev, int reg, struct resource_list *rl, int force, int prefetch); static int pci_probe(device_t dev); -static int pci_attach(device_t dev); -static int pci_detach(device_t dev); static void pci_load_vendor_data(void); static int pci_describe_parse_line(char **ptr, int *vendor, int *device, char **desc); @@ -250,6 +248,7 @@ struct pci_quirk { #define PCI_QUIRK_UNMAP_REG 4 /* Ignore PCI map register */ #define PCI_QUIRK_DISABLE_MSIX 5 /* MSI-X doesn't work */ #define PCI_QUIRK_MSI_INTX_BUG 6 /* PCIM_CMD_INTxDIS disables MSI */ +#define PCI_QUIRK_REALLOC_BAR 7 /* Can't allocate memory at the default address */ int arg1; int arg2; }; @@ -331,6 +330,12 @@ static const struct pci_quirk pci_quirks[] = { { 0x167814e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5715 */ { 0x167914e4, PCI_QUIRK_MSI_INTX_BUG, 0, 0 }, /* BCM5715S */ + /* + * HPE Gen 10 VGA has a memory range that can't be allocated in the + * expected place. + */ + { 0x98741002, PCI_QUIRK_REALLOC_BAR, 0, 0 }, + { 0 } }; @@ -3311,7 +3316,9 @@ pci_add_map(device_t bus, device_t dev, int reg, struct resource_list *rl, */ res = resource_list_reserve(rl, bus, dev, type, ®, start, end, count, flags); - if (pci_do_realloc_bars && res == NULL && (start != 0 || end != ~0)) { + if ((pci_do_realloc_bars + || pci_has_quirk(pci_get_devid(dev), PCI_QUIRK_REALLOC_BAR)) + && res == NULL && (start != 0 || end != ~0)) { /* * If the allocation fails, try to allocate a resource for * this BAR using any available range. The firmware felt @@ -4404,7 +4411,7 @@ pci_attach_common(device_t dev) return (0); } -static int +int pci_attach(device_t dev) { int busno, domain, error; @@ -4425,7 +4432,7 @@ pci_attach(device_t dev) return (bus_generic_attach(dev)); } -static int +int pci_detach(device_t dev) { #ifdef PCI_RES_BUS diff --git a/freebsd/sys/dev/pci/pci_private.h b/freebsd/sys/dev/pci/pci_private.h index f468152b..d891f592 100644 --- a/freebsd/sys/dev/pci/pci_private.h +++ b/freebsd/sys/dev/pci/pci_private.h @@ -58,7 +58,9 @@ void pci_add_resources(device_t bus, device_t dev, int force, uint32_t prefetchmask); void pci_add_resources_ea(device_t bus, device_t dev, int alloc_iov); struct pci_devinfo *pci_alloc_devinfo_method(device_t dev); +int pci_attach(device_t dev); int pci_attach_common(device_t dev); +int pci_detach(device_t dev); int pci_rescan_method(device_t dev); void pci_driver_added(device_t dev, driver_t *driver); int pci_ea_is_enabled(device_t dev, int rid); diff --git a/freebsd/sys/dev/sdhci/sdhci.c b/freebsd/sys/dev/sdhci/sdhci.c index 5d9cf26c..ed6010e8 100644 --- a/freebsd/sys/dev/sdhci/sdhci.c +++ b/freebsd/sys/dev/sdhci/sdhci.c @@ -904,8 +904,13 @@ sdhci_init_slot(device_t dev, struct sdhci_slot *slot, int num) slot->host.host_ocr |= MMC_OCR_320_330 | MMC_OCR_330_340; if (caps & SDHCI_CAN_VDD_300) slot->host.host_ocr |= MMC_OCR_290_300 | MMC_OCR_300_310; - /* 1.8V VDD is not supposed to be used for removable cards. */ - if ((caps & SDHCI_CAN_VDD_180) && (slot->opt & SDHCI_SLOT_EMBEDDED)) + /* + * 1.8V VDD is not supposed to be used for removable cards. Hardware + * prior to v3.0 had no way to indicate embedded slots, but did + * sometimes support 1.8v for non-removable devices. + */ + if ((caps & SDHCI_CAN_VDD_180) && (slot->version < SDHCI_SPEC_300 || + (slot->opt & SDHCI_SLOT_EMBEDDED))) slot->host.host_ocr |= MMC_OCR_LOW_VOLTAGE; if (slot->host.host_ocr == 0) { slot_printf(slot, "Hardware doesn't report any " diff --git a/freebsd/sys/dev/usb/controller/dwc_otg_fdt.c b/freebsd/sys/dev/usb/controller/dwc_otg_fdt.c index a7110887..65343be9 100644 --- a/freebsd/sys/dev/usb/controller/dwc_otg_fdt.c +++ b/freebsd/sys/dev/usb/controller/dwc_otg_fdt.c @@ -83,6 +83,20 @@ dwc_otg_probe(device_t dev) return (BUS_PROBE_DEFAULT); } +static int +dwc_otg_irq_index(device_t dev, int *rid) +{ + int idx, rv; + phandle_t node; + + node = ofw_bus_get_node(dev); + rv = ofw_bus_find_string_index(node, "interrupt-names", "usb", &idx); + if (rv != 0) + return (rv); + *rid = idx; + return (0); +} + int dwc_otg_attach(device_t dev) { @@ -135,10 +149,16 @@ dwc_otg_attach(device_t dev) /* - * brcm,bcm2708-usb FDT provides two interrupts, - * we need only second one (VC_USB) + * brcm,bcm2708-usb FDT provides two interrupts, we need only the USB + * interrupt (VC_USB). The latest FDT for it provides an + * interrupt-names property and swapped them around, while older ones + * did not have interrupt-names and put the usb interrupt in the second + * position. We'll attempt to use interrupt-names first with a fallback + * to the old method of assuming the index based on the compatible + * string. */ - rid = ofw_bus_is_compatible(dev, "brcm,bcm2708-usb") ? 1 : 0; + if (dwc_otg_irq_index(dev, &rid) != 0) + rid = ofw_bus_is_compatible(dev, "brcm,bcm2708-usb") ? 1 : 0; sc->sc_otg.sc_irq_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (sc->sc_otg.sc_irq_res == NULL) diff --git a/freebsd/sys/dev/usb/input/uep.c b/freebsd/sys/dev/usb/input/uep.c index 247bfb9c..6d11b1f7 100644 --- a/freebsd/sys/dev/usb/input/uep.c +++ b/freebsd/sys/dev/usb/input/uep.c @@ -59,7 +59,6 @@ #else #include <sys/ioccom.h> #include <sys/fcntl.h> -#include <sys/tty.h> #endif #define USB_DEBUG_VAR uep_debug diff --git a/freebsd/sys/dev/usb/input/ukbd.c b/freebsd/sys/dev/usb/input/ukbd.c index 770d1d3f..5cdaaffd 100644 --- a/freebsd/sys/dev/usb/input/ukbd.c +++ b/freebsd/sys/dev/usb/input/ukbd.c @@ -82,7 +82,6 @@ __FBSDID("$FreeBSD$"); #include <sys/ioccom.h> #include <sys/filio.h> -#include <sys/tty.h> #include <sys/kbio.h> #include <dev/kbd/kbdreg.h> @@ -1372,7 +1371,7 @@ ukbd_attach(device_t dev) sc->sc_flags |= UKBD_FLAG_ATTACHED; if (bootverbose) { - genkbd_diag(kbd, bootverbose); + kbdd_diag(kbd, bootverbose); } #ifdef USB_DEBUG @@ -2287,9 +2286,7 @@ static keyboard_switch_t ukbdsw = { .clear_state = &ukbd_clear_state, .get_state = &ukbd_get_state, .set_state = &ukbd_set_state, - .get_fkeystr = &genkbd_get_fkeystr, .poll = &ukbd_poll, - .diag = &genkbd_diag, }; KEYBOARD_DRIVER(ukbd, ukbdsw, ukbd_configure); diff --git a/freebsd/sys/dev/usb/input/ums.c b/freebsd/sys/dev/usb/input/ums.c index 4a0d1f34..65c76b4a 100644 --- a/freebsd/sys/dev/usb/input/ums.c +++ b/freebsd/sys/dev/usb/input/ums.c @@ -81,7 +81,6 @@ __FBSDID("$FreeBSD$"); #include <sys/ioccom.h> #include <sys/filio.h> -#include <sys/tty.h> #include <sys/mouse.h> #ifdef USB_DEBUG diff --git a/freebsd/sys/dev/usb/serial/uslcom.c b/freebsd/sys/dev/usb/serial/uslcom.c index 4128802d..26986b8b 100644 --- a/freebsd/sys/dev/usb/serial/uslcom.c +++ b/freebsd/sys/dev/usb/serial/uslcom.c @@ -315,6 +315,7 @@ static const STRUCT_USB_HOST_ID uslcom_devs[] = { USLCOM_DEV(SILABS, HAMLINKUSB), USLCOM_DEV(SILABS, HELICOM), USLCOM_DEV(SILABS, HUBZ), + USLCOM_DEV(SILABS, BV_AV2010_10), USLCOM_DEV(SILABS, IMS_USB_RS422), USLCOM_DEV(SILABS, INFINITY_MIC), USLCOM_DEV(SILABS, INGENI_ZIGBEE), @@ -626,7 +627,11 @@ uslcom_pre_param(struct ucom_softc *ucom, struct termios *t) case USLCOM_PARTNUM_CP2102: case USLCOM_PARTNUM_CP2103: default: - maxspeed = 921600; + /* + * Datasheet for cp2102 says 921600 max. Testing shows that + * 1228800 and 1843200 work fine. + */ + maxspeed = 1843200; break; } if (t->c_ospeed <= 0 || t->c_ospeed > maxspeed) diff --git a/freebsd/sys/dev/usb/usb_bus.h b/freebsd/sys/dev/usb/usb_bus.h index 710436c1..07784ded 100644 --- a/freebsd/sys/dev/usb/usb_bus.h +++ b/freebsd/sys/dev/usb/usb_bus.h @@ -2,7 +2,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2008 Hans Petter Selasky. All rights reserved. + * Copyright (c) 2008-2019 Hans Petter Selasky. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -42,19 +42,10 @@ struct usb_bus_msg { }; /* - * The following structure defines the USB statistics structure. - */ -struct usb_bus_stat { - uint32_t uds_requests[4]; -}; - -/* * The following structure defines an USB BUS. There is one USB BUS * for every Host or Device controller. */ struct usb_bus { - struct usb_bus_stat stats_err; - struct usb_bus_stat stats_ok; #if USB_HAVE_ROOT_MOUNT_HOLD struct root_hold_token *bus_roothold; #endif @@ -131,6 +122,7 @@ struct usb_bus { uint8_t do_probe; /* set if USB should be re-probed */ uint8_t no_explore; /* don't explore USB ports */ uint8_t dma_bits; /* number of DMA address lines */ + uint8_t control_ep_quirk; /* need 64kByte buffer for data stage */ }; #endif /* _USB_BUS_H_ */ diff --git a/freebsd/sys/dev/usb/usb_device.h b/freebsd/sys/dev/usb/usb_device.h index 1cf48ea1..691b2b38 100644 --- a/freebsd/sys/dev/usb/usb_device.h +++ b/freebsd/sys/dev/usb/usb_device.h @@ -2,7 +2,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2008 Hans Petter Selasky. All rights reserved. + * Copyright (c) 2008-2019 Hans Petter Selasky. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -177,10 +177,22 @@ union usb_device_scratch { }; /* + * Helper structure to keep track of USB device statistics. + */ +struct usb_device_statistics { + uint32_t uds_requests[4]; +}; + +/* * The following structure defines an USB device. There exists one of * these structures for every USB device. */ struct usb_device { + /* statistics */ + struct usb_device_statistics stats_err; + struct usb_device_statistics stats_ok; + struct usb_device_statistics stats_cancelled; + /* generic clear stall message */ struct usb_udev_msg cs_msg[2]; struct sx enum_sx; diff --git a/freebsd/sys/dev/usb/usb_generic.c b/freebsd/sys/dev/usb/usb_generic.c index d45a0bf6..b3f12249 100644 --- a/freebsd/sys/dev/usb/usb_generic.c +++ b/freebsd/sys/dev/usb/usb_generic.c @@ -2229,10 +2229,9 @@ ugen_ioctl_post(struct usb_fifo *f, u_long cmd, void *addr, int fflags) for (n = 0; n != 4; n++) { u.stat->uds_requests_fail[n] = - f->udev->bus->stats_err.uds_requests[n]; - + f->udev->stats_err.uds_requests[n]; u.stat->uds_requests_ok[n] = - f->udev->bus->stats_ok.uds_requests[n]; + f->udev->stats_ok.uds_requests[n]; } break; diff --git a/freebsd/sys/dev/usb/usb_ioctl.h b/freebsd/sys/dev/usb/usb_ioctl.h index e7e63fb9..c4023cab 100644 --- a/freebsd/sys/dev/usb/usb_ioctl.h +++ b/freebsd/sys/dev/usb/usb_ioctl.h @@ -224,7 +224,7 @@ struct usb_fs_uninit { } USB_IOCTL_STRUCT_ALIGN(1); struct usb_fs_open { -#define USB_FS_MAX_BUFSIZE (1 << 18) +#define USB_FS_MAX_BUFSIZE (1 << 25) /* 32 MBytes */ uint32_t max_bufsize; #define USB_FS_MAX_FRAMES (1U << 12) #define USB_FS_MAX_FRAMES_PRE_SCALE (1U << 31) /* for ISOCHRONOUS transfers */ diff --git a/freebsd/sys/dev/usb/usb_transfer.c b/freebsd/sys/dev/usb/usb_transfer.c index 7ea25337..2478d937 100644 --- a/freebsd/sys/dev/usb/usb_transfer.c +++ b/freebsd/sys/dev/usb/usb_transfer.c @@ -111,6 +111,33 @@ static const struct usb_config usb_control_ep_cfg[USB_CTRL_XFER_MAX] = { }, }; +static const struct usb_config usb_control_ep_quirk_cfg[USB_CTRL_XFER_MAX] = { + + /* This transfer is used for generic control endpoint transfers */ + + [0] = { + .type = UE_CONTROL, + .endpoint = 0x00, /* Control endpoint */ + .direction = UE_DIR_ANY, + .bufsize = 65535, /* bytes */ + .callback = &usb_request_callback, + .usb_mode = USB_MODE_DUAL, /* both modes */ + }, + + /* This transfer is used for generic clear stall only */ + + [1] = { + .type = UE_CONTROL, + .endpoint = 0x00, /* Control pipe */ + .direction = UE_DIR_ANY, + .bufsize = sizeof(struct usb_device_request), + .callback = &usb_do_clear_stall_callback, + .timeout = 1000, /* 1 second */ + .interval = 50, /* 50ms */ + .usb_mode = USB_MODE_HOST, + }, +}; + /* function prototypes */ static void usbd_update_max_frame_size(struct usb_xfer *); @@ -1051,7 +1078,8 @@ usbd_transfer_setup(struct usb_device *udev, * context, else there is a chance of * deadlock! */ - if (setup_start == usb_control_ep_cfg) + if (setup_start == usb_control_ep_cfg || + setup_start == usb_control_ep_quirk_cfg) info->done_p = USB_BUS_CONTROL_XFER_PROC(udev->bus); else if (xfer_mtx == &Giant) @@ -2595,11 +2623,14 @@ usbd_transfer_done(struct usb_xfer *xfer, usb_error_t error) } #endif /* keep some statistics */ - if (xfer->error) { - info->bus->stats_err.uds_requests + if (xfer->error == USB_ERR_CANCELLED) { + info->udev->stats_cancelled.uds_requests + [xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE]++; + } else if (xfer->error != USB_ERR_NORMAL_COMPLETION) { + info->udev->stats_err.uds_requests [xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE]++; } else { - info->bus->stats_ok.uds_requests + info->udev->stats_ok.uds_requests [xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE]++; } @@ -3179,7 +3210,8 @@ repeat: */ iface_index = 0; if (usbd_transfer_setup(udev, &iface_index, - udev->ctrl_xfer, usb_control_ep_cfg, USB_CTRL_XFER_MAX, NULL, + udev->ctrl_xfer, udev->bus->control_ep_quirk ? + usb_control_ep_quirk_cfg : usb_control_ep_cfg, USB_CTRL_XFER_MAX, NULL, &udev->device_mtx)) { DPRINTFN(0, "could not setup default " "USB transfer\n"); diff --git a/freebsd/sys/fs/devfs/devfs_vnops.c b/freebsd/sys/fs/devfs/devfs_vnops.c index 86808e21..f1027e6f 100644 --- a/freebsd/sys/fs/devfs/devfs_vnops.c +++ b/freebsd/sys/fs/devfs/devfs_vnops.c @@ -298,38 +298,27 @@ devfs_vptocnp(struct vop_vptocnp_args *ap) if (error != 0) return (error); - i = *buflen; + if (vp->v_type != VCHR && vp->v_type != VDIR) { + error = ENOENT; + goto finished; + } + dd = vp->v_data; + if (vp->v_type == VDIR && dd == dmp->dm_rootdir) { + *dvp = vp; + vref(*dvp); + goto finished; + } - if (vp->v_type == VCHR) { - i -= strlen(dd->de_cdp->cdp_c.si_name); - if (i < 0) { - error = ENOMEM; - goto finished; - } - bcopy(dd->de_cdp->cdp_c.si_name, buf + i, - strlen(dd->de_cdp->cdp_c.si_name)); - de = dd->de_dir; - } else if (vp->v_type == VDIR) { - if (dd == dmp->dm_rootdir) { - *dvp = vp; - vref(*dvp); - goto finished; - } - i -= dd->de_dirent->d_namlen; - if (i < 0) { - error = ENOMEM; - goto finished; - } - bcopy(dd->de_dirent->d_name, buf + i, - dd->de_dirent->d_namlen); - de = dd; - } else { - error = ENOENT; + i = *buflen; + i -= dd->de_dirent->d_namlen; + if (i < 0) { + error = ENOMEM; goto finished; } + bcopy(dd->de_dirent->d_name, buf + i, dd->de_dirent->d_namlen); *buflen = i; - de = devfs_parent_dirent(de); + de = devfs_parent_dirent(dd); if (de == NULL) { error = ENOENT; goto finished; @@ -828,9 +817,16 @@ out: error = ENOTTY; if (error == 0 && com == TIOCSCTTY) { - /* Do nothing if reassigning same control tty */ + /* + * Do nothing if reassigning same control tty, or if the + * control tty has already disappeared. If it disappeared, + * it's because we were racing with TIOCNOTTY. TIOCNOTTY + * already took care of releasing the old vnode and we have + * nothing left to do. + */ sx_slock(&proctree_lock); - if (td->td_proc->p_session->s_ttyvp == vp) { + if (td->td_proc->p_session->s_ttyvp == vp || + td->td_proc->p_session->s_ttyp == NULL) { sx_sunlock(&proctree_lock); return (0); } @@ -938,8 +934,8 @@ devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock) if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) return (EIO); - error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); - if (error) + error = vn_dir_check_exec(dvp, cnp); + if (error != 0) return (error); if (cnp->cn_namelen == 1 && *pname == '.') { diff --git a/freebsd/sys/kern/kern_conf.c b/freebsd/sys/kern/kern_conf.c index 8605cc43..e1553915 100644 --- a/freebsd/sys/kern/kern_conf.c +++ b/freebsd/sys/kern/kern_conf.c @@ -174,12 +174,6 @@ dev_rel(struct cdev *dev) dev->si_refcount--; KASSERT(dev->si_refcount >= 0, ("dev_rel(%s) gave negative count", devtoname(dev))); -#if 0 - if (dev->si_usecount == 0 && - (dev->si_flags & SI_CHEAPCLONE) && (dev->si_flags & SI_NAMED)) - ; - else -#endif if (dev->si_devsw == NULL && dev->si_refcount == 0) { LIST_REMOVE(dev, si_list); flag = 1; @@ -601,20 +595,41 @@ newdev(struct make_dev_args *args, struct cdev *si) mtx_assert(&devmtx, MA_OWNED); csw = args->mda_devsw; + si2 = NULL; if (csw->d_flags & D_NEEDMINOR) { /* We may want to return an existing device */ LIST_FOREACH(si2, &csw->d_devs, si_list) { if (dev2unit(si2) == args->mda_unit) { dev_free_devlocked(si); - return (si2); + si = si2; + break; } } + + /* + * If we're returning an existing device, we should make sure + * it isn't already initialized. This would have been caught + * in consumers anyways, but it's good to catch such a case + * early. We still need to complete initialization of the + * device, and we'll use whatever make_dev_args were passed in + * to do so. + */ + KASSERT(si2 == NULL || (si2->si_flags & SI_NAMED) == 0, + ("make_dev() by driver %s on pre-existing device (min=%x, name=%s)", + args->mda_devsw->d_name, dev2unit(si2), devtoname(si2))); } si->si_drv0 = args->mda_unit; - si->si_devsw = csw; si->si_drv1 = args->mda_si_drv1; si->si_drv2 = args->mda_si_drv2; - LIST_INSERT_HEAD(&csw->d_devs, si, si_list); + /* Only push to csw->d_devs if it's not a cloned device. */ + if (si2 == NULL) { + si->si_devsw = csw; + LIST_INSERT_HEAD(&csw->d_devs, si, si_list); + } else { + KASSERT(si->si_devsw == csw, + ("%s: inconsistent devsw between clone_create() and make_dev()", + __func__)); + } return (si); } @@ -832,17 +847,6 @@ make_dev_sv(struct make_dev_args *args1, struct cdev **dres, dev_refl(dev); if ((args.mda_flags & MAKEDEV_ETERNAL) != 0) dev->si_flags |= SI_ETERNAL; - if (dev->si_flags & SI_CHEAPCLONE && - dev->si_flags & SI_NAMED) { - /* - * This is allowed as it removes races and generally - * simplifies cloning devices. - * XXX: still ?? - */ - dev_unlock_and_free(); - *dres = dev; - return (0); - } KASSERT(!(dev->si_flags & SI_NAMED), ("make_dev() by driver %s on pre-existing device (min=%x, name=%s)", args.mda_devsw->d_name, dev2unit(dev), devtoname(dev))); @@ -1592,7 +1596,6 @@ DB_SHOW_COMMAND(cdev, db_show_cdev) SI_FLAG(SI_ETERNAL); SI_FLAG(SI_ALIAS); SI_FLAG(SI_NAMED); - SI_FLAG(SI_CHEAPCLONE); SI_FLAG(SI_CHILD); SI_FLAG(SI_DUMPDEV); SI_FLAG(SI_CLONELIST); diff --git a/freebsd/sys/kern/kern_linker.c b/freebsd/sys/kern/kern_linker.c index 07fbd418..4571cbed 100644 --- a/freebsd/sys/kern/kern_linker.c +++ b/freebsd/sys/kern/kern_linker.c @@ -640,6 +640,10 @@ linker_make_file(const char *pathname, linker_class_t lc) lf->ndeps = 0; lf->deps = NULL; lf->loadcnt = ++loadcnt; +#ifdef __arm__ + lf->exidx_addr = 0; + lf->exidx_size = 0; +#endif STAILQ_INIT(&lf->common); TAILQ_INIT(&lf->modules); TAILQ_INSERT_TAIL(&linker_files, lf, link); @@ -2077,14 +2081,18 @@ linker_load_module(const char *kldname, const char *modname, */ KASSERT(verinfo == NULL, ("linker_load_module: verinfo" " is not NULL")); + /* check if root file system is not mounted */ + if (rootvnode == NULL || curproc->p_fd->fd_rdir == NULL) + return (ENXIO); pathname = linker_search_kld(kldname); } else { if (modlist_lookup2(modname, verinfo) != NULL) return (EEXIST); + /* check if root file system is not mounted */ + if (rootvnode == NULL || curproc->p_fd->fd_rdir == NULL) + return (ENXIO); if (kldname != NULL) pathname = strdup(kldname, M_LINKER); - else if (rootvnode == NULL) - pathname = NULL; else /* * Need to find a KLD with required module diff --git a/freebsd/sys/kern/kern_mib.c b/freebsd/sys/kern/kern_mib.c index b1c02570..d7d8a356 100644 --- a/freebsd/sys/kern/kern_mib.c +++ b/freebsd/sys/kern/kern_mib.c @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_config.h> #include <sys/param.h> +#include <sys/boot.h> #include <sys/jail.h> #include <sys/kernel.h> #include <sys/limits.h> @@ -87,6 +88,8 @@ SYSCTL_ROOT_NODE(CTL_HW, hw, CTLFLAG_RW, 0, #ifndef __rtems__ SYSCTL_ROOT_NODE(CTL_MACHDEP, machdep, CTLFLAG_RW, 0, "machine dependent"); +SYSCTL_NODE(_machdep, OID_AUTO, mitigations, CTLFLAG_RW, 0, + "Machine dependent platform mitigations."); SYSCTL_ROOT_NODE(CTL_USER, user, CTLFLAG_RW, 0, "user-level"); SYSCTL_ROOT_NODE(CTL_P1003_1B, p1003_1b, CTLFLAG_RW, 0, @@ -148,7 +151,7 @@ SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 0, "Whether saved set-group/user ID is available"); #endif -char kernelname[MAXPATHLEN] = "/boot/kernel/kernel"; /* XXX bloat */ +char kernelname[MAXPATHLEN] = PATH_KERNEL; /* XXX bloat */ SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW | CTLFLAG_MPSAFE, kernelname, sizeof kernelname, "Name of kernel file booted"); diff --git a/freebsd/sys/kern/kern_mtxpool.c b/freebsd/sys/kern/kern_mtxpool.c index bc47d826..a7fc1078 100644 --- a/freebsd/sys/kern/kern_mtxpool.c +++ b/freebsd/sys/kern/kern_mtxpool.c @@ -85,7 +85,7 @@ struct mtx_pool { #define mtx_pool_next mtx_pool_header.mtxpool_next #ifndef __rtems__ -struct mtx_pool *mtxpool_sleep; +struct mtx_pool __read_frequently *mtxpool_sleep; #endif /* __rtems__ */ #if UINTPTR_MAX == UINT64_MAX /* 64 bits */ diff --git a/freebsd/sys/kern/kern_sysctl.c b/freebsd/sys/kern/kern_sysctl.c index b7ba41ea..f529704a 100644 --- a/freebsd/sys/kern/kern_sysctl.c +++ b/freebsd/sys/kern/kern_sysctl.c @@ -940,13 +940,18 @@ SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, NULL); * (be aware though, that the proper interface isn't as obvious as it * may seem, there are various conflicting requirements. * - * {0,0} printf the entire MIB-tree. - * {0,1,...} return the name of the "..." OID. - * {0,2,...} return the next OID. - * {0,3} return the OID of the name in "new" - * {0,4,...} return the kind & format info for the "..." OID. - * {0,5,...} return the description of the "..." OID. - * {0,6,...} return the aggregation label of the "..." OID. + * {CTL_SYSCTL, CTL_SYSCTL_DEBUG} printf the entire MIB-tree. + * {CTL_SYSCTL, CTL_SYSCTL_NAME, ...} return the name of the "..." + * OID. + * {CTL_SYSCTL, CTL_SYSCTL_NEXT, ...} return the next OID. + * {CTL_SYSCTL, CTL_SYSCTL_NAME2OID} return the OID of the name in + * "new" + * {CTL_SYSCTL, CTL_SYSCTL_OIDFMT, ...} return the kind & format info + * for the "..." OID. + * {CTL_SYSCTL, CTL_SYSCTL_OIDDESCR, ...} return the description of the + * "..." OID. + * {CTL_SYSCTL, CTL_SYSCTL_OIDLABEL, ...} return the aggregation label of + * the "..." OID. */ #ifdef SYSCTL_DEBUG @@ -1014,8 +1019,8 @@ sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS) return (ENOENT); } -SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE, - 0, 0, sysctl_sysctl_debug, "-", ""); +SYSCTL_PROC(_sysctl, CTL_SYSCTL_DEBUG, debug, CTLTYPE_STRING | CTLFLAG_RD | + CTLFLAG_MPSAFE, 0, 0, sysctl_sysctl_debug, "-", ""); #endif static int @@ -1080,8 +1085,8 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in * capability mode. */ -static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, - sysctl_sysctl_name, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NAME, name, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_name, ""); static int sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, @@ -1167,8 +1172,8 @@ sysctl_sysctl_next(SYSCTL_HANDLER_ARGS) * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in * capability mode. */ -static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, - sysctl_sysctl_next, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXT, next, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, ""); static int name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp) @@ -1254,9 +1259,9 @@ sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS) * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in * capability mode. */ -SYSCTL_PROC(_sysctl, 3, name2oid, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE - | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", ""); +SYSCTL_PROC(_sysctl, CTL_SYSCTL_NAME2OID, name2oid, CTLTYPE_INT | CTLFLAG_RW | + CTLFLAG_ANYBODY | CTLFLAG_MPSAFE | CTLFLAG_CAPRW, 0, 0, + sysctl_sysctl_name2oid, "I", ""); static int sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) @@ -1284,8 +1289,8 @@ sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) } -static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, - sysctl_sysctl_oidfmt, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDFMT, oidfmt, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidfmt, ""); static int sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) @@ -1309,8 +1314,8 @@ sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) return (error); } -static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, - sysctl_sysctl_oiddescr, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDDESCR, oiddescr, CTLFLAG_RD | + CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oiddescr, ""); static int sysctl_sysctl_oidlabel(SYSCTL_HANDLER_ARGS) @@ -1334,8 +1339,8 @@ sysctl_sysctl_oidlabel(SYSCTL_HANDLER_ARGS) return (error); } -static SYSCTL_NODE(_sysctl, 6, oidlabel, - CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDLABEL, oidlabel, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, ""); /* * Default "handler" functions. @@ -1830,8 +1835,8 @@ kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, size_t oidlen, plen; int error; - oid[0] = 0; /* sysctl internal magic */ - oid[1] = 3; /* name2oid */ + oid[0] = CTL_SYSCTL; + oid[1] = CTL_SYSCTL_NAME2OID; oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, diff --git a/freebsd/sys/kern/kern_timeout.c b/freebsd/sys/kern/kern_timeout.c index 2f478afc..b9162020 100644 --- a/freebsd/sys/kern/kern_timeout.c +++ b/freebsd/sys/kern/kern_timeout.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/bus.h> #include <sys/callout.h> +#include <sys/domainset.h> #include <sys/file.h> #include <sys/interrupt.h> #include <sys/kernel.h> @@ -135,7 +136,8 @@ SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_ * TODO: * allocate more timeout table slots when table overflows. */ -u_int callwheelsize, callwheelmask; +static u_int __read_mostly callwheelsize; +static u_int __read_mostly callwheelmask; #else /* __rtems__ */ #define callwheelsize (2 * ncallout) #define callwheelmask (callwheelsize - 1) @@ -234,7 +236,7 @@ struct callout_cpu cc_cpu; #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) #ifndef __rtems__ -static int timeout_cpu; +static int __read_mostly timeout_cpu; #else /* __rtems__ */ #define timeout_cpu 0 #endif /* __rtems__ */ @@ -426,8 +428,9 @@ callout_cpu_init(struct callout_cpu *cc, int cpu) SLIST_INIT(&cc->cc_callfree); cc->cc_inited = 1; #ifndef __rtems__ - cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize, - M_CALLOUT, M_WAITOK); + cc->cc_callwheel = malloc_domainset(sizeof(struct callout_list) * + callwheelsize, M_CALLOUT, + DOMAINSET_PREF(pcpu_find(cpu)->pc_domain), M_WAITOK); #endif /* __rtems__ */ for (i = 0; i < callwheelsize; i++) LIST_INIT(&cc->cc_callwheel[i]); diff --git a/freebsd/sys/kern/subr_bus.c b/freebsd/sys/kern/subr_bus.c index bfeb1c34..e43d0030 100644 --- a/freebsd/sys/kern/subr_bus.c +++ b/freebsd/sys/kern/subr_bus.c @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/conf.h> +#include <sys/domainset.h> #include <sys/eventhandler.h> #include <sys/filio.h> #include <sys/lock.h> @@ -2564,7 +2565,7 @@ void device_set_softc(device_t dev, void *softc) { if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) - free(dev->softc, M_BUS_SC); + free_domain(dev->softc, M_BUS_SC); dev->softc = softc; if (dev->softc) dev->flags |= DF_EXTERNALSOFTC; @@ -2581,7 +2582,7 @@ device_set_softc(device_t dev, void *softc) void device_free_softc(void *softc) { - free(softc, M_BUS_SC); + free_domain(softc, M_BUS_SC); } /** @@ -2830,6 +2831,11 @@ device_set_devclass_fixed(device_t dev, const char *classname) int device_set_driver(device_t dev, driver_t *driver) { +#ifndef __rtems__ + int domain; + struct domainset *policy; +#endif /* __rtems__ */ + if (dev->state >= DS_ATTACHED) return (EBUSY); @@ -2837,7 +2843,7 @@ device_set_driver(device_t dev, driver_t *driver) return (0); if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) { - free(dev->softc, M_BUS_SC); + free_domain(dev->softc, M_BUS_SC); dev->softc = NULL; } device_set_desc(dev, NULL); @@ -2846,8 +2852,14 @@ device_set_driver(device_t dev, driver_t *driver) if (driver) { kobj_init((kobj_t) dev, (kobj_class_t) driver); if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) { - dev->softc = malloc(driver->size, M_BUS_SC, - M_NOWAIT | M_ZERO); +#ifndef __rtems__ + if (bus_get_domain(dev, &domain) == 0) + policy = DOMAINSET_PREF(domain); + else + policy = DOMAINSET_RR(); +#endif /* __rtems__ */ + dev->softc = malloc_domainset(driver->size, M_BUS_SC, + policy, M_NOWAIT | M_ZERO); if (!dev->softc) { kobj_delete((kobj_t) dev, NULL); kobj_init((kobj_t) dev, &null_class); @@ -3771,6 +3783,22 @@ bus_generic_attach(device_t dev) } /** + * @brief Helper function for delaying attaching children + * + * Many buses can't run transactions on the bus which children need to probe and + * attach until after interrupts and/or timers are running. This function + * delays their attach until interrupts and timers are enabled. + */ +int +bus_delayed_attach_children(device_t dev) +{ + /* Probe and attach the bus children when interrupts are available */ + config_intrhook_oneshot((ich_func_t)bus_generic_attach, dev); + + return (0); +} + +/** * @brief Helper function for implementing DEVICE_DETACH() * * This function can be used to help implement the DEVICE_DETACH() for diff --git a/freebsd/sys/kern/subr_firmware.c b/freebsd/sys/kern/subr_firmware.c index cc8bb691..2780963c 100644 --- a/freebsd/sys/kern/subr_firmware.c +++ b/freebsd/sys/kern/subr_firmware.c @@ -260,7 +260,6 @@ firmware_unregister(const char *imagename) static void loadimage(void *arg, int npending) { - struct thread *td = curthread; char *imagename = arg; struct priv_fw *fp; linker_file_t result; @@ -270,11 +269,6 @@ loadimage(void *arg, int npending) mtx_lock(&firmware_mtx); mtx_unlock(&firmware_mtx); - if (td->td_proc->p_fd->fd_rdir == NULL) { - printf("%s: root not mounted yet, no way to load image\n", - imagename); - goto done; - } error = linker_reference_module(imagename, NULL, &result); if (error != 0) { printf("%s: could not load firmware image, error %d\n", diff --git a/freebsd/sys/kern/subr_gtaskqueue.c b/freebsd/sys/kern/subr_gtaskqueue.c index 173cfa08..c061c6b0 100644 --- a/freebsd/sys/kern/subr_gtaskqueue.c +++ b/freebsd/sys/kern/subr_gtaskqueue.c @@ -63,26 +63,26 @@ TASKQGROUP_DEFINE(softirq, mp_ncpus, 1); TASKQGROUP_DEFINE(config, 1, 1); struct gtaskqueue_busy { - struct gtask *tb_running; - TAILQ_ENTRY(gtaskqueue_busy) tb_link; + struct gtask *tb_running; + u_int tb_seq; + LIST_ENTRY(gtaskqueue_busy) tb_link; }; -static struct gtask * const TB_DRAIN_WAITER = (struct gtask *)0x1; - typedef void (*gtaskqueue_enqueue_fn)(void *context); struct gtaskqueue { STAILQ_HEAD(, gtask) tq_queue; + LIST_HEAD(, gtaskqueue_busy) tq_active; + u_int tq_seq; + int tq_callouts; + struct mtx_padalign tq_mutex; gtaskqueue_enqueue_fn tq_enqueue; void *tq_context; char *tq_name; - TAILQ_HEAD(, gtaskqueue_busy) tq_active; - struct mtx tq_mutex; struct thread **tq_threads; int tq_tcount; int tq_spin; int tq_flags; - int tq_callouts; taskqueue_callback_fn tq_callbacks[TASKQUEUE_NUM_CALLBACKS]; void *tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS]; }; @@ -121,12 +121,11 @@ gtask_dump(struct gtask *gtask) #endif static __inline int -TQ_SLEEP(struct gtaskqueue *tq, void *p, struct mtx *m, int pri, const char *wm, - int t) +TQ_SLEEP(struct gtaskqueue *tq, void *p, const char *wm) { if (tq->tq_spin) - return (msleep_spin(p, m, wm, t)); - return (msleep(p, m, pri, wm, t)); + return (msleep_spin(p, (struct mtx *)&tq->tq_mutex, wm, 0)); + return (msleep(p, &tq->tq_mutex, 0, wm, 0)); } static struct gtaskqueue * @@ -150,7 +149,7 @@ _gtaskqueue_create(const char *name, int mflags, } STAILQ_INIT(&queue->tq_queue); - TAILQ_INIT(&queue->tq_active); + LIST_INIT(&queue->tq_active); queue->tq_enqueue = enqueue; queue->tq_context = context; queue->tq_name = tq_name; @@ -173,7 +172,7 @@ gtaskqueue_terminate(struct thread **pp, struct gtaskqueue *tq) while (tq->tq_tcount > 0 || tq->tq_callouts > 0) { wakeup(tq); - TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0); + TQ_SLEEP(tq, pp, "gtq_destroy"); } } @@ -184,7 +183,7 @@ gtaskqueue_free(struct gtaskqueue *queue) TQ_LOCK(queue); queue->tq_flags &= ~TQ_FLAGS_ACTIVE; gtaskqueue_terminate(queue->tq_threads, queue); - KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?")); + KASSERT(LIST_EMPTY(&queue->tq_active), ("Tasks still running?")); KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks")); mtx_destroy(&queue->tq_mutex); free(queue->tq_threads, M_GTASKQUEUE); @@ -291,7 +290,7 @@ gtaskqueue_drain_tq_queue(struct gtaskqueue *queue) * have completed or are currently executing. */ while (t_barrier.ta_flags & TASK_ENQUEUED) - TQ_SLEEP(queue, &t_barrier, &queue->tq_mutex, PWAIT, "-", 0); + TQ_SLEEP(queue, &t_barrier, "gtq_qdrain"); } /* @@ -302,31 +301,24 @@ gtaskqueue_drain_tq_queue(struct gtaskqueue *queue) static void gtaskqueue_drain_tq_active(struct gtaskqueue *queue) { - struct gtaskqueue_busy tb_marker, *tb_first; + struct gtaskqueue_busy *tb; + u_int seq; - if (TAILQ_EMPTY(&queue->tq_active)) + if (LIST_EMPTY(&queue->tq_active)) return; /* Block taskq_terminate().*/ queue->tq_callouts++; - /* - * Wait for all currently executing taskqueue threads - * to go idle. - */ - tb_marker.tb_running = TB_DRAIN_WAITER; - TAILQ_INSERT_TAIL(&queue->tq_active, &tb_marker, tb_link); - while (TAILQ_FIRST(&queue->tq_active) != &tb_marker) - TQ_SLEEP(queue, &tb_marker, &queue->tq_mutex, PWAIT, "-", 0); - TAILQ_REMOVE(&queue->tq_active, &tb_marker, tb_link); - - /* - * Wakeup any other drain waiter that happened to queue up - * without any intervening active thread. - */ - tb_first = TAILQ_FIRST(&queue->tq_active); - if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER) - wakeup(tb_first); + /* Wait for any active task with sequence from the past. */ + seq = queue->tq_seq; +restart: + LIST_FOREACH(tb, &queue->tq_active, tb_link) { + if ((int)(tb->tb_seq - seq) <= 0) { + TQ_SLEEP(queue, tb->tb_running, "gtq_adrain"); + goto restart; + } + } /* Release taskqueue_terminate(). */ queue->tq_callouts--; @@ -358,40 +350,27 @@ static void gtaskqueue_run_locked(struct gtaskqueue *queue) { struct gtaskqueue_busy tb; - struct gtaskqueue_busy *tb_first; struct gtask *gtask; KASSERT(queue != NULL, ("tq is NULL")); TQ_ASSERT_LOCKED(queue); tb.tb_running = NULL; + LIST_INSERT_HEAD(&queue->tq_active, &tb, tb_link); - while (STAILQ_FIRST(&queue->tq_queue)) { - TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link); - - /* - * Carefully remove the first task from the queue and - * clear its TASK_ENQUEUED flag - */ - gtask = STAILQ_FIRST(&queue->tq_queue); - KASSERT(gtask != NULL, ("task is NULL")); + while ((gtask = STAILQ_FIRST(&queue->tq_queue)) != NULL) { STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link); gtask->ta_flags &= ~TASK_ENQUEUED; tb.tb_running = gtask; + tb.tb_seq = ++queue->tq_seq; TQ_UNLOCK(queue); KASSERT(gtask->ta_func != NULL, ("task->ta_func is NULL")); gtask->ta_func(gtask->ta_context); TQ_LOCK(queue); - tb.tb_running = NULL; wakeup(gtask); - - TAILQ_REMOVE(&queue->tq_active, &tb, tb_link); - tb_first = TAILQ_FIRST(&queue->tq_active); - if (tb_first != NULL && - tb_first->tb_running == TB_DRAIN_WAITER) - wakeup(tb_first); } + LIST_REMOVE(&tb, tb_link); } static int @@ -400,7 +379,7 @@ task_is_running(struct gtaskqueue *queue, struct gtask *gtask) struct gtaskqueue_busy *tb; TQ_ASSERT_LOCKED(queue); - TAILQ_FOREACH(tb, &queue->tq_active, tb_link) { + LIST_FOREACH(tb, &queue->tq_active, tb_link) { if (tb->tb_running == gtask) return (1); } @@ -433,7 +412,7 @@ static void gtaskqueue_drain_locked(struct gtaskqueue *queue, struct gtask *gtask) { while ((gtask->ta_flags & TASK_ENQUEUED) || task_is_running(queue, gtask)) - TQ_SLEEP(queue, gtask, &queue->tq_mutex, PWAIT, "-", 0); + TQ_SLEEP(queue, gtask, "gtq_drain"); } void @@ -580,7 +559,7 @@ gtaskqueue_thread_loop(void *arg) */ if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0) break; - TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0); + TQ_SLEEP(tq, tq, "-"); } gtaskqueue_run_locked(tq); /* @@ -606,7 +585,7 @@ gtaskqueue_thread_enqueue(void *context) tqp = context; tq = *tqp; - wakeup_one(tq); + wakeup_any(tq); } diff --git a/freebsd/sys/kern/subr_taskqueue.c b/freebsd/sys/kern/subr_taskqueue.c index 67e62fc8..85912248 100644 --- a/freebsd/sys/kern/subr_taskqueue.c +++ b/freebsd/sys/kern/subr_taskqueue.c @@ -58,26 +58,27 @@ static void taskqueue_swi_enqueue(void *); static void taskqueue_swi_giant_enqueue(void *); struct taskqueue_busy { - struct task *tb_running; - TAILQ_ENTRY(taskqueue_busy) tb_link; + struct task *tb_running; + u_int tb_seq; + LIST_ENTRY(taskqueue_busy) tb_link; }; -struct task * const TB_DRAIN_WAITER = (struct task *)0x1; - struct taskqueue { STAILQ_HEAD(, task) tq_queue; + LIST_HEAD(, taskqueue_busy) tq_active; + struct task *tq_hint; + u_int tq_seq; + int tq_callouts; + struct mtx_padalign tq_mutex; taskqueue_enqueue_fn tq_enqueue; void *tq_context; char *tq_name; - TAILQ_HEAD(, taskqueue_busy) tq_active; - struct mtx tq_mutex; struct thread **tq_threads; int tq_tcount; #ifndef __rtems__ int tq_spin; #endif /* __rtems__ */ int tq_flags; - int tq_callouts; taskqueue_callback_fn tq_callbacks[TASKQUEUE_NUM_CALLBACKS]; void *tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS]; }; @@ -134,14 +135,13 @@ _timeout_task_init(struct taskqueue *queue, struct timeout_task *timeout_task, } static __inline int -TQ_SLEEP(struct taskqueue *tq, void *p, struct mtx *m, int pri, const char *wm, - int t) +TQ_SLEEP(struct taskqueue *tq, void *p, const char *wm) { #ifndef __rtems__ if (tq->tq_spin) - return (msleep_spin(p, m, wm, t)); + return (msleep_spin(p, (struct mtx *)&tq->tq_mutex, wm, 0)); #endif /* __rtems__ */ - return (msleep(p, m, pri, wm, t)); + return (msleep(p, &tq->tq_mutex, 0, wm, 0)); } static struct taskqueue * @@ -165,7 +165,7 @@ _taskqueue_create(const char *name, int mflags, snprintf(tq_name, TASKQUEUE_NAMELEN, "%s", (name) ? name : "taskqueue"); STAILQ_INIT(&queue->tq_queue); - TAILQ_INIT(&queue->tq_active); + LIST_INIT(&queue->tq_active); queue->tq_enqueue = enqueue; queue->tq_context = context; queue->tq_name = tq_name; @@ -223,7 +223,7 @@ taskqueue_terminate(struct thread **pp, struct taskqueue *tq) while (tq->tq_tcount > 0 || tq->tq_callouts > 0) { wakeup(tq); - TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0); + TQ_SLEEP(tq, pp, "tq_destroy"); } } @@ -234,7 +234,7 @@ taskqueue_free(struct taskqueue *queue) TQ_LOCK(queue); queue->tq_flags &= ~TQ_FLAGS_ACTIVE; taskqueue_terminate(queue->tq_threads, queue); - KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?")); + KASSERT(LIST_EMPTY(&queue->tq_active), ("Tasks still running?")); KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks")); mtx_destroy(&queue->tq_mutex); free(queue->tq_threads, M_TASKQUEUE); @@ -260,21 +260,30 @@ taskqueue_enqueue_locked(struct taskqueue *queue, struct task *task) } /* - * Optimise the case when all tasks have the same priority. + * Optimise cases when all tasks use small set of priorities. + * In case of only one priority we always insert at the end. + * In case of two tq_hint typically gives the insertion point. + * In case of more then two tq_hint should halve the search. */ prev = STAILQ_LAST(&queue->tq_queue, task, ta_link); if (!prev || prev->ta_priority >= task->ta_priority) { STAILQ_INSERT_TAIL(&queue->tq_queue, task, ta_link); } else { - prev = NULL; - for (ins = STAILQ_FIRST(&queue->tq_queue); ins; - prev = ins, ins = STAILQ_NEXT(ins, ta_link)) + prev = queue->tq_hint; + if (prev && prev->ta_priority >= task->ta_priority) { + ins = STAILQ_NEXT(prev, ta_link); + } else { + prev = NULL; + ins = STAILQ_FIRST(&queue->tq_queue); + } + for (; ins; prev = ins, ins = STAILQ_NEXT(ins, ta_link)) if (ins->ta_priority < task->ta_priority) break; - if (prev) + if (prev) { STAILQ_INSERT_AFTER(&queue->tq_queue, prev, task, ta_link); - else + queue->tq_hint = task; + } else STAILQ_INSERT_HEAD(&queue->tq_queue, task, ta_link); } @@ -393,6 +402,7 @@ taskqueue_drain_tq_queue(struct taskqueue *queue) */ TASK_INIT(&t_barrier, USHRT_MAX, taskqueue_task_nop_fn, &t_barrier); STAILQ_INSERT_TAIL(&queue->tq_queue, &t_barrier, ta_link); + queue->tq_hint = &t_barrier; t_barrier.ta_pending = 1; /* @@ -400,7 +410,7 @@ taskqueue_drain_tq_queue(struct taskqueue *queue) * have completed or are currently executing. */ while (t_barrier.ta_pending != 0) - TQ_SLEEP(queue, &t_barrier, &queue->tq_mutex, PWAIT, "-", 0); + TQ_SLEEP(queue, &t_barrier, "tq_qdrain"); return (1); } @@ -412,31 +422,24 @@ taskqueue_drain_tq_queue(struct taskqueue *queue) static int taskqueue_drain_tq_active(struct taskqueue *queue) { - struct taskqueue_busy tb_marker, *tb_first; + struct taskqueue_busy *tb; + u_int seq; - if (TAILQ_EMPTY(&queue->tq_active)) + if (LIST_EMPTY(&queue->tq_active)) return (0); /* Block taskq_terminate().*/ queue->tq_callouts++; - /* - * Wait for all currently executing taskqueue threads - * to go idle. - */ - tb_marker.tb_running = TB_DRAIN_WAITER; - TAILQ_INSERT_TAIL(&queue->tq_active, &tb_marker, tb_link); - while (TAILQ_FIRST(&queue->tq_active) != &tb_marker) - TQ_SLEEP(queue, &tb_marker, &queue->tq_mutex, PWAIT, "-", 0); - TAILQ_REMOVE(&queue->tq_active, &tb_marker, tb_link); - - /* - * Wakeup any other drain waiter that happened to queue up - * without any intervening active thread. - */ - tb_first = TAILQ_FIRST(&queue->tq_active); - if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER) - wakeup(tb_first); + /* Wait for any active task with sequence from the past. */ + seq = queue->tq_seq; +restart: + LIST_FOREACH(tb, &queue->tq_active, tb_link) { + if ((int)(tb->tb_seq - seq) <= 0) { + TQ_SLEEP(queue, tb->tb_running, "tq_adrain"); + goto restart; + } + } /* Release taskqueue_terminate(). */ queue->tq_callouts--; @@ -469,42 +472,31 @@ static void taskqueue_run_locked(struct taskqueue *queue) { struct taskqueue_busy tb; - struct taskqueue_busy *tb_first; struct task *task; int pending; KASSERT(queue != NULL, ("tq is NULL")); TQ_ASSERT_LOCKED(queue); tb.tb_running = NULL; + LIST_INSERT_HEAD(&queue->tq_active, &tb, tb_link); - while (STAILQ_FIRST(&queue->tq_queue)) { - TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link); - - /* - * Carefully remove the first task from the queue and - * zero its pending count. - */ - task = STAILQ_FIRST(&queue->tq_queue); - KASSERT(task != NULL, ("task is NULL")); + while ((task = STAILQ_FIRST(&queue->tq_queue)) != NULL) { STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link); + if (queue->tq_hint == task) + queue->tq_hint = NULL; pending = task->ta_pending; task->ta_pending = 0; tb.tb_running = task; + tb.tb_seq = ++queue->tq_seq; TQ_UNLOCK(queue); KASSERT(task->ta_func != NULL, ("task->ta_func is NULL")); task->ta_func(task->ta_context, pending); TQ_LOCK(queue); - tb.tb_running = NULL; wakeup(task); - - TAILQ_REMOVE(&queue->tq_active, &tb, tb_link); - tb_first = TAILQ_FIRST(&queue->tq_active); - if (tb_first != NULL && - tb_first->tb_running == TB_DRAIN_WAITER) - wakeup(tb_first); } + LIST_REMOVE(&tb, tb_link); } void @@ -522,7 +514,7 @@ task_is_running(struct taskqueue *queue, struct task *task) struct taskqueue_busy *tb; TQ_ASSERT_LOCKED(queue); - TAILQ_FOREACH(tb, &queue->tq_active, tb_link) { + LIST_FOREACH(tb, &queue->tq_active, tb_link) { if (tb->tb_running == task) return (1); } @@ -551,8 +543,11 @@ taskqueue_cancel_locked(struct taskqueue *queue, struct task *task, u_int *pendp) { - if (task->ta_pending > 0) + if (task->ta_pending > 0) { STAILQ_REMOVE(&queue->tq_queue, task, task, ta_link); + if (queue->tq_hint == task) + queue->tq_hint = NULL; + } if (pendp != NULL) *pendp = task->ta_pending; task->ta_pending = 0; @@ -603,7 +598,7 @@ taskqueue_drain(struct taskqueue *queue, struct task *task) TQ_LOCK(queue); while (task->ta_pending != 0 || task_is_running(queue, task)) - TQ_SLEEP(queue, task, &queue->tq_mutex, PWAIT, "-", 0); + TQ_SLEEP(queue, task, "tq_drain"); TQ_UNLOCK(queue); } @@ -687,7 +682,7 @@ taskqueue_swi_giant_run(void *dummy) static int _taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, - cpuset_t *mask, const char *name, va_list ap) + cpuset_t *mask, struct proc *p, const char *name, va_list ap) { char ktname[MAXCOMLEN + 1]; struct thread *td; @@ -709,10 +704,10 @@ _taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, for (i = 0; i < count; i++) { if (count == 1) - error = kthread_add(taskqueue_thread_loop, tqp, NULL, + error = kthread_add(taskqueue_thread_loop, tqp, p, &tq->tq_threads[i], RFSTOPPED, 0, "%s", ktname); else - error = kthread_add(taskqueue_thread_loop, tqp, NULL, + error = kthread_add(taskqueue_thread_loop, tqp, p, &tq->tq_threads[i], RFSTOPPED, 0, "%s_%d", ktname, i); if (error) { @@ -766,7 +761,20 @@ taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, int error; va_start(ap, name); - error = _taskqueue_start_threads(tqp, count, pri, NULL, name, ap); + error = _taskqueue_start_threads(tqp, count, pri, NULL, NULL, name, ap); + va_end(ap); + return (error); +} + +int +taskqueue_start_threads_in_proc(struct taskqueue **tqp, int count, int pri, + struct proc *proc, const char *name, ...) +{ + va_list ap; + int error; + + va_start(ap, name); + error = _taskqueue_start_threads(tqp, count, pri, NULL, proc, name, ap); va_end(ap); return (error); } @@ -779,7 +787,7 @@ taskqueue_start_threads_cpuset(struct taskqueue **tqp, int count, int pri, int error; va_start(ap, name); - error = _taskqueue_start_threads(tqp, count, pri, mask, name, ap); + error = _taskqueue_start_threads(tqp, count, pri, mask, NULL, name, ap); va_end(ap); return (error); } @@ -815,7 +823,7 @@ taskqueue_thread_loop(void *arg) */ if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0) break; - TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0); + TQ_SLEEP(tq, tq, "-"); } taskqueue_run_locked(tq); /* diff --git a/freebsd/sys/kern/sys_pipe.c b/freebsd/sys/kern/sys_pipe.c index e20c67ea..aef35fc1 100755 --- a/freebsd/sys/kern/sys_pipe.c +++ b/freebsd/sys/kern/sys_pipe.c @@ -1135,15 +1135,8 @@ retry: goto error1; } - while (wpipe->pipe_map.cnt != 0) { - if (wpipe->pipe_state & PIPE_EOF) { - wpipe->pipe_map.cnt = 0; - pipe_destroy_write_buffer(wpipe); - pipeselwakeup(wpipe); - pipeunlock(wpipe); - error = EPIPE; - goto error1; - } + while (wpipe->pipe_map.cnt != 0 && + (wpipe->pipe_state & PIPE_EOF) == 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); @@ -1158,12 +1151,16 @@ retry: break; } - if (wpipe->pipe_state & PIPE_EOF) + if ((wpipe->pipe_state & PIPE_EOF) != 0) { + wpipe->pipe_map.cnt = 0; + pipe_destroy_write_buffer(wpipe); + pipeselwakeup(wpipe); error = EPIPE; - if (error == EINTR || error == ERESTART) + } else if (error == EINTR || error == ERESTART) { pipe_clone_write_buffer(wpipe); - else + } else { pipe_destroy_write_buffer(wpipe); + } pipeunlock(wpipe); KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, ("pipe %p leaked PIPE_DIRECTW", wpipe)); diff --git a/freebsd/sys/kern/tty.c b/freebsd/sys/kern/tty.c index 88b928b9..d0b5633c 100644 --- a/freebsd/sys/kern/tty.c +++ b/freebsd/sys/kern/tty.c @@ -1184,6 +1184,7 @@ void tty_rel_gone(struct tty *tp) { + tty_lock_assert(tp, MA_OWNED); MPASS(!tty_gone(tp)); /* Simulate carrier removal. */ @@ -1198,6 +1199,73 @@ tty_rel_gone(struct tty *tp) tty_rel_free(tp); } +#ifndef __rtems__ +static int +tty_drop_ctty(struct tty *tp, struct proc *p) +{ + struct session *session; + struct vnode *vp; + + /* + * This looks terrible, but it's generally safe as long as the tty + * hasn't gone away while we had the lock dropped. All of our sanity + * checking that this operation is OK happens after we've picked it back + * up, so other state changes are generally not fatal and the potential + * for this particular operation to happen out-of-order in a + * multithreaded scenario is likely a non-issue. + */ + tty_unlock(tp); + sx_xlock(&proctree_lock); + tty_lock(tp); + if (tty_gone(tp)) { + sx_xunlock(&proctree_lock); + return (ENODEV); + } + + /* + * If the session doesn't have a controlling TTY, or if we weren't + * invoked on the controlling TTY, we'll return ENOIOCTL as we've + * historically done. + */ + session = p->p_session; + if (session->s_ttyp == NULL || session->s_ttyp != tp) { + sx_xunlock(&proctree_lock); + return (ENOTTY); + } + + if (!SESS_LEADER(p)) { + sx_xunlock(&proctree_lock); + return (EPERM); + } + + PROC_LOCK(p); + SESS_LOCK(session); + vp = session->s_ttyvp; + session->s_ttyp = NULL; + session->s_ttyvp = NULL; + session->s_ttydp = NULL; + SESS_UNLOCK(session); + + tp->t_sessioncnt--; + p->p_flag &= ~P_CONTROLT; + PROC_UNLOCK(p); + sx_xunlock(&proctree_lock); + + /* + * If we did have a vnode, release our reference. Ordinarily we manage + * these at the devfs layer, but we can't necessarily know that we were + * invoked on the vnode referenced in the session (i.e. the vnode we + * hold a reference to). We explicitly don't check VBAD/VI_DOOMED here + * to avoid a vnode leak -- in circumstances elsewhere where we'd hit a + * VI_DOOMED vnode, release has been deferred until the controlling TTY + * is either changed or released. + */ + if (vp != NULL) + vrele(vp); + return (0); +} +#endif /* __rtems__ */ + /* * Exposing information about current TTY's through sysctl */ @@ -1738,6 +1806,10 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, *(int *)data = NO_PID; #endif /* __rtems__ */ return (0); +#ifndef __rtems__ + case TIOCNOTTY: + return (tty_drop_ctty(tp, td->td_proc)); +#endif /* __rtems__ */ case TIOCSCTTY: { #ifndef __rtems__ struct proc *p = td->td_proc; diff --git a/freebsd/sys/kern/uipc_mbuf2.c b/freebsd/sys/kern/uipc_mbuf2.c index 6f98b0a2..7a0b9cf0 100644 --- a/freebsd/sys/kern/uipc_mbuf2.c +++ b/freebsd/sys/kern/uipc_mbuf2.c @@ -103,8 +103,8 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) int writable; /* check invalid arguments. */ - if (m == NULL) - panic("m == NULL in m_pulldown()"); + KASSERT(m != NULL, ("%s: fix caller: m is NULL off %d len %d offp %p\n", + __func__, off, len, offp)); if (len > MCLBYTES) { m_freem(m); return NULL; /* impossible */ diff --git a/freebsd/sys/kern/uipc_usrreq.c b/freebsd/sys/kern/uipc_usrreq.c index fc4ee85d..13fca66d 100644 --- a/freebsd/sys/kern/uipc_usrreq.c +++ b/freebsd/sys/kern/uipc_usrreq.c @@ -2504,7 +2504,8 @@ unp_internalize(struct mbuf **controlp, struct thread *td) goto out; } - controlp = &(*controlp)->m_next; + if (*controlp != NULL) + controlp = &(*controlp)->m_next; if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) diff --git a/freebsd/sys/net/dlt.h b/freebsd/sys/net/dlt.h index 639e5a7f..31ad4e01 100644 --- a/freebsd/sys/net/dlt.h +++ b/freebsd/sys/net/dlt.h @@ -769,8 +769,17 @@ * IPMB packet for IPMI, beginning with the I2C slave address, followed * by the netFn and LUN, etc.. Requested by Chanthy Toeung * <chanthy.toeung@ca.kontron.com>. + * + * XXX - this used to be called DLT_IPMB, back when we got the + * impression from the email thread requesting it that the packet + * had no extra 2-byte header. We've renamed it; if anybody used + * DLT_IPMB and assumed no 2-byte header, this will cause the compile + * to fail, at which point we'll have to figure out what to do about + * the two header types using the same DLT_/LINKTYPE_ value. If that + * doesn't happen, we'll assume nobody used it and that the redefinition + * is safe. */ -#define DLT_IPMB 199 +#define DLT_IPMB_KONTRON 199 /* * Juniper-private data link type, as per request from @@ -1365,6 +1374,11 @@ #define DLT_DISPLAYPORT_AUX 275 /* + * Linux cooked sockets v2. + */ +#define DLT_LINUX_SLL2 276 + +/* * In case the code that includes this file (directly or indirectly) * has also included OS files that happen to define DLT_MATCHING_MAX, * with a different value (perhaps because that OS hasn't picked up @@ -1374,7 +1388,7 @@ #ifdef DLT_MATCHING_MAX #undef DLT_MATCHING_MAX #endif -#define DLT_MATCHING_MAX 275 /* highest value in the "matching" range */ +#define DLT_MATCHING_MAX 276 /* highest value in the "matching" range */ /* * DLT and savefile link type values are split into a class and diff --git a/freebsd/sys/net/if.c b/freebsd/sys/net/if.c index d57e6983..37e1581b 100644 --- a/freebsd/sys/net/if.c +++ b/freebsd/sys/net/if.c @@ -34,6 +34,7 @@ * $FreeBSD$ */ +#include <rtems/bsd/local/opt_bpf.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_inet.h> @@ -1253,16 +1254,20 @@ static void if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; +#ifdef DEV_BPF u_int bif_dlt, bif_hdrlen; +#endif void *old; int rc; +#ifdef DEV_BPF /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. */ bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); +#endif /* * Detach from current vnet, but preserve LLADDR info, do not @@ -1309,8 +1314,10 @@ if_vmove(struct ifnet *ifp, struct vnet *new_vnet) if_attach_internal(ifp, 1, ifc); +#ifdef DEV_BPF if (ifp->if_bpf == NULL) bpfattach(ifp, bif_dlt, bif_hdrlen); +#endif CURVNET_RESTORE(); } @@ -1447,14 +1454,12 @@ if_addgroup(struct ifnet *ifp, const char *groupname) return (EEXIST); } - if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, - M_NOWAIT)) == NULL) { + if ((ifgl = malloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } - if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), - M_TEMP, M_NOWAIT)) == NULL) { + if ((ifgm = malloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); @@ -1465,8 +1470,7 @@ if_addgroup(struct ifnet *ifp, const char *groupname) break; if (ifg == NULL) { - if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), - M_TEMP, M_NOWAIT)) == NULL) { + if ((ifg = malloc(sizeof(*ifg), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); @@ -1498,39 +1502,36 @@ if_addgroup(struct ifnet *ifp, const char *groupname) } /* - * Remove a group from an interface + * Helper function to remove a group out of an interface. Expects the global + * ifnet lock to be write-locked, and drops it before returning. */ -int -if_delgroup(struct ifnet *ifp, const char *groupname) +static void +_if_delgroup_locked(struct ifnet *ifp, struct ifg_list *ifgl, + const char *groupname) { - struct ifg_list *ifgl; - struct ifg_member *ifgm; - int freeifgl; + struct ifg_member *ifgm; + bool freeifgl; - IFNET_WLOCK(); - CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) - if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) - break; - if (ifgl == NULL) { - IFNET_WUNLOCK(); - return (ENOENT); - } + IFNET_WLOCK_ASSERT(); - freeifgl = 0; IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); - CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) - if (ifgm->ifgm_ifp == ifp) + CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) { + if (ifgm->ifgm_ifp == ifp) { + CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, + ifg_member, ifgm_next); break; - - if (ifgm != NULL) - CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); + } + } if (--ifgl->ifgl_group->ifg_refcnt == 0) { - CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); - freeifgl = 1; + CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, + ifg_next); + freeifgl = true; + } else { + freeifgl = false; } IFNET_WUNLOCK(); @@ -1543,6 +1544,26 @@ if_delgroup(struct ifnet *ifp, const char *groupname) free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); +} + +/* + * Remove a group from an interface + */ +int +if_delgroup(struct ifnet *ifp, const char *groupname) +{ + struct ifg_list *ifgl; + + IFNET_WLOCK(); + CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) + if (strcmp(ifgl->ifgl_group->ifg_group, groupname) == 0) + break; + if (ifgl == NULL) { + IFNET_WUNLOCK(); + return (ENOENT); + } + + _if_delgroup_locked(ifp, ifgl, groupname); return (0); } @@ -1553,44 +1574,13 @@ if_delgroup(struct ifnet *ifp, const char *groupname) static void if_delgroups(struct ifnet *ifp) { - struct ifg_list *ifgl; - struct ifg_member *ifgm; + struct ifg_list *ifgl; char groupname[IFNAMSIZ]; - int ifglfree; IFNET_WLOCK(); - while (!CK_STAILQ_EMPTY(&ifp->if_groups)) { - ifgl = CK_STAILQ_FIRST(&ifp->if_groups); - + while ((ifgl = CK_STAILQ_FIRST(&ifp->if_groups)) != NULL) { strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); - - IF_ADDR_WLOCK(ifp); - CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); - IF_ADDR_WUNLOCK(ifp); - - CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) - if (ifgm->ifgm_ifp == ifp) - break; - - if (ifgm != NULL) - CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, - ifgm_next); - ifglfree = 0; - if (--ifgl->ifgl_group->ifg_refcnt == 0) { - CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); - ifglfree = 1; - } - - IFNET_WUNLOCK(); - epoch_wait_preempt(net_epoch_preempt); - free(ifgm, M_TEMP); - if (ifglfree) { - EVENTHANDLER_INVOKE(group_detach_event, - ifgl->ifgl_group); - free(ifgl->ifgl_group, M_TEMP); - } - EVENTHANDLER_INVOKE(group_change_event, groupname); - + _if_delgroup_locked(ifp, ifgl, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); @@ -1678,7 +1668,7 @@ if_getgroupmembers(struct ifgroupreq *ifgr) IFNET_RLOCK(); CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) - if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) + if (strcmp(ifg->ifg_group, ifgr->ifgr_name) == 0) break; if (ifg == NULL) { IFNET_RUNLOCK(); @@ -1957,10 +1947,13 @@ ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); - if (error != 0 && - !(cmd == RTM_ADD && error == EEXIST) && - !(cmd == RTM_DELETE && error == ENOENT)) - if_printf(ifp, "%s failed: %d\n", otype, error); + if (error == 0 || + (cmd == RTM_ADD && error == EEXIST) || + (cmd == RTM_DELETE && (error == ENOENT || error == ESRCH))) + return (error); + + log(LOG_DEBUG, "%s: %s failed for interface %s: %u\n", + __func__, otype, if_name(ifp), error); return (error); } @@ -2951,6 +2944,7 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: + case SIOCGIFDOWNREASON: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); diff --git a/freebsd/sys/net/if_bridge.c b/freebsd/sys/net/if_bridge.c index 4bfb67a8..2544c4f5 100644 --- a/freebsd/sys/net/if_bridge.c +++ b/freebsd/sys/net/if_bridge.c @@ -137,6 +137,14 @@ __FBSDID("$FreeBSD$"); #include <net/route.h> +#ifdef INET6 +/* + * XXX: declare here to avoid to include many inet6 related files.. + * should be more generalized? + */ +extern void nd6_setmtu(struct ifnet *); +#endif + /* * Size of the route hash table. Must be a power of two. */ @@ -774,7 +782,7 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) } args; struct ifdrv *ifd = (struct ifdrv *) data; const struct bridge_control *bc; - int error = 0; + int error = 0, oldmtu; switch (cmd) { @@ -820,12 +828,24 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) break; } + oldmtu = ifp->if_mtu; BRIDGE_LOCK(sc); error = (*bc->bc_func)(sc, &args); BRIDGE_UNLOCK(sc); if (error) break; + /* + * Bridge MTU may change during addition of the first port. + * If it did, do network layer specific procedure. + */ + if (ifp->if_mtu != oldmtu) { +#ifdef INET6 + nd6_setmtu(ifp); +#endif + rt_updatemtu(ifp); + } + if (bc->bc_flags & BC_F_COPYOUT) error = copyout(&args, ifd->ifd_data, ifd->ifd_len); diff --git a/freebsd/sys/net/if_clone.c b/freebsd/sys/net/if_clone.c index 1fa79766..ac59b635 100644 --- a/freebsd/sys/net/if_clone.c +++ b/freebsd/sys/net/if_clone.c @@ -213,6 +213,18 @@ if_clone_create(char *name, size_t len, caddr_t params) return (if_clone_createif(ifc, name, len, params)); } +void +if_clone_addif(struct if_clone *ifc, struct ifnet *ifp) +{ + + if ((ifc->ifc_flags & IFC_NOGROUP) == 0) + if_addgroup(ifp, ifc->ifc_name); + + IF_CLONE_LOCK(ifc); + IFC_IFLIST_INSERT(ifc, ifp); + IF_CLONE_UNLOCK(ifc); +} + /* * Create a clone network interface. */ @@ -235,12 +247,7 @@ if_clone_createif(struct if_clone *ifc, char *name, size_t len, caddr_t params) if (ifp == NULL) panic("%s: lookup failed for %s", __func__, name); - if ((ifc->ifc_flags & IFC_NOGROUP) == 0) - if_addgroup(ifp, ifc->ifc_name); - - IF_CLONE_LOCK(ifc); - IFC_IFLIST_INSERT(ifc, ifp); - IF_CLONE_UNLOCK(ifc); + if_clone_addif(ifc, ifp); } return (err); diff --git a/freebsd/sys/net/if_clone.h b/freebsd/sys/net/if_clone.h index 5dceacf6..b721e294 100644 --- a/freebsd/sys/net/if_clone.h +++ b/freebsd/sys/net/if_clone.h @@ -79,7 +79,8 @@ int if_clone_list(struct if_clonereq *); struct if_clone *if_clone_findifc(struct ifnet *); void if_clone_addgroup(struct ifnet *, struct if_clone *); -/* The below interface used only by epair(4). */ +/* The below interfaces are used only by epair(4). */ +void if_clone_addif(struct if_clone *, struct ifnet *); int if_clone_destroyif(struct if_clone *, struct ifnet *); #endif /* _KERNEL */ diff --git a/freebsd/sys/net/if_epair.c b/freebsd/sys/net/if_epair.c index 69ff3efc..f4a875b7 100644 --- a/freebsd/sys/net/if_epair.c +++ b/freebsd/sys/net/if_epair.c @@ -713,6 +713,21 @@ epair_clone_match(struct if_clone *ifc, const char *name) return (1); } +static void +epair_clone_add(struct if_clone *ifc, struct epair_softc *scb) +{ + struct ifnet *ifp; + uint8_t eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ + + ifp = scb->ifp; + /* Copy epairNa etheraddr and change the last byte. */ + memcpy(eaddr, scb->oifp->if_hw_addr, ETHER_ADDR_LEN); + eaddr[5] = 0x0b; + ether_ifattach(ifp, eaddr); + + if_clone_addif(ifc, ifp); +} + static int epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { @@ -725,24 +740,6 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) uint32_t hash; uint8_t eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ - /* - * We are abusing params to create our second interface. - * Actually we already created it and called if_clone_create() - * for it to do the official insertion procedure the moment we knew - * it cannot fail anymore. So just do attach it here. - */ - if (params) { - scb = (struct epair_softc *)params; - ifp = scb->ifp; - /* Copy epairNa etheraddr and change the last byte. */ - memcpy(eaddr, scb->oifp->if_hw_addr, ETHER_ADDR_LEN); - eaddr[5] = 0x0b; - ether_ifattach(ifp, eaddr); - /* Correctly set the name for the cloner list. */ - strlcpy(name, ifp->if_xname, len); - return (0); - } - /* Try to see if a special unit was requested. */ error = ifc_name2unit(name, &unit); if (error != 0) @@ -893,10 +890,11 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) if_setsendqready(ifp); /* We need to play some tricks here for the second interface. */ strlcpy(name, epairname, len); - error = if_clone_create(name, len, (caddr_t)scb); - if (error) - panic("%s: if_clone_create() for our 2nd iface failed: %d", - __func__, error); + + /* Correctly set the name for the cloner list. */ + strlcpy(name, scb->ifp->if_xname, len); + epair_clone_add(ifc, scb); + scb->if_qflush = ifp->if_qflush; ifp->if_qflush = epair_qflush; ifp->if_transmit = epair_transmit; diff --git a/freebsd/sys/net/if_lagg.c b/freebsd/sys/net/if_lagg.c index af6f1667..2d133ec4 100644 --- a/freebsd/sys/net/if_lagg.c +++ b/freebsd/sys/net/if_lagg.c @@ -54,6 +54,7 @@ __FBSDID("$FreeBSD$"); #include <net/if_types.h> #include <net/if_var.h> #include <net/bpf.h> +#include <net/route.h> #include <net/vnet.h> #if defined(INET) || defined(INET6) @@ -75,6 +76,14 @@ __FBSDID("$FreeBSD$"); #include <net/if_lagg.h> #include <net/ieee8023ad_lacp.h> +#ifdef INET6 +/* + * XXX: declare here to avoid to include many inet6 related files.. + * should be more generalized? + */ +extern void nd6_setmtu(struct ifnet *); +#endif + #define LAGG_RLOCK() struct epoch_tracker lagg_et; epoch_enter_preempt(net_epoch_preempt, &lagg_et) #define LAGG_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &lagg_et) #define LAGG_RLOCK_ASSERT() MPASS(in_epoch(net_epoch_preempt)) @@ -1154,7 +1163,7 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) struct ifnet *tpif; struct thread *td = curthread; char *buf, *outbuf; - int count, buflen, len, error = 0; + int count, buflen, len, error = 0, oldmtu; bzero(&rpbuf, sizeof(rpbuf)); @@ -1221,23 +1230,35 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ro->ro_active += LAGG_PORTACTIVE(lp); } - ro->ro_bkt = sc->sc_bkt; + ro->ro_bkt = sc->sc_stride; ro->ro_flapping = sc->sc_flapping; ro->ro_flowid_shift = sc->flowid_shift; LAGG_XUNLOCK(sc); break; case SIOCSLAGGOPTS: - if (sc->sc_proto == LAGG_PROTO_ROUNDROBIN) { - if (ro->ro_bkt == 0) - sc->sc_bkt = 1; // Minimum 1 packet per iface. - else - sc->sc_bkt = ro->ro_bkt; - } error = priv_check(td, PRIV_NET_LAGG); if (error) break; - if (ro->ro_opts == 0) + + /* + * The stride option was added without defining a corresponding + * LAGG_OPT flag, so handle a non-zero value before checking + * anything else to preserve compatibility. + */ + LAGG_XLOCK(sc); + if (ro->ro_opts == 0 && ro->ro_bkt != 0) { + if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN) { + LAGG_XUNLOCK(sc); + error = EINVAL; + break; + } + sc->sc_stride = ro->ro_bkt; + } + if (ro->ro_opts == 0) { + LAGG_XUNLOCK(sc); break; + } + /* * Set options. LACP options are stored in sc->sc_psc, * not in sc_opts. @@ -1248,6 +1269,7 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) case LAGG_OPT_USE_FLOWID: case -LAGG_OPT_USE_FLOWID: case LAGG_OPT_FLOWIDSHIFT: + case LAGG_OPT_RR_LIMIT: valid = 1; lacp = 0; break; @@ -1266,8 +1288,6 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) break; } - LAGG_XLOCK(sc); - if (valid == 0 || (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) { /* Invalid combination of options specified. */ @@ -1275,14 +1295,23 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) LAGG_XUNLOCK(sc); break; /* Return from SIOCSLAGGOPTS. */ } + /* * Store new options into sc->sc_opts except for - * FLOWIDSHIFT and LACP options. + * FLOWIDSHIFT, RR and LACP options. */ if (lacp == 0) { if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT) sc->flowid_shift = ro->ro_flowid_shift; - else if (ro->ro_opts > 0) + else if (ro->ro_opts == LAGG_OPT_RR_LIMIT) { + if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN || + ro->ro_bkt == 0) { + error = EINVAL; + LAGG_XUNLOCK(sc); + break; + } + sc->sc_stride = ro->ro_bkt; + } else if (ro->ro_opts > 0) sc->sc_opts |= ro->ro_opts; else sc->sc_opts &= ~ro->ro_opts; @@ -1407,10 +1436,23 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) tpif->if_xname); } #endif + oldmtu = ifp->if_mtu; LAGG_XLOCK(sc); error = lagg_port_create(sc, tpif); LAGG_XUNLOCK(sc); if_rele(tpif); + + /* + * LAGG MTU may change during addition of the first port. + * If it did, do network layer specific procedure. + */ + if (ifp->if_mtu != oldmtu) { +#ifdef INET6 + nd6_setmtu(ifp); +#endif + rt_updatemtu(ifp); + } + VLAN_CAPABILITIES(ifp); break; case SIOCSLAGGDELPORT: @@ -1904,7 +1946,7 @@ static void lagg_rr_attach(struct lagg_softc *sc) { sc->sc_seq = 0; - sc->sc_bkt_count = sc->sc_bkt; + sc->sc_stride = 1; } static int @@ -1913,18 +1955,8 @@ lagg_rr_start(struct lagg_softc *sc, struct mbuf *m) struct lagg_port *lp; uint32_t p; - if (sc->sc_bkt_count == 0 && sc->sc_bkt > 0) - sc->sc_bkt_count = sc->sc_bkt; - - if (sc->sc_bkt > 0) { - atomic_subtract_int(&sc->sc_bkt_count, 1); - if (atomic_cmpset_int(&sc->sc_bkt_count, 0, sc->sc_bkt)) - p = atomic_fetchadd_32(&sc->sc_seq, 1); - else - p = sc->sc_seq; - } else - p = atomic_fetchadd_32(&sc->sc_seq, 1); - + p = atomic_fetchadd_32(&sc->sc_seq, 1); + p /= sc->sc_stride; p %= sc->sc_count; lp = CK_SLIST_FIRST(&sc->sc_ports); diff --git a/freebsd/sys/net/if_lagg.h b/freebsd/sys/net/if_lagg.h index f1e2d8f4..c4256a45 100644 --- a/freebsd/sys/net/if_lagg.h +++ b/freebsd/sys/net/if_lagg.h @@ -63,11 +63,11 @@ struct lagg_protos { #define LAGG_PROTO_DEFAULT LAGG_PROTO_FAILOVER #define LAGG_PROTOS { \ - { "failover", LAGG_PROTO_FAILOVER }, \ + { "failover", LAGG_PROTO_FAILOVER }, \ { "lacp", LAGG_PROTO_LACP }, \ { "loadbalance", LAGG_PROTO_LOADBALANCE }, \ - { "roundrobin", LAGG_PROTO_ROUNDROBIN }, \ - { "broadcast", LAGG_PROTO_BROADCAST }, \ + { "roundrobin", LAGG_PROTO_ROUNDROBIN }, \ + { "broadcast", LAGG_PROTO_BROADCAST }, \ { "none", LAGG_PROTO_NONE }, \ { "default", LAGG_PROTO_DEFAULT } \ } @@ -148,11 +148,12 @@ struct lagg_reqopts { #define LAGG_OPT_LACP_TXTEST 0x20 /* LACP debug: txtest */ #define LAGG_OPT_LACP_RXTEST 0x40 /* LACP debug: rxtest */ #define LAGG_OPT_LACP_TIMEOUT 0x80 /* LACP timeout */ +#define LAGG_OPT_RR_LIMIT 0x100 /* RR stride */ u_int ro_count; /* number of ports */ u_int ro_active; /* active port count */ u_int ro_flapping; /* number of flapping */ int ro_flowid_shift; /* shift the flowid */ - uint32_t ro_bkt; /* packet bucket for roundrobin */ + uint32_t ro_bkt; /* stride for RR */ }; #define SIOCGLAGGOPTS _IOWR('i', 152, struct lagg_reqopts) @@ -214,6 +215,7 @@ struct lagg_softc { struct ifmedia sc_media; /* media config */ void *sc_psc; /* protocol data */ uint32_t sc_seq; /* sequence counter */ + uint32_t sc_stride; /* stride for RR */ uint32_t sc_flags; int sc_destroying; /* destroying lagg */ @@ -225,8 +227,6 @@ struct lagg_softc { struct callout sc_callout; u_int sc_opts; int flowid_shift; /* shift the flowid */ - uint32_t sc_bkt; /* packates bucket for roundrobin */ - uint32_t sc_bkt_count; /* packates bucket count for roundrobin */ struct lagg_counters detached_counters; /* detached ports sum */ }; diff --git a/freebsd/sys/net/if_llatbl.c b/freebsd/sys/net/if_llatbl.c index b220d7aa..7b5c3a91 100644 --- a/freebsd/sys/net/if_llatbl.c +++ b/freebsd/sys/net/if_llatbl.c @@ -81,11 +81,6 @@ RW_SYSINIT(lltable_list_lock, &lltable_list_lock, "lltable_list_lock"); static void lltable_unlink(struct lltable *llt); static void llentries_unlink(struct lltable *llt, struct llentries *head); -static void htable_unlink_entry(struct llentry *lle); -static void htable_link_entry(struct lltable *llt, struct llentry *lle); -static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, - void *farg); - /* * Dump lle state for a specific address family. */ @@ -182,15 +177,16 @@ static void htable_unlink_entry(struct llentry *lle) { - if ((lle->la_flags & LLE_LINKED) != 0) { - IF_AFDATA_WLOCK_ASSERT(lle->lle_tbl->llt_ifp); - CK_LIST_REMOVE(lle, lle_next); - lle->la_flags &= ~(LLE_VALID | LLE_LINKED); + if ((lle->la_flags & LLE_LINKED) == 0) + return; + + IF_AFDATA_WLOCK_ASSERT(lle->lle_tbl->llt_ifp); + CK_LIST_REMOVE(lle, lle_next); + lle->la_flags &= ~(LLE_VALID | LLE_LINKED); #if 0 - lle->lle_tbl = NULL; - lle->lle_head = NULL; + lle->lle_tbl = NULL; + lle->lle_head = NULL; #endif - } } struct prefix_match_data { diff --git a/freebsd/sys/net/if_tap.c b/freebsd/sys/net/if_tap.c deleted file mode 100644 index 4ca35b66..00000000 --- a/freebsd/sys/net/if_tap.c +++ /dev/null @@ -1,1153 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * BASED ON: - * ------------------------------------------------------------------------- - * - * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> - * Nottingham University 1987. - */ - -/* - * $FreeBSD$ - * $Id: if_tap.c,v 0.21 2000/07/23 21:46:02 max Exp $ - */ - -#include <rtems/bsd/local/opt_inet.h> - -#include <sys/param.h> -#include <sys/conf.h> -#include <sys/lock.h> -#include <sys/fcntl.h> -#include <sys/filio.h> -#include <sys/jail.h> -#include <sys/kernel.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <sys/poll.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/selinfo.h> -#include <sys/signalvar.h> -#include <sys/socket.h> -#include <sys/sockio.h> -#include <sys/sx.h> -#include <sys/sysctl.h> -#include <sys/systm.h> -#include <sys/ttycom.h> -#include <sys/uio.h> -#include <sys/queue.h> - -#include <net/bpf.h> -#include <net/ethernet.h> -#include <net/if.h> -#include <net/if_var.h> -#include <net/if_clone.h> -#include <net/if_dl.h> -#include <net/if_media.h> -#include <net/if_types.h> -#include <net/route.h> -#include <net/vnet.h> - -#include <netinet/in.h> - -#include <net/if_tapvar.h> -#include <net/if_tap.h> - -#define CDEV_NAME "tap" -#define TAPDEBUG if (tapdebug) printf - -static const char tapname[] = "tap"; -static const char vmnetname[] = "vmnet"; -#define TAPMAXUNIT 0x7fff -#define VMNET_DEV_MASK CLONE_FLAG0 - -/* module */ -static int tapmodevent(module_t, int, void *); - -/* device */ -static void tapclone(void *, struct ucred *, char *, int, - struct cdev **); -static void tapcreate(struct cdev *); - -/* network interface */ -static void tapifstart(struct ifnet *); -static int tapifioctl(struct ifnet *, u_long, caddr_t); -static void tapifinit(void *); - -static int tap_clone_create(struct if_clone *, int, caddr_t); -static void tap_clone_destroy(struct ifnet *); -static struct if_clone *tap_cloner; -static int vmnet_clone_create(struct if_clone *, int, caddr_t); -static void vmnet_clone_destroy(struct ifnet *); -static struct if_clone *vmnet_cloner; - -/* character device */ -static d_open_t tapopen; -static d_close_t tapclose; -static d_read_t tapread; -static d_write_t tapwrite; -static d_ioctl_t tapioctl; -static d_poll_t tappoll; -static d_kqfilter_t tapkqfilter; - -/* kqueue(2) */ -static int tapkqread(struct knote *, long); -static int tapkqwrite(struct knote *, long); -static void tapkqdetach(struct knote *); - -static struct filterops tap_read_filterops = { - .f_isfd = 1, - .f_attach = NULL, - .f_detach = tapkqdetach, - .f_event = tapkqread, -}; - -static struct filterops tap_write_filterops = { - .f_isfd = 1, - .f_attach = NULL, - .f_detach = tapkqdetach, - .f_event = tapkqwrite, -}; - -static struct cdevsw tap_cdevsw = { - .d_version = D_VERSION, - .d_flags = D_NEEDMINOR, - .d_open = tapopen, - .d_close = tapclose, - .d_read = tapread, - .d_write = tapwrite, - .d_ioctl = tapioctl, - .d_poll = tappoll, - .d_name = CDEV_NAME, - .d_kqfilter = tapkqfilter, -}; - -/* - * All global variables in if_tap.c are locked with tapmtx, with the - * exception of tapdebug, which is accessed unlocked; tapclones is - * static at runtime. - */ -static struct mtx tapmtx; -static int tapdebug = 0; /* debug flag */ -static int tapuopen = 0; /* allow user open() */ -static int tapuponopen = 0; /* IFF_UP on open() */ -static int tapdclone = 1; /* enable devfs cloning */ -static SLIST_HEAD(, tap_softc) taphead; /* first device */ -static struct clonedevs *tapclones; - -MALLOC_DECLARE(M_TAP); -MALLOC_DEFINE(M_TAP, CDEV_NAME, "Ethernet tunnel interface"); -SYSCTL_INT(_debug, OID_AUTO, if_tap_debug, CTLFLAG_RW, &tapdebug, 0, ""); - -static struct sx tap_ioctl_sx; -SX_SYSINIT(tap_ioctl_sx, &tap_ioctl_sx, "tap_ioctl"); - -SYSCTL_DECL(_net_link); -static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0, - "Ethernet tunnel software network interface"); -SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tapuopen, 0, - "Allow user to open /dev/tap (based on node permissions)"); -SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, - "Bring interface up when /dev/tap is opened"); -SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, - "Enable legacy devfs interface creation"); -SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tapdebug, 0, ""); - -DEV_MODULE(if_tap, tapmodevent, NULL); -MODULE_VERSION(if_tap, 1); - -static int -tap_clone_create(struct if_clone *ifc, int unit, caddr_t params) -{ - struct cdev *dev; - int i; - - /* Find any existing device, or allocate new unit number. */ - i = clone_create(&tapclones, &tap_cdevsw, &unit, &dev, 0); - if (i) { - dev = make_dev(&tap_cdevsw, unit, UID_ROOT, GID_WHEEL, 0600, - "%s%d", tapname, unit); - } - - tapcreate(dev); - return (0); -} - -/* vmnet devices are tap devices in disguise */ -static int -vmnet_clone_create(struct if_clone *ifc, int unit, caddr_t params) -{ - struct cdev *dev; - int i; - - /* Find any existing device, or allocate new unit number. */ - i = clone_create(&tapclones, &tap_cdevsw, &unit, &dev, VMNET_DEV_MASK); - if (i) { - dev = make_dev(&tap_cdevsw, unit | VMNET_DEV_MASK, UID_ROOT, - GID_WHEEL, 0600, "%s%d", vmnetname, unit); - } - - tapcreate(dev); - return (0); -} - -static void -tap_destroy(struct tap_softc *tp) -{ - struct ifnet *ifp = tp->tap_ifp; - - CURVNET_SET(ifp->if_vnet); - - destroy_dev(tp->tap_dev); - seldrain(&tp->tap_rsel); - knlist_clear(&tp->tap_rsel.si_note, 0); - knlist_destroy(&tp->tap_rsel.si_note); - ether_ifdetach(ifp); - - sx_xlock(&tap_ioctl_sx); - ifp->if_softc = NULL; - sx_xunlock(&tap_ioctl_sx); - - if_free(ifp); - - mtx_destroy(&tp->tap_mtx); - free(tp, M_TAP); - CURVNET_RESTORE(); -} - -static void -tap_clone_destroy(struct ifnet *ifp) -{ - struct tap_softc *tp = ifp->if_softc; - - mtx_lock(&tapmtx); - SLIST_REMOVE(&taphead, tp, tap_softc, tap_next); - mtx_unlock(&tapmtx); - tap_destroy(tp); -} - -/* vmnet devices are tap devices in disguise */ -static void -vmnet_clone_destroy(struct ifnet *ifp) -{ - tap_clone_destroy(ifp); -} - -/* - * tapmodevent - * - * module event handler - */ -static int -tapmodevent(module_t mod, int type, void *data) -{ - static eventhandler_tag eh_tag = NULL; - struct tap_softc *tp = NULL; - struct ifnet *ifp = NULL; - - switch (type) { - case MOD_LOAD: - - /* intitialize device */ - - mtx_init(&tapmtx, "tapmtx", NULL, MTX_DEF); - SLIST_INIT(&taphead); - - clone_setup(&tapclones); - eh_tag = EVENTHANDLER_REGISTER(dev_clone, tapclone, 0, 1000); - if (eh_tag == NULL) { - clone_cleanup(&tapclones); - mtx_destroy(&tapmtx); - return (ENOMEM); - } - tap_cloner = if_clone_simple(tapname, tap_clone_create, - tap_clone_destroy, 0); - vmnet_cloner = if_clone_simple(vmnetname, vmnet_clone_create, - vmnet_clone_destroy, 0); - return (0); - - case MOD_UNLOAD: - /* - * The EBUSY algorithm here can't quite atomically - * guarantee that this is race-free since we have to - * release the tap mtx to deregister the clone handler. - */ - mtx_lock(&tapmtx); - SLIST_FOREACH(tp, &taphead, tap_next) { - mtx_lock(&tp->tap_mtx); - if (tp->tap_flags & TAP_OPEN) { - mtx_unlock(&tp->tap_mtx); - mtx_unlock(&tapmtx); - return (EBUSY); - } - mtx_unlock(&tp->tap_mtx); - } - mtx_unlock(&tapmtx); - - EVENTHANDLER_DEREGISTER(dev_clone, eh_tag); - if_clone_detach(tap_cloner); - if_clone_detach(vmnet_cloner); - drain_dev_clone_events(); - - mtx_lock(&tapmtx); - while ((tp = SLIST_FIRST(&taphead)) != NULL) { - SLIST_REMOVE_HEAD(&taphead, tap_next); - mtx_unlock(&tapmtx); - - ifp = tp->tap_ifp; - - TAPDEBUG("detaching %s\n", ifp->if_xname); - - tap_destroy(tp); - mtx_lock(&tapmtx); - } - mtx_unlock(&tapmtx); - clone_cleanup(&tapclones); - - mtx_destroy(&tapmtx); - - break; - - default: - return (EOPNOTSUPP); - } - - return (0); -} /* tapmodevent */ - - -/* - * DEVFS handler - * - * We need to support two kind of devices - tap and vmnet - */ -static void -tapclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) -{ - char devname[SPECNAMELEN + 1]; - int i, unit, append_unit; - int extra; - - if (*dev != NULL) - return; - - if (!tapdclone || - (!tapuopen && priv_check_cred(cred, PRIV_NET_IFCREATE, 0) != 0)) - return; - - unit = 0; - append_unit = 0; - extra = 0; - - /* We're interested in only tap/vmnet devices. */ - if (strcmp(name, tapname) == 0) { - unit = -1; - } else if (strcmp(name, vmnetname) == 0) { - unit = -1; - extra = VMNET_DEV_MASK; - } else if (dev_stdclone(name, NULL, tapname, &unit) != 1) { - if (dev_stdclone(name, NULL, vmnetname, &unit) != 1) { - return; - } else { - extra = VMNET_DEV_MASK; - } - } - - if (unit == -1) - append_unit = 1; - - CURVNET_SET(CRED_TO_VNET(cred)); - /* find any existing device, or allocate new unit number */ - i = clone_create(&tapclones, &tap_cdevsw, &unit, dev, extra); - if (i) { - if (append_unit) { - /* - * We were passed 'tun' or 'tap', with no unit specified - * so we'll need to append it now. - */ - namelen = snprintf(devname, sizeof(devname), "%s%d", name, - unit); - name = devname; - } - - *dev = make_dev_credf(MAKEDEV_REF, &tap_cdevsw, unit | extra, - cred, UID_ROOT, GID_WHEEL, 0600, "%s", name); - } - - if_clone_create(name, namelen, NULL); - CURVNET_RESTORE(); -} /* tapclone */ - - -/* - * tapcreate - * - * to create interface - */ -static void -tapcreate(struct cdev *dev) -{ - struct ifnet *ifp = NULL; - struct tap_softc *tp = NULL; - unsigned short macaddr_hi; - uint32_t macaddr_mid; - int unit; - const char *name = NULL; - u_char eaddr[6]; - - /* allocate driver storage and create device */ - tp = malloc(sizeof(*tp), M_TAP, M_WAITOK | M_ZERO); - mtx_init(&tp->tap_mtx, "tap_mtx", NULL, MTX_DEF); - mtx_lock(&tapmtx); - SLIST_INSERT_HEAD(&taphead, tp, tap_next); - mtx_unlock(&tapmtx); - - unit = dev2unit(dev); - - /* select device: tap or vmnet */ - if (unit & VMNET_DEV_MASK) { - name = vmnetname; - tp->tap_flags |= TAP_VMNET; - } else - name = tapname; - - unit &= TAPMAXUNIT; - - TAPDEBUG("tapcreate(%s%d). minor = %#x\n", name, unit, dev2unit(dev)); - - /* generate fake MAC address: 00 bd xx xx xx unit_no */ - macaddr_hi = htons(0x00bd); - macaddr_mid = (uint32_t) ticks; - bcopy(&macaddr_hi, eaddr, sizeof(short)); - bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t)); - eaddr[5] = (u_char)unit; - - /* fill the rest and attach interface */ - ifp = tp->tap_ifp = if_alloc(IFT_ETHER); - if (ifp == NULL) - panic("%s%d: can not if_alloc()", name, unit); - ifp->if_softc = tp; - if_initname(ifp, name, unit); - ifp->if_init = tapifinit; - ifp->if_start = tapifstart; - ifp->if_ioctl = tapifioctl; - ifp->if_mtu = ETHERMTU; - ifp->if_flags = (IFF_BROADCAST|IFF_SIMPLEX|IFF_MULTICAST); - IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); - ifp->if_capabilities |= IFCAP_LINKSTATE; - ifp->if_capenable |= IFCAP_LINKSTATE; - - dev->si_drv1 = tp; - tp->tap_dev = dev; - - ether_ifattach(ifp, eaddr); - - mtx_lock(&tp->tap_mtx); - tp->tap_flags |= TAP_INITED; - mtx_unlock(&tp->tap_mtx); - - knlist_init_mtx(&tp->tap_rsel.si_note, &tp->tap_mtx); - - TAPDEBUG("interface %s is created. minor = %#x\n", - ifp->if_xname, dev2unit(dev)); -} /* tapcreate */ - - -/* - * tapopen - * - * to open tunnel. must be superuser - */ -static int -tapopen(struct cdev *dev, int flag, int mode, struct thread *td) -{ - struct tap_softc *tp = NULL; - struct ifnet *ifp = NULL; - int error; - - if (tapuopen == 0) { - error = priv_check(td, PRIV_NET_TAP); - if (error) - return (error); - } - - if ((dev2unit(dev) & CLONE_UNITMASK) > TAPMAXUNIT) - return (ENXIO); - - tp = dev->si_drv1; - - mtx_lock(&tp->tap_mtx); - if (tp->tap_flags & TAP_OPEN) { - mtx_unlock(&tp->tap_mtx); - return (EBUSY); - } - - bcopy(IF_LLADDR(tp->tap_ifp), tp->ether_addr, sizeof(tp->ether_addr)); -#ifndef __rtems__ - tp->tap_pid = td->td_proc->p_pid; -#else /* __rtems__ */ - tp->tap_pid = BSD_DEFAULT_PID; -#endif /* __rtems__ */ - tp->tap_flags |= TAP_OPEN; - ifp = tp->tap_ifp; - - ifp->if_drv_flags |= IFF_DRV_RUNNING; - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - if (tapuponopen) - ifp->if_flags |= IFF_UP; - if_link_state_change(ifp, LINK_STATE_UP); - mtx_unlock(&tp->tap_mtx); - - TAPDEBUG("%s is open. minor = %#x\n", ifp->if_xname, dev2unit(dev)); - - return (0); -} /* tapopen */ - - -/* - * tapclose - * - * close the device - mark i/f down & delete routing info - */ -static int -tapclose(struct cdev *dev, int foo, int bar, struct thread *td) -{ - struct ifaddr *ifa; - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - - /* junk all pending output */ - mtx_lock(&tp->tap_mtx); - CURVNET_SET(ifp->if_vnet); - IF_DRAIN(&ifp->if_snd); - - /* - * Do not bring the interface down, and do not anything with - * interface, if we are in VMnet mode. Just close the device. - */ - if (((tp->tap_flags & TAP_VMNET) == 0) && - (ifp->if_flags & (IFF_UP | IFF_LINK0)) == IFF_UP) { - mtx_unlock(&tp->tap_mtx); - if_down(ifp); - mtx_lock(&tp->tap_mtx); - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - mtx_unlock(&tp->tap_mtx); - CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - rtinit(ifa, (int)RTM_DELETE, 0); - } - if_purgeaddrs(ifp); - mtx_lock(&tp->tap_mtx); - } - } - - if_link_state_change(ifp, LINK_STATE_DOWN); - CURVNET_RESTORE(); - - funsetown(&tp->tap_sigio); - selwakeuppri(&tp->tap_rsel, PZERO+1); - KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); - - tp->tap_flags &= ~TAP_OPEN; - tp->tap_pid = 0; - mtx_unlock(&tp->tap_mtx); - - TAPDEBUG("%s is closed. minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - - return (0); -} /* tapclose */ - - -/* - * tapifinit - * - * network interface initialization function - */ -static void -tapifinit(void *xtp) -{ - struct tap_softc *tp = (struct tap_softc *)xtp; - struct ifnet *ifp = tp->tap_ifp; - - TAPDEBUG("initializing %s\n", ifp->if_xname); - - mtx_lock(&tp->tap_mtx); - ifp->if_drv_flags |= IFF_DRV_RUNNING; - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - mtx_unlock(&tp->tap_mtx); - - /* attempt to start output */ - tapifstart(ifp); -} /* tapifinit */ - - -/* - * tapifioctl - * - * Process an ioctl request on network interface - */ -static int -tapifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) -{ - struct tap_softc *tp; - struct ifreq *ifr = (struct ifreq *)data; - struct ifstat *ifs = NULL; - struct ifmediareq *ifmr = NULL; - int dummy, error = 0; - - sx_xlock(&tap_ioctl_sx); - tp = ifp->if_softc; - if (tp == NULL) { - error = ENXIO; - goto bad; - } - switch (cmd) { - case SIOCSIFFLAGS: /* XXX -- just like vmnet does */ - case SIOCADDMULTI: - case SIOCDELMULTI: - break; - - case SIOCGIFMEDIA: - ifmr = (struct ifmediareq *)data; - dummy = ifmr->ifm_count; - ifmr->ifm_count = 1; - ifmr->ifm_status = IFM_AVALID; - ifmr->ifm_active = IFM_ETHER; - if (tp->tap_flags & TAP_OPEN) - ifmr->ifm_status |= IFM_ACTIVE; - ifmr->ifm_current = ifmr->ifm_active; - if (dummy >= 1) { - int media = IFM_ETHER; - error = copyout(&media, ifmr->ifm_ulist, - sizeof(int)); - } - break; - - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; - break; - - case SIOCGIFSTATUS: - ifs = (struct ifstat *)data; - mtx_lock(&tp->tap_mtx); - if (tp->tap_pid != 0) - snprintf(ifs->ascii, sizeof(ifs->ascii), - "\tOpened by PID %d\n", tp->tap_pid); - else - ifs->ascii[0] = '\0'; - mtx_unlock(&tp->tap_mtx); - break; - - default: - error = ether_ioctl(ifp, cmd, data); - break; - } - -bad: - sx_xunlock(&tap_ioctl_sx); - return (error); -} /* tapifioctl */ - - -/* - * tapifstart - * - * queue packets from higher level ready to put out - */ -static void -tapifstart(struct ifnet *ifp) -{ - struct tap_softc *tp = ifp->if_softc; - - TAPDEBUG("%s starting\n", ifp->if_xname); - - /* - * do not junk pending output if we are in VMnet mode. - * XXX: can this do any harm because of queue overflow? - */ - - mtx_lock(&tp->tap_mtx); - if (((tp->tap_flags & TAP_VMNET) == 0) && - ((tp->tap_flags & TAP_READY) != TAP_READY)) { - struct mbuf *m; - - /* Unlocked read. */ - TAPDEBUG("%s not ready, tap_flags = 0x%x\n", ifp->if_xname, - tp->tap_flags); - - for (;;) { - IF_DEQUEUE(&ifp->if_snd, m); - if (m != NULL) { - m_freem(m); - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - } else - break; - } - mtx_unlock(&tp->tap_mtx); - - return; - } - - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - - if (!IFQ_IS_EMPTY(&ifp->if_snd)) { - if (tp->tap_flags & TAP_RWAIT) { - tp->tap_flags &= ~TAP_RWAIT; - wakeup(tp); - } - - if ((tp->tap_flags & TAP_ASYNC) && (tp->tap_sigio != NULL)) { - mtx_unlock(&tp->tap_mtx); - pgsigio(&tp->tap_sigio, SIGIO, 0); - mtx_lock(&tp->tap_mtx); - } - - selwakeuppri(&tp->tap_rsel, PZERO+1); - KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); - if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ - } - - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - mtx_unlock(&tp->tap_mtx); -} /* tapifstart */ - - -/* - * tapioctl - * - * the cdevsw interface is now pretty minimal - */ -static int -tapioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) -{ - struct ifreq ifr; - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - struct tapinfo *tapp = NULL; - int f; - int error; -#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD4) - int ival; -#endif - - switch (cmd) { - case TAPSIFINFO: - tapp = (struct tapinfo *)data; - if (ifp->if_type != tapp->type) - return (EPROTOTYPE); - mtx_lock(&tp->tap_mtx); - if (ifp->if_mtu != tapp->mtu) { - strlcpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ); - ifr.ifr_mtu = tapp->mtu; - CURVNET_SET(ifp->if_vnet); - error = ifhwioctl(SIOCSIFMTU, ifp, - (caddr_t)&ifr, td); - CURVNET_RESTORE(); - if (error) { - mtx_unlock(&tp->tap_mtx); - return (error); - } - } - ifp->if_baudrate = tapp->baudrate; - mtx_unlock(&tp->tap_mtx); - break; - - case TAPGIFINFO: - tapp = (struct tapinfo *)data; - mtx_lock(&tp->tap_mtx); - tapp->mtu = ifp->if_mtu; - tapp->type = ifp->if_type; - tapp->baudrate = ifp->if_baudrate; - mtx_unlock(&tp->tap_mtx); - break; - - case TAPSDEBUG: - tapdebug = *(int *)data; - break; - - case TAPGDEBUG: - *(int *)data = tapdebug; - break; - - case TAPGIFNAME: { - struct ifreq *ifr = (struct ifreq *) data; - - strlcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ); - } break; - - case FIONBIO: - break; - - case FIOASYNC: - mtx_lock(&tp->tap_mtx); - if (*(int *)data) - tp->tap_flags |= TAP_ASYNC; - else - tp->tap_flags &= ~TAP_ASYNC; - mtx_unlock(&tp->tap_mtx); - break; - - case FIONREAD: - if (!IFQ_IS_EMPTY(&ifp->if_snd)) { - struct mbuf *mb; - - IFQ_LOCK(&ifp->if_snd); - IFQ_POLL_NOLOCK(&ifp->if_snd, mb); - for (*(int *)data = 0; mb != NULL; - mb = mb->m_next) - *(int *)data += mb->m_len; - IFQ_UNLOCK(&ifp->if_snd); - } else - *(int *)data = 0; - break; - - case FIOSETOWN: - return (fsetown(*(int *)data, &tp->tap_sigio)); - - case FIOGETOWN: - *(int *)data = fgetown(&tp->tap_sigio); - return (0); - - /* this is deprecated, FIOSETOWN should be used instead */ - case TIOCSPGRP: - return (fsetown(-(*(int *)data), &tp->tap_sigio)); - - /* this is deprecated, FIOGETOWN should be used instead */ - case TIOCGPGRP: - *(int *)data = -fgetown(&tp->tap_sigio); - return (0); - - /* VMware/VMnet port ioctl's */ - -#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD4) - case _IO('V', 0): - ival = IOCPARM_IVAL(data); - data = (caddr_t)&ival; - /* FALLTHROUGH */ -#endif - case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ - f = *(int *)data; - f &= 0x0fff; - f &= ~IFF_CANTCHANGE; - f |= IFF_UP; - - mtx_lock(&tp->tap_mtx); - ifp->if_flags = f | (ifp->if_flags & IFF_CANTCHANGE); - mtx_unlock(&tp->tap_mtx); - break; - - case SIOCGIFADDR: /* get MAC address of the remote side */ - mtx_lock(&tp->tap_mtx); - bcopy(tp->ether_addr, data, sizeof(tp->ether_addr)); - mtx_unlock(&tp->tap_mtx); - break; - - case SIOCSIFADDR: /* set MAC address of the remote side */ - mtx_lock(&tp->tap_mtx); - bcopy(data, tp->ether_addr, sizeof(tp->ether_addr)); - mtx_unlock(&tp->tap_mtx); - break; - - default: - return (ENOTTY); - } - return (0); -} /* tapioctl */ - - -/* - * tapread - * - * the cdevsw read interface - reads a packet at a time, or at - * least as much of a packet as can be read - */ -static int -tapread(struct cdev *dev, struct uio *uio, int flag) -{ - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - struct mbuf *m = NULL; - int error = 0, len; - - TAPDEBUG("%s reading, minor = %#x\n", ifp->if_xname, dev2unit(dev)); - - mtx_lock(&tp->tap_mtx); - if ((tp->tap_flags & TAP_READY) != TAP_READY) { - mtx_unlock(&tp->tap_mtx); - - /* Unlocked read. */ - TAPDEBUG("%s not ready. minor = %#x, tap_flags = 0x%x\n", - ifp->if_xname, dev2unit(dev), tp->tap_flags); - - return (EHOSTDOWN); - } - - tp->tap_flags &= ~TAP_RWAIT; - - /* sleep until we get a packet */ - do { - IF_DEQUEUE(&ifp->if_snd, m); - - if (m == NULL) { - if (flag & O_NONBLOCK) { - mtx_unlock(&tp->tap_mtx); - return (EWOULDBLOCK); - } - - tp->tap_flags |= TAP_RWAIT; - error = mtx_sleep(tp, &tp->tap_mtx, PCATCH | (PZERO + 1), - "taprd", 0); - if (error) { - mtx_unlock(&tp->tap_mtx); - return (error); - } - } - } while (m == NULL); - mtx_unlock(&tp->tap_mtx); - - /* feed packet to bpf */ - BPF_MTAP(ifp, m); - - /* xfer packet to user space */ - while ((m != NULL) && (uio->uio_resid > 0) && (error == 0)) { - len = min(uio->uio_resid, m->m_len); - if (len == 0) - break; - - error = uiomove(mtod(m, void *), len, uio); - m = m_free(m); - } - - if (m != NULL) { - TAPDEBUG("%s dropping mbuf, minor = %#x\n", ifp->if_xname, - dev2unit(dev)); - m_freem(m); - } - - return (error); -} /* tapread */ - - -/* - * tapwrite - * - * the cdevsw write interface - an atomic write is a packet - or else! - */ -static int -tapwrite(struct cdev *dev, struct uio *uio, int flag) -{ - struct ether_header *eh; - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - struct mbuf *m; - - TAPDEBUG("%s writing, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - - if (uio->uio_resid == 0) - return (0); - - if ((uio->uio_resid < 0) || (uio->uio_resid > TAPMRU)) { - TAPDEBUG("%s invalid packet len = %zd, minor = %#x\n", - ifp->if_xname, uio->uio_resid, dev2unit(dev)); - - return (EIO); - } - - if ((m = m_uiotombuf(uio, M_NOWAIT, 0, ETHER_ALIGN, - M_PKTHDR)) == NULL) { - if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); - return (ENOBUFS); - } - - m->m_pkthdr.rcvif = ifp; - - /* - * Only pass a unicast frame to ether_input(), if it would actually - * have been received by non-virtual hardware. - */ - if (m->m_len < sizeof(struct ether_header)) { - m_freem(m); - return (0); - } - eh = mtod(m, struct ether_header *); - - if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && - !ETHER_IS_MULTICAST(eh->ether_dhost) && - bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { - m_freem(m); - return (0); - } - - /* Pass packet up to parent. */ - CURVNET_SET(ifp->if_vnet); - (*ifp->if_input)(ifp, m); - CURVNET_RESTORE(); - if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); /* ibytes are counted in parent */ - - return (0); -} /* tapwrite */ - - -/* - * tappoll - * - * the poll interface, this is only useful on reads - * really. the write detect always returns true, write never blocks - * anyway, it either accepts the packet or drops it - */ -static int -tappoll(struct cdev *dev, int events, struct thread *td) -{ - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - int revents = 0; - - TAPDEBUG("%s polling, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - - if (events & (POLLIN | POLLRDNORM)) { - IFQ_LOCK(&ifp->if_snd); - if (!IFQ_IS_EMPTY(&ifp->if_snd)) { - TAPDEBUG("%s have data in queue. len = %d, " \ - "minor = %#x\n", ifp->if_xname, - ifp->if_snd.ifq_len, dev2unit(dev)); - - revents |= (events & (POLLIN | POLLRDNORM)); - } else { - TAPDEBUG("%s waiting for data, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - - selrecord(td, &tp->tap_rsel); - } - IFQ_UNLOCK(&ifp->if_snd); - } - - if (events & (POLLOUT | POLLWRNORM)) - revents |= (events & (POLLOUT | POLLWRNORM)); - - return (revents); -} /* tappoll */ - - -/* - * tap_kqfilter - * - * support for kevent() system call - */ -static int -tapkqfilter(struct cdev *dev, struct knote *kn) -{ - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - - switch (kn->kn_filter) { - case EVFILT_READ: - TAPDEBUG("%s kqfilter: EVFILT_READ, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - kn->kn_fop = &tap_read_filterops; - break; - - case EVFILT_WRITE: - TAPDEBUG("%s kqfilter: EVFILT_WRITE, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - kn->kn_fop = &tap_write_filterops; - break; - - default: - TAPDEBUG("%s kqfilter: invalid filter, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - return (EINVAL); - /* NOT REACHED */ - } - - kn->kn_hook = tp; - knlist_add(&tp->tap_rsel.si_note, kn, 0); - - return (0); -} /* tapkqfilter */ - - -/* - * tap_kqread - * - * Return true if there is data in the interface queue - */ -static int -tapkqread(struct knote *kn, long hint) -{ - int ret; - struct tap_softc *tp = kn->kn_hook; - struct cdev *dev = tp->tap_dev; - struct ifnet *ifp = tp->tap_ifp; - - if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { - TAPDEBUG("%s have data in queue. len = %d, minor = %#x\n", - ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); - ret = 1; - } else { - TAPDEBUG("%s waiting for data, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - ret = 0; - } - - return (ret); -} /* tapkqread */ - - -/* - * tap_kqwrite - * - * Always can write. Return the MTU in kn->data - */ -static int -tapkqwrite(struct knote *kn, long hint) -{ - struct tap_softc *tp = kn->kn_hook; - struct ifnet *ifp = tp->tap_ifp; - - kn->kn_data = ifp->if_mtu; - - return (1); -} /* tapkqwrite */ - - -static void -tapkqdetach(struct knote *kn) -{ - struct tap_softc *tp = kn->kn_hook; - - knlist_remove(&tp->tap_rsel.si_note, kn, 0); -} /* tapkqdetach */ - diff --git a/freebsd/sys/net/if_tap.h b/freebsd/sys/net/if_tap.h index 34f44b38..9718cee4 100644 --- a/freebsd/sys/net/if_tap.h +++ b/freebsd/sys/net/if_tap.h @@ -40,24 +40,22 @@ #ifndef _NET_IF_TAP_H_ #define _NET_IF_TAP_H_ -/* refer to if_tapvar.h for the softc stuff */ +#include <net/if_tun.h> /* maximum receive packet size (hard limit) */ #define TAPMRU 16384 -struct tapinfo { - int baudrate; /* linespeed */ - short mtu; /* maximum transmission unit */ - u_char type; /* ethernet, tokenring, etc. */ - u_char dummy; /* place holder */ -}; +#define tapinfo tuninfo -/* ioctl's for get/set debug */ -#define TAPSDEBUG _IOW('t', 90, int) -#define TAPGDEBUG _IOR('t', 89, int) -#define TAPSIFINFO _IOW('t', 91, struct tapinfo) -#define TAPGIFINFO _IOR('t', 92, struct tapinfo) -#define TAPGIFNAME _IOR('t', 93, struct ifreq) +/* + * ioctl's for get/set debug; these are aliases of TUN* ioctls, see net/if_tun.h + * for details. + */ +#define TAPSDEBUG TUNSDEBUG +#define TAPGDEBUG TUNGDEBUG +#define TAPSIFINFO TUNSIFINFO +#define TAPGIFINFO TUNGIFINFO +#define TAPGIFNAME TUNGIFNAME /* VMware ioctl's */ #define VMIO_SIOCSIFFLAGS _IOWINT('V', 0) diff --git a/freebsd/sys/net/if_tapvar.h b/freebsd/sys/net/if_tapvar.h deleted file mode 100644 index f5cf9f3e..00000000 --- a/freebsd/sys/net/if_tapvar.h +++ /dev/null @@ -1,71 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * BASED ON: - * ------------------------------------------------------------------------- - * - * Copyright (c) 1998 Brian Somers <brian@Awfulhak.org> - * All rights reserved. - * - * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> - * Nottingham University 1987. - */ - -/* - * $FreeBSD$ - * $Id: if_tapvar.h,v 0.6 2000/07/11 02:16:08 max Exp $ - */ - -#ifndef _NET_IF_TAPVAR_H_ -#define _NET_IF_TAPVAR_H_ - -/* - * tap_mtx locks tap_flags, tap_pid. tap_next locked with global tapmtx. - * Other fields locked by owning subsystems. - */ -struct tap_softc { - struct ifnet *tap_ifp; - u_short tap_flags; /* misc flags */ -#define TAP_OPEN (1 << 0) -#define TAP_INITED (1 << 1) -#define TAP_RWAIT (1 << 2) -#define TAP_ASYNC (1 << 3) -#define TAP_READY (TAP_OPEN|TAP_INITED) -#define TAP_VMNET (1 << 4) - - u_int8_t ether_addr[ETHER_ADDR_LEN]; /* ether addr of the remote side */ - - pid_t tap_pid; /* PID of process to open */ - struct sigio *tap_sigio; /* information for async I/O */ - struct selinfo tap_rsel; /* read select */ - - SLIST_ENTRY(tap_softc) tap_next; /* next device in chain */ - struct cdev *tap_dev; - struct mtx tap_mtx; /* per-softc mutex */ -}; - -#endif /* !_NET_IF_TAPVAR_H_ */ diff --git a/freebsd/sys/net/if_tun.c b/freebsd/sys/net/if_tun.c deleted file mode 100644 index c96b2163..00000000 --- a/freebsd/sys/net/if_tun.c +++ /dev/null @@ -1,1132 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ - -/*- - * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> - * Nottingham University 1987. - * - * This source may be freely distributed, however I would be interested - * in any changes that are made. - * - * This driver takes packets off the IP i/f and hands them up to a - * user process to have its wicked way with. This driver has it's - * roots in a similar driver written by Phil Cockcroft (formerly) at - * UCL. This driver is based much more on read/write/poll mode of - * operation though. - * - * $FreeBSD$ - */ - -#include <rtems/bsd/local/opt_inet.h> -#include <rtems/bsd/local/opt_inet6.h> - -#include <sys/param.h> -#include <sys/lock.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/systm.h> -#include <sys/jail.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <sys/socket.h> -#include <sys/fcntl.h> -#include <sys/filio.h> -#include <sys/sockio.h> -#include <sys/sx.h> -#include <sys/ttycom.h> -#include <sys/poll.h> -#include <sys/selinfo.h> -#include <sys/signalvar.h> -#include <sys/filedesc.h> -#include <sys/kernel.h> -#include <sys/sysctl.h> -#include <sys/conf.h> -#include <sys/uio.h> -#include <sys/malloc.h> -#include <sys/random.h> -#include <sys/ctype.h> - -#include <net/if.h> -#include <net/if_var.h> -#include <net/if_clone.h> -#include <net/if_types.h> -#include <net/netisr.h> -#include <net/route.h> -#include <net/vnet.h> -#ifdef INET -#include <netinet/in.h> -#endif -#include <net/bpf.h> -#include <net/if_tun.h> - -#include <sys/queue.h> -#include <sys/condvar.h> - -#include <security/mac/mac_framework.h> - -/* - * tun_list is protected by global tunmtx. Other mutable fields are - * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is - * static for the duration of a tunnel interface. - */ -struct tun_softc { - TAILQ_ENTRY(tun_softc) tun_list; - struct cdev *tun_dev; - u_short tun_flags; /* misc flags */ -#define TUN_OPEN 0x0001 -#define TUN_INITED 0x0002 -#define TUN_RCOLL 0x0004 -#define TUN_IASET 0x0008 -#define TUN_DSTADDR 0x0010 -#define TUN_LMODE 0x0020 -#define TUN_RWAIT 0x0040 -#define TUN_ASYNC 0x0080 -#define TUN_IFHEAD 0x0100 -#define TUN_DYING 0x0200 - -#define TUN_READY (TUN_OPEN | TUN_INITED) - -#ifndef __rtems__ - pid_t tun_pid; /* owning pid */ -#endif /* __rtems__ */ - struct ifnet *tun_ifp; /* the interface */ - struct sigio *tun_sigio; /* information for async I/O */ - struct selinfo tun_rsel; /* read select */ - struct mtx tun_mtx; /* protect mutable softc fields */ - struct cv tun_cv; /* protect against ref'd dev destroy */ -}; -#define TUN2IFP(sc) ((sc)->tun_ifp) - -#define TUNDEBUG if (tundebug) if_printf - -/* - * All mutable global variables in if_tun are locked using tunmtx, with - * the exception of tundebug, which is used unlocked, and tunclones, - * which is static after setup. - */ -static struct mtx tunmtx; -static eventhandler_tag tag; -static const char tunname[] = "tun"; -static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); -static int tundebug = 0; -static int tundclone = 1; -static struct clonedevs *tunclones; -static TAILQ_HEAD(,tun_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); -SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); - -static struct sx tun_ioctl_sx; -SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); - -SYSCTL_DECL(_net_link); -static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, - "IP tunnel software network interface."); -SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, - "Enable legacy devfs interface creation."); - -static void tunclone(void *arg, struct ucred *cred, char *name, - int namelen, struct cdev **dev); -static void tuncreate(const char *name, struct cdev *dev); -static int tunifioctl(struct ifnet *, u_long, caddr_t); -static void tuninit(struct ifnet *); -static int tunmodevent(module_t, int, void *); -static int tunoutput(struct ifnet *, struct mbuf *, - const struct sockaddr *, struct route *ro); -static void tunstart(struct ifnet *); - -static int tun_clone_match(struct if_clone *ifc, const char *name); -static int tun_clone_create(struct if_clone *, char *, size_t, caddr_t); -static int tun_clone_destroy(struct if_clone *, struct ifnet *); -static struct unrhdr *tun_unrhdr; -VNET_DEFINE_STATIC(struct if_clone *, tun_cloner); -#define V_tun_cloner VNET(tun_cloner) - -static d_open_t tunopen; -static d_close_t tunclose; -static d_read_t tunread; -static d_write_t tunwrite; -static d_ioctl_t tunioctl; -static d_poll_t tunpoll; -static d_kqfilter_t tunkqfilter; - -static int tunkqread(struct knote *, long); -static int tunkqwrite(struct knote *, long); -static void tunkqdetach(struct knote *); - -static struct filterops tun_read_filterops = { - .f_isfd = 1, - .f_attach = NULL, - .f_detach = tunkqdetach, - .f_event = tunkqread, -}; - -static struct filterops tun_write_filterops = { - .f_isfd = 1, - .f_attach = NULL, - .f_detach = tunkqdetach, - .f_event = tunkqwrite, -}; - -static struct cdevsw tun_cdevsw = { - .d_version = D_VERSION, - .d_flags = D_NEEDMINOR, - .d_open = tunopen, - .d_close = tunclose, - .d_read = tunread, - .d_write = tunwrite, - .d_ioctl = tunioctl, - .d_poll = tunpoll, - .d_kqfilter = tunkqfilter, - .d_name = tunname, -}; - -static int -tun_clone_match(struct if_clone *ifc, const char *name) -{ - if (strncmp(tunname, name, 3) == 0 && - (name[3] == '\0' || isdigit(name[3]))) - return (1); - - return (0); -} - -static int -tun_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) -{ - struct cdev *dev; - int err, unit, i; - - err = ifc_name2unit(name, &unit); - if (err != 0) - return (err); - - if (unit != -1) { - /* If this unit number is still available that/s okay. */ - if (alloc_unr_specific(tun_unrhdr, unit) == -1) - return (EEXIST); - } else { - unit = alloc_unr(tun_unrhdr); - } - - snprintf(name, IFNAMSIZ, "%s%d", tunname, unit); - - /* find any existing device, or allocate new unit number */ - i = clone_create(&tunclones, &tun_cdevsw, &unit, &dev, 0); - if (i) { - /* No preexisting struct cdev *, create one */ - dev = make_dev(&tun_cdevsw, unit, - UID_UUCP, GID_DIALER, 0600, "%s%d", tunname, unit); - } - tuncreate(tunname, dev); - - return (0); -} - -static void -tunclone(void *arg, struct ucred *cred, char *name, int namelen, - struct cdev **dev) -{ - char devname[SPECNAMELEN + 1]; - int u, i, append_unit; - - if (*dev != NULL) - return; - - /* - * If tun cloning is enabled, only the superuser can create an - * interface. - */ - if (!tundclone || priv_check_cred(cred, PRIV_NET_IFCREATE, 0) != 0) - return; - - if (strcmp(name, tunname) == 0) { - u = -1; - } else if (dev_stdclone(name, NULL, tunname, &u) != 1) - return; /* Don't recognise the name */ - if (u != -1 && u > IF_MAXUNIT) - return; /* Unit number too high */ - - if (u == -1) - append_unit = 1; - else - append_unit = 0; - - CURVNET_SET(CRED_TO_VNET(cred)); - /* find any existing device, or allocate new unit number */ - i = clone_create(&tunclones, &tun_cdevsw, &u, dev, 0); - if (i) { - if (append_unit) { - namelen = snprintf(devname, sizeof(devname), "%s%d", - name, u); - name = devname; - } - /* No preexisting struct cdev *, create one */ - *dev = make_dev_credf(MAKEDEV_REF, &tun_cdevsw, u, cred, - UID_UUCP, GID_DIALER, 0600, "%s", name); - } - - if_clone_create(name, namelen, NULL); - CURVNET_RESTORE(); -} - -static void -tun_destroy(struct tun_softc *tp) -{ - struct cdev *dev; - - mtx_lock(&tp->tun_mtx); - tp->tun_flags |= TUN_DYING; - if ((tp->tun_flags & TUN_OPEN) != 0) - cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); - else - mtx_unlock(&tp->tun_mtx); - - CURVNET_SET(TUN2IFP(tp)->if_vnet); - - dev = tp->tun_dev; - bpfdetach(TUN2IFP(tp)); - if_detach(TUN2IFP(tp)); - - sx_xlock(&tun_ioctl_sx); - TUN2IFP(tp)->if_softc = NULL; - sx_xunlock(&tun_ioctl_sx); - - free_unr(tun_unrhdr, TUN2IFP(tp)->if_dunit); - if_free(TUN2IFP(tp)); - destroy_dev(dev); - seldrain(&tp->tun_rsel); - knlist_clear(&tp->tun_rsel.si_note, 0); - knlist_destroy(&tp->tun_rsel.si_note); - mtx_destroy(&tp->tun_mtx); - cv_destroy(&tp->tun_cv); - free(tp, M_TUN); - CURVNET_RESTORE(); -} - -static int -tun_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) -{ - struct tun_softc *tp = ifp->if_softc; - - mtx_lock(&tunmtx); - TAILQ_REMOVE(&tunhead, tp, tun_list); - mtx_unlock(&tunmtx); - tun_destroy(tp); - - return (0); -} - -static void -vnet_tun_init(const void *unused __unused) -{ - V_tun_cloner = if_clone_advanced(tunname, 0, tun_clone_match, - tun_clone_create, tun_clone_destroy); -} -VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, - vnet_tun_init, NULL); - -static void -vnet_tun_uninit(const void *unused __unused) -{ - if_clone_detach(V_tun_cloner); -} -VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, - vnet_tun_uninit, NULL); - -static void -tun_uninit(const void *unused __unused) -{ - struct tun_softc *tp; - - EVENTHANDLER_DEREGISTER(dev_clone, tag); - drain_dev_clone_events(); - - mtx_lock(&tunmtx); - while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { - TAILQ_REMOVE(&tunhead, tp, tun_list); - mtx_unlock(&tunmtx); - tun_destroy(tp); - mtx_lock(&tunmtx); - } - mtx_unlock(&tunmtx); - delete_unrhdr(tun_unrhdr); - clone_cleanup(&tunclones); - mtx_destroy(&tunmtx); -} -SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); - -static int -tunmodevent(module_t mod, int type, void *data) -{ - - switch (type) { - case MOD_LOAD: - mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); - clone_setup(&tunclones); - tun_unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); - tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); - if (tag == NULL) - return (ENOMEM); - break; - case MOD_UNLOAD: - /* See tun_uninit, so it's done after the vnet_sysuninit() */ - break; - default: - return EOPNOTSUPP; - } - return 0; -} - -static moduledata_t tun_mod = { - "if_tun", - tunmodevent, - 0 -}; - -DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); -MODULE_VERSION(if_tun, 1); - -static void -tunstart(struct ifnet *ifp) -{ - struct tun_softc *tp = ifp->if_softc; - struct mbuf *m; - - TUNDEBUG(ifp,"%s starting\n", ifp->if_xname); - if (ALTQ_IS_ENABLED(&ifp->if_snd)) { - IFQ_LOCK(&ifp->if_snd); - IFQ_POLL_NOLOCK(&ifp->if_snd, m); - if (m == NULL) { - IFQ_UNLOCK(&ifp->if_snd); - return; - } - IFQ_UNLOCK(&ifp->if_snd); - } - - mtx_lock(&tp->tun_mtx); - if (tp->tun_flags & TUN_RWAIT) { - tp->tun_flags &= ~TUN_RWAIT; - wakeup(tp); - } - selwakeuppri(&tp->tun_rsel, PZERO + 1); - KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); - if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { - mtx_unlock(&tp->tun_mtx); - pgsigio(&tp->tun_sigio, SIGIO, 0); - } else - mtx_unlock(&tp->tun_mtx); -} - -/* XXX: should return an error code so it can fail. */ -static void -tuncreate(const char *name, struct cdev *dev) -{ - struct tun_softc *sc; - struct ifnet *ifp; - - sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO); - mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF); - cv_init(&sc->tun_cv, "tun_condvar"); - sc->tun_flags = TUN_INITED; - sc->tun_dev = dev; - mtx_lock(&tunmtx); - TAILQ_INSERT_TAIL(&tunhead, sc, tun_list); - mtx_unlock(&tunmtx); - - ifp = sc->tun_ifp = if_alloc(IFT_PPP); - if (ifp == NULL) - panic("%s%d: failed to if_alloc() interface.\n", - name, dev2unit(dev)); - if_initname(ifp, name, dev2unit(dev)); - ifp->if_mtu = TUNMTU; - ifp->if_ioctl = tunifioctl; - ifp->if_output = tunoutput; - ifp->if_start = tunstart; - ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; - ifp->if_softc = sc; - IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); - ifp->if_snd.ifq_drv_maxlen = 0; - IFQ_SET_READY(&ifp->if_snd); - knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx); - ifp->if_capabilities |= IFCAP_LINKSTATE; - ifp->if_capenable |= IFCAP_LINKSTATE; - - if_attach(ifp); - bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); - dev->si_drv1 = sc; - TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); -} - -static int -tunopen(struct cdev *dev, int flag, int mode, struct thread *td) -{ - struct ifnet *ifp; - struct tun_softc *tp; - - /* - * XXXRW: Non-atomic test and set of dev->si_drv1 requires - * synchronization. - */ - tp = dev->si_drv1; - if (!tp) { - tuncreate(tunname, dev); - tp = dev->si_drv1; - } - - mtx_lock(&tp->tun_mtx); - if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { - mtx_unlock(&tp->tun_mtx); - return (EBUSY); - } - -#ifndef __rtems__ - tp->tun_pid = td->td_proc->p_pid; -#endif /* __rtems__ */ - tp->tun_flags |= TUN_OPEN; - ifp = TUN2IFP(tp); - if_link_state_change(ifp, LINK_STATE_UP); - TUNDEBUG(ifp, "open\n"); - mtx_unlock(&tp->tun_mtx); - - return (0); -} - -/* - * tunclose - close the device - mark i/f down & delete - * routing info - */ -static int -tunclose(struct cdev *dev, int foo, int bar, struct thread *td) -{ - struct tun_softc *tp; - struct ifnet *ifp; - - tp = dev->si_drv1; - ifp = TUN2IFP(tp); - - mtx_lock(&tp->tun_mtx); -#ifndef __rtems__ - /* - * Simply close the device if this isn't the controlling process. This - * may happen if, for instance, the tunnel has been handed off to - * another process. The original controller should be able to close it - * without putting us into an inconsistent state. - */ - if (td->td_proc->p_pid != tp->tun_pid) { - mtx_unlock(&tp->tun_mtx); - return (0); - } -#endif /* __rtems__ */ - - /* - * junk all pending output - */ - CURVNET_SET(ifp->if_vnet); - IFQ_PURGE(&ifp->if_snd); - - if (ifp->if_flags & IFF_UP) { - mtx_unlock(&tp->tun_mtx); - if_down(ifp); - mtx_lock(&tp->tun_mtx); - } - - /* Delete all addresses and routes which reference this interface. */ - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - struct ifaddr *ifa; - - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - mtx_unlock(&tp->tun_mtx); - CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - /* deal w/IPv4 PtP destination; unlocked read */ - if (ifa->ifa_addr->sa_family == AF_INET) { - rtinit(ifa, (int)RTM_DELETE, - tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); - } else { - rtinit(ifa, (int)RTM_DELETE, 0); - } - } - if_purgeaddrs(ifp); - mtx_lock(&tp->tun_mtx); - } - if_link_state_change(ifp, LINK_STATE_DOWN); - CURVNET_RESTORE(); - - funsetown(&tp->tun_sigio); - selwakeuppri(&tp->tun_rsel, PZERO + 1); - KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); - TUNDEBUG (ifp, "closed\n"); - tp->tun_flags &= ~TUN_OPEN; -#ifndef __rtems__ - tp->tun_pid = 0; -#endif /* __rtems__ */ - - cv_broadcast(&tp->tun_cv); - mtx_unlock(&tp->tun_mtx); - return (0); -} - -static void -tuninit(struct ifnet *ifp) -{ - struct tun_softc *tp = ifp->if_softc; -#ifdef INET - struct ifaddr *ifa; -#endif - - TUNDEBUG(ifp, "tuninit\n"); - - mtx_lock(&tp->tun_mtx); - ifp->if_flags |= IFF_UP; - ifp->if_drv_flags |= IFF_DRV_RUNNING; - getmicrotime(&ifp->if_lastchange); - -#ifdef INET - if_addr_rlock(ifp); - CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family == AF_INET) { - struct sockaddr_in *si; - - si = (struct sockaddr_in *)ifa->ifa_addr; - if (si->sin_addr.s_addr) - tp->tun_flags |= TUN_IASET; - - si = (struct sockaddr_in *)ifa->ifa_dstaddr; - if (si && si->sin_addr.s_addr) - tp->tun_flags |= TUN_DSTADDR; - } - } - if_addr_runlock(ifp); -#endif - mtx_unlock(&tp->tun_mtx); -} - -/* - * Process an ioctl request. - */ -static int -tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) -{ - struct ifreq *ifr = (struct ifreq *)data; - struct tun_softc *tp; - struct ifstat *ifs; - int error = 0; - - sx_xlock(&tun_ioctl_sx); - tp = ifp->if_softc; - if (tp == NULL) { - error = ENXIO; - goto bad; - } - switch(cmd) { - case SIOCGIFSTATUS: - ifs = (struct ifstat *)data; - mtx_lock(&tp->tun_mtx); -#ifndef __rtems__ - if (tp->tun_pid) - snprintf(ifs->ascii, sizeof(ifs->ascii), - "\tOpened by PID %d\n", tp->tun_pid); - else -#endif /* __rtems__ */ - ifs->ascii[0] = '\0'; - mtx_unlock(&tp->tun_mtx); - break; - case SIOCSIFADDR: - tuninit(ifp); - TUNDEBUG(ifp, "address set\n"); - break; - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; - TUNDEBUG(ifp, "mtu set\n"); - break; - case SIOCSIFFLAGS: - case SIOCADDMULTI: - case SIOCDELMULTI: - break; - default: - error = EINVAL; - } -bad: - sx_xunlock(&tun_ioctl_sx); - return (error); -} - -/* - * tunoutput - queue packets from higher level ready to put out. - */ -static int -tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, - struct route *ro) -{ - struct tun_softc *tp = ifp->if_softc; - u_short cached_tun_flags; - int error; - u_int32_t af; - - TUNDEBUG (ifp, "tunoutput\n"); - -#ifdef MAC - error = mac_ifnet_check_transmit(ifp, m0); - if (error) { - m_freem(m0); - return (error); - } -#endif - - /* Could be unlocked read? */ - mtx_lock(&tp->tun_mtx); - cached_tun_flags = tp->tun_flags; - mtx_unlock(&tp->tun_mtx); - if ((cached_tun_flags & TUN_READY) != TUN_READY) { - TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); - m_freem (m0); - return (EHOSTDOWN); - } - - if ((ifp->if_flags & IFF_UP) != IFF_UP) { - m_freem (m0); - return (EHOSTDOWN); - } - - /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) - bcopy(dst->sa_data, &af, sizeof(af)); - else - af = dst->sa_family; - - if (bpf_peers_present(ifp->if_bpf)) - bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); - - /* prepend sockaddr? this may abort if the mbuf allocation fails */ - if (cached_tun_flags & TUN_LMODE) { - /* allocate space for sockaddr */ - M_PREPEND(m0, dst->sa_len, M_NOWAIT); - - /* if allocation failed drop packet */ - if (m0 == NULL) { - if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - return (ENOBUFS); - } else { - bcopy(dst, m0->m_data, dst->sa_len); - } - } - - if (cached_tun_flags & TUN_IFHEAD) { - /* Prepend the address family */ - M_PREPEND(m0, 4, M_NOWAIT); - - /* if allocation failed drop packet */ - if (m0 == NULL) { - if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - return (ENOBUFS); - } else - *(u_int32_t *)m0->m_data = htonl(af); - } else { -#ifdef INET - if (af != AF_INET) -#endif - { - m_freem(m0); - return (EAFNOSUPPORT); - } - } - - error = (ifp->if_transmit)(ifp, m0); - if (error) - return (ENOBUFS); - if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); - return (0); -} - -/* - * the cdevsw interface is now pretty minimal. - */ -static int -tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, - struct thread *td) -{ - struct ifreq ifr, *ifrp; - struct tun_softc *tp = dev->si_drv1; - struct tuninfo *tunp; - int error; - - switch (cmd) { - case TUNGIFNAME: - ifrp = (struct ifreq *)data; - strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ); - break; - case TUNSIFINFO: - tunp = (struct tuninfo *)data; - if (TUN2IFP(tp)->if_type != tunp->type) - return (EPROTOTYPE); - mtx_lock(&tp->tun_mtx); - if (TUN2IFP(tp)->if_mtu != tunp->mtu) { - strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); - ifr.ifr_mtu = tunp->mtu; - CURVNET_SET(TUN2IFP(tp)->if_vnet); - error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), - (caddr_t)&ifr, td); - CURVNET_RESTORE(); - if (error) { - mtx_unlock(&tp->tun_mtx); - return (error); - } - } - TUN2IFP(tp)->if_baudrate = tunp->baudrate; - mtx_unlock(&tp->tun_mtx); - break; - case TUNGIFINFO: - tunp = (struct tuninfo *)data; - mtx_lock(&tp->tun_mtx); - tunp->mtu = TUN2IFP(tp)->if_mtu; - tunp->type = TUN2IFP(tp)->if_type; - tunp->baudrate = TUN2IFP(tp)->if_baudrate; - mtx_unlock(&tp->tun_mtx); - break; - case TUNSDEBUG: - tundebug = *(int *)data; - break; - case TUNGDEBUG: - *(int *)data = tundebug; - break; - case TUNSLMODE: - mtx_lock(&tp->tun_mtx); - if (*(int *)data) { - tp->tun_flags |= TUN_LMODE; - tp->tun_flags &= ~TUN_IFHEAD; - } else - tp->tun_flags &= ~TUN_LMODE; - mtx_unlock(&tp->tun_mtx); - break; - case TUNSIFHEAD: - mtx_lock(&tp->tun_mtx); - if (*(int *)data) { - tp->tun_flags |= TUN_IFHEAD; - tp->tun_flags &= ~TUN_LMODE; - } else - tp->tun_flags &= ~TUN_IFHEAD; - mtx_unlock(&tp->tun_mtx); - break; - case TUNGIFHEAD: - mtx_lock(&tp->tun_mtx); - *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; - mtx_unlock(&tp->tun_mtx); - break; - case TUNSIFMODE: - /* deny this if UP */ - if (TUN2IFP(tp)->if_flags & IFF_UP) - return(EBUSY); - - switch (*(int *)data & ~IFF_MULTICAST) { - case IFF_POINTOPOINT: - case IFF_BROADCAST: - mtx_lock(&tp->tun_mtx); - TUN2IFP(tp)->if_flags &= - ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); - TUN2IFP(tp)->if_flags |= *(int *)data; - mtx_unlock(&tp->tun_mtx); - break; - default: - return(EINVAL); - } - break; - case TUNSIFPID: -#ifndef __rtems__ - mtx_lock(&tp->tun_mtx); - tp->tun_pid = curthread->td_proc->p_pid; - mtx_unlock(&tp->tun_mtx); -#endif /* __rtems__ */ - break; - case FIONBIO: - break; - case FIOASYNC: - mtx_lock(&tp->tun_mtx); - if (*(int *)data) - tp->tun_flags |= TUN_ASYNC; - else - tp->tun_flags &= ~TUN_ASYNC; - mtx_unlock(&tp->tun_mtx); - break; - case FIONREAD: - if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { - struct mbuf *mb; - IFQ_LOCK(&TUN2IFP(tp)->if_snd); - IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); - for (*(int *)data = 0; mb != NULL; mb = mb->m_next) - *(int *)data += mb->m_len; - IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); - } else - *(int *)data = 0; - break; - case FIOSETOWN: - return (fsetown(*(int *)data, &tp->tun_sigio)); - - case FIOGETOWN: - *(int *)data = fgetown(&tp->tun_sigio); - return (0); - - /* This is deprecated, FIOSETOWN should be used instead. */ - case TIOCSPGRP: - return (fsetown(-(*(int *)data), &tp->tun_sigio)); - - /* This is deprecated, FIOGETOWN should be used instead. */ - case TIOCGPGRP: - *(int *)data = -fgetown(&tp->tun_sigio); - return (0); - - default: - return (ENOTTY); - } - return (0); -} - -/* - * The cdevsw read interface - reads a packet at a time, or at - * least as much of a packet as can be read. - */ -static int -tunread(struct cdev *dev, struct uio *uio, int flag) -{ - struct tun_softc *tp = dev->si_drv1; - struct ifnet *ifp = TUN2IFP(tp); - struct mbuf *m; - int error=0, len; - - TUNDEBUG (ifp, "read\n"); - mtx_lock(&tp->tun_mtx); - if ((tp->tun_flags & TUN_READY) != TUN_READY) { - mtx_unlock(&tp->tun_mtx); - TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); - return (EHOSTDOWN); - } - - tp->tun_flags &= ~TUN_RWAIT; - - do { - IFQ_DEQUEUE(&ifp->if_snd, m); - if (m == NULL) { - if (flag & O_NONBLOCK) { - mtx_unlock(&tp->tun_mtx); - return (EWOULDBLOCK); - } - tp->tun_flags |= TUN_RWAIT; - error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), - "tunread", 0); - if (error != 0) { - mtx_unlock(&tp->tun_mtx); - return (error); - } - } - } while (m == NULL); - mtx_unlock(&tp->tun_mtx); - - while (m && uio->uio_resid > 0 && error == 0) { - len = min(uio->uio_resid, m->m_len); - if (len != 0) - error = uiomove(mtod(m, void *), len, uio); - m = m_free(m); - } - - if (m) { - TUNDEBUG(ifp, "Dropping mbuf\n"); - m_freem(m); - } - return (error); -} - -/* - * the cdevsw write interface - an atomic write is a packet - or else! - */ -static int -tunwrite(struct cdev *dev, struct uio *uio, int flag) -{ - struct tun_softc *tp = dev->si_drv1; - struct ifnet *ifp = TUN2IFP(tp); - struct mbuf *m; - uint32_t family, mru; - int isr; - - TUNDEBUG(ifp, "tunwrite\n"); - - if ((ifp->if_flags & IFF_UP) != IFF_UP) - /* ignore silently */ - return (0); - - if (uio->uio_resid == 0) - return (0); - - mru = TUNMRU; - if (tp->tun_flags & TUN_IFHEAD) - mru += sizeof(family); - if (uio->uio_resid < 0 || uio->uio_resid > mru) { - TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); - return (EIO); - } - - if ((m = m_uiotombuf(uio, M_NOWAIT, 0, 0, M_PKTHDR)) == NULL) { - if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); - return (ENOBUFS); - } - - m->m_pkthdr.rcvif = ifp; -#ifdef MAC - mac_ifnet_create_mbuf(ifp, m); -#endif - - /* Could be unlocked read? */ - mtx_lock(&tp->tun_mtx); - if (tp->tun_flags & TUN_IFHEAD) { - mtx_unlock(&tp->tun_mtx); - if (m->m_len < sizeof(family) && - (m = m_pullup(m, sizeof(family))) == NULL) - return (ENOBUFS); - family = ntohl(*mtod(m, u_int32_t *)); - m_adj(m, sizeof(family)); - } else { - mtx_unlock(&tp->tun_mtx); - family = AF_INET; - } - - BPF_MTAP2(ifp, &family, sizeof(family), m); - - switch (family) { -#ifdef INET - case AF_INET: - isr = NETISR_IP; - break; -#endif -#ifdef INET6 - case AF_INET6: - isr = NETISR_IPV6; - break; -#endif - default: - m_freem(m); - return (EAFNOSUPPORT); - } - random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); - if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); - if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); - CURVNET_SET(ifp->if_vnet); - M_SETFIB(m, ifp->if_fib); - netisr_dispatch(isr, m); - CURVNET_RESTORE(); - return (0); -} - -/* - * tunpoll - the poll interface, this is only useful on reads - * really. The write detect always returns true, write never blocks - * anyway, it either accepts the packet or drops it. - */ -static int -tunpoll(struct cdev *dev, int events, struct thread *td) -{ - struct tun_softc *tp = dev->si_drv1; - struct ifnet *ifp = TUN2IFP(tp); - int revents = 0; - struct mbuf *m; - - TUNDEBUG(ifp, "tunpoll\n"); - - if (events & (POLLIN | POLLRDNORM)) { - IFQ_LOCK(&ifp->if_snd); - IFQ_POLL_NOLOCK(&ifp->if_snd, m); - if (m != NULL) { - TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); - revents |= events & (POLLIN | POLLRDNORM); - } else { - TUNDEBUG(ifp, "tunpoll waiting\n"); - selrecord(td, &tp->tun_rsel); - } - IFQ_UNLOCK(&ifp->if_snd); - } - if (events & (POLLOUT | POLLWRNORM)) - revents |= events & (POLLOUT | POLLWRNORM); - - return (revents); -} - -/* - * tunkqfilter - support for the kevent() system call. - */ -static int -tunkqfilter(struct cdev *dev, struct knote *kn) -{ - struct tun_softc *tp = dev->si_drv1; - struct ifnet *ifp = TUN2IFP(tp); - - switch(kn->kn_filter) { - case EVFILT_READ: - TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - kn->kn_fop = &tun_read_filterops; - break; - - case EVFILT_WRITE: - TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - kn->kn_fop = &tun_write_filterops; - break; - - default: - TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - return(EINVAL); - } - - kn->kn_hook = tp; - knlist_add(&tp->tun_rsel.si_note, kn, 0); - - return (0); -} - -/* - * Return true of there is data in the interface queue. - */ -static int -tunkqread(struct knote *kn, long hint) -{ - int ret; - struct tun_softc *tp = kn->kn_hook; - struct cdev *dev = tp->tun_dev; - struct ifnet *ifp = TUN2IFP(tp); - - if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { - TUNDEBUG(ifp, - "%s have data in the queue. Len = %d, minor = %#x\n", - ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); - ret = 1; - } else { - TUNDEBUG(ifp, - "%s waiting for data, minor = %#x\n", ifp->if_xname, - dev2unit(dev)); - ret = 0; - } - - return (ret); -} - -/* - * Always can write, always return MTU in kn->data. - */ -static int -tunkqwrite(struct knote *kn, long hint) -{ - struct tun_softc *tp = kn->kn_hook; - struct ifnet *ifp = TUN2IFP(tp); - - kn->kn_data = ifp->if_mtu; - - return (1); -} - -static void -tunkqdetach(struct knote *kn) -{ - struct tun_softc *tp = kn->kn_hook; - - knlist_remove(&tp->tun_rsel.si_note, kn, 0); -} diff --git a/freebsd/sys/net/if_tuntap.c b/freebsd/sys/net/if_tuntap.c new file mode 100644 index 00000000..e366aac7 --- /dev/null +++ b/freebsd/sys/net/if_tuntap.c @@ -0,0 +1,1923 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com> + * All rights reserved. + * Copyright (c) 2019 Kyle Evans <kevans@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * BASED ON: + * ------------------------------------------------------------------------- + * + * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> + * Nottingham University 1987. + * + * This source may be freely distributed, however I would be interested + * in any changes that are made. + * + * This driver takes packets off the IP i/f and hands them up to a + * user process to have its wicked way with. This driver has it's + * roots in a similar driver written by Phil Cockcroft (formerly) at + * UCL. This driver is based much more on read/write/poll mode of + * operation though. + * + * $FreeBSD$ + */ + +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/jail.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/socket.h> +#include <sys/fcntl.h> +#include <sys/filio.h> +#include <sys/sockio.h> +#include <sys/sx.h> +#include <sys/syslog.h> +#include <sys/ttycom.h> +#include <sys/poll.h> +#include <sys/selinfo.h> +#include <sys/signalvar.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/malloc.h> +#include <sys/random.h> +#include <sys/ctype.h> + +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_clone.h> +#include <net/if_dl.h> +#include <net/if_media.h> +#include <net/if_types.h> +#include <net/netisr.h> +#include <net/route.h> +#include <net/vnet.h> +#ifdef INET +#include <netinet/in.h> +#endif +#include <net/bpf.h> +#include <net/if_tap.h> +#include <net/if_tun.h> + +#include <sys/queue.h> +#include <sys/condvar.h> +#include <security/mac/mac_framework.h> + +struct tuntap_driver; + +/* + * tun_list is protected by global tunmtx. Other mutable fields are + * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is + * static for the duration of a tunnel interface. + */ +struct tuntap_softc { + TAILQ_ENTRY(tuntap_softc) tun_list; + struct cdev *tun_alias; + struct cdev *tun_dev; + u_short tun_flags; /* misc flags */ +#define TUN_OPEN 0x0001 +#define TUN_INITED 0x0002 +#define TUN_UNUSED1 0x0008 +#define TUN_DSTADDR 0x0010 +#define TUN_LMODE 0x0020 +#define TUN_RWAIT 0x0040 +#define TUN_ASYNC 0x0080 +#define TUN_IFHEAD 0x0100 +#define TUN_DYING 0x0200 +#define TUN_L2 0x0400 +#define TUN_VMNET 0x0800 + +#define TUN_DRIVER_IDENT_MASK (TUN_L2 | TUN_VMNET) +#define TUN_READY (TUN_OPEN | TUN_INITED) + +#ifndef __rtems__ + pid_t tun_pid; /* owning pid */ +#endif /* __rtems__ */ + struct ifnet *tun_ifp; /* the interface */ + struct sigio *tun_sigio; /* async I/O info */ + struct tuntap_driver *tun_drv; /* appropriate driver */ + struct selinfo tun_rsel; /* read select */ + struct mtx tun_mtx; /* softc field mutex */ + struct cv tun_cv; /* for ref'd dev destroy */ + struct ether_addr tun_ether; /* remote address */ + int tun_busy; /* busy count */ +}; +#define TUN2IFP(sc) ((sc)->tun_ifp) + +#define TUNDEBUG if (tundebug) if_printf + +#define TUN_LOCK(tp) mtx_lock(&(tp)->tun_mtx) +#define TUN_UNLOCK(tp) mtx_unlock(&(tp)->tun_mtx) +#define TUN_LOCK_ASSERT(tp) mtx_assert(&(tp)->tun_mtx, MA_OWNED); + +#define TUN_VMIO_FLAG_MASK 0x0fff + +/* + * All mutable global variables in if_tun are locked using tunmtx, with + * the exception of tundebug, which is used unlocked, and the drivers' *clones, + * which are static after setup. + */ +static struct mtx tunmtx; +static eventhandler_tag arrival_tag; +static eventhandler_tag clone_tag; +static const char tunname[] = "tun"; +static const char tapname[] = "tap"; +static const char vmnetname[] = "vmnet"; +static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); +static int tundebug = 0; +static int tundclone = 1; +static int tap_allow_uopen = 0; /* allow user open() */ +static int tapuponopen = 0; /* IFF_UP on open() */ +static int tapdclone = 1; /* enable devfs cloning */ + +static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); +SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); + +static struct sx tun_ioctl_sx; +SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); + +SYSCTL_DECL(_net_link); +/* tun */ +static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, + "IP tunnel software network interface"); +SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, + "Enable legacy devfs interface creation"); + +/* tap */ +static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0, + "Ethernet tunnel software network interface"); +SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0, + "Allow user to open /dev/tap (based on node permissions)"); +SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, + "Bring interface up when /dev/tap is opened"); +SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, + "Enable legacy devfs interface creation"); +SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, ""); + +static int tun_create_device(struct tuntap_driver *drv, int unit, + struct ucred *cr, struct cdev **dev, const char *name); +static int tun_busy_locked(struct tuntap_softc *tp); +static void tun_unbusy_locked(struct tuntap_softc *tp); +static int tun_busy(struct tuntap_softc *tp); +static void tun_unbusy(struct tuntap_softc *tp); + +static int tuntap_name2info(const char *name, int *unit, int *flags); +static void tunclone(void *arg, struct ucred *cred, char *name, + int namelen, struct cdev **dev); +static void tuncreate(struct cdev *dev); +static void tundtor(void *data); +static void tunrename(void *arg, struct ifnet *ifp); +static int tunifioctl(struct ifnet *, u_long, caddr_t); +static void tuninit(struct ifnet *); +static void tunifinit(void *xtp); +static int tuntapmodevent(module_t, int, void *); +static int tunoutput(struct ifnet *, struct mbuf *, + const struct sockaddr *, struct route *ro); +static void tunstart(struct ifnet *); +static void tunstart_l2(struct ifnet *); + +static int tun_clone_match(struct if_clone *ifc, const char *name); +static int tap_clone_match(struct if_clone *ifc, const char *name); +static int vmnet_clone_match(struct if_clone *ifc, const char *name); +static int tun_clone_create(struct if_clone *, char *, size_t, caddr_t); +static int tun_clone_destroy(struct if_clone *, struct ifnet *); + +static d_open_t tunopen; +static d_read_t tunread; +static d_write_t tunwrite; +static d_ioctl_t tunioctl; +static d_poll_t tunpoll; +static d_kqfilter_t tunkqfilter; + +static int tunkqread(struct knote *, long); +static int tunkqwrite(struct knote *, long); +static void tunkqdetach(struct knote *); + +static struct filterops tun_read_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = tunkqdetach, + .f_event = tunkqread, +}; + +static struct filterops tun_write_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = tunkqdetach, + .f_event = tunkqwrite, +}; + +static struct tuntap_driver { + struct cdevsw cdevsw; + int ident_flags; + struct unrhdr *unrhdr; + struct clonedevs *clones; + ifc_match_t *clone_match_fn; + ifc_create_t *clone_create_fn; + ifc_destroy_t *clone_destroy_fn; +} tuntap_drivers[] = { + { + .ident_flags = 0, + .cdevsw = { + .d_version = D_VERSION, + .d_flags = D_NEEDMINOR, + .d_open = tunopen, + .d_read = tunread, + .d_write = tunwrite, + .d_ioctl = tunioctl, + .d_poll = tunpoll, + .d_kqfilter = tunkqfilter, + .d_name = tunname, + }, + .clone_match_fn = tun_clone_match, + .clone_create_fn = tun_clone_create, + .clone_destroy_fn = tun_clone_destroy, + }, + { + .ident_flags = TUN_L2, + .cdevsw = { + .d_version = D_VERSION, + .d_flags = D_NEEDMINOR, + .d_open = tunopen, + .d_read = tunread, + .d_write = tunwrite, + .d_ioctl = tunioctl, + .d_poll = tunpoll, + .d_kqfilter = tunkqfilter, + .d_name = tapname, + }, + .clone_match_fn = tap_clone_match, + .clone_create_fn = tun_clone_create, + .clone_destroy_fn = tun_clone_destroy, + }, + { + .ident_flags = TUN_L2 | TUN_VMNET, + .cdevsw = { + .d_version = D_VERSION, + .d_flags = D_NEEDMINOR, + .d_open = tunopen, + .d_read = tunread, + .d_write = tunwrite, + .d_ioctl = tunioctl, + .d_poll = tunpoll, + .d_kqfilter = tunkqfilter, + .d_name = vmnetname, + }, + .clone_match_fn = vmnet_clone_match, + .clone_create_fn = tun_clone_create, + .clone_destroy_fn = tun_clone_destroy, + }, +}; + +struct tuntap_driver_cloner { + SLIST_ENTRY(tuntap_driver_cloner) link; + struct tuntap_driver *drv; + struct if_clone *cloner; +}; + +VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) = + SLIST_HEAD_INITIALIZER(tuntap_driver_cloners); + +#define V_tuntap_driver_cloners VNET(tuntap_driver_cloners) + +/* + * Mechanism for marking a tunnel device as busy so that we can safely do some + * orthogonal operations (such as operations on devices) without racing against + * tun_destroy. tun_destroy will wait on the condvar if we're at all busy or + * open, to be woken up when the condition is alleviated. + */ +static int +tun_busy_locked(struct tuntap_softc *tp) +{ + + TUN_LOCK_ASSERT(tp); + if ((tp->tun_flags & TUN_DYING) != 0) { + /* + * Perhaps unintuitive, but the device is busy going away. + * Other interpretations of EBUSY from tun_busy make little + * sense, since making a busy device even more busy doesn't + * sound like a problem. + */ + return (EBUSY); + } + + ++tp->tun_busy; + return (0); +} + +static void +tun_unbusy_locked(struct tuntap_softc *tp) +{ + + TUN_LOCK_ASSERT(tp); + KASSERT(tp->tun_busy != 0, ("tun_unbusy: called for non-busy tunnel")); + + --tp->tun_busy; + /* Wake up anything that may be waiting on our busy tunnel. */ + if (tp->tun_busy == 0) + cv_broadcast(&tp->tun_cv); +} + +static int +tun_busy(struct tuntap_softc *tp) +{ + int ret; + + TUN_LOCK(tp); + ret = tun_busy_locked(tp); + TUN_UNLOCK(tp); + return (ret); +} + + +static void +tun_unbusy(struct tuntap_softc *tp) +{ + + TUN_LOCK(tp); + tun_unbusy_locked(tp); + TUN_UNLOCK(tp); +} + +/* + * Sets unit and/or flags given the device name. Must be called with correct + * vnet context. + */ +static int +tuntap_name2info(const char *name, int *outunit, int *outflags) +{ + struct tuntap_driver *drv; + struct tuntap_driver_cloner *drvc; + char *dname; + int flags, unit; + bool found; + + if (name == NULL) + return (EINVAL); + + /* + * Needed for dev_stdclone, but dev_stdclone will not modify, it just + * wants to be able to pass back a char * through the second param. We + * will always set that as NULL here, so we'll fake it. + */ + dname = __DECONST(char *, name); + found = false; + + KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), + ("tuntap_driver_cloners failed to initialize")); + SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { + KASSERT(drvc->drv != NULL, + ("tuntap_driver_cloners entry not properly initialized")); + drv = drvc->drv; + + if (strcmp(name, drv->cdevsw.d_name) == 0) { + found = true; + unit = -1; + flags = drv->ident_flags; + break; + } + + if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) { + found = true; + flags = drv->ident_flags; + break; + } + } + + if (!found) + return (ENXIO); + + if (outunit != NULL) + *outunit = unit; + if (outflags != NULL) + *outflags = flags; + return (0); +} + +/* + * Get driver information from a set of flags specified. Masks the identifying + * part of the flags and compares it against all of the available + * tuntap_drivers. Must be called with correct vnet context. + */ +static struct tuntap_driver * +tuntap_driver_from_flags(int tun_flags) +{ + struct tuntap_driver *drv; + struct tuntap_driver_cloner *drvc; + + KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), + ("tuntap_driver_cloners failed to initialize")); + SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { + KASSERT(drvc->drv != NULL, + ("tuntap_driver_cloners entry not properly initialized")); + drv = drvc->drv; + if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags) + return (drv); + } + + return (NULL); +} + + + +static int +tun_clone_match(struct if_clone *ifc, const char *name) +{ + int tunflags; + + if (tuntap_name2info(name, NULL, &tunflags) == 0) { + if ((tunflags & TUN_L2) == 0) + return (1); + } + + return (0); +} + +static int +tap_clone_match(struct if_clone *ifc, const char *name) +{ + int tunflags; + + if (tuntap_name2info(name, NULL, &tunflags) == 0) { + if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2) + return (1); + } + + return (0); +} + +static int +vmnet_clone_match(struct if_clone *ifc, const char *name) +{ + int tunflags; + + if (tuntap_name2info(name, NULL, &tunflags) == 0) { + if ((tunflags & TUN_VMNET) != 0) + return (1); + } + + return (0); +} + +static int +tun_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +{ + struct tuntap_driver *drv; + struct cdev *dev; + int err, i, tunflags, unit; + + tunflags = 0; + /* The name here tells us exactly what we're creating */ + err = tuntap_name2info(name, &unit, &tunflags); + if (err != 0) + return (err); + + drv = tuntap_driver_from_flags(tunflags); + if (drv == NULL) + return (ENXIO); + + if (unit != -1) { + /* If this unit number is still available that's okay. */ + if (alloc_unr_specific(drv->unrhdr, unit) == -1) + return (EEXIST); + } else { + unit = alloc_unr(drv->unrhdr); + } + + snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit); + + /* find any existing device, or allocate new unit number */ + dev = NULL; + i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0); + /* No preexisting struct cdev *, create one */ + if (i != 0) + i = tun_create_device(drv, unit, NULL, &dev, name); + if (i == 0) + tuncreate(dev); + + return (i); +} + +static void +tunclone(void *arg, struct ucred *cred, char *name, int namelen, + struct cdev **dev) +{ + char devname[SPECNAMELEN + 1]; + struct tuntap_driver *drv; + int append_unit, i, u, tunflags; + bool mayclone; + + if (*dev != NULL) + return; + + tunflags = 0; + CURVNET_SET(CRED_TO_VNET(cred)); + if (tuntap_name2info(name, &u, &tunflags) != 0) + goto out; /* Not recognized */ + + if (u != -1 && u > IF_MAXUNIT) + goto out; /* Unit number too high */ + + mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE, 0) == 0; + if ((tunflags & TUN_L2) != 0) { + /* tap/vmnet allow user open with a sysctl */ + mayclone = (mayclone || tap_allow_uopen) && tapdclone; + } else { + mayclone = mayclone && tundclone; + } + + /* + * If tun cloning is enabled, only the superuser can create an + * interface. + */ + if (!mayclone) + goto out; + + if (u == -1) + append_unit = 1; + else + append_unit = 0; + + drv = tuntap_driver_from_flags(tunflags); + if (drv == NULL) + goto out; + + /* find any existing device, or allocate new unit number */ + i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0); + if (i) { + if (append_unit) { + namelen = snprintf(devname, sizeof(devname), "%s%d", + name, u); + name = devname; + } + + i = tun_create_device(drv, u, cred, dev, name); + } + if (i == 0) + if_clone_create(name, namelen, NULL); +out: + CURVNET_RESTORE(); +} + +static void +tun_destroy(struct tuntap_softc *tp) +{ + + TUN_LOCK(tp); + tp->tun_flags |= TUN_DYING; + if (tp->tun_busy != 0) + cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); + else + TUN_UNLOCK(tp); + + CURVNET_SET(TUN2IFP(tp)->if_vnet); + + /* destroy_dev will take care of any alias. */ + destroy_dev(tp->tun_dev); + seldrain(&tp->tun_rsel); + knlist_clear(&tp->tun_rsel.si_note, 0); + knlist_destroy(&tp->tun_rsel.si_note); + if ((tp->tun_flags & TUN_L2) != 0) { + ether_ifdetach(TUN2IFP(tp)); + } else { + bpfdetach(TUN2IFP(tp)); + if_detach(TUN2IFP(tp)); + } + sx_xlock(&tun_ioctl_sx); + TUN2IFP(tp)->if_softc = NULL; + sx_xunlock(&tun_ioctl_sx); + free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); + if_free(TUN2IFP(tp)); + mtx_destroy(&tp->tun_mtx); + cv_destroy(&tp->tun_cv); + free(tp, M_TUN); + CURVNET_RESTORE(); +} + +static int +tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp) +{ + struct tuntap_softc *tp = ifp->if_softc; + + mtx_lock(&tunmtx); + TAILQ_REMOVE(&tunhead, tp, tun_list); + mtx_unlock(&tunmtx); + tun_destroy(tp); + + return (0); +} + +static void +vnet_tun_init(const void *unused __unused) +{ + struct tuntap_driver *drv; + struct tuntap_driver_cloner *drvc; + int i; + + for (i = 0; i < nitems(tuntap_drivers); ++i) { + drv = &tuntap_drivers[i]; + drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO); + + drvc->drv = drv; + drvc->cloner = if_clone_advanced(drv->cdevsw.d_name, 0, + drv->clone_match_fn, drv->clone_create_fn, + drv->clone_destroy_fn); + SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link); + }; +} +VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_tun_init, NULL); + +static void +vnet_tun_uninit(const void *unused __unused) +{ + struct tuntap_driver_cloner *drvc; + + while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) { + drvc = SLIST_FIRST(&V_tuntap_driver_cloners); + SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link); + + if_clone_detach(drvc->cloner); + free(drvc, M_TUN); + } +} +VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_tun_uninit, NULL); + +static void +tun_uninit(const void *unused __unused) +{ + struct tuntap_driver *drv; + struct tuntap_softc *tp; + int i; + + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag); + EVENTHANDLER_DEREGISTER(dev_clone, clone_tag); + drain_dev_clone_events(); + + mtx_lock(&tunmtx); + while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { + TAILQ_REMOVE(&tunhead, tp, tun_list); + mtx_unlock(&tunmtx); + tun_destroy(tp); + mtx_lock(&tunmtx); + } + mtx_unlock(&tunmtx); + for (i = 0; i < nitems(tuntap_drivers); ++i) { + drv = &tuntap_drivers[i]; + delete_unrhdr(drv->unrhdr); + clone_cleanup(&drv->clones); + } + mtx_destroy(&tunmtx); +} +SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); + +static struct tuntap_driver * +tuntap_driver_from_ifnet(const struct ifnet *ifp) +{ + struct tuntap_driver *drv; + int i; + + if (ifp == NULL) + return (NULL); + + for (i = 0; i < nitems(tuntap_drivers); ++i) { + drv = &tuntap_drivers[i]; + if (strcmp(ifp->if_dname, drv->cdevsw.d_name) == 0) + return (drv); + } + + return (NULL); +} + +static int +tuntapmodevent(module_t mod, int type, void *data) +{ + struct tuntap_driver *drv; + int i; + + switch (type) { + case MOD_LOAD: + mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); + for (i = 0; i < nitems(tuntap_drivers); ++i) { + drv = &tuntap_drivers[i]; + clone_setup(&drv->clones); + drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); + } + arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event, + tunrename, 0, 1000); + if (arrival_tag == NULL) + return (ENOMEM); + clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); + if (clone_tag == NULL) + return (ENOMEM); + break; + case MOD_UNLOAD: + /* See tun_uninit, so it's done after the vnet_sysuninit() */ + break; + default: + return EOPNOTSUPP; + } + return 0; +} + +static moduledata_t tuntap_mod = { + "if_tuntap", + tuntapmodevent, + 0 +}; + +/* We'll only ever have these two, so no need for a macro. */ +static moduledata_t tun_mod = { "if_tun", NULL, 0 }; +static moduledata_t tap_mod = { "if_tap", NULL, 0 }; + +DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_tuntap, 1); +DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_tun, 1); +DECLARE_MODULE(if_tap, tap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_tap, 1); + +static int +tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr, + struct cdev **dev, const char *name) +{ + struct make_dev_args args; + struct tuntap_softc *tp; + int error; + + tp = malloc(sizeof(*tp), M_TUN, M_WAITOK | M_ZERO); + mtx_init(&tp->tun_mtx, "tun_mtx", NULL, MTX_DEF); + cv_init(&tp->tun_cv, "tun_condvar"); + tp->tun_flags = drv->ident_flags; + tp->tun_drv = drv; + + make_dev_args_init(&args); + if (cr != NULL) + args.mda_flags = MAKEDEV_REF; + args.mda_devsw = &drv->cdevsw; + args.mda_cr = cr; + args.mda_uid = UID_UUCP; + args.mda_gid = GID_DIALER; + args.mda_mode = 0600; + args.mda_unit = unit; + args.mda_si_drv1 = tp; + error = make_dev_s(&args, dev, "%s", name); + if (error != 0) { + free(tp, M_TUN); + return (error); + } + + KASSERT((*dev)->si_drv1 != NULL, + ("Failed to set si_drv1 at %s creation", name)); + tp->tun_dev = *dev; + knlist_init_mtx(&tp->tun_rsel.si_note, &tp->tun_mtx); + mtx_lock(&tunmtx); + TAILQ_INSERT_TAIL(&tunhead, tp, tun_list); + mtx_unlock(&tunmtx); + return (0); +} + +static void +tunstart(struct ifnet *ifp) +{ + struct tuntap_softc *tp = ifp->if_softc; + struct mbuf *m; + + TUNDEBUG(ifp, "starting\n"); + if (ALTQ_IS_ENABLED(&ifp->if_snd)) { + IFQ_LOCK(&ifp->if_snd); + IFQ_POLL_NOLOCK(&ifp->if_snd, m); + if (m == NULL) { + IFQ_UNLOCK(&ifp->if_snd); + return; + } + IFQ_UNLOCK(&ifp->if_snd); + } + + TUN_LOCK(tp); + if (tp->tun_flags & TUN_RWAIT) { + tp->tun_flags &= ~TUN_RWAIT; + wakeup(tp); + } + selwakeuppri(&tp->tun_rsel, PZERO + 1); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); + if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { + TUN_UNLOCK(tp); + pgsigio(&tp->tun_sigio, SIGIO, 0); + } else + TUN_UNLOCK(tp); +} + +/* + * tunstart_l2 + * + * queue packets from higher level ready to put out + */ +static void +tunstart_l2(struct ifnet *ifp) +{ + struct tuntap_softc *tp = ifp->if_softc; + + TUNDEBUG(ifp, "starting\n"); + + /* + * do not junk pending output if we are in VMnet mode. + * XXX: can this do any harm because of queue overflow? + */ + + TUN_LOCK(tp); + if (((tp->tun_flags & TUN_VMNET) == 0) && + ((tp->tun_flags & TUN_READY) != TUN_READY)) { + struct mbuf *m; + + /* Unlocked read. */ + TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags); + + for (;;) { + IF_DEQUEUE(&ifp->if_snd, m); + if (m != NULL) { + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + } else + break; + } + TUN_UNLOCK(tp); + + return; + } + + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + if (tp->tun_flags & TUN_RWAIT) { + tp->tun_flags &= ~TUN_RWAIT; + wakeup(tp); + } + + if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) { + TUN_UNLOCK(tp); + pgsigio(&tp->tun_sigio, SIGIO, 0); + TUN_LOCK(tp); + } + + selwakeuppri(&tp->tun_rsel, PZERO+1); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ + } + + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + TUN_UNLOCK(tp); +} /* tunstart_l2 */ + +/* XXX: should return an error code so it can fail. */ +static void +tuncreate(struct cdev *dev) +{ + struct tuntap_driver *drv; + struct tuntap_softc *tp; + struct ifnet *ifp; + struct ether_addr eaddr; + int iflags; + u_char type; + + tp = dev->si_drv1; + KASSERT(tp != NULL, + ("si_drv1 should have been initialized at creation")); + + drv = tp->tun_drv; + iflags = IFF_MULTICAST; + if ((tp->tun_flags & TUN_L2) != 0) { + type = IFT_ETHER; + iflags |= IFF_BROADCAST | IFF_SIMPLEX; + } else { + type = IFT_PPP; + iflags |= IFF_POINTOPOINT; + } + ifp = tp->tun_ifp = if_alloc(type); + if (ifp == NULL) + panic("%s%d: failed to if_alloc() interface.\n", + drv->cdevsw.d_name, dev2unit(dev)); + ifp->if_softc = tp; + if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev)); + ifp->if_ioctl = tunifioctl; + ifp->if_flags = iflags; + IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); + ifp->if_capabilities |= IFCAP_LINKSTATE; + ifp->if_capenable |= IFCAP_LINKSTATE; + + if ((tp->tun_flags & TUN_L2) != 0) { + ifp->if_mtu = ETHERMTU; + ifp->if_init = tunifinit; + ifp->if_start = tunstart_l2; + + ether_gen_addr(ifp, &eaddr); + ether_ifattach(ifp, eaddr.octet); + } else { + ifp->if_mtu = TUNMTU; + ifp->if_start = tunstart; + ifp->if_output = tunoutput; + + ifp->if_snd.ifq_drv_maxlen = 0; + IFQ_SET_READY(&ifp->if_snd); + + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + } + + TUN_LOCK(tp); + tp->tun_flags |= TUN_INITED; + TUN_UNLOCK(tp); + + TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); +} + +static void +tunrename(void *arg __unused, struct ifnet *ifp) +{ + struct tuntap_softc *tp; + int error; + + if ((ifp->if_flags & IFF_RENAMING) == 0) + return; + + if (tuntap_driver_from_ifnet(ifp) == NULL) + return; + + /* + * We need to grab the ioctl sx long enough to make sure the softc is + * still there. If it is, we can safely try to busy the tun device. + * The busy may fail if the device is currently dying, in which case + * we do nothing. If it doesn't fail, the busy count stops the device + * from dying until we've created the alias (that will then be + * subsequently destroyed). + */ + sx_xlock(&tun_ioctl_sx); + tp = ifp->if_softc; + if (tp == NULL) { + sx_xunlock(&tun_ioctl_sx); + return; + } + error = tun_busy(tp); + sx_xunlock(&tun_ioctl_sx); + if (error != 0) + return; + if (tp->tun_alias != NULL) { + destroy_dev(tp->tun_alias); + tp->tun_alias = NULL; + } + + if (strcmp(ifp->if_xname, tp->tun_dev->si_name) == 0) + goto out; + + /* + * Failure's ok, aliases are created on a best effort basis. If a + * tun user/consumer decides to rename the interface to conflict with + * another device (non-ifnet) on the system, we will assume they know + * what they are doing. make_dev_alias_p won't touch tun_alias on + * failure, so we use it but ignore the return value. + */ + make_dev_alias_p(MAKEDEV_CHECKNAME, &tp->tun_alias, tp->tun_dev, "%s", + ifp->if_xname); +out: + tun_unbusy(tp); +} + +static int +tunopen(struct cdev *dev, int flag, int mode, struct thread *td) +{ + struct ifnet *ifp; + struct tuntap_softc *tp; + int error, tunflags; + + tunflags = 0; + CURVNET_SET(TD_TO_VNET(td)); + error = tuntap_name2info(dev->si_name, NULL, &tunflags); + if (error != 0) { + CURVNET_RESTORE(); + return (error); /* Shouldn't happen */ + } + + if ((tunflags & TUN_L2) != 0) { + /* Restrict? */ + if (tap_allow_uopen == 0) { + error = priv_check(td, PRIV_NET_TAP); + if (error != 0) { + CURVNET_RESTORE(); + return (error); + } + } + } + + tp = dev->si_drv1; + KASSERT(tp != NULL, + ("si_drv1 should have been initialized at creation")); + + TUN_LOCK(tp); + if ((tp->tun_flags & TUN_INITED) == 0) { + TUN_UNLOCK(tp); + CURVNET_RESTORE(); + return (ENXIO); + } + if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { + TUN_UNLOCK(tp); + CURVNET_RESTORE(); + return (EBUSY); + } + + error = tun_busy_locked(tp); + KASSERT(error == 0, ("Must be able to busy an unopen tunnel")); + ifp = TUN2IFP(tp); + + if ((tp->tun_flags & TUN_L2) != 0) { + bcopy(IF_LLADDR(ifp), tp->tun_ether.octet, + sizeof(tp->tun_ether.octet)); + + ifp->if_drv_flags |= IFF_DRV_RUNNING; + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + + if (tapuponopen) + ifp->if_flags |= IFF_UP; + } + +#ifndef __rtems__ + tp->tun_pid = td->td_proc->p_pid; +#endif /* __rtems__ */ + tp->tun_flags |= TUN_OPEN; + + if_link_state_change(ifp, LINK_STATE_UP); + TUNDEBUG(ifp, "open\n"); + TUN_UNLOCK(tp); + + /* + * This can fail with either ENOENT or EBUSY. This is in the middle of + * d_open, so ENOENT should not be possible. EBUSY is possible, but + * the only cdevpriv dtor being set will be tundtor and the softc being + * passed is constant for a given cdev. We ignore the possible error + * because of this as either "unlikely" or "not actually a problem." + */ + (void)devfs_set_cdevpriv(tp, tundtor); + CURVNET_RESTORE(); + return (0); +} + +/* + * tundtor - tear down the device - mark i/f down & delete + * routing info + */ +static void +tundtor(void *data) +{ +#ifndef __rtems__ + struct proc *p; +#endif /* __rtems__ */ + struct tuntap_softc *tp; + struct ifnet *ifp; + bool l2tun; + + tp = data; +#ifndef __rtems__ + p = curproc; +#endif /* __rtems__ */ + ifp = TUN2IFP(tp); + + TUN_LOCK(tp); + +#ifndef __rtems__ + /* + * Realistically, we can't be obstinate here. This only means that the + * tuntap device was closed out of order, and the last closer wasn't the + * controller. These are still good to know about, though, as software + * should avoid multiple processes with a tuntap device open and + * ill-defined transfer of control (e.g., handoff, TUNSIFPID, close in + * parent). + */ + if (p->p_pid != tp->tun_pid) { + log(LOG_INFO, + "pid %d (%s), %s: tun/tap protocol violation, non-controlling process closed last.\n", + p->p_pid, p->p_comm, tp->tun_dev->si_name); + } +#endif /* __rtems__ */ + + /* + * junk all pending output + */ + CURVNET_SET(ifp->if_vnet); + + l2tun = false; + if ((tp->tun_flags & TUN_L2) != 0) { + l2tun = true; + IF_DRAIN(&ifp->if_snd); + } else { + IFQ_PURGE(&ifp->if_snd); + } + + /* For vmnet, we won't do most of the address/route bits */ + if ((tp->tun_flags & TUN_VMNET) != 0 || + (l2tun && (ifp->if_flags & IFF_LINK0) != 0)) + goto out; + + if (ifp->if_flags & IFF_UP) { + TUN_UNLOCK(tp); + if_down(ifp); + TUN_LOCK(tp); + } + + /* Delete all addresses and routes which reference this interface. */ + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + struct ifaddr *ifa; + + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + TUN_UNLOCK(tp); + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + /* deal w/IPv4 PtP destination; unlocked read */ + if (!l2tun && ifa->ifa_addr->sa_family == AF_INET) { + rtinit(ifa, (int)RTM_DELETE, + tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); + } else { + rtinit(ifa, (int)RTM_DELETE, 0); + } + } + if_purgeaddrs(ifp); + TUN_LOCK(tp); + } + +out: + if_link_state_change(ifp, LINK_STATE_DOWN); + CURVNET_RESTORE(); + + funsetown(&tp->tun_sigio); + selwakeuppri(&tp->tun_rsel, PZERO + 1); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); + TUNDEBUG (ifp, "closed\n"); + tp->tun_flags &= ~TUN_OPEN; +#ifndef __rtems__ + tp->tun_pid = 0; +#endif /* __rtems__ */ + + tun_unbusy_locked(tp); + TUN_UNLOCK(tp); +} + +static void +tuninit(struct ifnet *ifp) +{ + struct tuntap_softc *tp = ifp->if_softc; +#ifdef INET + struct ifaddr *ifa; +#endif + + TUNDEBUG(ifp, "tuninit\n"); + + TUN_LOCK(tp); + ifp->if_drv_flags |= IFF_DRV_RUNNING; + if ((tp->tun_flags & TUN_L2) == 0) { + ifp->if_flags |= IFF_UP; + getmicrotime(&ifp->if_lastchange); +#ifdef INET + if_addr_rlock(ifp); + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family == AF_INET) { + struct sockaddr_in *si; + + si = (struct sockaddr_in *)ifa->ifa_dstaddr; + if (si && si->sin_addr.s_addr) { + tp->tun_flags |= TUN_DSTADDR; + break; + } + } + } + if_addr_runlock(ifp); +#endif + TUN_UNLOCK(tp); + } else { + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + TUN_UNLOCK(tp); + /* attempt to start output */ + tunstart_l2(ifp); + } + +} + +/* + * Used only for l2 tunnel. + */ +static void +tunifinit(void *xtp) +{ + struct tuntap_softc *tp; + + tp = (struct tuntap_softc *)xtp; + tuninit(tp->tun_ifp); +} + +/* + * Process an ioctl request. + */ +static int +tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct ifreq *ifr = (struct ifreq *)data; + struct tuntap_softc *tp; + struct ifstat *ifs; + struct ifmediareq *ifmr; + int dummy, error = 0; + bool l2tun; + + ifmr = NULL; + sx_xlock(&tun_ioctl_sx); + tp = ifp->if_softc; + if (tp == NULL) { + error = ENXIO; + goto bad; + } + l2tun = (tp->tun_flags & TUN_L2) != 0; + switch(cmd) { + case SIOCGIFSTATUS: + ifs = (struct ifstat *)data; + TUN_LOCK(tp); +#ifndef __rtems__ + if (tp->tun_pid) + snprintf(ifs->ascii, sizeof(ifs->ascii), + "\tOpened by PID %d\n", tp->tun_pid); + else +#endif /* __rtems__ */ + ifs->ascii[0] = '\0'; + TUN_UNLOCK(tp); + break; + case SIOCSIFADDR: + if (l2tun) + error = ether_ioctl(ifp, cmd, data); + else + tuninit(ifp); + if (error == 0) + TUNDEBUG(ifp, "address set\n"); + break; + case SIOCSIFMTU: + ifp->if_mtu = ifr->ifr_mtu; + TUNDEBUG(ifp, "mtu set\n"); + break; + case SIOCSIFFLAGS: + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + case SIOCGIFMEDIA: + if (!l2tun) { + error = EINVAL; + break; + } + + ifmr = (struct ifmediareq *)data; + dummy = ifmr->ifm_count; + ifmr->ifm_count = 1; + ifmr->ifm_status = IFM_AVALID; + ifmr->ifm_active = IFM_ETHER; + if (tp->tun_flags & TUN_OPEN) + ifmr->ifm_status |= IFM_ACTIVE; + ifmr->ifm_current = ifmr->ifm_active; + if (dummy >= 1) { + int media = IFM_ETHER; + error = copyout(&media, ifmr->ifm_ulist, sizeof(int)); + } + break; + default: + if (l2tun) { + error = ether_ioctl(ifp, cmd, data); + } else { + error = EINVAL; + } + } +bad: + sx_xunlock(&tun_ioctl_sx); + return (error); +} + +/* + * tunoutput - queue packets from higher level ready to put out. + */ +static int +tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, + struct route *ro) +{ + struct tuntap_softc *tp = ifp->if_softc; + u_short cached_tun_flags; + int error; + u_int32_t af; + + TUNDEBUG (ifp, "tunoutput\n"); + +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m0); + if (error) { + m_freem(m0); + return (error); + } +#endif + + /* Could be unlocked read? */ + TUN_LOCK(tp); + cached_tun_flags = tp->tun_flags; + TUN_UNLOCK(tp); + if ((cached_tun_flags & TUN_READY) != TUN_READY) { + TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); + m_freem (m0); + return (EHOSTDOWN); + } + + if ((ifp->if_flags & IFF_UP) != IFF_UP) { + m_freem (m0); + return (EHOSTDOWN); + } + + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC) + bcopy(dst->sa_data, &af, sizeof(af)); + else + af = dst->sa_family; + + if (bpf_peers_present(ifp->if_bpf)) + bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); + + /* prepend sockaddr? this may abort if the mbuf allocation fails */ + if (cached_tun_flags & TUN_LMODE) { + /* allocate space for sockaddr */ + M_PREPEND(m0, dst->sa_len, M_NOWAIT); + + /* if allocation failed drop packet */ + if (m0 == NULL) { + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (ENOBUFS); + } else { + bcopy(dst, m0->m_data, dst->sa_len); + } + } + + if (cached_tun_flags & TUN_IFHEAD) { + /* Prepend the address family */ + M_PREPEND(m0, 4, M_NOWAIT); + + /* if allocation failed drop packet */ + if (m0 == NULL) { + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (ENOBUFS); + } else + *(u_int32_t *)m0->m_data = htonl(af); + } else { +#ifdef INET + if (af != AF_INET) +#endif + { + m_freem(m0); + return (EAFNOSUPPORT); + } + } + + error = (ifp->if_transmit)(ifp, m0); + if (error) + return (ENOBUFS); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + return (0); +} + +/* + * the cdevsw interface is now pretty minimal. + */ +static int +tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, + struct thread *td) +{ + struct ifreq ifr, *ifrp; + struct tuntap_softc *tp = dev->si_drv1; + struct tuninfo *tunp; + int error, iflags; +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) + int ival; +#endif + bool l2tun; + + l2tun = (tp->tun_flags & TUN_L2) != 0; + if (l2tun) { + /* tap specific ioctls */ + switch(cmd) { + /* VMware/VMnet port ioctl's */ +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) + case _IO('V', 0): + ival = IOCPARM_IVAL(data); + data = (caddr_t)&ival; + /* FALLTHROUGH */ +#endif + case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ + iflags = *(int *)data; + iflags &= TUN_VMIO_FLAG_MASK; + iflags &= ~IFF_CANTCHANGE; + iflags |= IFF_UP; + + TUN_LOCK(tp); + TUN2IFP(tp)->if_flags = iflags | + (TUN2IFP(tp)->if_flags & IFF_CANTCHANGE); + TUN_UNLOCK(tp); + + return (0); + case SIOCGIFADDR: /* get MAC address of the remote side */ + TUN_LOCK(tp); + bcopy(&tp->tun_ether.octet, data, + sizeof(tp->tun_ether.octet)); + TUN_UNLOCK(tp); + + return (0); + case SIOCSIFADDR: /* set MAC address of the remote side */ + TUN_LOCK(tp); + bcopy(data, &tp->tun_ether.octet, + sizeof(tp->tun_ether.octet)); + TUN_UNLOCK(tp); + + return (0); + } + + /* Fall through to the common ioctls if unhandled */ + } else { + switch (cmd) { + case TUNSLMODE: + TUN_LOCK(tp); + if (*(int *)data) { + tp->tun_flags |= TUN_LMODE; + tp->tun_flags &= ~TUN_IFHEAD; + } else + tp->tun_flags &= ~TUN_LMODE; + TUN_UNLOCK(tp); + + return (0); + case TUNSIFHEAD: + TUN_LOCK(tp); + if (*(int *)data) { + tp->tun_flags |= TUN_IFHEAD; + tp->tun_flags &= ~TUN_LMODE; + } else + tp->tun_flags &= ~TUN_IFHEAD; + TUN_UNLOCK(tp); + + return (0); + case TUNGIFHEAD: + TUN_LOCK(tp); + *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; + TUN_UNLOCK(tp); + + return (0); + case TUNSIFMODE: + /* deny this if UP */ + if (TUN2IFP(tp)->if_flags & IFF_UP) + return (EBUSY); + + switch (*(int *)data & ~IFF_MULTICAST) { + case IFF_POINTOPOINT: + case IFF_BROADCAST: + TUN_LOCK(tp); + TUN2IFP(tp)->if_flags &= + ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); + TUN2IFP(tp)->if_flags |= *(int *)data; + TUN_UNLOCK(tp); + + break; + default: + return (EINVAL); + } + + return (0); + case TUNSIFPID: +#ifndef __rtems__ + TUN_LOCK(tp); + tp->tun_pid = curthread->td_proc->p_pid; + TUN_UNLOCK(tp); +#endif /* __rtems__ */ + + return (0); + } + /* Fall through to the common ioctls if unhandled */ + } + + switch (cmd) { + case TUNGIFNAME: + ifrp = (struct ifreq *)data; + strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ); + + return (0); + case TUNSIFINFO: + tunp = (struct tuninfo *)data; + if (TUN2IFP(tp)->if_type != tunp->type) + return (EPROTOTYPE); + TUN_LOCK(tp); + if (TUN2IFP(tp)->if_mtu != tunp->mtu) { + strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); + ifr.ifr_mtu = tunp->mtu; + CURVNET_SET(TUN2IFP(tp)->if_vnet); + error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), + (caddr_t)&ifr, td); + CURVNET_RESTORE(); + if (error) { + TUN_UNLOCK(tp); + return (error); + } + } + TUN2IFP(tp)->if_baudrate = tunp->baudrate; + TUN_UNLOCK(tp); + break; + case TUNGIFINFO: + tunp = (struct tuninfo *)data; + TUN_LOCK(tp); + tunp->mtu = TUN2IFP(tp)->if_mtu; + tunp->type = TUN2IFP(tp)->if_type; + tunp->baudrate = TUN2IFP(tp)->if_baudrate; + TUN_UNLOCK(tp); + break; + case TUNSDEBUG: + tundebug = *(int *)data; + break; + case TUNGDEBUG: + *(int *)data = tundebug; + break; + case FIONBIO: + break; + case FIOASYNC: + TUN_LOCK(tp); + if (*(int *)data) + tp->tun_flags |= TUN_ASYNC; + else + tp->tun_flags &= ~TUN_ASYNC; + TUN_UNLOCK(tp); + break; + case FIONREAD: + if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { + struct mbuf *mb; + IFQ_LOCK(&TUN2IFP(tp)->if_snd); + IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); + for (*(int *)data = 0; mb != NULL; mb = mb->m_next) + *(int *)data += mb->m_len; + IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); + } else + *(int *)data = 0; + break; + case FIOSETOWN: + return (fsetown(*(int *)data, &tp->tun_sigio)); + + case FIOGETOWN: + *(int *)data = fgetown(&tp->tun_sigio); + return (0); + + /* This is deprecated, FIOSETOWN should be used instead. */ + case TIOCSPGRP: + return (fsetown(-(*(int *)data), &tp->tun_sigio)); + + /* This is deprecated, FIOGETOWN should be used instead. */ + case TIOCGPGRP: + *(int *)data = -fgetown(&tp->tun_sigio); + return (0); + + default: + return (ENOTTY); + } + return (0); +} + +/* + * The cdevsw read interface - reads a packet at a time, or at + * least as much of a packet as can be read. + */ +static int +tunread(struct cdev *dev, struct uio *uio, int flag) +{ + struct tuntap_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + struct mbuf *m; + int error=0, len; + + TUNDEBUG (ifp, "read\n"); + TUN_LOCK(tp); + if ((tp->tun_flags & TUN_READY) != TUN_READY) { + TUN_UNLOCK(tp); + TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); + return (EHOSTDOWN); + } + + tp->tun_flags &= ~TUN_RWAIT; + + for (;;) { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m != NULL) + break; + if (flag & O_NONBLOCK) { + TUN_UNLOCK(tp); + return (EWOULDBLOCK); + } + tp->tun_flags |= TUN_RWAIT; + error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), + "tunread", 0); + if (error != 0) { + TUN_UNLOCK(tp); + return (error); + } + } + TUN_UNLOCK(tp); + + if ((tp->tun_flags & TUN_L2) != 0) + BPF_MTAP(ifp, m); + + while (m && uio->uio_resid > 0 && error == 0) { + len = min(uio->uio_resid, m->m_len); + if (len != 0) + error = uiomove(mtod(m, void *), len, uio); + m = m_free(m); + } + + if (m) { + TUNDEBUG(ifp, "Dropping mbuf\n"); + m_freem(m); + } + return (error); +} + +static int +tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m) +{ + struct ether_header *eh; + struct ifnet *ifp; + + ifp = TUN2IFP(tp); + + /* + * Only pass a unicast frame to ether_input(), if it would + * actually have been received by non-virtual hardware. + */ + if (m->m_len < sizeof(struct ether_header)) { + m_freem(m); + return (0); + } + + eh = mtod(m, struct ether_header *); + + if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && + !ETHER_IS_MULTICAST(eh->ether_dhost) && + bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { + m_freem(m); + return (0); + } + + /* Pass packet up to parent. */ + CURVNET_SET(ifp->if_vnet); + (*ifp->if_input)(ifp, m); + CURVNET_RESTORE(); + /* ibytes are counted in parent */ + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + return (0); +} + +static int +tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m) +{ + struct ifnet *ifp; + int family, isr; + + ifp = TUN2IFP(tp); + /* Could be unlocked read? */ + TUN_LOCK(tp); + if (tp->tun_flags & TUN_IFHEAD) { + TUN_UNLOCK(tp); + if (m->m_len < sizeof(family) && + (m = m_pullup(m, sizeof(family))) == NULL) + return (ENOBUFS); + family = ntohl(*mtod(m, u_int32_t *)); + m_adj(m, sizeof(family)); + } else { + TUN_UNLOCK(tp); + family = AF_INET; + } + + BPF_MTAP2(ifp, &family, sizeof(family), m); + + switch (family) { +#ifdef INET + case AF_INET: + isr = NETISR_IP; + break; +#endif +#ifdef INET6 + case AF_INET6: + isr = NETISR_IPV6; + break; +#endif + default: + m_freem(m); + return (EAFNOSUPPORT); + } + random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + CURVNET_SET(ifp->if_vnet); + M_SETFIB(m, ifp->if_fib); + netisr_dispatch(isr, m); + CURVNET_RESTORE(); + return (0); +} + +/* + * the cdevsw write interface - an atomic write is a packet - or else! + */ +static int +tunwrite(struct cdev *dev, struct uio *uio, int flag) +{ + struct tuntap_softc *tp; + struct ifnet *ifp; + struct mbuf *m; + uint32_t mru; + int align; + bool l2tun; + + tp = dev->si_drv1; + ifp = TUN2IFP(tp); + TUNDEBUG(ifp, "tunwrite\n"); + if ((ifp->if_flags & IFF_UP) != IFF_UP) + /* ignore silently */ + return (0); + + if (uio->uio_resid == 0) + return (0); + + l2tun = (tp->tun_flags & TUN_L2) != 0; + align = 0; + mru = l2tun ? TAPMRU : TUNMRU; + if (l2tun) + align = ETHER_ALIGN; + else if ((tp->tun_flags & TUN_IFHEAD) != 0) + mru += sizeof(uint32_t); /* family */ + if (uio->uio_resid < 0 || uio->uio_resid > mru) { + TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); + return (EIO); + } + + if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) { + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); + return (ENOBUFS); + } + + m->m_pkthdr.rcvif = ifp; +#ifdef MAC + mac_ifnet_create_mbuf(ifp, m); +#endif + + if (l2tun) + return (tunwrite_l2(tp, m)); + + return (tunwrite_l3(tp, m)); +} + +/* + * tunpoll - the poll interface, this is only useful on reads + * really. The write detect always returns true, write never blocks + * anyway, it either accepts the packet or drops it. + */ +static int +tunpoll(struct cdev *dev, int events, struct thread *td) +{ + struct tuntap_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + int revents = 0; + + TUNDEBUG(ifp, "tunpoll\n"); + + if (events & (POLLIN | POLLRDNORM)) { + IFQ_LOCK(&ifp->if_snd); + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); + revents |= events & (POLLIN | POLLRDNORM); + } else { + TUNDEBUG(ifp, "tunpoll waiting\n"); + selrecord(td, &tp->tun_rsel); + } + IFQ_UNLOCK(&ifp->if_snd); + } + revents |= events & (POLLOUT | POLLWRNORM); + + return (revents); +} + +/* + * tunkqfilter - support for the kevent() system call. + */ +static int +tunkqfilter(struct cdev *dev, struct knote *kn) +{ + struct tuntap_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + + switch(kn->kn_filter) { + case EVFILT_READ: + TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + kn->kn_fop = &tun_read_filterops; + break; + + case EVFILT_WRITE: + TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + kn->kn_fop = &tun_write_filterops; + break; + + default: + TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + return(EINVAL); + } + + kn->kn_hook = tp; + knlist_add(&tp->tun_rsel.si_note, kn, 0); + + return (0); +} + +/* + * Return true of there is data in the interface queue. + */ +static int +tunkqread(struct knote *kn, long hint) +{ + int ret; + struct tuntap_softc *tp = kn->kn_hook; + struct cdev *dev = tp->tun_dev; + struct ifnet *ifp = TUN2IFP(tp); + + if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { + TUNDEBUG(ifp, + "%s have data in the queue. Len = %d, minor = %#x\n", + ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); + ret = 1; + } else { + TUNDEBUG(ifp, + "%s waiting for data, minor = %#x\n", ifp->if_xname, + dev2unit(dev)); + ret = 0; + } + + return (ret); +} + +/* + * Always can write, always return MTU in kn->data. + */ +static int +tunkqwrite(struct knote *kn, long hint) +{ + struct tuntap_softc *tp = kn->kn_hook; + struct ifnet *ifp = TUN2IFP(tp); + + kn->kn_data = ifp->if_mtu; + + return (1); +} + +static void +tunkqdetach(struct knote *kn) +{ + struct tuntap_softc *tp = kn->kn_hook; + + knlist_remove(&tp->tun_rsel.si_note, kn, 0); +} diff --git a/freebsd/sys/net/if_vlan.c b/freebsd/sys/net/if_vlan.c index 8f108b9d..2b5b3488 100644 --- a/freebsd/sys/net/if_vlan.c +++ b/freebsd/sys/net/if_vlan.c @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_vlan.h> #include <rtems/bsd/local/opt_ratelimit.h> @@ -76,6 +77,7 @@ __FBSDID("$FreeBSD$"); #include <net/if_dl.h> #include <net/if_types.h> #include <net/if_vlan_var.h> +#include <net/route.h> #include <net/vnet.h> #ifdef INET @@ -83,6 +85,14 @@ __FBSDID("$FreeBSD$"); #include <netinet/if_ether.h> #endif +#ifdef INET6 +/* + * XXX: declare here to avoid to include many inet6 related files.. + * should be more generalized? + */ +extern void nd6_setmtu(struct ifnet *); +#endif + #define VLAN_DEF_HWIDTH 4 #define VLAN_IFFLAGS (IFF_BROADCAST | IFF_MULTICAST) @@ -1410,11 +1420,19 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) * Set up our interface address to reflect the underlying * physical interface's. */ - bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen); + TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv); ((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen = p->if_addrlen; - TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv); + /* + * Do not schedule link address update if it was the same + * as previous parent's. This helps avoid updating for each + * associated llentry. + */ + if (memcmp(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen) != 0) { + bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen); + taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); + } /* We are ready for operation now. */ ifp->if_drv_flags |= IFF_DRV_RUNNING; @@ -1725,7 +1743,7 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) struct ifvlan *ifv; struct ifvlantrunk *trunk; struct vlanreq vlr; - int error = 0; + int error = 0, oldmtu; ifr = (struct ifreq *)data; ifa = (struct ifaddr *) data; @@ -1819,8 +1837,20 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = ENOENT; break; } + oldmtu = ifp->if_mtu; error = vlan_config(ifv, p, vlr.vlr_tag); if_rele(p); + + /* + * VLAN MTU may change during addition of the vlandev. + * If it did, do network layer specific procedure. + */ + if (ifp->if_mtu != oldmtu) { +#ifdef INET6 + nd6_setmtu(ifp); +#endif + rt_updatemtu(ifp); + } break; case SIOCGETVLAN: diff --git a/freebsd/sys/net/iflib.h b/freebsd/sys/net/iflib.h index 2395439a..b0215daf 100644 --- a/freebsd/sys/net/iflib.h +++ b/freebsd/sys/net/iflib.h @@ -361,6 +361,11 @@ typedef enum { * Interface needs admin task to ignore interface up/down status */ #define IFLIB_ADMIN_ALWAYS_RUN 0x10000 +/* + * When using a single hardware interrupt for the interface, only process RX + * interrupts instead of doing combined RX/TX processing. + */ +#define IFLIB_SINGLE_IRQ_RX_ONLY 0x40000 /* diff --git a/freebsd/sys/net/route.c b/freebsd/sys/net/route.c index 0933c3a8..adbf91bd 100644 --- a/freebsd/sys/net/route.c +++ b/freebsd/sys/net/route.c @@ -865,7 +865,7 @@ rtrequest_fib(int req, * to reflect size of the provided buffer. if no NHR_COPY is specified, * point dst,netmask and gw @info fields to appropriate @rt values. * - * if @flags contains NHR_REF, do refcouting on rt_ifp. + * if @flags contains NHR_REF, do refcouting on rt_ifp and rt_ifa. * * Returns 0 on success. */ @@ -935,10 +935,9 @@ rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags) info->rti_flags = rt->rt_flags; info->rti_ifp = rt->rt_ifp; info->rti_ifa = rt->rt_ifa; - ifa_ref(info->rti_ifa); if (flags & NHR_REF) { - /* Do 'traditional' refcouting */ if_ref(info->rti_ifp); + ifa_ref(info->rti_ifa); } return (0); @@ -948,8 +947,8 @@ rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags) * Lookups up route entry for @dst in RIB database for fib @fibnum. * Exports entry data to @info using rt_exportinfo(). * - * if @flags contains NHR_REF, refcouting is performed on rt_ifp. - * All references can be released later by calling rib_free_info() + * If @flags contains NHR_REF, refcouting is performed on rt_ifp and rt_ifa. + * All references can be released later by calling rib_free_info(). * * Returns 0 on success. * Returns ENOENT for lookup failure, ENOMEM for export failure. @@ -995,6 +994,7 @@ void rib_free_info(struct rt_addrinfo *info) { + ifa_free(info->rti_ifa); if_rele(info->rti_ifp); } @@ -1627,9 +1627,12 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, error = rt_getifa_fib(info, fibnum); if (error) return (error); + } else { + ifa_ref(info->rti_ifa); } rt = uma_zalloc(V_rtzone, M_NOWAIT); if (rt == NULL) { + ifa_free(info->rti_ifa); return (ENOBUFS); } rt->rt_flags = RTF_UP | flags; @@ -1638,6 +1641,7 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, * Add the gateway. Possibly re-malloc-ing the storage for it. */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { + ifa_free(info->rti_ifa); uma_zfree(V_rtzone, rt); return (error); } @@ -1661,7 +1665,6 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, * examine the ifa and ifa->ifa_ifp if it so desires. */ ifa = info->rti_ifa; - ifa_ref(ifa); rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; rt->rt_weight = 1; @@ -2101,7 +2104,6 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) * Do the actual request */ bzero((caddr_t)&info, sizeof(info)); - ifa_ref(ifa); info.rti_ifa = ifa; info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; @@ -2116,7 +2118,6 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; error = rtrequest1_fib(cmd, &info, &rt, fibnum); - if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change diff --git a/freebsd/sys/net/sff8472.h b/freebsd/sys/net/sff8472.h index d38fcfc0..9fa465a1 100644 --- a/freebsd/sys/net/sff8472.h +++ b/freebsd/sys/net/sff8472.h @@ -379,7 +379,7 @@ enum { /* * Table 3.2 Identifier values. - * Identifier constants has taken from SFF-8024 rev 4.2 table 4.1 + * Identifier constants has taken from SFF-8024 rev 4.6 table 4.1 * (as referenced by table 3.2 footer) * */ enum { @@ -396,10 +396,10 @@ enum { SFF_8024_ID_X2 = 0xA, /* X2 */ SFF_8024_ID_DWDM_SFP = 0xB, /* DWDM-SFP */ SFF_8024_ID_QSFP = 0xC, /* QSFP */ - SFF_8024_ID_QSFPPLUS = 0xD, /* QSFP+ */ + SFF_8024_ID_QSFPPLUS = 0xD, /* QSFP+ or later */ SFF_8024_ID_CXP = 0xE, /* CXP */ - SFF_8024_ID_HD4X = 0xF, /* Shielded Mini Multilane HD 4X */ - SFF_8024_ID_HD8X = 0x10, /* Shielded Mini Multilane HD 8X */ + SFF_8024_ID_HD4X = 0xF, /* Shielded Mini Multilane HD 4X */ + SFF_8024_ID_HD8X = 0x10, /* Shielded Mini Multilane HD 8X */ SFF_8024_ID_QSFP28 = 0x11, /* QSFP28 or later */ SFF_8024_ID_CXP2 = 0x12, /* CXP2 (aka CXP28) */ SFF_8024_ID_CDFP = 0x13, /* CDFP (Style 1/Style 2) */ @@ -408,34 +408,49 @@ enum { SFF_8024_ID_CDFP3 = 0x16, /* CDFP (Style3) */ SFF_8024_ID_MICROQSFP = 0x17, /* microQSFP */ SFF_8024_ID_QSFP_DD = 0x18, /* QSFP-DD 8X Pluggable Transceiver */ - SFF_8024_ID_LAST = SFF_8024_ID_QSFP_DD - }; - -static const char *sff_8024_id[SFF_8024_ID_LAST + 1] = {"Unknown", - "GBIC", - "SFF", - "SFP/SFP+/SFP28", - "XBI", - "Xenpak", - "XFP", - "XFF", - "XFP-E", - "XPAK", - "X2", - "DWDM-SFP/SFP+", - "QSFP", - "QSFP+", - "CXP", - "HD4X", - "HD8X", - "QSFP28", - "CXP2", - "CDFP", - "SMM4", - "SMM8", - "CDFP3", - "microQSFP", - "QSFP-DD"}; + SFF_8024_ID_OSFP8X = 0x19, /* OSFP 8X Pluggable Transceiver */ + SFF_8024_ID_SFP_DD = 0x1A, /* SFP-DD 2X Pluggable Transceiver */ + SFF_8024_ID_DSFP = 0x1B, /* DSFP Dual SFF Pluggable Transceiver */ + SFF_8024_ID_X4ML = 0x1C, /* x4 MiniLink/OcuLink */ + SFF_8024_ID_X8ML = 0x1D, /* x8 MiniLink */ + SFF_8024_ID_QSFP_CMIS = 0x1E, /* QSFP+ or later w/ Common Management + Interface Specification */ + SFF_8024_ID_LAST = SFF_8024_ID_QSFP_CMIS +}; + +static const char *sff_8024_id[SFF_8024_ID_LAST + 1] = { + "Unknown", + "GBIC", + "SFF", + "SFP/SFP+/SFP28", + "XBI", + "Xenpak", + "XFP", + "XFF", + "XFP-E", + "XPAK", + "X2", + "DWDM-SFP/SFP+", + "QSFP", + "QSFP+", + "CXP", + "HD4X", + "HD8X", + "QSFP28", + "CXP2", + "CDFP", + "SMM4", + "SMM8", + "CDFP3", + "microQSFP", + "QSFP-DD", + "QSFP8X", + "SFP-DD", + "DSFP", + "x4MiniLink/OcuLink", + "x8MiniLink", + "QSFP+(CIMS)" +}; /* Keep compatibility with old definitions */ #define SFF_8472_ID_UNKNOWN SFF_8024_ID_UNKNOWN diff --git a/freebsd/sys/net/vnet.h b/freebsd/sys/net/vnet.h index b4168750..2d69a8a9 100644 --- a/freebsd/sys/net/vnet.h +++ b/freebsd/sys/net/vnet.h @@ -325,6 +325,8 @@ struct vnet_sysinit { }; #define VNET_SYSINIT(ident, subsystem, order, func, arg) \ + CTASSERT((subsystem) > SI_SUB_VNET && \ + (subsystem) <= SI_SUB_VNET_DONE); \ static struct vnet_sysinit ident ## _vnet_init = { \ subsystem, \ order, \ @@ -337,6 +339,8 @@ struct vnet_sysinit { vnet_deregister_sysinit, &ident ## _vnet_init) #define VNET_SYSUNINIT(ident, subsystem, order, func, arg) \ + CTASSERT((subsystem) > SI_SUB_VNET && \ + (subsystem) <= SI_SUB_VNET_DONE); \ static struct vnet_sysinit ident ## _vnet_uninit = { \ subsystem, \ order, \ diff --git a/freebsd/sys/net80211/ieee80211.c b/freebsd/sys/net80211/ieee80211.c index 927905bb..f003c769 100644 --- a/freebsd/sys/net80211/ieee80211.c +++ b/freebsd/sys/net80211/ieee80211.c @@ -1388,6 +1388,8 @@ getflags(const uint8_t bands[], uint32_t flags[], int ht40, int vht80) /* * Add one 20 MHz channel into specified channel list. + * You MUST NOT mix bands when calling this. It will not add 5ghz + * channels if you have any B/G/N band bit set. */ /* XXX VHT */ int diff --git a/freebsd/sys/netinet/in_mcast.c b/freebsd/sys/netinet/in_mcast.c index ff442399..cbb6c6d3 100644 --- a/freebsd/sys/netinet/in_mcast.c +++ b/freebsd/sys/netinet/in_mcast.c @@ -2207,7 +2207,11 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) __func__); goto out_inp_locked; } - inm_acquire(imf->imf_inm); + /* + * NOTE: Refcount from in_joingroup_locked() + * is protecting membership. + */ + ip_mfilter_insert(&imo->imo_head, imf); } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); @@ -2231,8 +2235,6 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) goto out_inp_locked; } } - if (is_new) - ip_mfilter_insert(&imo->imo_head, imf); imf_commit(imf); imf = NULL; @@ -2401,6 +2403,12 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) if (is_final) { ip_mfilter_remove(&imo->imo_head, imf); imf_leave(imf); + + /* + * Give up the multicast address record to which + * the membership points. + */ + (void) in_leavegroup_locked(imf->imf_inm, imf); } else { if (imf->imf_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; @@ -2455,14 +2463,8 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) out_inp_locked: INP_WUNLOCK(inp); - if (is_final && imf) { - /* - * Give up the multicast address record to which - * the membership points. - */ - (void) in_leavegroup_locked(imf->imf_inm, imf); + if (is_final && imf) ip_mfilter_free(imf); - } IN_MULTI_UNLOCK(); return (error); diff --git a/freebsd/sys/netinet/ip_carp.c b/freebsd/sys/netinet/ip_carp.c index 02a24bb8..30b09198 100644 --- a/freebsd/sys/netinet/ip_carp.c +++ b/freebsd/sys/netinet/ip_carp.c @@ -568,13 +568,16 @@ carp6_input(struct mbuf **mp, int *offp, int proto) } /* verify that we have a complete carp packet */ - len = m->m_len; - IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch)); - if (ch == NULL) { - CARPSTATS_INC(carps_badlen); - CARP_DEBUG("%s: packet size %u too small\n", __func__, len); - return (IPPROTO_DONE); + if (m->m_len < *offp + sizeof(*ch)) { + len = m->m_len; + m = m_pullup(m, *offp + sizeof(*ch)); + if (m == NULL) { + CARPSTATS_INC(carps_badlen); + CARP_DEBUG("%s: packet size %u too small\n", __func__, len); + return (IPPROTO_DONE); + } } + ch = (struct carp_header *)(mtod(m, char *) + *offp); /* verify the CARP checksum */ @@ -1189,7 +1192,7 @@ carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) return (ifa); } -caddr_t +char * carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { struct ifaddr *ifa; @@ -1231,14 +1234,15 @@ carp_forus(struct ifnet *ifp, u_char *dhost) CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { - CARP_LOCK(sc); + /* + * CARP_LOCK() is not here, since would protect nothing, but + * cause deadlock with if_bridge, calling this under its lock. + */ if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr), ETHER_ADDR_LEN)) { - CARP_UNLOCK(sc); CIF_UNLOCK(ifp->if_carp); return (1); } - CARP_UNLOCK(sc); } CIF_UNLOCK(ifp->if_carp); @@ -1848,7 +1852,7 @@ carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td) carp_carprcp(&carpr, sc, priveleged); carpr.carpr_count = count; error = copyout(&carpr, - (caddr_t)ifr_data_get_ptr(ifr) + + (char *)ifr_data_get_ptr(ifr) + (i * sizeof(carpr)), sizeof(carpr)); if (error) { CIF_UNLOCK(ifp->if_carp); diff --git a/freebsd/sys/netinet/ip_carp.h b/freebsd/sys/netinet/ip_carp.h index fc591ac3..f8ee38dd 100644 --- a/freebsd/sys/netinet/ip_carp.h +++ b/freebsd/sys/netinet/ip_carp.h @@ -149,7 +149,7 @@ int carp_output (struct ifnet *, struct mbuf *, int carp_master(struct ifaddr *); int carp_iamatch(struct ifaddr *, uint8_t **); struct ifaddr *carp_iamatch6(struct ifnet *, struct in6_addr *); -caddr_t carp_macmatch6(struct ifnet *, struct mbuf *, const struct in6_addr *); +char * carp_macmatch6(struct ifnet *, struct mbuf *, const struct in6_addr *); int carp_forus(struct ifnet *, u_char *); /* These are external networking stack hooks for CARP */ @@ -174,7 +174,7 @@ extern int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #ifdef INET6 /* netinet6/nd6_nbr.c */ extern struct ifaddr *(*carp_iamatch6_p)(struct ifnet *, struct in6_addr *); -extern caddr_t (*carp_macmatch6_p)(struct ifnet *, struct mbuf *, +extern char * (*carp_macmatch6_p)(struct ifnet *, struct mbuf *, const struct in6_addr *); #endif #endif diff --git a/freebsd/sys/netinet/ip_mroute.c b/freebsd/sys/netinet/ip_mroute.c index 3dd887f3..3b27781e 100644 --- a/freebsd/sys/netinet/ip_mroute.c +++ b/freebsd/sys/netinet/ip_mroute.c @@ -181,10 +181,14 @@ static struct mtx mfc_mtx; VNET_DEFINE_STATIC(vifi_t, numvifs); #define V_numvifs VNET(numvifs) -VNET_DEFINE_STATIC(struct vif, viftable[MAXVIFS]); +VNET_DEFINE_STATIC(struct vif *, viftable); #define V_viftable VNET(viftable) +/* + * No one should be able to "query" this before initialisation happened in + * vnet_mroute_init(), so we should still be fine. + */ SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD, - &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]", + &VNET_NAME(viftable), sizeof(*V_viftable) * MAXVIFS, "S,vif[MAXVIFS]", "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); static struct mtx vif_mtx; @@ -212,7 +216,7 @@ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); * expiration time. Periodically, the entries are analysed and processed. */ #define BW_METER_BUCKETS 1024 -VNET_DEFINE_STATIC(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]); +VNET_DEFINE_STATIC(struct bw_meter **, bw_meter_timers); #define V_bw_meter_timers VNET(bw_meter_timers) VNET_DEFINE_STATIC(struct callout, bw_meter_ch); #define V_bw_meter_ch VNET(bw_meter_ch) @@ -222,7 +226,7 @@ VNET_DEFINE_STATIC(struct callout, bw_meter_ch); * Pending upcalls are stored in a vector which is flushed when * full, or periodically */ -VNET_DEFINE_STATIC(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]); +VNET_DEFINE_STATIC(struct bw_upcall *, bw_upcalls); #define V_bw_upcalls VNET(bw_upcalls) VNET_DEFINE_STATIC(u_int, bw_upcalls_n); /* # of pending upcalls */ #define V_bw_upcalls_n VNET(bw_upcalls_n) @@ -766,7 +770,7 @@ X_ip_mrouter_done(void) bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize); V_bw_upcalls_n = 0; - bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); + bzero(V_bw_meter_timers, BW_METER_BUCKETS * sizeof(*V_bw_meter_timers)); MFC_UNLOCK(); @@ -2807,7 +2811,14 @@ vnet_mroute_init(const void *unused __unused) { V_nexpire = malloc(mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO); - bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); + + V_viftable = mallocarray(MAXVIFS, sizeof(*V_viftable), + M_MRTABLE, M_WAITOK|M_ZERO); + V_bw_meter_timers = mallocarray(BW_METER_BUCKETS, + sizeof(*V_bw_meter_timers), M_MRTABLE, M_WAITOK|M_ZERO); + V_bw_upcalls = mallocarray(BW_UPCALLS_MAX, sizeof(*V_bw_upcalls), + M_MRTABLE, M_WAITOK|M_ZERO); + callout_init(&V_expire_upcalls_ch, 1); callout_init(&V_bw_upcalls_ch, 1); callout_init(&V_bw_meter_ch, 1); @@ -2820,6 +2831,9 @@ static void vnet_mroute_uninit(const void *unused __unused) { + free(V_bw_upcalls, M_MRTABLE); + free(V_bw_meter_timers, M_MRTABLE); + free(V_viftable, M_MRTABLE); free(V_nexpire, M_MRTABLE); V_nexpire = NULL; } diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c index c9eb7aa3..343874e5 100644 --- a/freebsd/sys/netinet/ip_output.c +++ b/freebsd/sys/netinet/ip_output.c @@ -655,6 +655,7 @@ sendit: in_pcboutput_txrtlmt(inp, ifp, m); /* stamp send tag on mbuf */ m->m_pkthdr.snd_tag = inp->inp_snd_tag; + m->m_pkthdr.csum_flags |= CSUM_SND_TAG; } else { m->m_pkthdr.snd_tag = NULL; } @@ -707,6 +708,7 @@ sendit: in_pcboutput_txrtlmt(inp, ifp, m); /* stamp send tag on mbuf */ m->m_pkthdr.snd_tag = inp->inp_snd_tag; + m->m_pkthdr.csum_flags |= CSUM_SND_TAG; } else { m->m_pkthdr.snd_tag = NULL; } diff --git a/freebsd/sys/netinet/ip_reass.c b/freebsd/sys/netinet/ip_reass.c index 70a6edae..036d19fe 100644 --- a/freebsd/sys/netinet/ip_reass.c +++ b/freebsd/sys/netinet/ip_reass.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/eventhandler.h> +#include <sys/kernel.h> #include <sys/hash.h> #include <sys/mbuf.h> #include <sys/malloc.h> @@ -48,7 +49,10 @@ __FBSDID("$FreeBSD$"); #include <sys/lock.h> #include <sys/mutex.h> #include <sys/sysctl.h> +#include <sys/socket.h> +#include <net/if.h> +#include <net/if_var.h> #include <net/rss_config.h> #include <net/netisr.h> #include <net/vnet.h> @@ -182,6 +186,7 @@ ip_reass(struct mbuf *m) struct ip *ip; struct mbuf *p, *q, *nq, *t; struct ipq *fp; + struct ifnet *srcifp; struct ipqhead *head; int i, hlen, next, tmpmax; u_int8_t ecn, ecn0; @@ -242,6 +247,11 @@ ip_reass(struct mbuf *m) } /* + * Store receive network interface pointer for later. + */ + srcifp = m->m_pkthdr.rcvif; + + /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ @@ -491,8 +501,11 @@ ip_reass(struct mbuf *m) m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* some debugging cruft by sklower, below, will go away soon */ - if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ + if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ m_fixhdr(m); + /* set valid receive interface pointer */ + m->m_pkthdr.rcvif = srcifp; + } IPSTAT_INC(ips_reassembled); IPQ_UNLOCK(hash); @@ -608,6 +621,46 @@ ipreass_drain(void) } } +/* + * Drain off all datagram fragments belonging to + * the given network interface. + */ +static void +ipreass_cleanup(void *arg __unused, struct ifnet *ifp) +{ + struct ipq *fp, *temp; + struct mbuf *m; + int i; + + KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); + + CURVNET_SET_QUIET(ifp->if_vnet); + + /* + * Skip processing if IPv4 reassembly is not initialised or + * torn down by ipreass_destroy(). + */ + if (V_ipq_zone == NULL) { + CURVNET_RESTORE(); + return; + } + + for (i = 0; i < IPREASS_NHASH; i++) { + IPQ_LOCK(i); + /* Scan fragment list. */ + TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, temp) { + for (m = fp->ipq_frags; m != NULL; m = m->m_nextpkt) { + /* clear no longer valid rcvif pointer */ + if (m->m_pkthdr.rcvif == ifp) + m->m_pkthdr.rcvif = NULL; + } + } + IPQ_UNLOCK(i); + } + CURVNET_RESTORE(); +} +EVENTHANDLER_DEFINE(ifnet_departure_event, ipreass_cleanup, NULL, 0); + #ifdef VIMAGE /* * Destroy IP reassembly structures. @@ -618,6 +671,7 @@ ipreass_destroy(void) ipreass_drain(); uma_zdestroy(V_ipq_zone); + V_ipq_zone = NULL; for (int i = 0; i < IPREASS_NHASH; i++) mtx_destroy(&V_ipq[i].lock); } diff --git a/freebsd/sys/netinet/sctp_asconf.c b/freebsd/sys/netinet/sctp_asconf.c index 4de01ed7..a13f4040 100644 --- a/freebsd/sys/netinet/sctp_asconf.c +++ b/freebsd/sys/netinet/sctp_asconf.c @@ -107,42 +107,47 @@ sctp_asconf_error_response(uint32_t id, uint16_t cause, uint8_t *error_tlv, struct mbuf *m_reply = NULL; struct sctp_asconf_paramhdr *aph; struct sctp_error_cause *error; + size_t buf_len; + uint16_t i, param_length, cause_length, padding_length; uint8_t *tlv; - m_reply = sctp_get_mbuf_for_msg((sizeof(struct sctp_asconf_paramhdr) + - tlv_length + - sizeof(struct sctp_error_cause)), - 0, M_NOWAIT, 1, MT_DATA); + if (error_tlv == NULL) { + tlv_length = 0; + } + cause_length = sizeof(struct sctp_error_cause) + tlv_length; + param_length = sizeof(struct sctp_asconf_paramhdr) + cause_length; + padding_length = tlv_length % 4; + if (padding_length != 0) { + padding_length = 4 - padding_length; + } + buf_len = param_length + padding_length; + if (buf_len > MLEN) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "asconf_error_response: tlv_length (%xh) too big\n", + tlv_length); + return (NULL); + } + m_reply = sctp_get_mbuf_for_msg(buf_len, 0, M_NOWAIT, 1, MT_DATA); if (m_reply == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_error_response: couldn't get mbuf!\n"); return (NULL); } aph = mtod(m_reply, struct sctp_asconf_paramhdr *); - error = (struct sctp_error_cause *)(aph + 1); - - aph->correlation_id = id; aph->ph.param_type = htons(SCTP_ERROR_CAUSE_IND); + aph->ph.param_length = htons(param_length); + aph->correlation_id = id; + error = (struct sctp_error_cause *)(aph + 1); error->code = htons(cause); - error->length = tlv_length + sizeof(struct sctp_error_cause); - aph->ph.param_length = error->length + - sizeof(struct sctp_asconf_paramhdr); - - if (aph->ph.param_length > MLEN) { - SCTPDBG(SCTP_DEBUG_ASCONF1, - "asconf_error_response: tlv_length (%xh) too big\n", - tlv_length); - sctp_m_freem(m_reply); /* discard */ - return (NULL); - } + error->length = htons(cause_length); if (error_tlv != NULL) { tlv = (uint8_t *)(error + 1); memcpy(tlv, error_tlv, tlv_length); + for (i = 0; i < padding_length; i++) { + tlv[tlv_length + i] = 0; + } } - SCTP_BUF_LEN(m_reply) = aph->ph.param_length; - error->length = htons(error->length); - aph->ph.param_length = htons(aph->ph.param_length); - + SCTP_BUF_LEN(m_reply) = buf_len; return (m_reply); } @@ -171,10 +176,16 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap #endif aparam_length = ntohs(aph->ph.param_length); + if (aparam_length < sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_paramhdr)) { + return (NULL); + } ph = (struct sctp_paramhdr *)(aph + 1); param_type = ntohs(ph->param_type); #if defined(INET) || defined(INET6) param_length = ntohs(ph->param_length); + if (param_length + sizeof(struct sctp_asconf_paramhdr) != aparam_length) { + return (NULL); + } #endif sa = &store.sa; switch (param_type) { @@ -238,6 +249,7 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap "process_asconf_add_ip: using source addr "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src); } + net = NULL; /* add the address */ if (bad_address) { m_reply = sctp_asconf_error_response(aph->correlation_id, @@ -252,17 +264,19 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap SCTP_CAUSE_RESOURCE_SHORTAGE, (uint8_t *)aph, aparam_length); } else { - /* notify upper layer */ - sctp_ulp_notify(SCTP_NOTIFY_ASCONF_ADD_IP, stcb, 0, sa, SCTP_SO_NOT_LOCKED); if (response_required) { m_reply = sctp_asconf_success_response(aph->correlation_id); } - sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, stcb->sctp_ep, stcb, net); - sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, - stcb, net); - if (send_hb) { - sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED); + if (net != NULL) { + /* notify upper layer */ + sctp_ulp_notify(SCTP_NOTIFY_ASCONF_ADD_IP, stcb, 0, sa, SCTP_SO_NOT_LOCKED); + sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, stcb->sctp_ep, stcb, net); + sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, + stcb, net); + if (send_hb) { + sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED); + } } } return (m_reply); @@ -271,7 +285,7 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap static int sctp_asconf_del_remote_addrs_except(struct sctp_tcb *stcb, struct sockaddr *src) { - struct sctp_nets *src_net, *net; + struct sctp_nets *src_net, *net, *nnet; /* make sure the source address exists as a destination net */ src_net = sctp_findnet(stcb, src); @@ -281,10 +295,9 @@ sctp_asconf_del_remote_addrs_except(struct sctp_tcb *stcb, struct sockaddr *src) } /* delete all destination addresses except the source */ - TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + TAILQ_FOREACH_SAFE(net, &stcb->asoc.nets, sctp_next, nnet) { if (net != src_net) { /* delete this address */ - sctp_remove_net(stcb, net); SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_del_remote_addrs_except: deleting "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, @@ -292,6 +305,7 @@ sctp_asconf_del_remote_addrs_except(struct sctp_tcb *stcb, struct sockaddr *src) /* notify upper layer */ sctp_ulp_notify(SCTP_NOTIFY_ASCONF_DELETE_IP, stcb, 0, (struct sockaddr *)&net->ro._l_addr, SCTP_SO_NOT_LOCKED); + sctp_remove_net(stcb, net); } } return (0); @@ -322,10 +336,16 @@ sctp_process_asconf_delete_ip(struct sockaddr *src, #endif aparam_length = ntohs(aph->ph.param_length); + if (aparam_length < sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_paramhdr)) { + return (NULL); + } ph = (struct sctp_paramhdr *)(aph + 1); param_type = ntohs(ph->param_type); #if defined(INET) || defined(INET6) param_length = ntohs(ph->param_length); + if (param_length + sizeof(struct sctp_asconf_paramhdr) != aparam_length) { + return (NULL); + } #endif sa = &store.sa; switch (param_type) { @@ -453,10 +473,16 @@ sctp_process_asconf_set_primary(struct sockaddr *src, #endif aparam_length = ntohs(aph->ph.param_length); + if (aparam_length < sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_paramhdr)) { + return (NULL); + } ph = (struct sctp_paramhdr *)(aph + 1); param_type = ntohs(ph->param_type); #if defined(INET) || defined(INET6) param_length = ntohs(ph->param_length); + if (param_length + sizeof(struct sctp_asconf_paramhdr) != aparam_length) { + return (NULL); + } #endif sa = &store.sa; switch (param_type) { @@ -675,8 +701,8 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, sctp_m_freem(m_ack); return; } - /* param_length is already validated in process_control... */ - offset += ntohs(p_addr->ph.param_length); /* skip lookup addr */ + /* skip lookup addr */ + offset += SCTP_SIZE32(ntohs(p_addr->ph.param_length)); /* get pointer to first asconf param in ASCONF */ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_asconf_paramhdr), (uint8_t *)&aparam_buf); if (aph == NULL) { @@ -705,6 +731,7 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, if (param_length <= sizeof(struct sctp_paramhdr)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: param length (%u) too short\n", param_length); sctp_m_freem(m_ack); + return; } /* get the entire parameter */ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, param_length, aparam_buf); @@ -760,8 +787,6 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, if (m_result != NULL) { SCTP_BUF_NEXT(m_tail) = m_result; m_tail = m_result; - /* update lengths, make sure it's aligned too */ - SCTP_BUF_LEN(m_result) = SCTP_SIZE32(SCTP_BUF_LEN(m_result)); ack_cp->ch.chunk_length += SCTP_BUF_LEN(m_result); /* set flag to force success reports */ error = 1; @@ -1956,12 +1981,10 @@ sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, case AF_INET: { struct sockaddr_in *sin; - struct in6pcb *inp6; - inp6 = (struct in6pcb *)&inp->ip_inp.inp; /* invalid if we are a v6 only endpoint */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && - SCTP_IPV6_V6ONLY(inp6)) + SCTP_IPV6_V6ONLY(&inp->ip_inp.inp)) return; sin = &ifa->address.sin; @@ -2034,11 +2057,9 @@ sctp_asconf_iterator_ep(struct sctp_inpcb *inp, void *ptr, uint32_t val SCTP_UNU case AF_INET: { /* invalid if we are a v6 only endpoint */ - struct in6pcb *inp6; - inp6 = (struct in6pcb *)&inp->ip_inp.inp; if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && - SCTP_IPV6_V6ONLY(inp6)) { + SCTP_IPV6_V6ONLY(&inp->ip_inp.inp)) { cnt_invalid++; if (asc->cnt == cnt_invalid) return (1); @@ -2149,13 +2170,11 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, case AF_INET: { /* invalid if we are a v6 only endpoint */ - struct in6pcb *inp6; struct sockaddr_in *sin; - inp6 = (struct in6pcb *)&inp->ip_inp.inp; /* invalid if we are a v6 only endpoint */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && - SCTP_IPV6_V6ONLY(inp6)) + SCTP_IPV6_V6ONLY(&inp->ip_inp.inp)) continue; sin = &ifa->address.sin; @@ -2172,7 +2191,7 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, continue; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && - SCTP_IPV6_V6ONLY(inp6)) { + SCTP_IPV6_V6ONLY(&inp->ip_inp.inp)) { cnt_invalid++; if (asc->cnt == cnt_invalid) return; diff --git a/freebsd/sys/netinet/sctp_dtrace_define.h b/freebsd/sys/netinet/sctp_dtrace_define.h deleted file mode 100644 index ad7c8526..00000000 --- a/freebsd/sys/netinet/sctp_dtrace_define.h +++ /dev/null @@ -1,177 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-3-Clause - * - * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. - * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * a) Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * b) Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the distribution. - * - * c) Neither the name of Cisco Systems, Inc. nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#ifndef _NETINET_SCTP_DTRACE_DEFINE_H_ -#define _NETINET_SCTP_DTRACE_DEFINE_H_ - -#include <sys/kernel.h> -#include <sys/sdt.h> - -SDT_PROVIDER_DECLARE(sctp); - -/********************************************************/ -/* Cwnd probe - tracks changes in the congestion window on a netp */ -/********************************************************/ -/* Initial */ -SDT_PROBE_DEFINE5(sctp, cwnd, net, init, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* The port number of the local side << 16 | - * port number of remote in network byte - * order. */ - "uintptr_t", /* The pointer to the struct sctp_nets * - * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ - -/* ACK-INCREASE */ -SDT_PROBE_DEFINE5(sctp, cwnd, net, ack, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* The port number of the local side << 16 | - * port number of remote in network byte - * order. */ - "uintptr_t", /* The pointer to the struct sctp_nets * - * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ - -/* ACK-INCREASE */ -SDT_PROBE_DEFINE5(sctp, cwnd, net, rttvar, - "uint64_t", /* The Vtag << 32 | localport << 16 | - * remoteport */ - "uint64_t", /* obw | nbw */ - "uint64_t", /* bwrtt | newrtt */ - "uint64_t", /* flight */ - "uint64_t"); /* (cwnd << 32) | point << 16 | retval(0/1) */ - -SDT_PROBE_DEFINE5(sctp, cwnd, net, rttstep, - "uint64_t", /* The Vtag << 32 | localport << 16 | - * remoteport */ - "uint64_t", /* obw | nbw */ - "uint64_t", /* bwrtt | newrtt */ - "uint64_t", /* flight */ - "uint64_t"); /* (cwnd << 32) | point << 16 | retval(0/1) */ - -/* FastRetransmit-DECREASE */ -SDT_PROBE_DEFINE5(sctp, cwnd, net, fr, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* The port number of the local side << 16 | - * port number of remote in network byte - * order. */ - "uintptr_t", /* The pointer to the struct sctp_nets * - * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ - -/* TimeOut-DECREASE */ -SDT_PROBE_DEFINE5(sctp, cwnd, net, to, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* The port number of the local side << 16 | - * port number of remote in network byte - * order. */ - "uintptr_t", /* The pointer to the struct sctp_nets * - * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ - -/* BurstLimit-DECREASE */ -SDT_PROBE_DEFINE5(sctp, cwnd, net, bl, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* The port number of the local side << 16 | - * port number of remote in network byte - * order. */ - "uintptr_t", /* The pointer to the struct sctp_nets * - * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ - -/* ECN-DECREASE */ -SDT_PROBE_DEFINE5(sctp, cwnd, net, ecn, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* The port number of the local side << 16 | - * port number of remote in network byte - * order. */ - "uintptr_t", /* The pointer to the struct sctp_nets * - * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ - -/* PacketDrop-DECREASE */ -SDT_PROBE_DEFINE5(sctp, cwnd, net, pd, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* The port number of the local side << 16 | - * port number of remote in network byte - * order. */ - "uintptr_t", /* The pointer to the struct sctp_nets * - * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ - -/********************************************************/ -/* Rwnd probe - tracks changes in the receiver window for an assoc */ -/********************************************************/ -SDT_PROBE_DEFINE4(sctp, rwnd, assoc, val, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* The port number of the local side << 16 | - * port number of remote in network byte - * order. */ - "int", /* The up/down amount */ - "int"); /* The new value of the cwnd */ - -/********************************************************/ -/* flight probe - tracks changes in the flight size on a net or assoc */ -/********************************************************/ -SDT_PROBE_DEFINE5(sctp, flightsize, net, val, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* The port number of the local side << 16 | - * port number of remote in network byte - * order. */ - "uintptr_t", /* The pointer to the struct sctp_nets * - * changing */ - "int", /* The up/down amount */ - "int"); /* The new value of the cwnd */ - -/********************************************************/ -/* The total flight version */ -/********************************************************/ -SDT_PROBE_DEFINE4(sctp, flightsize, assoc, val, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* The port number of the local side << 16 | - * port number of remote in network byte - * order. */ - "int", /* The up/down amount */ - "int"); /* The new value of the cwnd */ - -#endif diff --git a/freebsd/sys/netinet/sctp_indata.c b/freebsd/sys/netinet/sctp_indata.c index c4a11fec..1b28cc38 100644 --- a/freebsd/sys/netinet/sctp_indata.c +++ b/freebsd/sys/netinet/sctp_indata.c @@ -474,6 +474,11 @@ sctp_clean_up_control(struct sctp_tcb *stcb, struct sctp_queued_to_read *control chk->data = NULL; sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } + sctp_free_remote_addr(control->whoFrom); + if (control->data) { + sctp_m_freem(control->data); + control->data = NULL; + } sctp_free_a_readq(stcb, control); } @@ -713,6 +718,7 @@ sctp_add_to_tail_pointer(struct sctp_queued_to_read *control, struct mbuf *m, ui } if (control->tail_mbuf == NULL) { /* TSNH */ + sctp_m_freem(control->data); control->data = m; sctp_setup_tail_pointer(control); return; @@ -2116,10 +2122,13 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, struct mbuf *mm; control->data = dmbuf; + control->tail_mbuf = NULL; for (mm = control->data; mm; mm = mm->m_next) { control->length += SCTP_BUF_LEN(mm); + if (SCTP_BUF_NEXT(mm) == NULL) { + control->tail_mbuf = mm; + } } - control->tail_mbuf = NULL; control->end_added = 1; control->last_frag_seen = 1; control->first_frag_seen = 1; @@ -3110,13 +3119,12 @@ sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1 * update RTO too ? */ if (tp1->do_rtt) { - if (*rto_ok) { - tp1->whoTo->RTO = - sctp_calculate_rto(stcb, - &stcb->asoc, - tp1->whoTo, - &tp1->sent_rcv_time, - SCTP_RTT_FROM_DATA); + if (*rto_ok && + sctp_calculate_rto(stcb, + &stcb->asoc, + tp1->whoTo, + &tp1->sent_rcv_time, + SCTP_RTT_FROM_DATA)) { *rto_ok = 0; } if (tp1->whoTo->rto_needed == 0) { @@ -4088,16 +4096,12 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, /* update RTO too? */ if (tp1->do_rtt) { - if (rto_ok) { - tp1->whoTo->RTO = - /* - * sa_ignore - * NO_NULL_CHK - */ - sctp_calculate_rto(stcb, - asoc, tp1->whoTo, - &tp1->sent_rcv_time, - SCTP_RTT_FROM_DATA); + if (rto_ok && + sctp_calculate_rto(stcb, + &stcb->asoc, + tp1->whoTo, + &tp1->sent_rcv_time, + SCTP_RTT_FROM_DATA)) { rto_ok = 0; } if (tp1->whoTo->rto_needed == 0) { @@ -4706,12 +4710,12 @@ hopeless_peer: /* update RTO too? */ if (tp1->do_rtt) { - if (rto_ok) { - tp1->whoTo->RTO = - sctp_calculate_rto(stcb, - asoc, tp1->whoTo, - &tp1->sent_rcv_time, - SCTP_RTT_FROM_DATA); + if (rto_ok && + sctp_calculate_rto(stcb, + &stcb->asoc, + tp1->whoTo, + &tp1->sent_rcv_time, + SCTP_RTT_FROM_DATA)) { rto_ok = 0; } if (tp1->whoTo->rto_needed == 0) { diff --git a/freebsd/sys/netinet/sctp_input.c b/freebsd/sys/netinet/sctp_input.c index 3f4e2f5f..4191d24c 100644 --- a/freebsd/sys/netinet/sctp_input.c +++ b/freebsd/sys/netinet/sctp_input.c @@ -467,6 +467,10 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, if (!cookie_found) { uint16_t len; + /* Only report the missing cookie parameter */ + if (op_err != NULL) { + sctp_m_freem(op_err); + } len = (uint16_t)(sizeof(struct sctp_error_missing_param) + sizeof(uint16_t)); /* We abort with an error of missing mandatory param */ op_err = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); @@ -550,7 +554,7 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3); /* calculate the RTO */ - net->RTO = sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, + sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, SCTP_RTT_FROM_NON_DATA); retval = sctp_send_cookie_echo(m, offset, initack_limit, stcb, net); return (retval); @@ -650,7 +654,7 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, tv.tv_sec = cp->heartbeat.hb_info.time_value_1; tv.tv_usec = cp->heartbeat.hb_info.time_value_2; /* Now lets do a RTO with this */ - r_net->RTO = sctp_calculate_rto(stcb, &stcb->asoc, r_net, &tv, + sctp_calculate_rto(stcb, &stcb->asoc, r_net, &tv, SCTP_RTT_FROM_NON_DATA); if (!(r_net->dest_state & SCTP_ADDR_REACHABLE)) { r_net->dest_state |= SCTP_ADDR_REACHABLE; @@ -705,34 +709,37 @@ static int sctp_handle_nat_colliding_state(struct sctp_tcb *stcb) { /* - * return 0 means we want you to proceed with the abort non-zero - * means no abort processing + * Return 0 means we want you to proceed with the abort non-zero + * means no abort processing. */ + uint32_t new_vtag; struct sctpasochead *head; if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { + new_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_INP_INFO_WLOCK(); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); + } else { + return (0); } if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) { /* generate a new vtag and send init */ LIST_REMOVE(stcb, sctp_asocs); - stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); + stcb->asoc.my_vtag = new_vtag; head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); - sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); SCTP_INP_INFO_WUNLOCK(); + sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); return (1); - } - if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) { + } else { /* * treat like a case where the cookie expired i.e.: - dump * current cookie. - generate a new vtag. - resend init. @@ -742,15 +749,15 @@ sctp_handle_nat_colliding_state(struct sctp_tcb *stcb) SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, &stcb->asoc); - stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); + stcb->asoc.my_vtag = new_vtag; head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); - sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); SCTP_INP_INFO_WUNLOCK(); + sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); return (1); } return (0); @@ -1676,8 +1683,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, old.tv_sec = cookie->time_entered.tv_sec; old.tv_usec = cookie->time_entered.tv_usec; net->hb_responded = 1; - net->RTO = sctp_calculate_rto(stcb, asoc, net, - &old, + sctp_calculate_rto(stcb, asoc, net, &old, SCTP_RTT_FROM_NON_DATA); if (stcb->asoc.sctp_autoclose_ticks && @@ -2401,8 +2407,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, /* calculate the RTT and set the encaps port */ old.tv_sec = cookie->time_entered.tv_sec; old.tv_usec = cookie->time_entered.tv_usec; - (*netp)->RTO = sctp_calculate_rto(stcb, asoc, *netp, - &old, SCTP_RTT_FROM_NON_DATA); + sctp_calculate_rto(stcb, asoc, *netp, &old, SCTP_RTT_FROM_NON_DATA); } /* respond with a COOKIE-ACK */ sctp_send_cookie_ack(stcb); @@ -2978,8 +2983,7 @@ sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED, SCTP_STAT_INCR_COUNTER32(sctps_activeestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); if (asoc->overall_error_count == 0) { - net->RTO = sctp_calculate_rto(stcb, asoc, net, - &asoc->time_entered, + sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, SCTP_RTT_FROM_NON_DATA); } (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); diff --git a/freebsd/sys/netinet/sctp_os_bsd.h b/freebsd/sys/netinet/sctp_os_bsd.h index abe8e2c9..3db2d5e2 100644 --- a/freebsd/sys/netinet/sctp_os_bsd.h +++ b/freebsd/sys/netinet/sctp_os_bsd.h @@ -97,9 +97,6 @@ __FBSDID("$FreeBSD$"); #include <crypto/sha1.h> #include <crypto/sha2/sha256.h> -#ifndef in6pcb -#define in6pcb inpcb -#endif /* Declare all the malloc names for all the various mallocs */ MALLOC_DECLARE(SCTP_M_MAP); MALLOC_DECLARE(SCTP_M_STRMI); @@ -368,7 +365,7 @@ typedef struct callout sctp_os_timer_t; */ /* get the v6 hop limit */ -#define SCTP_GET_HLIM(inp, ro) in6_selecthlim((struct in6pcb *)&inp->ip_inp.inp, (ro ? (ro->ro_rt ? (ro->ro_rt->rt_ifp) : (NULL)) : (NULL))); +#define SCTP_GET_HLIM(inp, ro) in6_selecthlim((struct inpcb *)&inp->ip_inp.inp, (ro ? (ro->ro_rt ? (ro->ro_rt->rt_ifp) : (NULL)) : (NULL))); /* is the endpoint v6only? */ #define SCTP_IPV6_V6ONLY(inp) (((struct inpcb *)inp)->inp_flags & IN6P_IPV6_V6ONLY) @@ -431,7 +428,7 @@ typedef struct rtentry sctp_rtentry_t; m_clrprotoflags(o_pak); \ if (local_stcb && local_stcb->sctp_ep) \ result = ip6_output(o_pak, \ - ((struct in6pcb *)(local_stcb->sctp_ep))->in6p_outputopts, \ + ((struct inpcb *)(local_stcb->sctp_ep))->in6p_outputopts, \ (ro), 0, 0, ifp, NULL); \ else \ result = ip6_output(o_pak, NULL, (ro), 0, 0, ifp, NULL); \ diff --git a/freebsd/sys/netinet/sctp_output.c b/freebsd/sys/netinet/sctp_output.c index 9221080d..522825da 100644 --- a/freebsd/sys/netinet/sctp_output.c +++ b/freebsd/sys/netinet/sctp_output.c @@ -4338,7 +4338,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, * at the SCTP layer. So use the value from * the IP layer. */ - flowlabel = ntohl(((struct in6pcb *)inp)->in6p_flowinfo); + flowlabel = ntohl(((struct inpcb *)inp)->inp_flow); } flowlabel &= 0x000fffff; len = SCTP_MIN_OVERHEAD; @@ -4393,7 +4393,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, * at the SCTP layer. So use the value from * the IP layer. */ - tos_value = (ntohl(((struct in6pcb *)inp)->in6p_flowinfo) >> 20) & 0xff; + tos_value = (ntohl(((struct inpcb *)inp)->inp_flow) >> 20) & 0xff; } tos_value &= 0xfc; if (ecn_ok) { @@ -7874,8 +7874,8 @@ sctp_med_chunk_output(struct sctp_inpcb *inp, int bundle_at, ctl_cnt, no_data_chunks, eeor_mode; unsigned int mtu, r_mtu, omtu, mx_mtu, to_out; int tsns_sent = 0; - uint32_t auth_offset = 0; - struct sctp_auth_chunk *auth = NULL; + uint32_t auth_offset; + struct sctp_auth_chunk *auth; uint16_t auth_keyid; int override_ok = 1; int skip_fill_up = 0; @@ -8070,6 +8070,8 @@ again_one_more_time: } bundle_at = 0; endoutchain = outchain = NULL; + auth = NULL; + auth_offset = 0; no_fragmentflg = 1; one_chunk = 0; if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { @@ -9061,8 +9063,7 @@ sctp_send_cookie_echo(struct mbuf *m, pad = 4 - pad; } if (pad > 0) { - cookie = sctp_pad_lastmbuf(cookie, pad, NULL); - if (cookie == NULL) { + if (sctp_pad_lastmbuf(cookie, pad, NULL) == NULL) { return (-8); } } diff --git a/freebsd/sys/netinet/sctp_pcb.c b/freebsd/sys/netinet/sctp_pcb.c index 10e4768e..c72cb5a9 100644 --- a/freebsd/sys/netinet/sctp_pcb.c +++ b/freebsd/sys/netinet/sctp_pcb.c @@ -49,7 +49,6 @@ __FBSDID("$FreeBSD$"); #include <netinet/sctp_output.h> #include <netinet/sctp_timer.h> #include <netinet/sctp_bsd_addr.h> -#include <netinet/sctp_dtrace_define.h> #if defined(INET) || defined(INET6) #include <netinet/udp.h> #endif @@ -3647,12 +3646,8 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) #ifdef INET6 - if (ip_pcb->inp_vflag & INP_IPV6) { - struct in6pcb *in6p; - - in6p = (struct in6pcb *)inp; - ip6_freepcbopts(in6p->in6p_outputopts); - } + if (ip_pcb->inp_vflag & INP_IPV6) + ip6_freepcbopts(((struct inpcb *)inp)->in6p_outputopts); #endif /* INET6 */ ip_pcb->inp_vflag = 0; /* free up authentication fields */ diff --git a/freebsd/sys/netinet/sctp_pcb.h b/freebsd/sys/netinet/sctp_pcb.h index 0f5aca88..cbe51c7d 100644 --- a/freebsd/sys/netinet/sctp_pcb.h +++ b/freebsd/sys/netinet/sctp_pcb.h @@ -362,7 +362,7 @@ struct sctp_inpcb { */ union { struct inpcb inp; - char align[(sizeof(struct in6pcb) + SCTP_ALIGNM1) & + char align[(sizeof(struct inpcb) + SCTP_ALIGNM1) & ~SCTP_ALIGNM1]; } ip_inp; diff --git a/freebsd/sys/netinet/sctp_usrreq.c b/freebsd/sys/netinet/sctp_usrreq.c index 01759156..0783462a 100644 --- a/freebsd/sys/netinet/sctp_usrreq.c +++ b/freebsd/sys/netinet/sctp_usrreq.c @@ -1414,10 +1414,8 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && (num_v4 > 0)) { - struct in6pcb *inp6; - inp6 = (struct in6pcb *)inp; - if (SCTP_IPV6_V6ONLY(inp6)) { + if (SCTP_IPV6_V6ONLY(inp)) { /* * if IPV6_V6ONLY flag, ignore connections destined * to a v4 addr or v4-mapped addr @@ -6918,14 +6916,14 @@ sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p) #ifdef INET6 case AF_INET6: { - struct sockaddr_in6 *sin6p; + struct sockaddr_in6 *sin6; if (addr->sa_len != sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } - sin6p = (struct sockaddr_in6 *)addr; - if (p != NULL && (error = prison_remote_ip6(p->td_ucred, &sin6p->sin6_addr)) != 0) { + sin6 = (struct sockaddr_in6 *)addr; + if (p != NULL && (error = prison_remote_ip6(p->td_ucred, &sin6->sin6_addr)) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); return (error); } diff --git a/freebsd/sys/netinet/sctputil.c b/freebsd/sys/netinet/sctputil.c index c7d4499c..6ae999b0 100644 --- a/freebsd/sys/netinet/sctputil.c +++ b/freebsd/sys/netinet/sctputil.c @@ -2471,25 +2471,24 @@ sctp_mtu_size_reset(struct sctp_inpcb *inp, /* - * given an association and starting time of the current RTT period return - * RTO in number of msecs net should point to the current network + * Given an association and starting time of the current RTT period, update + * RTO in number of msecs. net should point to the current network. + * Return 1, if an RTO update was performed, return 0 if no update was + * performed due to invalid starting point. */ -uint32_t +int sctp_calculate_rto(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_nets *net, struct timeval *old, int rtt_from_sack) { - /*- - * given an association and the starting time of the current RTT - * period (in value1/value2) return RTO in number of msecs. - */ + struct timeval now; + uint64_t rtt_us; /* RTT in us */ int32_t rtt; /* RTT in ms */ uint32_t new_rto; int first_measure = 0; - struct timeval now; /************************/ /* 1. calculate new RTT */ @@ -2500,10 +2499,19 @@ sctp_calculate_rto(struct sctp_tcb *stcb, } else { (void)SCTP_GETTIME_TIMEVAL(&now); } + if ((old->tv_sec > now.tv_sec) || + ((old->tv_sec == now.tv_sec) && (old->tv_sec > now.tv_sec))) { + /* The starting point is in the future. */ + return (0); + } timevalsub(&now, old); + rtt_us = (uint64_t)1000000 * (uint64_t)now.tv_sec + (uint64_t)now.tv_usec; + if (rtt_us > SCTP_RTO_UPPER_BOUND * 1000) { + /* The RTT is larger than a sane value. */ + return (0); + } /* store the current RTT in us */ - net->rtt = (uint64_t)1000000 * (uint64_t)now.tv_sec + - (uint64_t)now.tv_usec; + net->rtt = rtt_us; /* compute rtt in ms */ rtt = (int32_t)(net->rtt / 1000); if ((asoc->cc_functions.sctp_rtt_calculated) && (rtt_from_sack == SCTP_RTT_FROM_DATA)) { @@ -2535,7 +2543,7 @@ sctp_calculate_rto(struct sctp_tcb *stcb, * Paper "Congestion Avoidance and Control", Annex A. * * (net->lastsa >> SCTP_RTT_SHIFT) is the srtt - * (net->lastsa >> SCTP_RTT_VAR_SHIFT) is the rttvar + * (net->lastsv >> SCTP_RTT_VAR_SHIFT) is the rttvar */ if (net->RTO_measured) { rtt -= (net->lastsa >> SCTP_RTT_SHIFT); @@ -2576,8 +2584,8 @@ sctp_calculate_rto(struct sctp_tcb *stcb, if (new_rto > stcb->asoc.maxrto) { new_rto = stcb->asoc.maxrto; } - /* we are now returning the RTO */ - return (new_rto); + net->RTO = new_rto; + return (1); } /* diff --git a/freebsd/sys/netinet/sctputil.h b/freebsd/sys/netinet/sctputil.h index 690e6125..c67c021f 100644 --- a/freebsd/sys/netinet/sctputil.h +++ b/freebsd/sys/netinet/sctputil.h @@ -133,7 +133,7 @@ uint32_t sctp_get_next_mtu(uint32_t); void sctp_timeout_handler(void *); -uint32_t +int sctp_calculate_rto(struct sctp_tcb *, struct sctp_association *, struct sctp_nets *, struct timeval *, int); diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c index 05891306..fc111d9c 100644 --- a/freebsd/sys/netinet/tcp_input.c +++ b/freebsd/sys/netinet/tcp_input.c @@ -131,9 +131,9 @@ __FBSDID("$FreeBSD$"); const int tcprexmtthresh = 3; -int tcp_log_in_vain = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, - &tcp_log_in_vain, 0, +VNET_DEFINE(int, tcp_log_in_vain) = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(tcp_log_in_vain), 0, "Log all incoming TCP segments to closed ports"); VNET_DEFINE(int, blackhole) = 0; @@ -536,11 +536,19 @@ cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) int tcp6_input(struct mbuf **mp, int *offp, int proto) { - struct mbuf *m = *mp; + struct mbuf *m; struct in6_ifaddr *ia6; struct ip6_hdr *ip6; - IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); + m = *mp; + if (m->m_len < *offp + sizeof(struct tcphdr)) { + m = m_pullup(m, *offp + sizeof(struct tcphdr)); + if (m == NULL) { + *mp = m; + TCPSTAT_INC(tcps_rcvshort); + return (IPPROTO_DONE); + } + } /* * draft-itojun-ipv6-tcp-to-anycast @@ -549,17 +557,17 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) ip6 = mtod(m, struct ip6_hdr *); ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { - struct ip6_hdr *ip6; ifa_free(&ia6->ia_ifa); - ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); + *mp = NULL; return (IPPROTO_DONE); } if (ia6) ifa_free(&ia6->ia_ifa); + *mp = m; return (tcp_input(mp, offp, proto)); } #endif /* INET6 */ @@ -618,15 +626,6 @@ tcp_input(struct mbuf **mp, int *offp, int proto) #ifdef INET6 if (isipv6) { - /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ - - if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { - m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); - if (m == NULL) { - TCPSTAT_INC(tcps_rcvshort); - return (IPPROTO_DONE); - } - } ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); @@ -735,7 +734,13 @@ tcp_input(struct mbuf **mp, int *offp, int proto) if (off > sizeof (struct tcphdr)) { #ifdef INET6 if (isipv6) { - IP6_EXTHDR_CHECK(m, off0, off, IPPROTO_DONE); + if (m->m_len < off0 + off) { + m = m_pullup(m, off0 + off); + if (m == NULL) { + TCPSTAT_INC(tcps_rcvshort); + return (IPPROTO_DONE); + } + } ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } @@ -883,8 +888,8 @@ findpcb: * Log communication attempts to ports that are not * in use. */ - if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || - tcp_log_in_vain == 2) { + if ((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) || + V_tcp_log_in_vain == 2) { if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); diff --git a/freebsd/sys/netinet/tcp_output.c b/freebsd/sys/netinet/tcp_output.c index 3e024fdb..dc75c68d 100644 --- a/freebsd/sys/netinet/tcp_output.c +++ b/freebsd/sys/netinet/tcp_output.c @@ -933,6 +933,20 @@ send: if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { + if (optlen + ipoptlen >= tp->t_maxseg) { + /* + * Since we don't have enough space to put + * the IP header chain and the TCP header in + * one packet as required by RFC 7112, don't + * send it. Also ensure that at least one + * byte of the payload can be put into the + * TCP segment. + */ + SOCKBUF_UNLOCK(&so->so_snd); + error = EMSGSIZE; + sack_rxmit = 0; + goto out; + } len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; if (dont_sendalot) diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c index 44ec38c7..eae696c1 100644 --- a/freebsd/sys/netinet/tcp_subr.c +++ b/freebsd/sys/netinet/tcp_subr.c @@ -3114,7 +3114,7 @@ tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, { /* Is logging enabled? */ - if (tcp_log_in_vain == 0) + if (V_tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c index cf6ceff5..e1b9ec59 100644 --- a/freebsd/sys/netinet/tcp_timer.c +++ b/freebsd/sys/netinet/tcp_timer.c @@ -127,9 +127,10 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); -int tcp_always_keepalive = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, - &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); +VNET_DEFINE(int, tcp_always_keepalive) = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW, + &VNET_NAME(tcp_always_keepalive) , 0, + "Assume SO_KEEPALIVE on all TCP connections"); int tcp_fast_finwait2_recycle = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, @@ -433,7 +434,7 @@ tcp_timer_keep(void *xtp) TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; - if ((tcp_always_keepalive || + if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) diff --git a/freebsd/sys/netinet/tcp_timer.h b/freebsd/sys/netinet/tcp_timer.h index 3e985bdf..fe3616c2 100644 --- a/freebsd/sys/netinet/tcp_timer.h +++ b/freebsd/sys/netinet/tcp_timer.h @@ -203,10 +203,11 @@ extern int tcp_backoff[]; extern int tcp_totbackoff; extern int tcp_rexmit_drop_options; -extern int tcp_always_keepalive; extern int tcp_finwait2_timeout; extern int tcp_fast_finwait2_recycle; +VNET_DECLARE(int, tcp_always_keepalive); +#define V_tcp_always_keepalive VNET(tcp_always_keepalive) VNET_DECLARE(int, tcp_pmtud_blackhole_detect); #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) VNET_DECLARE(int, tcp_pmtud_blackhole_mss); diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c index 809ea35d..eab13eeb 100644 --- a/freebsd/sys/netinet/tcp_usrreq.c +++ b/freebsd/sys/netinet/tcp_usrreq.c @@ -346,23 +346,25 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; - struct sockaddr_in6 *sin6p; + struct sockaddr_in6 *sin6; + u_char vflagsav; - sin6p = (struct sockaddr_in6 *)nam; - if (nam->sa_len != sizeof (*sin6p)) + sin6 = (struct sockaddr_in6 *)nam; + if (nam->sa_len != sizeof (*sin6)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ - if (sin6p->sin6_family == AF_INET6 && - IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) + if (sin6->sin6_family == AF_INET6 && + IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EAFNOSUPPORT); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); INP_WLOCK(inp); + vflagsav = inp->inp_vflag; if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; @@ -374,12 +376,12 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) inp->inp_vflag |= INP_IPV6; #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { - if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) inp->inp_vflag |= INP_IPV4; - else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + else if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; - in6_sin6_2_sin(&sin, sin6p); + in6_sin6_2_sin(&sin, sin6); if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { error = EAFNOSUPPORT; INP_HASH_WUNLOCK(&V_tcbinfo); @@ -397,6 +399,8 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) error = in6_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: + if (error != 0) + inp->inp_vflag = vflagsav; TCPDEBUG2(PRU_BIND); TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); @@ -459,6 +463,7 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; + u_char vflagsav; TCPDEBUG0; inp = sotoinpcb(so); @@ -468,6 +473,7 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) error = EINVAL; goto out; } + vflagsav = inp->inp_vflag; tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); @@ -493,6 +499,9 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) if (IS_FASTOPEN(tp->t_flags)) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); + if (error != 0) + inp->inp_vflag = vflagsav; + out: TCPDEBUG2(PRU_LISTEN); TCP_PROBE2(debug__user, tp, PRU_LISTEN); @@ -568,23 +577,27 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; - struct sockaddr_in6 *sin6p; + struct sockaddr_in6 *sin6; + u_int8_t incflagsav; + u_char vflagsav; TCPDEBUG0; - sin6p = (struct sockaddr_in6 *)nam; - if (nam->sa_len != sizeof (*sin6p)) + sin6 = (struct sockaddr_in6 *)nam; + if (nam->sa_len != sizeof (*sin6)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ - if (sin6p->sin6_family == AF_INET6 - && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) + if (sin6->sin6_family == AF_INET6 + && IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_WLOCK(inp); + vflagsav = inp->inp_vflag; + incflagsav = inp->inp_inc.inc_flags; if (inp->inp_flags & INP_TIMEWAIT) { error = EADDRINUSE; goto out; @@ -601,7 +614,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) * therefore probably require the hash lock, which isn't held here. * Is this a significant problem? */ - if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { @@ -613,16 +626,16 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; } - in6_sin6_2_sin(&sin, sin6p); + in6_sin6_2_sin(&sin, sin6); if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { error = EAFNOSUPPORT; goto out; } - inp->inp_vflag |= INP_IPV4; - inp->inp_vflag &= ~INP_IPV6; if ((error = prison_remote_ip4(td->td_ucred, &sin.sin_addr)) != 0) goto out; + inp->inp_vflag |= INP_IPV4; + inp->inp_vflag &= ~INP_IPV6; if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) goto out; #ifdef TCP_OFFLOAD @@ -640,11 +653,11 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) } } #endif + if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr)) != 0) + goto out; inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; inp->inp_inc.inc_flags |= INC_ISIPV6; - if ((error = prison_remote_ip6(td->td_ucred, &sin6p->sin6_addr)) != 0) - goto out; if ((error = tcp6_connect(tp, nam, td)) != 0) goto out; #ifdef TCP_OFFLOAD @@ -657,6 +670,15 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) error = tp->t_fb->tfb_tcp_output(tp); out: + /* + * If the implicit bind in the connect call fails, restore + * the flags we modified. + */ + if (error != 0 && inp->inp_lport == 0) { + inp->inp_vflag = vflagsav; + inp->inp_inc.inc_flags = incflagsav; + } + TCPDEBUG2(PRU_CONNECT); TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); @@ -912,6 +934,9 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, #ifdef INET6 int isipv6; #endif + u_int8_t incflagsav; + u_char vflagsav; + bool restoreflags; TCPDEBUG0; /* @@ -923,6 +948,9 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); + vflagsav = inp->inp_vflag; + incflagsav = inp->inp_inc.inc_flags; + restoreflags = false; if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { if (control) m_freem(control); @@ -974,22 +1002,22 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, #ifdef INET6 case AF_INET6: { - struct sockaddr_in6 *sin6p; + struct sockaddr_in6 *sin6; - sin6p = (struct sockaddr_in6 *)nam; - if (sin6p->sin6_len != sizeof(struct sockaddr_in6)) { + sin6 = (struct sockaddr_in6 *)nam; + if (sin6->sin6_len != sizeof(*sin6)) { if (m) m_freem(m); error = EINVAL; goto out; } - if (IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { + if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { if (m) m_freem(m); error = EAFNOSUPPORT; goto out; } - if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; @@ -1003,9 +1031,10 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, m_freem(m); goto out; } + restoreflags = true; inp->inp_vflag &= ~INP_IPV6; sinp = &sin; - in6_sin6_2_sin(sinp, sin6p); + in6_sin6_2_sin(sinp, sin6); if (IN_MULTICAST( ntohl(sinp->sin_addr.s_addr))) { error = EAFNOSUPPORT; @@ -1033,10 +1062,11 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, error = EAFNOSUPPORT; goto out; } + restoreflags = true; inp->inp_vflag &= ~INP_IPV4; inp->inp_inc.inc_flags |= INC_ISIPV6; if ((error = prison_remote_ip6(td->td_ucred, - &sin6p->sin6_addr))) { + &sin6->sin6_addr))) { if (m) m_freem(m); goto out; @@ -1083,6 +1113,14 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, error = tcp_connect(tp, (struct sockaddr *)sinp, td); #endif + /* + * The bind operation in tcp_connect succeeded. We + * no longer want to restore the flags if later + * operations fail. + */ + if (error == 0 || inp->inp_lport != 0) + restoreflags = false; + if (error) goto out; if (IS_FASTOPEN(tp->t_flags)) @@ -1153,6 +1191,14 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, error = tcp_connect(tp, (struct sockaddr *)sinp, td); #endif + /* + * The bind operation in tcp_connect succeeded. We + * no longer want to restore the flags if later + * operations fail. + */ + if (error == 0 || inp->inp_lport != 0) + restoreflags = false; + if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; @@ -1171,6 +1217,14 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, TCP_LOG_USERSEND, error, 0, NULL, false); out: + /* + * If the request was unsuccessful and we changed flags, + * restore the original flags. + */ + if (error != 0 && restoreflags) { + inp->inp_vflag = vflagsav; + inp->inp_inc.inc_flags = incflagsav; + } TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB : diff --git a/freebsd/sys/netinet/tcp_var.h b/freebsd/sys/netinet/tcp_var.h index cca8623e..13d20294 100644 --- a/freebsd/sys/netinet/tcp_var.h +++ b/freebsd/sys/netinet/tcp_var.h @@ -745,7 +745,8 @@ SYSCTL_DECL(_net_inet_tcp_sack); MALLOC_DECLARE(M_TCPLOG); #endif -extern int tcp_log_in_vain; +VNET_DECLARE(int, tcp_log_in_vain); +#define V_tcp_log_in_vain VNET(tcp_log_in_vain) /* * Global TCP tunables shared between different stacks. diff --git a/freebsd/sys/netinet/udp_usrreq.c b/freebsd/sys/netinet/udp_usrreq.c index f89660d6..8462d0ee 100644 --- a/freebsd/sys/netinet/udp_usrreq.c +++ b/freebsd/sys/netinet/udp_usrreq.c @@ -122,9 +122,9 @@ VNET_DEFINE(int, udp_cksum) = 1; SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_cksum), 0, "compute udp checksum"); -int udp_log_in_vain = 0; -SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, - &udp_log_in_vain, 0, "Log all incoming UDP packets"); +VNET_DEFINE(int, udp_log_in_vain) = 0; +SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(udp_log_in_vain), 0, "Log all incoming UDP packets"); VNET_DEFINE(int, udp_blackhole) = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, @@ -427,14 +427,13 @@ udp_input(struct mbuf **mp, int *offp, int proto) /* * Get IP and UDP header together in first mbuf. */ - ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) { UDPSTAT_INC(udps_hdrops); return (IPPROTO_DONE); } - ip = mtod(m, struct ip *); } + ip = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)ip + iphlen); cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0; @@ -695,7 +694,7 @@ udp_input(struct mbuf **mp, int *offp, int proto) ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp, m); if (inp == NULL) { - if (udp_log_in_vain) { + if (V_udp_log_in_vain) { char src[INET_ADDRSTRLEN]; char dst[INET_ADDRSTRLEN]; diff --git a/freebsd/sys/netinet/udp_var.h b/freebsd/sys/netinet/udp_var.h index 01545582..ecca2a54 100644 --- a/freebsd/sys/netinet/udp_var.h +++ b/freebsd/sys/netinet/udp_var.h @@ -153,9 +153,10 @@ extern u_long udp_sendspace; extern u_long udp_recvspace; VNET_DECLARE(int, udp_cksum); VNET_DECLARE(int, udp_blackhole); +VNET_DECLARE(int, udp_log_in_vain); #define V_udp_cksum VNET(udp_cksum) #define V_udp_blackhole VNET(udp_blackhole) -extern int udp_log_in_vain; +#define V_udp_log_in_vain VNET(udp_log_in_vain) static __inline struct inpcbinfo * udp_get_inpcbinfo(int protocol) diff --git a/freebsd/sys/netinet6/dest6.c b/freebsd/sys/netinet6/dest6.c index 50a836ba..354457e2 100644 --- a/freebsd/sys/netinet6/dest6.c +++ b/freebsd/sys/netinet6/dest6.c @@ -66,30 +66,35 @@ __FBSDID("$FreeBSD$"); int dest6_input(struct mbuf **mp, int *offp, int proto) { - struct mbuf *m = *mp; - int off = *offp, dstoptlen, optlen; + struct mbuf *m; + int off, dstoptlen, optlen; struct ip6_dest *dstopts; u_int8_t *opt; - /* validation of the length of the header */ -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(*dstopts), IPPROTO_DONE); + m = *mp; + off = *offp; + + /* Validation of the length of the header. */ + if (m->m_len < off + sizeof(*dstopts)) { + m = m_pullup(m, off + sizeof(*dstopts)); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = m; + return (IPPROTO_DONE); + } + } dstopts = (struct ip6_dest *)(mtod(m, caddr_t) + off); -#else - IP6_EXTHDR_GET(dstopts, struct ip6_dest *, m, off, sizeof(*dstopts)); - if (dstopts == NULL) - return IPPROTO_DONE; -#endif dstoptlen = (dstopts->ip6d_len + 1) << 3; -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, dstoptlen, IPPROTO_DONE); + if (m->m_len < off + dstoptlen) { + m = m_pullup(m, off + dstoptlen); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = m; + return (IPPROTO_DONE); + } + } dstopts = (struct ip6_dest *)(mtod(m, caddr_t) + off); -#else - IP6_EXTHDR_GET(dstopts, struct ip6_dest *, m, off, dstoptlen); - if (dstopts == NULL) - return IPPROTO_DONE; -#endif off += dstoptlen; dstoptlen -= sizeof(struct ip6_dest); opt = (u_int8_t *)dstopts + sizeof(struct ip6_dest); @@ -112,17 +117,21 @@ dest6_input(struct mbuf **mp, int *offp, int proto) default: /* unknown option */ optlen = ip6_unknown_opt(opt, m, opt - mtod(m, u_int8_t *)); - if (optlen == -1) + if (optlen == -1) { + *mp = NULL; return (IPPROTO_DONE); + } optlen += 2; break; } } *offp = off; + *mp = m; return (dstopts->ip6d_nxt); bad: m_freem(m); + *mp = NULL; return (IPPROTO_DONE); } diff --git a/freebsd/sys/netinet6/frag6.c b/freebsd/sys/netinet6/frag6.c index 0b0c7b91..443c684a 100644 --- a/freebsd/sys/netinet6/frag6.c +++ b/freebsd/sys/netinet6/frag6.c @@ -5,6 +5,7 @@ * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. + * Copyright (c) 2019 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -40,20 +41,18 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/domain.h> +#include <sys/eventhandler.h> #include <sys/hash.h> +#include <sys/kernel.h> #include <sys/malloc.h> #include <sys/mbuf.h> -#include <sys/domain.h> -#include <sys/eventhandler.h> #include <sys/protosw.h> +#include <sys/queue.h> #include <sys/socket.h> -#include <sys/errno.h> -#include <sys/time.h> -#include <sys/kernel.h> +#include <sys/sysctl.h> #include <sys/syslog.h> -#include <machine/atomic.h> - #include <net/if.h> #include <net/if_var.h> #include <net/netisr.h> @@ -65,48 +64,85 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet/icmp6.h> -#include <netinet/in_systm.h> /* for ECN definitions */ -#include <netinet/ip.h> /* for ECN definitions */ +#include <netinet/in_systm.h> /* For ECN definitions. */ +#include <netinet/ip.h> /* For ECN definitions. */ +#ifdef MAC #include <security/mac/mac_framework.h> +#endif /* - * Reassembly headers are stored in hash buckets. + * A "big picture" of how IPv6 fragment queues are all linked together. + * + * struct ip6qbucket ip6qb[...]; hashed buckets + * |||||||| + * | + * +--- TAILQ(struct ip6q, packets) *q6; tailq entries holding + * |||||||| fragmented packets + * | (1 per original packet) + * | + * +--- TAILQ(struct ip6asfrag, ip6q_frags) *af6; tailq entries of IPv6 + * | *ip6af;fragment packets + * | for one original packet + * + *mbuf */ + +/* Reassembly headers are stored in hash buckets. */ #define IP6REASS_NHASH_LOG2 10 #define IP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2) #define IP6REASS_HMASK (IP6REASS_NHASH - 1) -static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *, - uint32_t bucket __unused); -static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused); -static void frag6_insque_head(struct ip6q *, struct ip6q *, - uint32_t bucket); -static void frag6_remque(struct ip6q *, uint32_t bucket); -static void frag6_freef(struct ip6q *, uint32_t bucket); - +TAILQ_HEAD(ip6qhead, ip6q); struct ip6qbucket { - struct ip6q ip6q; + struct ip6qhead packets; struct mtx lock; int count; }; -VNET_DEFINE_STATIC(volatile u_int, frag6_nfragpackets); -volatile u_int frag6_nfrags = 0; -VNET_DEFINE_STATIC(struct ip6qbucket, ip6q[IP6REASS_NHASH]); -VNET_DEFINE_STATIC(uint32_t, ip6q_hashseed); +struct ip6asfrag { + TAILQ_ENTRY(ip6asfrag) ip6af_tq; + struct mbuf *ip6af_m; + int ip6af_offset; /* Offset in ip6af_m to next header. */ + int ip6af_frglen; /* Fragmentable part length. */ + int ip6af_off; /* Fragment offset. */ + bool ip6af_mff; /* More fragment bit in frag off. */ +}; + +static MALLOC_DEFINE(M_FRAG6, "frag6", "IPv6 fragment reassembly header"); + +#ifdef VIMAGE +/* A flag to indicate if IPv6 fragmentation is initialized. */ +VNET_DEFINE_STATIC(bool, frag6_on); +#define V_frag6_on VNET(frag6_on) +#endif + +/* System wide (global) maximum and count of packets in reassembly queues. */ +static int ip6_maxfrags; +static volatile u_int frag6_nfrags = 0; +/* Maximum and current packets in per-VNET reassembly queue. */ +VNET_DEFINE_STATIC(int, ip6_maxfragpackets); +VNET_DEFINE_STATIC(volatile u_int, frag6_nfragpackets); +#define V_ip6_maxfragpackets VNET(ip6_maxfragpackets) #define V_frag6_nfragpackets VNET(frag6_nfragpackets) -#define V_ip6q VNET(ip6q) -#define V_ip6q_hashseed VNET(ip6q_hashseed) -#define IP6Q_LOCK(i) mtx_lock(&V_ip6q[(i)].lock) -#define IP6Q_TRYLOCK(i) mtx_trylock(&V_ip6q[(i)].lock) -#define IP6Q_LOCK_ASSERT(i) mtx_assert(&V_ip6q[(i)].lock, MA_OWNED) -#define IP6Q_UNLOCK(i) mtx_unlock(&V_ip6q[(i)].lock) -#define IP6Q_HEAD(i) (&V_ip6q[(i)].ip6q) +/* Maximum per-VNET reassembly queues per bucket and fragments per packet. */ +VNET_DEFINE_STATIC(int, ip6_maxfragbucketsize); +VNET_DEFINE_STATIC(int, ip6_maxfragsperpacket); +#define V_ip6_maxfragbucketsize VNET(ip6_maxfragbucketsize) +#define V_ip6_maxfragsperpacket VNET(ip6_maxfragsperpacket) + +/* Per-VNET reassembly queue buckets. */ +VNET_DEFINE_STATIC(struct ip6qbucket, ip6qb[IP6REASS_NHASH]); +VNET_DEFINE_STATIC(uint32_t, ip6qb_hashseed); +#define V_ip6qb VNET(ip6qb) +#define V_ip6qb_hashseed VNET(ip6qb_hashseed) -static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); +#define IP6QB_LOCK(_b) mtx_lock(&V_ip6qb[(_b)].lock) +#define IP6QB_TRYLOCK(_b) mtx_trylock(&V_ip6qb[(_b)].lock) +#define IP6QB_LOCK_ASSERT(_b) mtx_assert(&V_ip6qb[(_b)].lock, MA_OWNED) +#define IP6QB_UNLOCK(_b) mtx_unlock(&V_ip6qb[(_b)].lock) +#define IP6QB_HEAD(_b) (&V_ip6qb[(_b)].packets) /* * By default, limit the number of IP6 fragments across all reassembly @@ -124,11 +160,18 @@ static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); #define IP6_MAXFRAGS (nmbclusters / 32) #define IP6_MAXFRAGPACKETS (imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50)) + /* - * Initialise reassembly queue and fragment identifier. + * Sysctls and helper function. */ -void -frag6_set_bucketsize() +SYSCTL_DECL(_net_inet6_ip6); + +SYSCTL_UINT(_net_inet6_ip6, OID_AUTO, frag6_nfrags, + CTLFLAG_RD, __DEVOLATILE(u_int *, &frag6_nfrags), 0, + "Global number of IPv6 fragments across all reassembly queues."); + +static void +frag6_set_bucketsize(void) { int i; @@ -136,68 +179,180 @@ frag6_set_bucketsize() V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1); } +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, + CTLFLAG_RW, &ip6_maxfrags, 0, + "Maximum allowed number of outstanding IPv6 packet fragments. " + "A value of 0 means no fragmented packets will be accepted, while a " + "a value of -1 means no limit"); + +static int +sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = V_ip6_maxfragpackets; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || !req->newptr) + return (error); + V_ip6_maxfragpackets = val; + frag6_set_bucketsize(); + return (0); +} +SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, NULL, 0, + sysctl_ip6_maxfragpackets, "I", + "Default maximum number of outstanding fragmented IPv6 packets. " + "A value of 0 means no fragmented packets will be accepted, while a " + "a value of -1 means no limit"); +SYSCTL_UINT(_net_inet6_ip6, OID_AUTO, frag6_nfragpackets, + CTLFLAG_VNET | CTLFLAG_RD, + __DEVOLATILE(u_int *, &VNET_NAME(frag6_nfragpackets)), 0, + "Per-VNET number of IPv6 fragments across all reassembly queues."); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0, + "Maximum allowed number of fragments per packet"); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0, + "Maximum number of reassembly queues per hash bucket"); + + +/* + * Remove the IPv6 fragmentation header from the mbuf. + */ +int +ip6_deletefraghdr(struct mbuf *m, int offset, int wait __unused) +{ + struct ip6_hdr *ip6; + + KASSERT(m->m_len >= offset + sizeof(struct ip6_frag), + ("%s: ext headers not contigous in mbuf %p m_len %d >= " + "offset %d + %zu\n", __func__, m, m->m_len, offset, + sizeof(struct ip6_frag))); + + /* Delete frag6 header. */ + ip6 = mtod(m, struct ip6_hdr *); + bcopy(ip6, (char *)ip6 + sizeof(struct ip6_frag), offset); + m->m_data += sizeof(struct ip6_frag); + m->m_len -= sizeof(struct ip6_frag); + m->m_flags |= M_FRAGMENTED; + + return (0); +} + +/* + * Free a fragment reassembly header and all associated datagrams. + */ static void -frag6_change(void *tag) +frag6_freef(struct ip6q *q6, uint32_t bucket) { - VNET_ITERATOR_DECL(vnet_iter); + struct ip6_hdr *ip6; + struct ip6asfrag *af6; + struct mbuf *m; - ip6_maxfrags = IP6_MAXFRAGS; - VNET_LIST_RLOCK_NOSLEEP(); - VNET_FOREACH(vnet_iter) { - CURVNET_SET(vnet_iter); - V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; - frag6_set_bucketsize(); - CURVNET_RESTORE(); + IP6QB_LOCK_ASSERT(bucket); + + while ((af6 = TAILQ_FIRST(&q6->ip6q_frags)) != NULL) { + + m = af6->ip6af_m; + TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq); + + /* + * Return ICMP time exceeded error for the 1st fragment. + * Just free other fragments. + */ + if (af6->ip6af_off == 0 && m->m_pkthdr.rcvif != NULL) { + + /* Adjust pointer. */ + ip6 = mtod(m, struct ip6_hdr *); + + /* Restore source and destination addresses. */ + ip6->ip6_src = q6->ip6q_src; + ip6->ip6_dst = q6->ip6q_dst; + + icmp6_error(m, ICMP6_TIME_EXCEEDED, + ICMP6_TIME_EXCEED_REASSEMBLY, 0); + } else + m_freem(m); + + free(af6, M_FRAG6); } - VNET_LIST_RUNLOCK_NOSLEEP(); + + TAILQ_REMOVE(IP6QB_HEAD(bucket), q6, ip6q_tq); + V_ip6qb[bucket].count--; + atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); +#ifdef MAC + mac_ip6q_destroy(q6); +#endif + free(q6, M_FRAG6); + atomic_subtract_int(&V_frag6_nfragpackets, 1); } -void -frag6_init(void) +/* + * Drain off all datagram fragments belonging to + * the given network interface. + */ +static void +frag6_cleanup(void *arg __unused, struct ifnet *ifp) { + struct ip6qhead *head; struct ip6q *q6; - int i; + struct ip6asfrag *af6; + uint32_t bucket; - V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; - frag6_set_bucketsize(); - for (i = 0; i < IP6REASS_NHASH; i++) { - q6 = IP6Q_HEAD(i); - q6->ip6q_next = q6->ip6q_prev = q6; - mtx_init(&V_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF); - V_ip6q[i].count = 0; - } - V_ip6q_hashseed = arc4random(); - V_ip6_maxfragsperpacket = 64; - if (!IS_DEFAULT_VNET(curvnet)) + KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); + + CURVNET_SET_QUIET(ifp->if_vnet); +#ifdef VIMAGE + /* + * Skip processing if IPv6 reassembly is not initialised or + * torn down by frag6_destroy(). + */ + if (!V_frag6_on) { + CURVNET_RESTORE(); return; + } +#endif - ip6_maxfrags = IP6_MAXFRAGS; - EVENTHANDLER_REGISTER(nmbclusters_change, - frag6_change, NULL, EVENTHANDLER_PRI_ANY); + for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { + IP6QB_LOCK(bucket); + head = IP6QB_HEAD(bucket); + /* Scan fragment list. */ + TAILQ_FOREACH(q6, head, ip6q_tq) { + TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) { + + /* Clear no longer valid rcvif pointer. */ + if (af6->ip6af_m->m_pkthdr.rcvif == ifp) + af6->ip6af_m->m_pkthdr.rcvif = NULL; + } + } + IP6QB_UNLOCK(bucket); + } + CURVNET_RESTORE(); } +EVENTHANDLER_DEFINE(ifnet_departure_event, frag6_cleanup, NULL, 0); /* - * In RFC2460, fragment and reassembly rule do not agree with each other, - * in terms of next header field handling in fragment header. + * Like in RFC2460, in RFC8200, fragment and reassembly rules do not agree with + * each other, in terms of next header field handling in fragment header. * While the sender will use the same value for all of the fragmented packets, - * receiver is suggested not to check the consistency. + * receiver is suggested not to check for consistency. * - * fragment rule (p20): - * (2) A Fragment header containing: - * The Next Header value that identifies the first header of - * the Fragmentable Part of the original packet. + * Fragment rules (p18,p19): + * (2) A Fragment header containing: + * The Next Header value that identifies the first header + * after the Per-Fragment headers of the original packet. * -> next header field is same for all fragments * - * reassembly rule (p21): - * The Next Header field of the last header of the Unfragmentable - * Part is obtained from the Next Header field of the first + * Reassembly rule (p20): + * The Next Header field of the last header of the Per-Fragment + * headers is obtained from the Next Header field of the first * fragment's Fragment header. * -> should grab it from the first fragment only * * The following note also contradicts with fragment rule - no one is going to * send different fragment with different next header field. * - * additional note (p22): + * Additional note (p22) [not an error]: * The Next Header values in the Fragment headers of different * fragments of the same original packet may differ. Only the value * from the Offset zero fragment packet is used for reassembly. @@ -206,91 +361,111 @@ frag6_init(void) * There is no explicit reason given in the RFC. Historical reason maybe? */ /* - * Fragment input + * Fragment input. */ int frag6_input(struct mbuf **mp, int *offp, int proto) { - struct mbuf *m = *mp, *t; + struct mbuf *m, *t; struct ip6_hdr *ip6; struct ip6_frag *ip6f; - struct ip6q *head, *q6; - struct ip6asfrag *af6, *ip6af, *af6dwn; - struct in6_ifaddr *ia; - int offset = *offp, nxt, i, next; - int first_frag = 0; - int fragoff, frgpartlen; /* must be larger than u_int16_t */ + struct ip6qhead *head; + struct ip6q *q6; + struct ip6asfrag *af6, *ip6af, *af6tmp; + struct in6_ifaddr *ia6; + struct ifnet *dstifp, *srcifp; uint32_t hashkey[(sizeof(struct in6_addr) * 2 + sizeof(ip6f->ip6f_ident)) / sizeof(uint32_t)]; - uint32_t hash, *hashkeyp; - struct ifnet *dstifp; - u_int8_t ecn, ecn0; + uint32_t bucket, *hashkeyp; + int fragoff, frgpartlen; /* Must be larger than uint16_t. */ + int nxt, offset, plen; + uint8_t ecn, ecn0; + bool only_frag; #ifdef RSS - struct m_tag *mtag; struct ip6_direct_ctx *ip6dc; + struct m_tag *mtag; #endif -#if 0 - char ip6buf[INET6_ADDRSTRLEN]; -#endif + m = *mp; + offset = *offp; + + M_ASSERTPKTHDR(m); + if (m->m_len < offset + sizeof(struct ip6_frag)) { + m = m_pullup(m, offset + sizeof(struct ip6_frag)); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = NULL; + return (IPPROTO_DONE); + } + } ip6 = mtod(m, struct ip6_hdr *); -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, offset, sizeof(struct ip6_frag), IPPROTO_DONE); - ip6f = (struct ip6_frag *)((caddr_t)ip6 + offset); -#else - IP6_EXTHDR_GET(ip6f, struct ip6_frag *, m, offset, sizeof(*ip6f)); - if (ip6f == NULL) - return (IPPROTO_DONE); -#endif dstifp = NULL; - /* find the destination interface of the packet. */ - ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); - if (ia != NULL) { - dstifp = ia->ia_ifp; - ifa_free(&ia->ia_ifa); + /* Find the destination interface of the packet. */ + ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); + if (ia6 != NULL) { + dstifp = ia6->ia_ifp; + ifa_free(&ia6->ia_ifa); } - /* jumbo payload can't contain a fragment header */ + + /* Jumbo payload cannot contain a fragment header. */ if (ip6->ip6_plen == 0) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset); in6_ifstat_inc(dstifp, ifs6_reass_fail); - return IPPROTO_DONE; + *mp = NULL; + return (IPPROTO_DONE); } /* - * check whether fragment packet's fragment length is - * multiple of 8 octets. + * Check whether fragment packet's fragment length is a + * multiple of 8 octets (unless it is the last one). * sizeof(struct ip6_frag) == 8 * sizeof(struct ip6_hdr) = 40 */ + ip6f = (struct ip6_frag *)((caddr_t)ip6 + offset); if ((ip6f->ip6f_offlg & IP6F_MORE_FRAG) && (((ntohs(ip6->ip6_plen) - offset) & 0x7) != 0)) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offsetof(struct ip6_hdr, ip6_plen)); in6_ifstat_inc(dstifp, ifs6_reass_fail); - return IPPROTO_DONE; + *mp = NULL; + return (IPPROTO_DONE); } IP6STAT_INC(ip6s_fragments); in6_ifstat_inc(dstifp, ifs6_reass_reqd); - /* offset now points to data portion */ - offset += sizeof(struct ip6_frag); - /* - * RFC 6946: Handle "atomic" fragments (offset and m bit set to 0) - * upfront, unrelated to any reassembly. Just skip the fragment header. + * Handle "atomic" fragments (offset and m bit set to 0) upfront, + * unrelated to any reassembly. We need to remove the frag hdr + * which is ugly. + * See RFC 6946 and section 4.5 of RFC 8200. */ if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) { - /* XXX-BZ we want dedicated counters for this. */ - IP6STAT_INC(ip6s_reassembled); + IP6STAT_INC(ip6s_atomicfrags); + nxt = ip6f->ip6f_nxt; + /* + * Set nxt(-hdr field value) to the original value. + * We cannot just set ip6->ip6_nxt as there might be + * an unfragmentable part with extension headers and + * we must update the last one. + */ + m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t), + (caddr_t)&nxt); + ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - + sizeof(struct ip6_frag)); + if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) + goto dropfrag2; + m->m_pkthdr.len -= sizeof(struct ip6_frag); in6_ifstat_inc(dstifp, ifs6_reass_ok); - *offp = offset; - m->m_flags |= M_FRAGMENTED; - return (ip6f->ip6f_nxt); + *mp = m; + return (nxt); } + /* Offset now points to data portion. */ + offset += sizeof(struct ip6_frag); + /* Get fragment length and discard 0-byte fragments. */ frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset; if (frgpartlen == 0) { @@ -298,31 +473,48 @@ frag6_input(struct mbuf **mp, int *offp, int proto) offsetof(struct ip6_hdr, ip6_plen)); in6_ifstat_inc(dstifp, ifs6_reass_fail); IP6STAT_INC(ip6s_fragdropped); - return IPPROTO_DONE; + *mp = NULL; + return (IPPROTO_DONE); } - hashkeyp = hashkey; - memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr)); - hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); - memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr)); - hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); - *hashkeyp = ip6f->ip6f_ident; - hash = jenkins_hash32(hashkey, nitems(hashkey), V_ip6q_hashseed); - hash &= IP6REASS_HMASK; - head = IP6Q_HEAD(hash); - IP6Q_LOCK(hash); - /* - * Enforce upper bound on number of fragments. + * Enforce upper bound on number of fragments for the entire system. * If maxfrag is 0, never accept fragments. * If maxfrag is -1, accept all fragments without limitation. */ if (ip6_maxfrags < 0) ; else if (atomic_load_int(&frag6_nfrags) >= (u_int)ip6_maxfrags) - goto dropfrag; + goto dropfrag2; + + /* + * Validate that a full header chain to the ULP is present in the + * packet containing the first fragment as per RFC RFC7112 and + * RFC 8200 pages 18,19: + * The first fragment packet is composed of: + * (3) Extension headers, if any, and the Upper-Layer header. These + * headers must be in the first fragment. ... + */ + fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK); + /* XXX TODO. thj has D16851 open for this. */ + /* Send ICMPv6 4,3 in case of violation. */ - for (q6 = head->ip6q_next; q6 != head; q6 = q6->ip6q_next) + /* Store receive network interface pointer for later. */ + srcifp = m->m_pkthdr.rcvif; + + /* Generate a hash value for fragment bucket selection. */ + hashkeyp = hashkey; + memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr)); + hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); + memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr)); + hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); + *hashkeyp = ip6f->ip6f_ident; + bucket = jenkins_hash32(hashkey, nitems(hashkey), V_ip6qb_hashseed); + bucket &= IP6REASS_HMASK; + IP6QB_LOCK(bucket); + head = IP6QB_HEAD(bucket); + + TAILQ_FOREACH(q6, head, ip6q_tq) if (ip6f->ip6f_ident == q6->ip6q_ident && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst) @@ -332,11 +524,11 @@ frag6_input(struct mbuf **mp, int *offp, int proto) ) break; - if (q6 == head) { - /* - * the first fragment to arrive, create a reassembly queue. - */ - first_frag = 1; + only_frag = false; + if (q6 == NULL) { + + /* A first fragment to arrive creates a reassembly queue. */ + only_frag = true; /* * Enforce upper bound on number of fragmented packets @@ -347,30 +539,27 @@ frag6_input(struct mbuf **mp, int *offp, int proto) */ if (V_ip6_maxfragpackets < 0) ; - else if (V_ip6q[hash].count >= V_ip6_maxfragbucketsize || + else if (V_ip6qb[bucket].count >= V_ip6_maxfragbucketsize || atomic_load_int(&V_frag6_nfragpackets) >= (u_int)V_ip6_maxfragpackets) goto dropfrag; - atomic_add_int(&V_frag6_nfragpackets, 1); - q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE, - M_NOWAIT); + + /* Allocate IPv6 fragement packet queue entry. */ + q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FRAG6, + M_NOWAIT | M_ZERO); if (q6 == NULL) goto dropfrag; - bzero(q6, sizeof(*q6)); #ifdef MAC if (mac_ip6q_init(q6, M_NOWAIT) != 0) { - free(q6, M_FTABLE); + free(q6, M_FRAG6); goto dropfrag; } mac_ip6q_create(m, q6); #endif - frag6_insque_head(q6, head, hash); + atomic_add_int(&V_frag6_nfragpackets, 1); - /* ip6q_nxt will be filled afterwards, from 1st fragment */ - q6->ip6q_down = q6->ip6q_up = (struct ip6asfrag *)q6; -#ifdef notyet - q6->ip6q_nxtp = (u_char *)nxtp; -#endif + /* ip6q_nxt will be filled afterwards, from 1st fragment. */ + TAILQ_INIT(&q6->ip6q_frags); q6->ip6q_ident = ip6f->ip6f_ident; q6->ip6q_ttl = IPV6_FRAGTTL; q6->ip6q_src = ip6->ip6_src; @@ -379,18 +568,24 @@ frag6_input(struct mbuf **mp, int *offp, int proto) (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; q6->ip6q_unfrglen = -1; /* The 1st fragment has not arrived. */ - q6->ip6q_nfrag = 0; + /* Add the fragemented packet to the bucket. */ + TAILQ_INSERT_HEAD(head, q6, ip6q_tq); + V_ip6qb[bucket].count++; } /* - * If it's the 1st fragment, record the length of the + * If it is the 1st fragment, record the length of the * unfragmentable part and the next header of the fragment header. + * Assume the first 1st fragement to arrive will be correct. + * We do not have any duplicate checks here yet so another packet + * with fragoff == 0 could come and overwrite the ip6q_unfrglen + * and worse, the next header, at any time. */ - fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK); - if (fragoff == 0) { + if (fragoff == 0 && q6->ip6q_unfrglen == -1) { q6->ip6q_unfrglen = offset - sizeof(struct ip6_hdr) - sizeof(struct ip6_frag); q6->ip6q_nxt = ip6f->ip6f_nxt; + /* XXX ECN? */ } /* @@ -401,39 +596,66 @@ frag6_input(struct mbuf **mp, int *offp, int proto) if (q6->ip6q_unfrglen >= 0) { /* The 1st fragment has already arrived. */ if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) { + if (only_frag) { + TAILQ_REMOVE(head, q6, ip6q_tq); + V_ip6qb[bucket].count--; + atomic_subtract_int(&V_frag6_nfragpackets, 1); +#ifdef MAC + mac_ip6q_destroy(q6); +#endif + free(q6, M_FRAG6); + } + IP6QB_UNLOCK(bucket); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); - IP6Q_UNLOCK(hash); + *mp = NULL; return (IPPROTO_DONE); } } else if (fragoff + frgpartlen > IPV6_MAXPACKET) { + if (only_frag) { + TAILQ_REMOVE(head, q6, ip6q_tq); + V_ip6qb[bucket].count--; + atomic_subtract_int(&V_frag6_nfragpackets, 1); +#ifdef MAC + mac_ip6q_destroy(q6); +#endif + free(q6, M_FRAG6); + } + IP6QB_UNLOCK(bucket); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); - IP6Q_UNLOCK(hash); + *mp = NULL; return (IPPROTO_DONE); } + /* - * If it's the first fragment, do the above check for each + * If it is the first fragment, do the above check for each * fragment already stored in the reassembly queue. */ - if (fragoff == 0) { - for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; - af6 = af6dwn) { - af6dwn = af6->ip6af_down; - - if (q6->ip6q_unfrglen + af6->ip6af_off + af6->ip6af_frglen > - IPV6_MAXPACKET) { - struct mbuf *merr = IP6_REASS_MBUF(af6); + if (fragoff == 0 && !only_frag) { + TAILQ_FOREACH_SAFE(af6, &q6->ip6q_frags, ip6af_tq, af6tmp) { + + if (q6->ip6q_unfrglen + af6->ip6af_off + + af6->ip6af_frglen > IPV6_MAXPACKET) { struct ip6_hdr *ip6err; - int erroff = af6->ip6af_offset; + struct mbuf *merr; + int erroff; + + merr = af6->ip6af_m; + erroff = af6->ip6af_offset; - /* dequeue the fragment. */ - frag6_deq(af6, hash); - free(af6, M_FTABLE); + /* Dequeue the fragment. */ + TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq); + q6->ip6q_nfrag--; + atomic_subtract_int(&frag6_nfrags, 1); + free(af6, M_FRAG6); - /* adjust pointer. */ + /* Set a valid receive interface pointer. */ + merr->m_pkthdr.rcvif = srcifp; + + /* Adjust pointer. */ ip6err = mtod(merr, struct ip6_hdr *); /* @@ -451,239 +673,182 @@ frag6_input(struct mbuf **mp, int *offp, int proto) } } - ip6af = (struct ip6asfrag *)malloc(sizeof(struct ip6asfrag), M_FTABLE, - M_NOWAIT); + /* Allocate an IPv6 fragement queue entry for this fragmented part. */ + ip6af = (struct ip6asfrag *)malloc(sizeof(struct ip6asfrag), M_FRAG6, + M_NOWAIT | M_ZERO); if (ip6af == NULL) goto dropfrag; - bzero(ip6af, sizeof(*ip6af)); - ip6af->ip6af_mff = ip6f->ip6f_offlg & IP6F_MORE_FRAG; + ip6af->ip6af_mff = (ip6f->ip6f_offlg & IP6F_MORE_FRAG) ? true : false; ip6af->ip6af_off = fragoff; ip6af->ip6af_frglen = frgpartlen; ip6af->ip6af_offset = offset; - IP6_REASS_MBUF(ip6af) = m; + ip6af->ip6af_m = m; - if (first_frag) { - af6 = (struct ip6asfrag *)q6; - goto insert; + if (only_frag) { + /* + * Do a manual insert rather than a hard-to-understand cast + * to a different type relying on data structure order to work. + */ + TAILQ_INSERT_HEAD(&q6->ip6q_frags, ip6af, ip6af_tq); + goto postinsert; } + /* Do duplicate, condition, and boundry checks. */ /* * Handle ECN by comparing this segment with the first one; * if CE is set, do not lose CE. - * drop if CE and not-ECT are mixed for the same packet. + * Drop if CE and not-ECT are mixed for the same packet. */ ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; ecn0 = q6->ip6q_ecn; if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT) { - free(ip6af, M_FTABLE); + free(ip6af, M_FRAG6); goto dropfrag; } if (ecn0 != IPTOS_ECN_CE) q6->ip6q_ecn = IPTOS_ECN_CE; } if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) { - free(ip6af, M_FTABLE); + free(ip6af, M_FRAG6); goto dropfrag; } - /* - * Find a segment which begins after this one does. - */ - for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; - af6 = af6->ip6af_down) + /* Find a fragmented part which begins after this one does. */ + TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) if (af6->ip6af_off > ip6af->ip6af_off) break; -#if 0 - /* - * If there is a preceding segment, it may provide some of - * our data already. If so, drop the data from the incoming - * segment. If it provides all of our data, drop us. - */ - if (af6->ip6af_up != (struct ip6asfrag *)q6) { - i = af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen - - ip6af->ip6af_off; - if (i > 0) { - if (i >= ip6af->ip6af_frglen) - goto dropfrag; - m_adj(IP6_REASS_MBUF(ip6af), i); - ip6af->ip6af_off += i; - ip6af->ip6af_frglen -= i; - } - } - - /* - * While we overlap succeeding segments trim them or, - * if they are completely covered, dequeue them. - */ - while (af6 != (struct ip6asfrag *)q6 && - ip6af->ip6af_off + ip6af->ip6af_frglen > af6->ip6af_off) { - i = (ip6af->ip6af_off + ip6af->ip6af_frglen) - af6->ip6af_off; - if (i < af6->ip6af_frglen) { - af6->ip6af_frglen -= i; - af6->ip6af_off += i; - m_adj(IP6_REASS_MBUF(af6), i); - break; - } - af6 = af6->ip6af_down; - m_freem(IP6_REASS_MBUF(af6->ip6af_up)); - frag6_deq(af6->ip6af_up, hash); - } -#else /* * If the incoming framgent overlaps some existing fragments in - * the reassembly queue, drop it, since it is dangerous to override - * existing fragments from a security point of view. - * We don't know which fragment is the bad guy - here we trust - * fragment that came in earlier, with no real reason. - * - * Note: due to changes after disabling this part, mbuf passed to - * m_adj() below now does not meet the requirement. + * the reassembly queue, drop both the new fragment and the + * entire reassembly queue. However, if the new fragment + * is an exact duplicate of an existing fragment, only silently + * drop the existing fragment and leave the fragmentation queue + * unchanged, as allowed by the RFC. (RFC 8200, 4.5) */ - if (af6->ip6af_up != (struct ip6asfrag *)q6) { - i = af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen - - ip6af->ip6af_off; - if (i > 0) { -#if 0 /* suppress the noisy log */ - log(LOG_ERR, "%d bytes of a fragment from %s " - "overlaps the previous fragment\n", - i, ip6_sprintf(ip6buf, &q6->ip6q_src)); -#endif - free(ip6af, M_FTABLE); + if (af6 != NULL) + af6tmp = TAILQ_PREV(af6, ip6fraghead, ip6af_tq); + else + af6tmp = TAILQ_LAST(&q6->ip6q_frags, ip6fraghead); + if (af6tmp != NULL) { + if (af6tmp->ip6af_off + af6tmp->ip6af_frglen - + ip6af->ip6af_off > 0) { + if (af6tmp->ip6af_off != ip6af->ip6af_off || + af6tmp->ip6af_frglen != ip6af->ip6af_frglen) + frag6_freef(q6, bucket); + free(ip6af, M_FRAG6); goto dropfrag; } } - if (af6 != (struct ip6asfrag *)q6) { - i = (ip6af->ip6af_off + ip6af->ip6af_frglen) - af6->ip6af_off; - if (i > 0) { -#if 0 /* suppress the noisy log */ - log(LOG_ERR, "%d bytes of a fragment from %s " - "overlaps the succeeding fragment", - i, ip6_sprintf(ip6buf, &q6->ip6q_src)); -#endif - free(ip6af, M_FTABLE); + if (af6 != NULL) { + if (ip6af->ip6af_off + ip6af->ip6af_frglen - + af6->ip6af_off > 0) { + if (af6->ip6af_off != ip6af->ip6af_off || + af6->ip6af_frglen != ip6af->ip6af_frglen) + frag6_freef(q6, bucket); + free(ip6af, M_FRAG6); goto dropfrag; } } -#endif -insert: #ifdef MAC - if (!first_frag) - mac_ip6q_update(m, q6); + mac_ip6q_update(m, q6); #endif /* - * Stick new segment in its place; - * check for complete reassembly. - * If not complete, check fragment limit. - * Move to front of packet queue, as we are - * the most recently active fragmented packet. + * Stick new segment in its place; check for complete reassembly. + * If not complete, check fragment limit. Move to front of packet + * queue, as we are the most recently active fragmented packet. */ - frag6_enq(ip6af, af6->ip6af_up, hash); + if (af6 != NULL) + TAILQ_INSERT_BEFORE(af6, ip6af, ip6af_tq); + else + TAILQ_INSERT_TAIL(&q6->ip6q_frags, ip6af, ip6af_tq); +postinsert: atomic_add_int(&frag6_nfrags, 1); q6->ip6q_nfrag++; -#if 0 /* xxx */ - if (q6 != head->ip6q_next) { - frag6_remque(q6, hash); - frag6_insque_head(q6, head, hash); - } -#endif - next = 0; - for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; - af6 = af6->ip6af_down) { - if (af6->ip6af_off != next) { + + plen = 0; + TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) { + if (af6->ip6af_off != plen) { if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { - IP6STAT_INC(ip6s_fragdropped); - frag6_freef(q6, hash); + IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag); + frag6_freef(q6, bucket); } - IP6Q_UNLOCK(hash); - return IPPROTO_DONE; + IP6QB_UNLOCK(bucket); + *mp = NULL; + return (IPPROTO_DONE); } - next += af6->ip6af_frglen; + plen += af6->ip6af_frglen; } - if (af6->ip6af_up->ip6af_mff) { + af6 = TAILQ_LAST(&q6->ip6q_frags, ip6fraghead); + if (af6->ip6af_mff) { if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) { - IP6STAT_INC(ip6s_fragdropped); - frag6_freef(q6, hash); + IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag); + frag6_freef(q6, bucket); } - IP6Q_UNLOCK(hash); - return IPPROTO_DONE; + IP6QB_UNLOCK(bucket); + *mp = NULL; + return (IPPROTO_DONE); } - /* - * Reassembly is complete; concatenate fragments. - */ - ip6af = q6->ip6q_down; - t = m = IP6_REASS_MBUF(ip6af); - af6 = ip6af->ip6af_down; - frag6_deq(ip6af, hash); - while (af6 != (struct ip6asfrag *)q6) { + /* Reassembly is complete; concatenate fragments. */ + ip6af = TAILQ_FIRST(&q6->ip6q_frags); + t = m = ip6af->ip6af_m; + TAILQ_REMOVE(&q6->ip6q_frags, ip6af, ip6af_tq); + while ((af6 = TAILQ_FIRST(&q6->ip6q_frags)) != NULL) { m->m_pkthdr.csum_flags &= - IP6_REASS_MBUF(af6)->m_pkthdr.csum_flags; + af6->ip6af_m->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += - IP6_REASS_MBUF(af6)->m_pkthdr.csum_data; - - af6dwn = af6->ip6af_down; - frag6_deq(af6, hash); - while (t->m_next) - t = t->m_next; - m_adj(IP6_REASS_MBUF(af6), af6->ip6af_offset); - m_demote_pkthdr(IP6_REASS_MBUF(af6)); - m_cat(t, IP6_REASS_MBUF(af6)); - free(af6, M_FTABLE); - af6 = af6dwn; + af6->ip6af_m->m_pkthdr.csum_data; + + TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq); + t = m_last(t); + m_adj(af6->ip6af_m, af6->ip6af_offset); + m_demote_pkthdr(af6->ip6af_m); + m_cat(t, af6->ip6af_m); + free(af6, M_FRAG6); } while (m->m_pkthdr.csum_data & 0xffff0000) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); - /* adjust offset to point where the original next header starts */ + /* Adjust offset to point where the original next header starts. */ offset = ip6af->ip6af_offset - sizeof(struct ip6_frag); - free(ip6af, M_FTABLE); + free(ip6af, M_FRAG6); ip6 = mtod(m, struct ip6_hdr *); - ip6->ip6_plen = htons((u_short)next + offset - sizeof(struct ip6_hdr)); + ip6->ip6_plen = htons((u_short)plen + offset - sizeof(struct ip6_hdr)); if (q6->ip6q_ecn == IPTOS_ECN_CE) ip6->ip6_flow |= htonl(IPTOS_ECN_CE << 20); nxt = q6->ip6q_nxt; -#ifdef notyet - *q6->ip6q_nxtp = (u_char)(nxt & 0xff); -#endif - if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) { - frag6_remque(q6, hash); - atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); -#ifdef MAC - mac_ip6q_destroy(q6); -#endif - free(q6, M_FTABLE); - atomic_subtract_int(&V_frag6_nfragpackets, 1); + TAILQ_REMOVE(head, q6, ip6q_tq); + V_ip6qb[bucket].count--; + atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); - goto dropfrag; - } + ip6_deletefraghdr(m, offset, M_NOWAIT); - /* - * Store NXT to the original. - */ + /* Set nxt(-hdr field value) to the original value. */ m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t), (caddr_t)&nxt); - frag6_remque(q6, hash); - atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_reassemble(q6, m); mac_ip6q_destroy(q6); #endif - free(q6, M_FTABLE); + free(q6, M_FRAG6); atomic_subtract_int(&V_frag6_nfragpackets, 1); if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */ - int plen = 0; + + plen = 0; for (t = m; t; t = t->m_next) plen += t->m_len; m->m_pkthdr.len = plen; + /* Set a valid receive interface pointer. */ + m->m_pkthdr.rcvif = srcifp; } #ifdef RSS @@ -699,211 +864,94 @@ insert: m_tag_prepend(m, mtag); #endif - IP6Q_UNLOCK(hash); + IP6QB_UNLOCK(bucket); IP6STAT_INC(ip6s_reassembled); in6_ifstat_inc(dstifp, ifs6_reass_ok); #ifdef RSS - /* - * Queue/dispatch for reprocessing. - */ + /* Queue/dispatch for reprocessing. */ netisr_dispatch(NETISR_IPV6_DIRECT, m); - return IPPROTO_DONE; + *mp = NULL; + return (IPPROTO_DONE); #endif - /* - * Tell launch routine the next header - */ - + /* Tell launch routine the next header. */ *mp = m; *offp = offset; - return nxt; + return (nxt); - dropfrag: - IP6Q_UNLOCK(hash); +dropfrag: + IP6QB_UNLOCK(bucket); +dropfrag2: in6_ifstat_inc(dstifp, ifs6_reass_fail); IP6STAT_INC(ip6s_fragdropped); m_freem(m); - return IPPROTO_DONE; -} - -/* - * Free a fragment reassembly header and all - * associated datagrams. - */ -static void -frag6_freef(struct ip6q *q6, uint32_t bucket) -{ - struct ip6asfrag *af6, *down6; - - IP6Q_LOCK_ASSERT(bucket); - - for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; - af6 = down6) { - struct mbuf *m = IP6_REASS_MBUF(af6); - - down6 = af6->ip6af_down; - frag6_deq(af6, bucket); - - /* - * Return ICMP time exceeded error for the 1st fragment. - * Just free other fragments. - */ - if (af6->ip6af_off == 0) { - struct ip6_hdr *ip6; - - /* adjust pointer */ - ip6 = mtod(m, struct ip6_hdr *); - - /* restore source and destination addresses */ - ip6->ip6_src = q6->ip6q_src; - ip6->ip6_dst = q6->ip6q_dst; - - icmp6_error(m, ICMP6_TIME_EXCEEDED, - ICMP6_TIME_EXCEED_REASSEMBLY, 0); - } else - m_freem(m); - free(af6, M_FTABLE); - } - frag6_remque(q6, bucket); - atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag); -#ifdef MAC - mac_ip6q_destroy(q6); -#endif - free(q6, M_FTABLE); - atomic_subtract_int(&V_frag6_nfragpackets, 1); -} - -/* - * Put an ip fragment on a reassembly chain. - * Like insque, but pointers in middle of structure. - */ -static void -frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6, - uint32_t bucket __unused) -{ - - IP6Q_LOCK_ASSERT(bucket); - - af6->ip6af_up = up6; - af6->ip6af_down = up6->ip6af_down; - up6->ip6af_down->ip6af_up = af6; - up6->ip6af_down = af6; -} - -/* - * To frag6_enq as remque is to insque. - */ -static void -frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unused) -{ - - IP6Q_LOCK_ASSERT(bucket); - - af6->ip6af_up->ip6af_down = af6->ip6af_down; - af6->ip6af_down->ip6af_up = af6->ip6af_up; -} - -static void -frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket) -{ - - IP6Q_LOCK_ASSERT(bucket); - KASSERT(IP6Q_HEAD(bucket) == old, - ("%s: attempt to insert at head of wrong bucket" - " (bucket=%u, old=%p)", __func__, bucket, old)); - - new->ip6q_prev = old; - new->ip6q_next = old->ip6q_next; - old->ip6q_next->ip6q_prev= new; - old->ip6q_next = new; - V_ip6q[bucket].count++; -} - -static void -frag6_remque(struct ip6q *p6, uint32_t bucket) -{ - - IP6Q_LOCK_ASSERT(bucket); - - p6->ip6q_prev->ip6q_next = p6->ip6q_next; - p6->ip6q_next->ip6q_prev = p6->ip6q_prev; - V_ip6q[bucket].count--; + *mp = NULL; + return (IPPROTO_DONE); } /* * IPv6 reassembling timer processing; - * if a timer expires on a reassembly - * queue, discard it. + * if a timer expires on a reassembly queue, discard it. */ void frag6_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); - struct ip6q *head, *q6; - int i; + struct ip6qhead *head; + struct ip6q *q6, *q6tmp; + uint32_t bucket; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - for (i = 0; i < IP6REASS_NHASH; i++) { - IP6Q_LOCK(i); - head = IP6Q_HEAD(i); - q6 = head->ip6q_next; - if (q6 == NULL) { - /* - * XXXJTL: This should never happen. This - * should turn into an assertion. - */ - IP6Q_UNLOCK(i); - continue; - } - while (q6 != head) { - --q6->ip6q_ttl; - q6 = q6->ip6q_next; - if (q6->ip6q_prev->ip6q_ttl == 0) { - IP6STAT_INC(ip6s_fragtimeout); + for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { + IP6QB_LOCK(bucket); + head = IP6QB_HEAD(bucket); + TAILQ_FOREACH_SAFE(q6, head, ip6q_tq, q6tmp) + if (--q6->ip6q_ttl == 0) { + IP6STAT_ADD(ip6s_fragtimeout, + q6->ip6q_nfrag); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(q6->ip6q_prev, i); + frag6_freef(q6, bucket); } - } /* * If we are over the maximum number of fragments * (due to the limit being lowered), drain off * enough to get down to the new limit. * Note that we drain all reassembly queues if * maxfragpackets is 0 (fragmentation is disabled), - * and don't enforce a limit when maxfragpackets + * and do not enforce a limit when maxfragpackets * is negative. */ while ((V_ip6_maxfragpackets == 0 || (V_ip6_maxfragpackets > 0 && - V_ip6q[i].count > V_ip6_maxfragbucketsize)) && - head->ip6q_prev != head) { - IP6STAT_INC(ip6s_fragoverflow); + V_ip6qb[bucket].count > V_ip6_maxfragbucketsize)) && + (q6 = TAILQ_LAST(head, ip6qhead)) != NULL) { + IP6STAT_ADD(ip6s_fragoverflow, q6->ip6q_nfrag); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(head->ip6q_prev, i); + frag6_freef(q6, bucket); } - IP6Q_UNLOCK(i); + IP6QB_UNLOCK(bucket); } /* * If we are still over the maximum number of fragmented * packets, drain off enough to get down to the new limit. */ - i = 0; + bucket = 0; while (V_ip6_maxfragpackets >= 0 && atomic_load_int(&V_frag6_nfragpackets) > (u_int)V_ip6_maxfragpackets) { - IP6Q_LOCK(i); - head = IP6Q_HEAD(i); - if (head->ip6q_prev != head) { - IP6STAT_INC(ip6s_fragoverflow); + IP6QB_LOCK(bucket); + q6 = TAILQ_LAST(IP6QB_HEAD(bucket), ip6qhead); + if (q6 != NULL) { + IP6STAT_ADD(ip6s_fragoverflow, q6->ip6q_nfrag); /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(head->ip6q_prev, i); + frag6_freef(q6, bucket); } - IP6Q_UNLOCK(i); - i = (i + 1) % IP6REASS_NHASH; + IP6QB_UNLOCK(bucket); + bucket = (bucket + 1) % IP6REASS_NHASH; } CURVNET_RESTORE(); } @@ -911,55 +959,102 @@ frag6_slowtimo(void) } /* + * Eventhandler to adjust limits in case nmbclusters change. + */ +static void +frag6_change(void *tag) +{ + VNET_ITERATOR_DECL(vnet_iter); + + ip6_maxfrags = IP6_MAXFRAGS; + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; + frag6_set_bucketsize(); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +} + +/* + * Initialise reassembly queue and fragment identifier. + */ +void +frag6_init(void) +{ + uint32_t bucket; + + V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS; + frag6_set_bucketsize(); + for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { + TAILQ_INIT(IP6QB_HEAD(bucket)); + mtx_init(&V_ip6qb[bucket].lock, "ip6qb", NULL, MTX_DEF); + V_ip6qb[bucket].count = 0; + } + V_ip6qb_hashseed = arc4random(); + V_ip6_maxfragsperpacket = 64; +#ifdef VIMAGE + V_frag6_on = true; +#endif + if (!IS_DEFAULT_VNET(curvnet)) + return; + + ip6_maxfrags = IP6_MAXFRAGS; + EVENTHANDLER_REGISTER(nmbclusters_change, + frag6_change, NULL, EVENTHANDLER_PRI_ANY); +} + +/* * Drain off all datagram fragments. */ +static void +frag6_drain_one(void) +{ + struct ip6q *q6; + uint32_t bucket; + + for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { + IP6QB_LOCK(bucket); + while ((q6 = TAILQ_FIRST(IP6QB_HEAD(bucket))) != NULL) { + IP6STAT_INC(ip6s_fragdropped); + /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ + frag6_freef(q6, bucket); + } + IP6QB_UNLOCK(bucket); + } +} + void frag6_drain(void) { VNET_ITERATOR_DECL(vnet_iter); - struct ip6q *head; - int i; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - for (i = 0; i < IP6REASS_NHASH; i++) { - if (IP6Q_TRYLOCK(i) == 0) - continue; - head = IP6Q_HEAD(i); - while (head->ip6q_next != head) { - IP6STAT_INC(ip6s_fragdropped); - /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(head->ip6q_next, i); - } - IP6Q_UNLOCK(i); - } + frag6_drain_one(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } -int -ip6_deletefraghdr(struct mbuf *m, int offset, int wait) +#ifdef VIMAGE +/* + * Clear up IPv6 reassembly structures. + */ +void +frag6_destroy(void) { - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); - struct mbuf *t; - - /* Delete frag6 header. */ - if (m->m_len >= offset + sizeof(struct ip6_frag)) { - /* This is the only possible case with !PULLDOWN_TEST. */ - bcopy(ip6, (char *)ip6 + sizeof(struct ip6_frag), - offset); - m->m_data += sizeof(struct ip6_frag); - m->m_len -= sizeof(struct ip6_frag); - } else { - /* This comes with no copy if the boundary is on cluster. */ - if ((t = m_split(m, offset, wait)) == NULL) - return (ENOMEM); - m_adj(t, sizeof(struct ip6_frag)); - m_cat(m, t); + uint32_t bucket; + + frag6_drain_one(); + V_frag6_on = false; + for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) { + KASSERT(V_ip6qb[bucket].count == 0, + ("%s: V_ip6qb[%d] (%p) count not 0 (%d)", __func__, + bucket, &V_ip6qb[bucket], V_ip6qb[bucket].count)); + mtx_destroy(&V_ip6qb[bucket].lock); } - - m->m_flags |= M_FRAGMENTED; - return (0); } +#endif diff --git a/freebsd/sys/netinet6/icmp6.c b/freebsd/sys/netinet6/icmp6.c index 6dd25e98..293ff85f 100644 --- a/freebsd/sys/netinet6/icmp6.c +++ b/freebsd/sys/netinet6/icmp6.c @@ -234,16 +234,13 @@ icmp6_error2(struct mbuf *m, int type, int code, int param, if (ifp == NULL) return; -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); -#else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); - if (m == NULL) + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); return; + } } -#endif - ip6 = mtod(m, struct ip6_hdr *); if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) @@ -278,15 +275,13 @@ icmp6_error(struct mbuf *m, int type, int code, int param) } #endif -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), ); -#else if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); - if (m == NULL) + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); return; + } } -#endif oip6 = mtod(m, struct ip6_hdr *); /* @@ -324,17 +319,16 @@ icmp6_error(struct mbuf *m, int type, int code, int param) if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), ); - icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); -#else - IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off, - sizeof(*icp)); - if (icp == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return; + if (m->m_len < off + sizeof(struct icmp6_hdr)) { + m = m_pullup(m, off + sizeof(struct icmp6_hdr)); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + return; + } } -#endif + oip6 = mtod(m, struct ip6_hdr *); + icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); + if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* @@ -351,8 +345,6 @@ icmp6_error(struct mbuf *m, int type, int code, int param) /* non-ICMPv6 - send the error */ } - oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */ - /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) { ICMP6STAT_INC(icp6s_toofreq); @@ -403,35 +395,38 @@ icmp6_error(struct mbuf *m, int type, int code, int param) int icmp6_input(struct mbuf **mp, int *offp, int proto) { - struct mbuf *m = *mp, *n; + struct mbuf *m, *n; struct ifnet *ifp; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; - int off = *offp; - int icmp6len = m->m_pkthdr.len - *offp; - int code, sum, noff; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; - int ip6len, error; + int code, error, icmp6len, ip6len, noff, off, sum; - ifp = m->m_pkthdr.rcvif; + m = *mp; + off = *offp; -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE); - /* m might change if M_LOOP. So, call mtod after this */ -#endif + if (m->m_len < off + sizeof(struct icmp6_hdr)) { + m = m_pullup(m, off + sizeof(struct icmp6_hdr)); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = m; + return (IPPROTO_DONE); + } + } /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ - ip6 = mtod(m, struct ip6_hdr *); - ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); + icmp6len = m->m_pkthdr.len - off; if (icmp6len < sizeof(struct icmp6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } + ip6 = mtod(m, struct ip6_hdr *); + ifp = m->m_pkthdr.rcvif; /* * Check multicast group membership. * Note: SSM filters are not applied for ICMPv6 traffic. @@ -447,20 +442,9 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) } } - /* - * calculate the checksum - */ -#ifndef PULLDOWN_TEST + /* Calculate the checksum. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); -#else - IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); - if (icmp6 == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return IPPROTO_DONE; - } -#endif code = icmp6->icmp6_code; - if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log((LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", @@ -475,6 +459,7 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(ifp, ifs6_in_error); + ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(ifp, ifs6_in_dstunreach); @@ -587,8 +572,14 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) n->m_pkthdr.len = n0len + (noff - off); n->m_next = n0; } else { - IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off, - sizeof(*nicmp6)); + if (n->m_len < off + sizeof(*nicmp6)) { + n = m_pullup(n, off + sizeof(*nicmp6)); + if (n == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + break; + } + } + nicmp6 = (struct icmp6_hdr *)(mtod(n, caddr_t) + off); noff = off; } if (n) { @@ -621,8 +612,10 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) */ if ((ip6->ip6_hlim != 1) || (m->m_flags & M_RTALERT_MLD) == 0) goto freeit; - if (mld_input(m, off, icmp6len) != 0) + if (mld_input(&m, off, icmp6len) != 0) { + *mp = NULL; return (IPPROTO_DONE); + } /* m stays. */ break; @@ -641,10 +634,15 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) goto badlen; if (mode == FQDN) { -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), - IPPROTO_DONE); -#endif + if (m->m_len < off + sizeof(struct icmp6_nodeinfo)) { + m = m_pullup(m, off + + sizeof(struct icmp6_nodeinfo)); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = m; + return (IPPROTO_DONE); + } + } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n) n = ni6_input(n, off); @@ -734,7 +732,14 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { - IP6_EXTHDR_CHECK(m, off, icmp6len, IPPROTO_DONE); + if (m->m_len < off + icmp6len) { + m = m_pullup(m, off + icmp6len); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = NULL; + return (IPPROTO_DONE); + } + } error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; @@ -853,6 +858,7 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) deliver: if (icmp6_notify_error(&m, off, icmp6len, code) != 0) { /* In this case, m should've been freed. */ + *mp = NULL; return (IPPROTO_DONE); } break; @@ -869,38 +875,40 @@ icmp6_input(struct mbuf **mp, int *offp, int proto) /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); - return IPPROTO_DONE; + *mp = m; + return (IPPROTO_DONE); freeit: m_freem(m); - return IPPROTO_DONE; + *mp = NULL; + return (IPPROTO_DONE); } static int icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) { - struct mbuf *m = *mp; + struct mbuf *m; struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; + m = *mp; + if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, - sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1); - icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); -#else - IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, - sizeof(*icmp6) + sizeof(struct ip6_hdr)); - if (icmp6 == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return (-1); + + if (m->m_len < off + sizeof(*icmp6) + sizeof(struct ip6_hdr)) { + m = m_pullup(m, off + sizeof(*icmp6) + sizeof(struct ip6_hdr)); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = m; + return (-1); + } } -#endif + icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); eip6 = (struct ip6_hdr *)(icmp6 + 1); /* Detect the upper level protocol */ @@ -924,19 +932,17 @@ icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, 0, - eoff + sizeof(struct ip6_ext), -1); - eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff); -#else - IP6_EXTHDR_GET(eh, struct ip6_ext *, m, - eoff, sizeof(*eh)); - if (eh == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return (-1); + if (m->m_len < eoff + sizeof(struct ip6_ext)) { + m = m_pullup(m, eoff + + sizeof(struct ip6_ext)); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = m; + return (-1); + } } -#endif - + eh = (struct ip6_ext *) + (mtod(m, caddr_t) + eoff); if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else @@ -952,18 +958,16 @@ icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) * information that depends on the final * destination (e.g. path MTU). */ -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1); + if (m->m_len < eoff + sizeof(*rth)) { + m = m_pullup(m, eoff + sizeof(*rth)); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = m; + return (-1); + } + } rth = (struct ip6_rthdr *) (mtod(m, caddr_t) + eoff); -#else - IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m, - eoff, sizeof(*rth)); - if (rth == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return (-1); - } -#endif rthlen = (rth->ip6r_len + 1) << 3; /* * XXX: currently there is no @@ -977,19 +981,17 @@ icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) rth->ip6r_type == IPV6_RTHDR_TYPE_0) { int hops; -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1); + if (m->m_len < eoff + rthlen) { + m = m_pullup(m, eoff + rthlen); + if (m == NULL) { + IP6STAT_INC( + ip6s_exthdrtoolong); + *mp = m; + return (-1); + } + } rth0 = (struct ip6_rthdr0 *) (mtod(m, caddr_t) + eoff); -#else - IP6_EXTHDR_GET(rth0, - struct ip6_rthdr0 *, m, - eoff, rthlen); - if (rth0 == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return (-1); - } -#endif /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) @@ -999,19 +1001,17 @@ icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, 0, eoff + - sizeof(struct ip6_frag), -1); + if (m->m_len < eoff + sizeof(struct ip6_frag)) { + m = m_pullup(m, eoff + + sizeof(struct ip6_frag)); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = m; + return (-1); + } + } fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); -#else - IP6_EXTHDR_GET(fh, struct ip6_frag *, m, - eoff, sizeof(*fh)); - if (fh == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return (-1); - } -#endif /* * Data after a fragment header is meaningless * unless it is the first fragment, but @@ -1037,16 +1037,7 @@ icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) } } notify: -#ifndef PULLDOWN_TEST icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); -#else - IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, - sizeof(*icmp6) + sizeof(struct ip6_hdr)); - if (icmp6 == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return (-1); - } -#endif /* * retrieve parameters from the inner IPv6 header, and convert @@ -1104,6 +1095,7 @@ icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) freeit: m_freem(m); + *mp = NULL; return (-1); } @@ -1191,15 +1183,7 @@ ni6_input(struct mbuf *m, int off) struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); -#ifndef PULLDOWN_TEST ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); -#else - IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6)); - if (ni6 == NULL) { - /* m is already reclaimed */ - return (NULL); - } -#endif /* * Validate IPv6 source address. @@ -1296,7 +1280,6 @@ ni6_input(struct mbuf *m, int off) * * We do not do proxy at this moment. */ - /* m_pulldown instead of copy? */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (caddr_t)&in6_subj); if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL)) @@ -1340,10 +1323,19 @@ ni6_input(struct mbuf *m, int off) mtx_unlock(&pr->pr_mtx); if (!n || n->m_next || n->m_len == 0) goto bad; - IP6_EXTHDR_GET(subj, char *, m, - off + sizeof(struct icmp6_nodeinfo), subjlen); - if (subj == NULL) - goto bad; + if (m->m_len < off + sizeof(struct icmp6_nodeinfo) + + subjlen) { + m = m_pullup(m, off + + sizeof(struct icmp6_nodeinfo) + subjlen); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + goto bad; + } + } + /* ip6 possibly invalid but not used after. */ + ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); + subj = (char *)(mtod(m, caddr_t) + off + + sizeof(struct icmp6_nodeinfo)); if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; @@ -1906,23 +1898,15 @@ icmp6_rip6_input(struct mbuf **mp, int off) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); - struct inpcb *in6p; + struct inpcb *inp; struct inpcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; struct epoch_tracker et; struct mbuf *opts = NULL; -#ifndef PULLDOWN_TEST - /* this is assumed to be safe. */ + /* This is assumed to be safe; icmp6_input() does a pullup. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); -#else - IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); - if (icmp6 == NULL) { - /* m is already reclaimed */ - return (IPPROTO_DONE); - } -#endif /* * XXX: the address may have embedded scope zone ID, which should be @@ -1934,29 +1918,30 @@ icmp6_rip6_input(struct mbuf **mp, int off) fromsa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&fromsa)) { m_freem(m); + *mp = NULL; return (IPPROTO_DONE); } INP_INFO_RLOCK_ET(&V_ripcbinfo, et); - CK_LIST_FOREACH(in6p, &V_ripcb, inp_list) { - if ((in6p->inp_vflag & INP_IPV6) == 0) + CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { + if ((inp->inp_vflag & INP_IPV6) == 0) continue; - if (in6p->inp_ip_p != IPPROTO_ICMPV6) + if (inp->inp_ip_p != IPPROTO_ICMPV6) continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && - !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && - !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) continue; - INP_RLOCK(in6p); - if (__predict_false(in6p->inp_flags2 & INP_FREED)) { - INP_RUNLOCK(in6p); + INP_RLOCK(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + INP_RUNLOCK(inp); continue; } if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, - in6p->in6p_icmp6filt)) { - INP_RUNLOCK(in6p); + inp->in6p_icmp6filt)) { + INP_RUNLOCK(inp); continue; } if (last != NULL) { @@ -2017,7 +2002,7 @@ icmp6_rip6_input(struct mbuf **mp, int off) } INP_RUNLOCK(last); } - last = in6p; + last = inp; } INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); if (last != NULL) { @@ -2059,7 +2044,8 @@ icmp6_rip6_input(struct mbuf **mp, int off) m_freem(m); IP6STAT_DEC(ip6s_delivered); } - return IPPROTO_DONE; + *mp = NULL; + return (IPPROTO_DONE); } /* @@ -2237,24 +2223,17 @@ void icmp6_redirect_input(struct mbuf *m, int off) { struct ifnet *ifp; - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + struct ip6_hdr *ip6; struct nd_redirect *nd_rd; - int icmp6len = ntohs(ip6->ip6_plen); - char *lladdr = NULL; - int lladdrlen = 0; - int is_router; - int is_onlink; - struct in6_addr src6 = ip6->ip6_src; - struct in6_addr redtgt6; - struct in6_addr reddst6; + struct in6_addr src6, redtgt6, reddst6; union nd_opts ndopts; char ip6buf[INET6_ADDRSTRLEN]; + char *lladdr; + int icmp6len, is_onlink, is_router, lladdrlen; M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: no rcvif", __func__)); - ifp = m->m_pkthdr.rcvif; - /* XXX if we are router, we don't update route by icmp6 redirect */ if (V_ip6_forwarding) goto freeit; @@ -2265,25 +2244,29 @@ icmp6_redirect_input(struct mbuf *m, int off) if(m->m_flags & M_FRAGMENTED) goto freeit; -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, icmp6len,); - nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); -#else - IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); - if (nd_rd == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return; + ip6 = mtod(m, struct ip6_hdr *); + icmp6len = ntohs(ip6->ip6_plen); + if (m->m_len < off + icmp6len) { + m = m_pullup(m, off + icmp6len); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + return; + } } -#endif + ip6 = mtod(m, struct ip6_hdr *); + nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); + + ifp = m->m_pkthdr.rcvif; redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; - if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) || - in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) { + if (in6_setscope(&redtgt6, ifp, NULL) || + in6_setscope(&reddst6, ifp, NULL)) { goto freeit; } /* validation */ + src6 = ip6->ip6_src; if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " @@ -2369,6 +2352,8 @@ icmp6_redirect_input(struct mbuf *m, int off) goto freeit; } + lladdr = NULL; + lladdrlen = 0; if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; diff --git a/freebsd/sys/netinet6/in6.c b/freebsd/sys/netinet6/in6.c index 078efe45..a42b7bf7 100644 --- a/freebsd/sys/netinet6/in6.c +++ b/freebsd/sys/netinet6/in6.c @@ -86,6 +86,7 @@ __FBSDID("$FreeBSD$"); #include <sys/kernel.h> #include <sys/lock.h> #include <sys/rmlock.h> +#include <sys/sysctl.h> #include <sys/syslog.h> #include <net/if.h> @@ -2027,8 +2028,6 @@ in6_if2idlen(struct ifnet *ifp) } } -#include <sys/sysctl.h> - struct in6_llentry { struct llentry base; }; diff --git a/freebsd/sys/netinet6/in6_mcast.c b/freebsd/sys/netinet6/in6_mcast.c index 1ac10633..cf7c7ff2 100644 --- a/freebsd/sys/netinet6/in6_mcast.c +++ b/freebsd/sys/netinet6/in6_mcast.c @@ -1830,7 +1830,7 @@ ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt) * Returns NULL if no ifp could be found. */ static struct ifnet * -in6p_lookup_mcast_ifp(const struct inpcb *in6p, +in6p_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in6 *gsin6) { struct nhop6_basic nh6; @@ -1838,13 +1838,13 @@ in6p_lookup_mcast_ifp(const struct inpcb *in6p, uint32_t scopeid; uint32_t fibnum; - KASSERT(in6p->inp_vflag & INP_IPV6, + KASSERT(inp->inp_vflag & INP_IPV6, ("%s: not INP_IPV6 inpcb", __func__)); KASSERT(gsin6->sin6_family == AF_INET6, ("%s: not AF_INET6 group", __func__)); in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid); - fibnum = in6p ? in6p->inp_inc.inc_fibnum : RT_DEFAULT_FIB; + fibnum = inp ? inp->inp_inc.inc_fibnum : RT_DEFAULT_FIB; if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0) return (NULL); @@ -2111,6 +2111,7 @@ in6p_join_group(struct inpcb *inp, struct sockopt *sopt) * NOTE: Refcount from in6_joingroup_locked() * is protecting membership. */ + ip6_mfilter_insert(&imo->im6o_head, imf); } else { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); @@ -2136,9 +2137,6 @@ in6p_join_group(struct inpcb *inp, struct sockopt *sopt) } } - if (is_new) - ip6_mfilter_insert(&imo->im6o_head, imf); - im6f_commit(imf); imf = NULL; @@ -2330,6 +2328,12 @@ in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) if (is_final) { ip6_mfilter_remove(&imo->im6o_head, imf); im6f_leave(imf); + + /* + * Give up the multicast address record to which + * the membership points. + */ + (void)in6_leavegroup_locked(inm, imf); } else { if (imf->im6f_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; @@ -2386,14 +2390,8 @@ in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) out_in6p_locked: INP_WUNLOCK(inp); - if (is_final && imf) { - /* - * Give up the multicast address record to which - * the membership points. - */ - (void)in6_leavegroup_locked(inm, imf); + if (is_final && imf) ip6_mfilter_free(imf); - } IN6_MULTI_UNLOCK(); return (error); diff --git a/freebsd/sys/netinet6/in6_pcb.c b/freebsd/sys/netinet6/in6_pcb.c index 3c89cdf4..903bc09b 100644 --- a/freebsd/sys/netinet6/in6_pcb.c +++ b/freebsd/sys/netinet6/in6_pcb.c @@ -816,20 +816,20 @@ in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, void in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { - struct inpcb *in6p; + struct inpcb *inp; struct in6_multi *inm; struct in6_mfilter *imf; struct ip6_moptions *im6o; INP_INFO_WLOCK(pcbinfo); - CK_LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) { - INP_WLOCK(in6p); - if (__predict_false(in6p->inp_flags2 & INP_FREED)) { - INP_WUNLOCK(in6p); + CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { + INP_WLOCK(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + INP_WUNLOCK(inp); continue; } - im6o = in6p->in6p_moptions; - if ((in6p->inp_vflag & INP_IPV6) && im6o != NULL) { + im6o = inp->in6p_moptions; + if ((inp->inp_vflag & INP_IPV6) && im6o != NULL) { /* * Unselect the outgoing ifp for multicast if it * is being detached. @@ -853,7 +853,7 @@ restart: goto restart; } } - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } diff --git a/freebsd/sys/netinet6/in6_pcb.h b/freebsd/sys/netinet6/in6_pcb.h index 2c6bcdc6..56ea6eeb 100644 --- a/freebsd/sys/netinet6/in6_pcb.h +++ b/freebsd/sys/netinet6/in6_pcb.h @@ -113,7 +113,7 @@ int in6_getpeeraddr(struct socket *so, struct sockaddr **nam); int in6_getsockaddr(struct socket *so, struct sockaddr **nam); int in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam); int in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam); -int in6_selecthlim(struct in6pcb *, struct ifnet *); +int in6_selecthlim(struct inpcb *, struct ifnet *); int in6_pcbsetport(struct in6_addr *, struct inpcb *, struct ucred *); void init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m, int); #endif /* _KERNEL */ diff --git a/freebsd/sys/netinet6/in6_proto.c b/freebsd/sys/netinet6/in6_proto.c index cf62e60c..a16818ce 100644 --- a/freebsd/sys/netinet6/in6_proto.c +++ b/freebsd/sys/netinet6/in6_proto.c @@ -386,10 +386,6 @@ VNET_DEFINE(int, ip6_accept_rtadv) = 0; VNET_DEFINE(int, ip6_no_radr) = 0; VNET_DEFINE(int, ip6_norbit_raif) = 0; VNET_DEFINE(int, ip6_rfc6204w3) = 0; -VNET_DEFINE(int, ip6_maxfragpackets); /* initialized in frag6.c:frag6_init() */ -int ip6_maxfrags; /* initialized in frag6.c:frag6_init() */ -VNET_DEFINE(int, ip6_maxfragbucketsize);/* initialized in frag6.c:frag6_init() */ -VNET_DEFINE(int, ip6_maxfragsperpacket); /* initialized in frag6.c:frag6_init() */ VNET_DEFINE(int, ip6_log_interval) = 5; VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we * process? */ @@ -476,20 +472,6 @@ sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS) return (0); } -static int -sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS) -{ - int error, val; - - val = V_ip6_maxfragpackets; - error = sysctl_handle_int(oidp, &val, 0, req); - if (error != 0 || !req->newptr) - return (error); - V_ip6_maxfragpackets = val; - frag6_set_bucketsize(); - return (0); -} - SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0, "Enable forwarding of IPv6 packets between interfaces"); @@ -502,12 +484,6 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim, SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat, ip6stat, "IP6 statistics (struct ip6stat, netinet6/ip6_var.h)"); -SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, - CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, NULL, 0, - sysctl_ip6_maxfragpackets, "I", - "Default maximum number of outstanding fragmented IPv6 packets. " - "A value of 0 means no fragmented packets will be accepted, while a " - "a value of -1 means no limit"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0, "Default value of per-interface flag for accepting ICMPv6 RA messages"); @@ -577,17 +553,6 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr, SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0, "Use the default scope zone when none is specified"); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, - CTLFLAG_RW, &ip6_maxfrags, 0, - "Maximum allowed number of outstanding IPv6 packet fragments. " - "A value of 0 means no fragmented packets will be accepted, while a " - "a value of -1 means no limit"); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0, - "Maximum number of reassembly queues per hash bucket"); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0, - "Maximum allowed number of fragments per packet"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0, "Enable path MTU discovery for multicast packets"); diff --git a/freebsd/sys/netinet6/in6_src.c b/freebsd/sys/netinet6/in6_src.c index 170eaf18..0bd8bba4 100644 --- a/freebsd/sys/netinet6/in6_src.c +++ b/freebsd/sys/netinet6/in6_src.c @@ -933,21 +933,21 @@ in6_selectroute_fib(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, * 3. The system default hoplimit. */ int -in6_selecthlim(struct inpcb *in6p, struct ifnet *ifp) +in6_selecthlim(struct inpcb *inp, struct ifnet *ifp) { - if (in6p && in6p->in6p_hops >= 0) - return (in6p->in6p_hops); + if (inp && inp->in6p_hops >= 0) + return (inp->in6p_hops); else if (ifp) return (ND_IFINFO(ifp)->chlim); - else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { + else if (inp && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { struct nhop6_basic nh6; struct in6_addr dst; uint32_t fibnum, scopeid; int hlim; - fibnum = in6p->inp_inc.inc_fibnum; - in6_splitscope(&in6p->in6p_faddr, &dst, &scopeid); + fibnum = inp->inp_inc.inc_fibnum; + in6_splitscope(&inp->in6p_faddr, &dst, &scopeid); if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6)==0){ hlim = ND_IFINFO(nh6.nh_ifp)->chlim; return (hlim); diff --git a/freebsd/sys/netinet6/ip6_forward.c b/freebsd/sys/netinet6/ip6_forward.c index 80535efe..97a7a6c6 100644 --- a/freebsd/sys/netinet6/ip6_forward.c +++ b/freebsd/sys/netinet6/ip6_forward.c @@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_ipsec.h> #include <rtems/bsd/local/opt_ipstealth.h> +#include <rtems/bsd/local/opt_sctp.h> #include <sys/param.h> #include <sys/systm.h> diff --git a/freebsd/sys/netinet6/ip6_input.c b/freebsd/sys/netinet6/ip6_input.c index 25ab624c..6800d002 100644 --- a/freebsd/sys/netinet6/ip6_input.c +++ b/freebsd/sys/netinet6/ip6_input.c @@ -205,9 +205,6 @@ struct rmlock in6_ifaddr_lock; RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock"); static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); -#ifdef PULLDOWN_TEST -static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); -#endif /* * IP6 initialization: fill in IP6 protocol switch table. @@ -396,6 +393,7 @@ ip6_destroy(void *unused __unused) } IFNET_RUNLOCK(); + frag6_destroy(); nd6_destroy(); in6_ifattach_destroy(); @@ -406,20 +404,22 @@ VNET_SYSUNINIT(inet6, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_destroy, NULL); #endif static int -ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off, +ip6_input_hbh(struct mbuf **mp, uint32_t *plen, uint32_t *rtalert, int *off, int *nxt, int *ours) { + struct mbuf *m; struct ip6_hdr *ip6; struct ip6_hbh *hbh; - if (ip6_hopopts_input(plen, rtalert, &m, off)) { + if (ip6_hopopts_input(plen, rtalert, mp, off)) { #if 0 /*touches NULL pointer*/ - in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); + in6_ifstat_inc((*mp)->m_pkthdr.rcvif, ifs6_in_discard); #endif goto out; /* m have already been freed */ } /* adjust pointer */ + m = *mp; ip6 = mtod(m, struct ip6_hdr *); /* @@ -441,17 +441,8 @@ ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); goto out; } -#ifndef PULLDOWN_TEST /* ip6_hopopts_input() ensures that mbuf is contiguous */ hbh = (struct ip6_hbh *)(ip6 + 1); -#else - IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), - sizeof(struct ip6_hbh)); - if (hbh == NULL) { - IP6STAT_INC(ip6s_tooshort); - goto out; - } -#endif *nxt = hbh->ip6h_nxt; /* @@ -602,7 +593,6 @@ ip6_input(struct mbuf *m) in6_ifstat_inc(rcvif, ifs6_in_receive); IP6STAT_INC(ip6s_total); -#ifndef PULLDOWN_TEST /* * L2 bridge code and some other code can return mbuf chain * that does not conform to KAME requirement. too bad. @@ -624,9 +614,6 @@ ip6_input(struct mbuf *m) m_freem(m); m = n; } - IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */); -#endif - if (m->m_len < sizeof(struct ip6_hdr)) { if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { IP6STAT_INC(ip6s_toosmall); @@ -693,11 +680,10 @@ ip6_input(struct mbuf *m) * and bypass security checks (act as if it was from 127.0.0.1 by using * IPv6 src ::ffff:127.0.0.1). Be cautious. * - * This check chokes if we are in an SIIT cloud. As none of BSDs - * support IPv4-less kernel compilation, we cannot support SIIT - * environment at all. So, it makes more sense for us to reject any - * malicious packets for non-SIIT environment, than try to do a - * partial support for SIIT environment. + * We have supported IPv6-only kernels for a few years and this issue + * has not come up. The world seems to move mostly towards not using + * v4mapped on the wire, so it makes sense for us to keep rejecting + * any such packets. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { @@ -859,7 +845,7 @@ passin: */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { - if (ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours) != 0) + if (ip6_input_hbh(&m, &plen, &rtalert, &off, &nxt, &ours) != 0) return; } else nxt = ip6->ip6_nxt; @@ -915,24 +901,6 @@ passin: return; } - ip6 = mtod(m, struct ip6_hdr *); - - /* - * Malicious party may be able to use IPv4 mapped addr to confuse - * tcp/udp stack and bypass security checks (act as if it was from - * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious. - * - * For SIIT end node behavior, you may want to disable the check. - * However, you will become vulnerable to attacks using IPv4 mapped - * source. - */ - if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || - IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { - IP6STAT_INC(ip6s_badscope); - in6_ifstat_inc(rcvif, ifs6_in_addrerr); - goto bad; - } - /* * Tell launch routine the next header */ @@ -987,33 +955,33 @@ ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp, struct ip6_hbh *hbh; /* validation of the length of the header */ -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1); + if (m->m_len < off + sizeof(*hbh)) { + m = m_pullup(m, off + sizeof(*hbh)); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = NULL; + return (-1); + } + } hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); hbhlen = (hbh->ip6h_len + 1) << 3; - IP6_EXTHDR_CHECK(m, off, hbhlen, -1); - hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); -#else - IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, - sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); - if (hbh == NULL) { - IP6STAT_INC(ip6s_tooshort); - return -1; - } - hbhlen = (hbh->ip6h_len + 1) << 3; - IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), - hbhlen); - if (hbh == NULL) { - IP6STAT_INC(ip6s_tooshort); - return -1; + if (m->m_len < off + hbhlen) { + m = m_pullup(m, off + hbhlen); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = NULL; + return (-1); + } } -#endif + hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); off += hbhlen; hbhlen -= sizeof(struct ip6_hbh); if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), - hbhlen, rtalertp, plenp) < 0) + hbhlen, rtalertp, plenp) < 0) { + *mp = NULL; return (-1); + } *offp = off; *mp = m; @@ -1198,10 +1166,9 @@ ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off) * Create the "control" list for this pcb. * These functions will not modify mbuf chain at all. * - * With KAME mbuf chain restriction: * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already - * called IP6_EXTHDR_CHECK() and all the extension headers are located in the + * called m_pullup() and all the extension headers are located in the * very first mbuf on the mbuf chain. * * ip6_savecontrol_v4 will handle those options that are possible to be @@ -1409,15 +1376,16 @@ ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, } void -ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) +ip6_savecontrol(struct inpcb *inp, struct mbuf *m, struct mbuf **mp) { - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + struct ip6_hdr *ip6; int v4only = 0; - mp = ip6_savecontrol_v4(in6p, m, mp, &v4only); + mp = ip6_savecontrol_v4(inp, m, mp, &v4only); if (v4only) return; + ip6 = mtod(m, struct ip6_hdr *); /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too @@ -1425,7 +1393,7 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) * returned to normal user. * See also RFC 2292 section 6 (or RFC 3542 section 8). */ - if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) { + if ((inp->inp_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary @@ -1435,29 +1403,10 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) */ if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; - int hbhlen = 0; -#ifdef PULLDOWN_TEST - struct mbuf *ext; -#endif + int hbhlen; -#ifndef PULLDOWN_TEST hbh = (struct ip6_hbh *)(ip6 + 1); hbhlen = (hbh->ip6h_len + 1) << 3; -#else - ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr), - ip6->ip6_nxt); - if (ext == NULL) { - IP6STAT_INC(ip6s_tooshort); - return; - } - hbh = mtod(ext, struct ip6_hbh *); - hbhlen = (hbh->ip6h_len + 1) << 3; - if (hbhlen != ext->m_len) { - m_freem(ext); - IP6STAT_INC(ip6s_tooshort); - return; - } -#endif /* * XXX: We copy the whole header even if a @@ -1467,17 +1416,14 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) * Note: this constraint is removed in RFC3542 */ *mp = sbcreatecontrol((caddr_t)hbh, hbhlen, - IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS), + IS2292(inp, IPV6_2292HOPOPTS, IPV6_HOPOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; -#ifdef PULLDOWN_TEST - m_freem(ext); -#endif } } - if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { + if ((inp->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* @@ -1490,9 +1436,6 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) while (1) { /* is explicit loop prevention necessary? */ struct ip6_ext *ip6e = NULL; int elen; -#ifdef PULLDOWN_TEST - struct mbuf *ext = NULL; -#endif /* * if it is not an extension header, don't try to @@ -1508,7 +1451,6 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) goto loopend; } -#ifndef PULLDOWN_TEST if (off + sizeof(*ip6e) > m->m_len) goto loopend; ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off); @@ -1518,42 +1460,25 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) elen = (ip6e->ip6e_len + 1) << 3; if (off + elen > m->m_len) goto loopend; -#else - ext = ip6_pullexthdr(m, off, nxt); - if (ext == NULL) { - IP6STAT_INC(ip6s_tooshort); - return; - } - ip6e = mtod(ext, struct ip6_ext *); - if (nxt == IPPROTO_AH) - elen = (ip6e->ip6e_len + 2) << 2; - else - elen = (ip6e->ip6e_len + 1) << 3; - if (elen != ext->m_len) { - m_freem(ext); - IP6STAT_INC(ip6s_tooshort); - return; - } -#endif switch (nxt) { case IPPROTO_DSTOPTS: - if (!(in6p->inp_flags & IN6P_DSTOPTS)) + if (!(inp->inp_flags & IN6P_DSTOPTS)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, - IS2292(in6p, + IS2292(inp, IPV6_2292DSTOPTS, IPV6_DSTOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: - if (!(in6p->inp_flags & IN6P_RTHDR)) + if (!(inp->inp_flags & IN6P_RTHDR)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, - IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR), + IS2292(inp, IPV6_2292RTHDR, IPV6_RTHDR), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; @@ -1569,9 +1494,6 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) * the code just in case (nxt overwritten or * other cases). */ -#ifdef PULLDOWN_TEST - m_freem(ext); -#endif goto loopend; } @@ -1580,16 +1502,12 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) off += elen; nxt = ip6e->ip6e_nxt; ip6e = NULL; -#ifdef PULLDOWN_TEST - m_freem(ext); - ext = NULL; -#endif } loopend: ; } - if (in6p->inp_flags2 & INP_RECVFLOWID) { + if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; @@ -1610,7 +1528,7 @@ ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) } #ifdef RSS - if (in6p->inp_flags2 & INP_RECVRSSBUCKETID) { + if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; @@ -1669,49 +1587,6 @@ ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu) sorwakeup(so); } -#ifdef PULLDOWN_TEST -/* - * pull single extension header from mbuf chain. returns single mbuf that - * contains the result, or NULL on error. - */ -static struct mbuf * -ip6_pullexthdr(struct mbuf *m, size_t off, int nxt) -{ - struct ip6_ext ip6e; - size_t elen; - struct mbuf *n; - -#ifdef DIAGNOSTIC - switch (nxt) { - case IPPROTO_DSTOPTS: - case IPPROTO_ROUTING: - case IPPROTO_HOPOPTS: - case IPPROTO_AH: /* is it possible? */ - break; - default: - printf("ip6_pullexthdr: invalid nxt=%d\n", nxt); - } -#endif - - m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); - if (nxt == IPPROTO_AH) - elen = (ip6e.ip6e_len + 2) << 2; - else - elen = (ip6e.ip6e_len + 1) << 3; - - if (elen > MLEN) - n = m_getcl(M_NOWAIT, MT_DATA, 0); - else - n = m_get(M_NOWAIT, MT_DATA); - if (n == NULL) - return NULL; - - m_copydata(m, off, elen, mtod(n, caddr_t)); - n->m_len = elen; - return n; -} -#endif - /* * Get pointer to the previous header followed by the header * currently processed. diff --git a/freebsd/sys/netinet6/ip6_mroute.c b/freebsd/sys/netinet6/ip6_mroute.c index 9dee53b0..437d6da7 100644 --- a/freebsd/sys/netinet6/ip6_mroute.c +++ b/freebsd/sys/netinet6/ip6_mroute.c @@ -1722,12 +1722,10 @@ pim6_input(struct mbuf *m, int off, int proto, void *arg __unused) PIM6STAT_INC(pim6s_rcv_total); - ip6 = mtod(m, struct ip6_hdr *); - pimlen = m->m_pkthdr.len - off; - /* * Validate lengths */ + pimlen = m->m_pkthdr.len - off; if (pimlen < PIM_MINLEN) { PIM6STAT_INC(pim6s_rcv_tooshort); MRT6_DLOG(DEBUG_PIM, "PIM packet too short"); @@ -1749,20 +1747,15 @@ pim6_input(struct mbuf *m, int off, int proto, void *arg __unused) * Make sure that the IP6 and PIM headers in contiguous memory, and * possibly the PIM REGISTER header */ -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, minlen, IPPROTO_DONE); - /* adjust pointer */ + if (m->m_len < off + minlen) { + m = m_pullup(m, off + minlen); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + return (IPPROTO_DONE); + } + } ip6 = mtod(m, struct ip6_hdr *); - - /* adjust mbuf to point to the PIM header */ pim = (struct pim *)((caddr_t)ip6 + off); -#else - IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen); - if (pim == NULL) { - PIM6STAT_INC(pim6s_rcv_tooshort); - return (IPPROTO_DONE); - } -#endif #define PIM6_CHECKSUM #ifdef PIM6_CHECKSUM diff --git a/freebsd/sys/netinet6/ip6_output.c b/freebsd/sys/netinet6/ip6_output.c index e941ac49..73312ca6 100644 --- a/freebsd/sys/netinet6/ip6_output.c +++ b/freebsd/sys/netinet6/ip6_output.c @@ -968,6 +968,7 @@ passout: in_pcboutput_txrtlmt(inp, ifp, m); /* stamp send tag on mbuf */ m->m_pkthdr.snd_tag = inp->inp_snd_tag; + m->m_pkthdr.csum_flags |= CSUM_SND_TAG; } else { m->m_pkthdr.snd_tag = NULL; } @@ -1083,6 +1084,7 @@ sendorfree: in_pcboutput_txrtlmt(inp, ifp, m); /* stamp send tag on mbuf */ m->m_pkthdr.snd_tag = inp->inp_snd_tag; + m->m_pkthdr.csum_flags |= CSUM_SND_TAG; } else { m->m_pkthdr.snd_tag = NULL; } @@ -1421,7 +1423,7 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) { int optdatalen, uproto; void *optdata; - struct inpcb *in6p = sotoinpcb(so); + struct inpcb *inp = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; @@ -1456,43 +1458,43 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: - INP_WLOCK(in6p); + INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) - in6p->inp_flags2 |= INP_REUSEADDR; + inp->inp_flags2 |= INP_REUSEADDR; else - in6p->inp_flags2 &= ~INP_REUSEADDR; - INP_WUNLOCK(in6p); + inp->inp_flags2 &= ~INP_REUSEADDR; + INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: - INP_WLOCK(in6p); + INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) - in6p->inp_flags2 |= INP_REUSEPORT; + inp->inp_flags2 |= INP_REUSEPORT; else - in6p->inp_flags2 &= ~INP_REUSEPORT; - INP_WUNLOCK(in6p); + inp->inp_flags2 &= ~INP_REUSEPORT; + INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT_LB: - INP_WLOCK(in6p); + INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT_LB) != 0) - in6p->inp_flags2 |= INP_REUSEPORT_LB; + inp->inp_flags2 |= INP_REUSEPORT_LB; else - in6p->inp_flags2 &= ~INP_REUSEPORT_LB; - INP_WUNLOCK(in6p); + inp->inp_flags2 &= ~INP_REUSEPORT_LB; + INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: - INP_WLOCK(in6p); - in6p->inp_inc.inc_fibnum = so->so_fibnum; - INP_WUNLOCK(in6p); + INP_WLOCK(inp); + inp->inp_inc.inc_fibnum = so->so_fibnum; + INP_WUNLOCK(inp); error = 0; break; case SO_MAX_PACING_RATE: #ifdef RATELIMIT - INP_WLOCK(in6p); - in6p->inp_flags2 |= INP_RATE_LIMIT_CHANGED; - INP_WUNLOCK(in6p); + INP_WLOCK(inp); + inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; + INP_WUNLOCK(inp); error = 0; #else error = EOPNOTSUPP; @@ -1526,7 +1528,7 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; - error = ip6_pcbopts(&in6p->in6p_outputopts, + error = ip6_pcbopts(&inp->in6p_outputopts, m, so, sopt); m_freem(m); /* XXX */ break; @@ -1597,57 +1599,57 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) error = EINVAL; else { /* -1 = kernel default */ - in6p->in6p_hops = optval; - if ((in6p->inp_vflag & + inp->in6p_hops = optval; + if ((inp->inp_vflag & INP_IPV4) != 0) - in6p->inp_ip_ttl = optval; + inp->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ - INP_WLOCK(in6p); \ + INP_WLOCK(inp); \ if (optval) \ - in6p->inp_flags |= (bit); \ + inp->inp_flags |= (bit); \ else \ - in6p->inp_flags &= ~(bit); \ - INP_WUNLOCK(in6p); \ + inp->inp_flags &= ~(bit); \ + INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) #define OPTSET2292(bit) \ do { \ - INP_WLOCK(in6p); \ - in6p->inp_flags |= IN6P_RFC2292; \ + INP_WLOCK(inp); \ + inp->inp_flags |= IN6P_RFC2292; \ if (optval) \ - in6p->inp_flags |= (bit); \ + inp->inp_flags |= (bit); \ else \ - in6p->inp_flags &= ~(bit); \ - INP_WUNLOCK(in6p); \ + inp->inp_flags &= ~(bit); \ + INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) -#define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0) +#define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0) #define OPTSET2_N(bit, val) do { \ if (val) \ - in6p->inp_flags2 |= bit; \ + inp->inp_flags2 |= bit; \ else \ - in6p->inp_flags2 &= ~bit; \ + inp->inp_flags2 &= ~bit; \ } while (0) #define OPTSET2(bit, val) do { \ - INP_WLOCK(in6p); \ + INP_WLOCK(inp); \ OPTSET2_N(bit, val); \ - INP_WUNLOCK(in6p); \ + INP_WUNLOCK(inp); \ } while (0) -#define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0) +#define OPTBIT2(bit) (inp->inp_flags2 & (bit) ? 1 : 0) #define OPTSET2292_EXCLUSIVE(bit) \ do { \ - INP_WLOCK(in6p); \ + INP_WLOCK(inp); \ if (OPTBIT(IN6P_RFC2292)) { \ error = EINVAL; \ } else { \ if (optval) \ - in6p->inp_flags |= (bit); \ + inp->inp_flags |= (bit); \ else \ - in6p->inp_flags &= ~(bit); \ + inp->inp_flags &= ~(bit); \ } \ - INP_WUNLOCK(in6p); \ + INP_WUNLOCK(inp); \ } while (/*CONSTCOND*/ 0) case IPV6_RECVPKTINFO: @@ -1663,17 +1665,17 @@ do { \ error = EINVAL; break; } - INP_WLOCK(in6p); - if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - INP_WUNLOCK(in6p); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); return (ECONNRESET); } - optp = &in6p->in6p_outputopts; + optp = &inp->in6p_outputopts; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); break; } @@ -1724,16 +1726,16 @@ do { \ * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ - if (in6p->inp_lport || - !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { + if (inp->inp_lport || + !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) - in6p->inp_vflag &= ~INP_IPV4; + inp->inp_vflag &= ~INP_IPV4; else - in6p->inp_vflag |= INP_IPV4; + inp->inp_vflag |= INP_IPV4; break; case IPV6_RECVTCLASS: /* cannot mix with RFC2292 XXX */ @@ -1757,10 +1759,10 @@ do { \ case IPV6_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { - INP_WLOCK(in6p); - in6p->inp_rss_listen_bucket = optval; + INP_WLOCK(inp); + inp->inp_rss_listen_bucket = optval; OPTSET2_N(INP_RSS_BUCKET_SET, 1); - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); } else { error = EINVAL; } @@ -1783,17 +1785,17 @@ do { \ break; { struct ip6_pktopts **optp; - INP_WLOCK(in6p); - if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - INP_WUNLOCK(in6p); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); return (ECONNRESET); } - optp = &in6p->in6p_outputopts; + optp = &inp->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); break; } @@ -1875,16 +1877,16 @@ do { \ break; optlen = sopt->sopt_valsize; optbuf = optbuf_storage; - INP_WLOCK(in6p); - if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - INP_WUNLOCK(in6p); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); return (ECONNRESET); } - optp = &in6p->in6p_outputopts; + optp = &inp->in6p_outputopts; error = ip6_pcbopt(optname, optbuf, optlen, optp, (td != NULL) ? td->td_ucred : NULL, uproto); - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); break; } #undef OPTSET @@ -1901,7 +1903,7 @@ do { \ case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: - error = ip6_setmoptions(in6p, sopt); + error = ip6_setmoptions(inp, sopt); break; case IPV6_PORTRANGE: @@ -1910,34 +1912,34 @@ do { \ if (error) break; - INP_WLOCK(in6p); + INP_WLOCK(inp); switch (optval) { case IPV6_PORTRANGE_DEFAULT: - in6p->inp_flags &= ~(INP_LOWPORT); - in6p->inp_flags &= ~(INP_HIGHPORT); + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: - in6p->inp_flags &= ~(INP_LOWPORT); - in6p->inp_flags |= INP_HIGHPORT; + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: - in6p->inp_flags &= ~(INP_HIGHPORT); - in6p->inp_flags |= INP_LOWPORT; + inp->inp_flags &= ~(INP_HIGHPORT); + inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IPV6_IPSEC_POLICY: if (IPSEC_ENABLED(ipv6)) { - error = IPSEC_PCBCTL(ipv6, in6p, sopt); + error = IPSEC_PCBCTL(ipv6, inp, sopt); break; } /* FALLTHROUGH */ @@ -2005,7 +2007,7 @@ do { \ break; case IPV6_UNICAST_HOPS: - optval = in6p->in6p_hops; + optval = inp->in6p_hops; break; case IPV6_RECVPKTINFO: @@ -2031,7 +2033,7 @@ do { \ case IPV6_PORTRANGE: { int flags; - flags = in6p->inp_flags; + flags = inp->inp_flags; if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & INP_LOWPORT) @@ -2057,11 +2059,11 @@ do { \ break; case IPV6_FLOWID: - optval = in6p->inp_flowid; + optval = inp->inp_flowid; break; case IPV6_FLOWTYPE: - optval = in6p->inp_flowtype; + optval = inp->inp_flowtype; break; case IPV6_RECVFLOWID: @@ -2070,8 +2072,8 @@ do { \ #ifdef RSS case IPV6_RSSBUCKETID: retval = - rss_hash2bucket(in6p->inp_flowid, - in6p->inp_flowtype, + rss_hash2bucket(inp->inp_flowid, + inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; @@ -2107,12 +2109,12 @@ do { \ * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. - * Copy faddr out of in6p to avoid holding lock + * Copy faddr out of inp to avoid holding lock * on inp during route lookup. */ - INP_RLOCK(in6p); - bcopy(&in6p->in6p_faddr, &addr, sizeof(addr)); - INP_RUNLOCK(in6p); + INP_RLOCK(inp); + bcopy(&inp->in6p_faddr, &addr, sizeof(addr)); + INP_RUNLOCK(inp); error = ip6_getpmtu_ctl(so->so_fibnum, &addr, &pmtu); if (error) @@ -2164,20 +2166,20 @@ do { \ case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: - error = ip6_getpcbopt(in6p, optname, sopt); + error = ip6_getpcbopt(inp, optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_MSFILTER: - error = ip6_getmoptions(in6p, sopt); + error = ip6_getmoptions(inp, sopt); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IPV6_IPSEC_POLICY: if (IPSEC_ENABLED(ipv6)) { - error = IPSEC_PCBCTL(ipv6, in6p, sopt); + error = IPSEC_PCBCTL(ipv6, inp, sopt); break; } /* FALLTHROUGH */ @@ -2197,7 +2199,7 @@ ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0, optval, optlen; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); - struct inpcb *in6p = sotoinpcb(so); + struct inpcb *inp = sotoinpcb(so); int level, op, optname; level = sopt->sopt_level; @@ -2240,14 +2242,14 @@ ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) if (optval != icmp6off) error = EINVAL; } else - in6p->in6p_cksum = optval; + inp->in6p_cksum = optval; break; case SOPT_GET: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else - optval = in6p->in6p_cksum; + optval = inp->in6p_cksum; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; @@ -2346,16 +2348,16 @@ ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, #define GET_PKTOPT_VAR(field, lenexpr) do { \ if (pktopt && pktopt->field) { \ - INP_RUNLOCK(in6p); \ + INP_RUNLOCK(inp); \ optdata = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK); \ malloc_optdata = true; \ - INP_RLOCK(in6p); \ - if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ - INP_RUNLOCK(in6p); \ + INP_RLOCK(inp); \ + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ + INP_RUNLOCK(inp); \ free(optdata, M_TEMP); \ return (ECONNRESET); \ } \ - pktopt = in6p->in6p_outputopts; \ + pktopt = inp->in6p_outputopts; \ if (pktopt && pktopt->field) { \ optdatalen = min(lenexpr, sopt->sopt_valsize); \ bcopy(&pktopt->field, optdata, optdatalen); \ @@ -2374,7 +2376,7 @@ ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, pktopt->field->sa_len) static int -ip6_getpcbopt(struct inpcb *in6p, int optname, struct sockopt *sopt) +ip6_getpcbopt(struct inpcb *inp, int optname, struct sockopt *sopt) { void *optdata = NULL; bool malloc_optdata = false; @@ -2386,8 +2388,8 @@ ip6_getpcbopt(struct inpcb *in6p, int optname, struct sockopt *sopt) int defpreftemp = IP6PO_TEMPADDR_SYSTEM; struct ip6_pktopts *pktopt; - INP_RLOCK(in6p); - pktopt = in6p->in6p_outputopts; + INP_RLOCK(inp); + pktopt = inp->in6p_outputopts; switch (optname) { case IPV6_PKTINFO: @@ -2447,10 +2449,10 @@ ip6_getpcbopt(struct inpcb *in6p, int optname, struct sockopt *sopt) #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif - INP_RUNLOCK(in6p); + INP_RUNLOCK(inp); return (ENOPROTOOPT); } - INP_RUNLOCK(in6p); + INP_RUNLOCK(inp); error = sooptcopyout(sopt, optdata, optdatalen); if (malloc_optdata) @@ -3135,23 +3137,23 @@ ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) * Compute IPv6 extension header length. */ int -ip6_optlen(struct inpcb *in6p) +ip6_optlen(struct inpcb *inp) { int len; - if (!in6p->in6p_outputopts) + if (!inp->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) - len += elen(in6p->in6p_outputopts->ip6po_hbh); - if (in6p->in6p_outputopts->ip6po_rthdr) + len += elen(inp->in6p_outputopts->ip6po_hbh); + if (inp->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ - len += elen(in6p->in6p_outputopts->ip6po_dest1); - len += elen(in6p->in6p_outputopts->ip6po_rthdr); - len += elen(in6p->in6p_outputopts->ip6po_dest2); + len += elen(inp->in6p_outputopts->ip6po_dest1); + len += elen(inp->in6p_outputopts->ip6po_rthdr); + len += elen(inp->in6p_outputopts->ip6po_dest2); return len; #undef elen } diff --git a/freebsd/sys/netinet6/ip6_var.h b/freebsd/sys/netinet6/ip6_var.h index b66f5cfb..05881f08 100644 --- a/freebsd/sys/netinet6/ip6_var.h +++ b/freebsd/sys/netinet6/ip6_var.h @@ -68,39 +68,27 @@ #include <sys/epoch.h> +#ifdef _KERNEL +struct ip6asfrag; /* frag6.c */ +TAILQ_HEAD(ip6fraghead, ip6asfrag); + /* * IP6 reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. */ struct ip6q { - struct ip6asfrag *ip6q_down; - struct ip6asfrag *ip6q_up; + struct ip6fraghead ip6q_frags; u_int32_t ip6q_ident; u_int8_t ip6q_nxt; u_int8_t ip6q_ecn; u_int8_t ip6q_ttl; struct in6_addr ip6q_src, ip6q_dst; - struct ip6q *ip6q_next; - struct ip6q *ip6q_prev; + TAILQ_ENTRY(ip6q) ip6q_tq; int ip6q_unfrglen; /* len of unfragmentable part */ -#ifdef notyet - u_char *ip6q_nxtp; -#endif int ip6q_nfrag; /* # of fragments */ struct label *ip6q_label; }; - -struct ip6asfrag { - struct ip6asfrag *ip6af_down; - struct ip6asfrag *ip6af_up; - struct mbuf *ip6af_m; - int ip6af_offset; /* offset in ip6af_m to next header */ - int ip6af_frglen; /* fragmentable part length */ - int ip6af_off; /* fragment offset */ - u_int16_t ip6af_mff; /* more fragment bit in frag off */ -}; - -#define IP6_REASS_MBUF(ip6af) (*(struct mbuf **)&((ip6af)->ip6af_m)) +#endif /* _KERNEL */ /* * IP6 reinjecting structure. @@ -207,6 +195,7 @@ struct ip6stat { uint64_t ip6s_localout; /* total ip packets generated here */ uint64_t ip6s_odropped; /* lost packets due to nobufs, etc. */ uint64_t ip6s_reassembled; /* total packets reassembled ok */ + uint64_t ip6s_atomicfrags; /* atomic fragments */ uint64_t ip6s_fragmented; /* datagrams successfully fragmented */ uint64_t ip6s_ofragments; /* output fragments created */ uint64_t ip6s_cantfrag; /* don't fragment flag was set, etc. */ @@ -298,12 +287,6 @@ VNET_DECLARE(int, ip6_v6only); VNET_DECLARE(struct socket *, ip6_mrouter); /* multicast routing daemon */ VNET_DECLARE(int, ip6_sendredirects); /* send IP redirects when forwarding? */ -VNET_DECLARE(int, ip6_maxfragpackets); /* Maximum packets in reassembly - * queue */ -extern int ip6_maxfrags; /* Maximum fragments in reassembly - * queue */ -VNET_DECLARE(int, ip6_maxfragbucketsize); /* Maximum reassembly queues per bucket */ -VNET_DECLARE(int, ip6_maxfragsperpacket); /* Maximum fragments per packet */ VNET_DECLARE(int, ip6_accept_rtadv); /* Acts as a host not a router */ VNET_DECLARE(int, ip6_no_radr); /* No defroute from RA */ VNET_DECLARE(int, ip6_norbit_raif); /* Disable R-bit in NA on RA @@ -317,9 +300,6 @@ VNET_DECLARE(int, ip6_hdrnestlimit); /* upper limit of # of extension VNET_DECLARE(int, ip6_dad_count); /* DupAddrDetectionTransmits */ #define V_ip6_mrouter VNET(ip6_mrouter) #define V_ip6_sendredirects VNET(ip6_sendredirects) -#define V_ip6_maxfragpackets VNET(ip6_maxfragpackets) -#define V_ip6_maxfragbucketsize VNET(ip6_maxfragbucketsize) -#define V_ip6_maxfragsperpacket VNET(ip6_maxfragsperpacket) #define V_ip6_accept_rtadv VNET(ip6_accept_rtadv) #define V_ip6_no_radr VNET(ip6_no_radr) #define V_ip6_norbit_raif VNET(ip6_norbit_raif) @@ -406,8 +386,8 @@ int ip6_fragment(struct ifnet *, struct mbuf *, int, u_char, int, int route6_input(struct mbuf **, int *, int); -void frag6_set_bucketsize(void); void frag6_init(void); +void frag6_destroy(void); int frag6_input(struct mbuf **, int *, int); void frag6_slowtimo(void); void frag6_drain(void); diff --git a/freebsd/sys/netinet6/mld6.c b/freebsd/sys/netinet6/mld6.c index a0d045d5..e7b400ae 100644 --- a/freebsd/sys/netinet6/mld6.c +++ b/freebsd/sys/netinet6/mld6.c @@ -1254,20 +1254,27 @@ out_locked: * Return IPPROTO_DONE if we freed m. Otherwise, return 0. */ int -mld_input(struct mbuf *m, int off, int icmp6len) +mld_input(struct mbuf **mp, int off, int icmp6len) { struct ifnet *ifp; struct ip6_hdr *ip6; + struct mbuf *m; struct mld_hdr *mld; int mldlen; + m = *mp; CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off); ifp = m->m_pkthdr.rcvif; - ip6 = mtod(m, struct ip6_hdr *); - /* Pullup to appropriate size. */ + if (m->m_len < off + sizeof(*mld)) { + m = m_pullup(m, off + sizeof(*mld)); + if (m == NULL) { + ICMP6STAT_INC(icp6s_badlen); + return (IPPROTO_DONE); + } + } mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off); if (mld->mld_type == MLD_LISTENER_QUERY && icmp6len >= sizeof(struct mldv2_query)) { @@ -1275,11 +1282,16 @@ mld_input(struct mbuf *m, int off, int icmp6len) } else { mldlen = sizeof(struct mld_hdr); } - IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen); - if (mld == NULL) { - ICMP6STAT_INC(icp6s_badlen); - return (IPPROTO_DONE); + if (m->m_len < off + mldlen) { + m = m_pullup(m, off + mldlen); + if (m == NULL) { + ICMP6STAT_INC(icp6s_badlen); + return (IPPROTO_DONE); + } } + *mp = m; + ip6 = mtod(m, struct ip6_hdr *); + mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off); /* * Userland needs to see all of this traffic for implementing diff --git a/freebsd/sys/netinet6/mld6_var.h b/freebsd/sys/netinet6/mld6_var.h index 8dc2ffa4..0aedde27 100644 --- a/freebsd/sys/netinet6/mld6_var.h +++ b/freebsd/sys/netinet6/mld6_var.h @@ -167,7 +167,7 @@ struct mld_ifsoftc * void mld_domifdetach(struct ifnet *); void mld_fasttimo(void); void mld_ifdetach(struct ifnet *, struct in6_multi_head *); -int mld_input(struct mbuf *, int, int); +int mld_input(struct mbuf **, int, int); void mld_slowtimo(void); #ifdef SYSCTL_DECL diff --git a/freebsd/sys/netinet6/nd6.c b/freebsd/sys/netinet6/nd6.c index 140dde59..aea8168e 100644 --- a/freebsd/sys/netinet6/nd6.c +++ b/freebsd/sys/netinet6/nd6.c @@ -117,7 +117,6 @@ VNET_DEFINE(int, nd6_debug) = 0; static eventhandler_tag lle_event_eh, iflladdr_event_eh; -VNET_DEFINE(struct nd_drhead, nd_defrouter); VNET_DEFINE(struct nd_prhead, nd_prefix); VNET_DEFINE(struct rwlock, nd6_lock); VNET_DEFINE(uint64_t, nd6_list_genid); @@ -147,9 +146,11 @@ static int nd6_need_cache(struct ifnet *); VNET_DEFINE_STATIC(struct callout, nd6_slowtimo_ch); #define V_nd6_slowtimo_ch VNET(nd6_slowtimo_ch) -VNET_DEFINE(struct callout, nd6_timer_ch); +VNET_DEFINE_STATIC(struct callout, nd6_timer_ch); #define V_nd6_timer_ch VNET(nd6_timer_ch) +SYSCTL_DECL(_net_inet6_icmp6); + static void nd6_lle_event(void *arg __unused, struct llentry *lle, int evt) { @@ -219,7 +220,7 @@ nd6_init(void) rw_init(&V_nd6_lock, "nd6 list"); LIST_INIT(&V_nd_prefix); - TAILQ_INIT(&V_nd_defrouter); + nd6_defrouter_init(); /* Start timers. */ callout_init(&V_nd6_slowtimo_ch, 0); @@ -894,27 +895,15 @@ void nd6_timer(void *arg) { CURVNET_SET((struct vnet *) arg); - struct nd_drhead drq; struct nd_prhead prl; - struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; struct ifnet *ifp; struct in6_ifaddr *ia6, *nia6; uint64_t genid; - TAILQ_INIT(&drq); LIST_INIT(&prl); - ND6_WLOCK(); - TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) - if (dr->expire && dr->expire < time_uptime) - defrouter_unlink(dr, &drq); - ND6_WUNLOCK(); - - while ((dr = TAILQ_FIRST(&drq)) != NULL) { - TAILQ_REMOVE(&drq, dr, dr_entry); - defrouter_del(dr); - } + nd6_defrouter_timer(); /* * expire interface addresses. @@ -1137,34 +1126,15 @@ regen_tmpaddr(struct in6_ifaddr *ia6) void nd6_purge(struct ifnet *ifp) { - struct nd_drhead drq; struct nd_prhead prl; - struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; - TAILQ_INIT(&drq); LIST_INIT(&prl); - /* - * Nuke default router list entries toward ifp. - * We defer removal of default router list entries that is installed - * in the routing table, in order to keep additional side effects as - * small as possible. - */ - ND6_WLOCK(); - TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { - if (dr->installed) - continue; - if (dr->ifp == ifp) - defrouter_unlink(dr, &drq); - } - TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { - if (!dr->installed) - continue; - if (dr->ifp == ifp) - defrouter_unlink(dr, &drq); - } + /* Purge default router list entries toward ifp. */ + nd6_defrouter_purge(ifp); + ND6_WLOCK(); /* * Remove prefixes on ifp. We should have already removed addresses on * this interface, so no addresses should be referencing these prefixes. @@ -1175,11 +1145,7 @@ nd6_purge(struct ifnet *ifp) } ND6_WUNLOCK(); - /* Delete the unlinked router and prefix objects. */ - while ((dr = TAILQ_FIRST(&drq)) != NULL) { - TAILQ_REMOVE(&drq, dr, dr_entry); - defrouter_del(dr); - } + /* Delete the unlinked prefix objects. */ while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); nd6_prefix_del(pr); @@ -1365,7 +1331,7 @@ restart: * as on-link, and thus, as a neighbor. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && - TAILQ_EMPTY(&V_nd_defrouter) && + nd6_defrouter_list_empty() && V_nd6_defifindex == ifp->if_index) { return (1); } @@ -1808,22 +1774,9 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) case SIOCSRTRFLUSH_IN6: { /* flush all the default routers */ - struct nd_drhead drq; - struct nd_defrouter *dr; - - TAILQ_INIT(&drq); defrouter_reset(); - - ND6_WLOCK(); - while ((dr = TAILQ_FIRST(&V_nd_defrouter)) != NULL) - defrouter_unlink(dr, &drq); - ND6_WUNLOCK(); - while ((dr = TAILQ_FIRST(&drq)) != NULL) { - TAILQ_REMOVE(&drq, dr, dr_entry); - defrouter_del(dr); - } - + nd6_defrouter_flush_all(); defrouter_select(); break; } @@ -2367,13 +2320,7 @@ nd6_resolve_slow(struct ifnet *ifp, int flags, struct mbuf *m, } } if (lle == NULL) { - if (!(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) { - m_freem(m); - return (ENOBUFS); - } - - if (m != NULL) - m_freem(m); + m_freem(m); return (ENOBUFS); } @@ -2616,59 +2563,6 @@ clear_llinfo_pqueue(struct llentry *ln) ln->la_hold = NULL; } -static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS); -static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS); - -SYSCTL_DECL(_net_inet6_icmp6); -SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist, - CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, - NULL, 0, nd6_sysctl_drlist, "S,in6_defrouter", - "NDP default router list"); -SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, - CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, - NULL, 0, nd6_sysctl_prlist, "S,in6_prefix", - "NDP prefix list"); -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, ""); -SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), ""); - -static int -nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS) -{ - struct in6_defrouter d; - struct nd_defrouter *dr; - int error; - - if (req->newptr != NULL) - return (EPERM); - - error = sysctl_wire_old_buffer(req, 0); - if (error != 0) - return (error); - - bzero(&d, sizeof(d)); - d.rtaddr.sin6_family = AF_INET6; - d.rtaddr.sin6_len = sizeof(d.rtaddr); - - ND6_RLOCK(); - TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { - d.rtaddr.sin6_addr = dr->rtaddr; - error = sa6_recoverscope(&d.rtaddr); - if (error != 0) - break; - d.flags = dr->raflags; - d.rtlifetime = dr->rtlifetime; - d.expire = dr->expire + (time_second - time_uptime); - d.if_index = dr->ifp->if_index; - error = SYSCTL_OUT(req, &d, sizeof(d)); - if (error != 0) - break; - } - ND6_RUNLOCK(); - return (error); -} - static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS) { @@ -2742,3 +2636,11 @@ out: ND6_RUNLOCK(); return (error); } +SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, + NULL, 0, nd6_sysctl_prlist, "S,in6_prefix", + "NDP prefix list"); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, ""); +SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), ""); diff --git a/freebsd/sys/netinet6/nd6.h b/freebsd/sys/netinet6/nd6.h index cabfeec0..71c99b1b 100644 --- a/freebsd/sys/netinet6/nd6.h +++ b/freebsd/sys/netinet6/nd6.h @@ -329,7 +329,6 @@ VNET_DECLARE(int, nd6_mmaxtries); VNET_DECLARE(int, nd6_useloopback); VNET_DECLARE(int, nd6_maxnudhint); VNET_DECLARE(int, nd6_gctimer); -VNET_DECLARE(struct nd_drhead, nd_defrouter); VNET_DECLARE(struct nd_prhead, nd_prefix); VNET_DECLARE(int, nd6_debug); VNET_DECLARE(int, nd6_onlink_ns_rfc4861); @@ -340,7 +339,6 @@ VNET_DECLARE(int, nd6_onlink_ns_rfc4861); #define V_nd6_useloopback VNET(nd6_useloopback) #define V_nd6_maxnudhint VNET(nd6_maxnudhint) #define V_nd6_gctimer VNET(nd6_gctimer) -#define V_nd_defrouter VNET(nd_defrouter) #define V_nd_prefix VNET(nd_prefix) #define V_nd6_debug VNET(nd6_debug) #define V_nd6_onlink_ns_rfc4861 VNET(nd6_onlink_ns_rfc4861) @@ -470,6 +468,8 @@ void nd6_dad_stop(struct ifaddr *); /* nd6_rtr.c */ void nd6_rs_input(struct mbuf *, int, int); void nd6_ra_input(struct mbuf *, int, int); +struct nd_defrouter *defrouter_lookup(struct in6_addr *, struct ifnet *); +struct nd_defrouter *defrouter_lookup_locked(struct in6_addr *, struct ifnet *); void defrouter_reset(void); void defrouter_select_fib(int fibnum); void defrouter_select(void); @@ -478,6 +478,11 @@ void defrouter_rele(struct nd_defrouter *); bool defrouter_remove(struct in6_addr *, struct ifnet *); void defrouter_unlink(struct nd_defrouter *, struct nd_drhead *); void defrouter_del(struct nd_defrouter *); +bool nd6_defrouter_list_empty(void); +void nd6_defrouter_flush_all(void); +void nd6_defrouter_purge(struct ifnet *); +void nd6_defrouter_timer(void); +void nd6_defrouter_init(void); int nd6_prelist_add(struct nd_prefixctl *, struct nd_defrouter *, struct nd_prefix **); void nd6_prefix_unlink(struct nd_prefix *, struct nd_prhead *); @@ -487,8 +492,6 @@ void nd6_prefix_rele(struct nd_prefix *); int nd6_prefix_onlink(struct nd_prefix *); int nd6_prefix_offlink(struct nd_prefix *); void pfxlist_onlink_check(void); -struct nd_defrouter *defrouter_lookup(struct in6_addr *, struct ifnet *); -struct nd_defrouter *defrouter_lookup_locked(struct in6_addr *, struct ifnet *); struct nd_prefix *nd6_prefix_lookup(struct nd_prefixctl *); void rt6_flush(struct in6_addr *, struct ifnet *); int nd6_setdefaultiface(int); diff --git a/freebsd/sys/netinet6/nd6_nbr.c b/freebsd/sys/netinet6/nd6_nbr.c index 49810020..634eea06 100644 --- a/freebsd/sys/netinet6/nd6_nbr.c +++ b/freebsd/sys/netinet6/nd6_nbr.c @@ -122,53 +122,53 @@ VNET_DEFINE_STATIC(int, dad_maxtry) = 15; /* max # of *tries* to void nd6_ns_input(struct mbuf *m, int off, int icmp6len) { - struct ifnet *ifp = m->m_pkthdr.rcvif; - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + struct ifnet *ifp; + struct ip6_hdr *ip6; struct nd_neighbor_solicit *nd_ns; - struct in6_addr saddr6 = ip6->ip6_src; - struct in6_addr daddr6 = ip6->ip6_dst; - struct in6_addr taddr6; - struct in6_addr myaddr6; - char *lladdr = NULL; - struct ifaddr *ifa = NULL; - int lladdrlen = 0; - int anycast = 0, proxy = 0, tentative = 0; - int tlladdr; - int rflag; - union nd_opts ndopts; + struct in6_addr daddr6, myaddr6, saddr6, taddr6; + struct ifaddr *ifa; struct sockaddr_dl proxydl; + union nd_opts ndopts; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; + char *lladdr; + int anycast, lladdrlen, proxy, rflag, tentative, tlladdr; + + ifa = NULL; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; - rflag = (V_ip6_forwarding) ? ND_NA_FLAG_ROUTER : 0; - if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && V_ip6_norbit_raif) - rflag = 0; -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, icmp6len,); - nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off); -#else - IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len); - if (nd_ns == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return; - } -#endif - ip6 = mtod(m, struct ip6_hdr *); /* adjust pointer for safety */ - taddr6 = nd_ns->nd_ns_target; - if (in6_setscope(&taddr6, ifp, NULL) != 0) - goto bad; - + ifp = m->m_pkthdr.rcvif; + ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); - goto bad; + goto bads; } + if (m->m_len < off + icmp6len) { + m = m_pullup(m, off + icmp6len); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + return; + } + } + ip6 = mtod(m, struct ip6_hdr *); + nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off); + + saddr6 = ip6->ip6_src; + daddr6 = ip6->ip6_dst; + taddr6 = nd_ns->nd_ns_target; + if (in6_setscope(&taddr6, ifp, NULL) != 0) + goto bad; + + rflag = (V_ip6_forwarding) ? ND_NA_FLAG_ROUTER : 0; + if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && V_ip6_norbit_raif) + rflag = 0; + if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { /* dst has to be a solicited node multicast address. */ if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL && @@ -216,6 +216,8 @@ nd6_ns_input(struct mbuf *m, int off, int icmp6len) goto freeit; } + lladdr = NULL; + lladdrlen = 0; if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; @@ -255,6 +257,7 @@ nd6_ns_input(struct mbuf *m, int off, int icmp6len) ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* (2) check. */ + proxy = 0; if (ifa == NULL) { struct sockaddr_dl rt_gateway; struct rt_addrinfo info; @@ -381,6 +384,7 @@ nd6_ns_input(struct mbuf *m, int off, int icmp6len) ip6_sprintf(ip6bufs, &daddr6))); nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n", ip6_sprintf(ip6bufs, &taddr6))); + bads: ICMP6STAT_INC(icp6s_badns); if (ifa != NULL) ifa_free(ifa); @@ -615,32 +619,32 @@ nd6_ns_output(struct ifnet *ifp, const struct in6_addr *saddr6, void nd6_na_input(struct mbuf *m, int off, int icmp6len) { - struct ifnet *ifp = m->m_pkthdr.rcvif; - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); - struct nd_neighbor_advert *nd_na; - struct in6_addr daddr6 = ip6->ip6_dst; - struct in6_addr taddr6; - int flags; - int is_router; - int is_solicited; - int is_override; - char *lladdr = NULL; - int lladdrlen = 0; - int checklink = 0; + struct ifnet *ifp; + struct ip6_hdr *ip6; struct ifaddr *ifa; - struct llentry *ln = NULL; - union nd_opts ndopts; - struct mbuf *chain = NULL; + struct llentry *ln; + struct mbuf *chain; + struct nd_neighbor_advert *nd_na; + struct in6_addr daddr6, taddr6; struct sockaddr_in6 sin6; + union nd_opts ndopts; u_char linkhdr[LLE_MAX_LINKHDR]; - size_t linkhdrsize; - int lladdr_off; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; + char *lladdr; + size_t linkhdrsize; + int flags, is_override, is_router, is_solicited; + int lladdr_off, lladdrlen, checklink; + + chain = NULL; + ln = NULL; + checklink = 0; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; + ifp = m->m_pkthdr.rcvif; + ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, "nd6_na_input: invalid hlim (%d) from %s to %s on %s\n", @@ -649,22 +653,20 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) goto bad; } -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, icmp6len,); - nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off); -#else - IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len); - if (nd_na == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return; + if (m->m_len < off + icmp6len) { + m = m_pullup(m, off + icmp6len); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + return; + } } -#endif + ip6 = mtod(m, struct ip6_hdr *); + nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off); flags = nd_na->nd_na_flags_reserved; is_router = ((flags & ND_NA_FLAG_ROUTER) != 0); is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0); is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0); - memset(&sin6, 0, sizeof(sin6)); taddr6 = nd_na->nd_na_target; if (in6_setscope(&taddr6, ifp, NULL)) @@ -676,6 +678,8 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) ip6_sprintf(ip6bufs, &taddr6))); goto bad; } + + daddr6 = ip6->ip6_dst; if (IN6_IS_ADDR_MULTICAST(&daddr6)) if (is_solicited) { nd6log((LOG_ERR, @@ -692,6 +696,8 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) goto freeit; } + lladdr = NULL; + lladdrlen = 0; if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; @@ -889,8 +895,10 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) * rt->rt_flags &= ~RTF_REJECT; */ ln->la_asked = 0; - if (ln->la_hold != NULL) + if (ln->la_hold != NULL) { + memset(&sin6, 0, sizeof(sin6)); nd6_grab_holdchain(ln, &chain, &sin6); + } freeit: if (ln != NULL) LLE_WUNLOCK(ln); diff --git a/freebsd/sys/netinet6/nd6_rtr.c b/freebsd/sys/netinet6/nd6_rtr.c index a60e7c66..9dddedf4 100644 --- a/freebsd/sys/netinet6/nd6_rtr.c +++ b/freebsd/sys/netinet6/nd6_rtr.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include <sys/errno.h> #include <sys/rmlock.h> #include <sys/rwlock.h> +#include <sys/sysctl.h> #include <sys/syslog.h> #include <sys/queue.h> @@ -74,24 +75,12 @@ __FBSDID("$FreeBSD$"); #include <netinet/icmp6.h> #include <netinet6/scope6_var.h> -static int rtpref(struct nd_defrouter *); static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *); static int prelist_update(struct nd_prefixctl *, struct nd_defrouter *, struct mbuf *, int); -static struct in6_ifaddr *in6_ifadd(struct nd_prefixctl *, int); -static struct nd_pfxrouter *pfxrtr_lookup(struct nd_prefix *, - struct nd_defrouter *); -static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *); -static void pfxrtr_del(struct nd_pfxrouter *); -static struct nd_pfxrouter *find_pfxlist_reachable_router(struct nd_prefix *); -static void defrouter_delreq(struct nd_defrouter *); -static void nd6_rtmsg(int, struct rtentry *); -static int in6_init_prefix_ltimes(struct nd_prefix *); -static void in6_init_address_ltimes(struct nd_prefix *, - struct in6_addrlifetime *); - -static int rt6_deleteroute(const struct rtentry *, void *); +VNET_DEFINE_STATIC(struct nd_drhead, nd6_defrouter); +#define V_nd6_defrouter VNET(nd6_defrouter) VNET_DECLARE(int, nd6_recalc_reachtm_interval); #define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval) @@ -108,6 +97,8 @@ VNET_DEFINE(u_int32_t, ip6_temp_valid_lifetime) = DEF_TEMP_VALID_LIFETIME; VNET_DEFINE(int, ip6_temp_regen_advance) = TEMPADDR_REGEN_ADVANCE; +SYSCTL_DECL(_net_inet6_icmp6); + /* RTPREF_MEDIUM has to be 0! */ #define RTPREF_HIGH 1 #define RTPREF_MEDIUM 0 @@ -115,6 +106,37 @@ VNET_DEFINE(int, ip6_temp_regen_advance) = TEMPADDR_REGEN_ADVANCE; #define RTPREF_RESERVED (-2) #define RTPREF_INVALID (-3) /* internal */ +void +defrouter_ref(struct nd_defrouter *dr) +{ + + refcount_acquire(&dr->refcnt); +} + +void +defrouter_rele(struct nd_defrouter *dr) +{ + + if (refcount_release(&dr->refcnt)) + free(dr, M_IP6NDP); +} + +/* + * Remove a router from the global list and optionally stash it in a + * caller-supplied queue. + */ +void +defrouter_unlink(struct nd_defrouter *dr, struct nd_drhead *drq) +{ + + ND6_WLOCK_ASSERT(); + + TAILQ_REMOVE(&V_nd6_defrouter, dr, dr_entry); + V_nd6_list_genid++; + if (drq != NULL) + TAILQ_INSERT_TAIL(drq, dr, dr_entry); +} + /* * Receive Router Solicitation Message - just for routers. * Router solicitation/advertisement is mostly managed by userland program @@ -125,14 +147,16 @@ VNET_DEFINE(int, ip6_temp_regen_advance) = TEMPADDR_REGEN_ADVANCE; void nd6_rs_input(struct mbuf *m, int off, int icmp6len) { - struct ifnet *ifp = m->m_pkthdr.rcvif; - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + struct ifnet *ifp; + struct ip6_hdr *ip6; struct nd_router_solicit *nd_rs; - struct in6_addr saddr6 = ip6->ip6_src; - char *lladdr = NULL; - int lladdrlen = 0; + struct in6_addr saddr6; union nd_opts ndopts; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; + char *lladdr; + int lladdrlen; + + ifp = m->m_pkthdr.rcvif; /* * Accept RS only when V_ip6_forwarding=1 and the interface has @@ -146,9 +170,10 @@ nd6_rs_input(struct mbuf *m, int off, int icmp6len) goto freeit; /* Sanity checks */ + ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, - "nd6_rs_input: invalid hlim (%d) from %s to %s on %s\n", + "%s: invalid hlim (%d) from %s to %s on %s\n", __func__, ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; @@ -158,29 +183,31 @@ nd6_rs_input(struct mbuf *m, int off, int icmp6len) * Don't update the neighbor cache, if src = ::. * This indicates that the src has no IP address assigned yet. */ + saddr6 = ip6->ip6_src; if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) goto freeit; -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, icmp6len,); - nd_rs = (struct nd_router_solicit *)((caddr_t)ip6 + off); -#else - IP6_EXTHDR_GET(nd_rs, struct nd_router_solicit *, m, off, icmp6len); - if (nd_rs == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return; + if (m->m_len < off + icmp6len) { + m = m_pullup(m, off + icmp6len); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + return; + } } -#endif + ip6 = mtod(m, struct ip6_hdr *); + nd_rs = (struct nd_router_solicit *)((caddr_t)ip6 + off); icmp6len -= sizeof(*nd_rs); nd6_option_init(nd_rs + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, - "nd6_rs_input: invalid ND option, ignored\n")); + "%s: invalid ND option, ignored\n", __func__)); /* nd6_options have incremented stats */ goto freeit; } + lladdr = NULL; + lladdrlen = 0; if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; @@ -188,9 +215,8 @@ nd6_rs_input(struct mbuf *m, int off, int icmp6len) if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, - "nd6_rs_input: lladdrlen mismatch for %s " - "(if %d, RS packet %d)\n", - ip6_sprintf(ip6bufs, &saddr6), + "%s: lladdrlen mismatch for %s (if %d, RS packet %d)\n", + __func__, ip6_sprintf(ip6bufs, &saddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } @@ -216,22 +242,22 @@ nd6_rs_input(struct mbuf *m, int off, int icmp6len) void nd6_ra_input(struct mbuf *m, int off, int icmp6len) { - struct ifnet *ifp = m->m_pkthdr.rcvif; - struct nd_ifinfo *ndi = ND_IFINFO(ifp); - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + struct ifnet *ifp; + struct nd_ifinfo *ndi; + struct ip6_hdr *ip6; struct nd_router_advert *nd_ra; - struct in6_addr saddr6 = ip6->ip6_src; - int mcast = 0; - union nd_opts ndopts; + struct in6_addr saddr6; struct nd_defrouter *dr; + union nd_opts ndopts; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; - - dr = NULL; + int mcast; /* * We only accept RAs only when the per-interface flag * ND6_IFF_ACCEPT_RTADV is on the receiving interface. */ + ifp = m->m_pkthdr.rcvif; + ndi = ND_IFINFO(ifp); if (!(ndi->flags & ND6_IFF_ACCEPT_RTADV)) goto freeit; @@ -239,41 +265,44 @@ nd6_ra_input(struct mbuf *m, int off, int icmp6len) if(m->m_flags & M_FRAGMENTED) goto freeit; + ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, - "nd6_ra_input: invalid hlim (%d) from %s to %s on %s\n", + "%s: invalid hlim (%d) from %s to %s on %s\n", __func__, ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } + saddr6 = ip6->ip6_src; if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) { nd6log((LOG_ERR, - "nd6_ra_input: src %s is not link-local\n", + "%s: src %s is not link-local\n", __func__, ip6_sprintf(ip6bufs, &saddr6))); goto bad; } -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, icmp6len,); - nd_ra = (struct nd_router_advert *)((caddr_t)ip6 + off); -#else - IP6_EXTHDR_GET(nd_ra, struct nd_router_advert *, m, off, icmp6len); - if (nd_ra == NULL) { - ICMP6STAT_INC(icp6s_tooshort); - return; + if (m->m_len < off + icmp6len) { + m = m_pullup(m, off + icmp6len); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + return; + } } -#endif + ip6 = mtod(m, struct ip6_hdr *); + nd_ra = (struct nd_router_advert *)((caddr_t)ip6 + off); icmp6len -= sizeof(*nd_ra); nd6_option_init(nd_ra + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, - "nd6_ra_input: invalid ND option, ignored\n")); + "%s: invalid ND option, ignored\n", __func__)); /* nd6_options have incremented stats */ goto freeit; } + mcast = 0; + dr = NULL; { struct nd_defrouter dr0; u_int32_t advreachable = nd_ra->nd_ra_reachable; @@ -341,26 +370,25 @@ nd6_ra_input(struct mbuf *m, int off, int icmp6len) if (pi->nd_opt_pi_len != 4) { nd6log((LOG_INFO, - "nd6_ra_input: invalid option " - "len %d for prefix information option, " - "ignored\n", pi->nd_opt_pi_len)); + "%s: invalid option len %d for prefix " + "information option, ignored\n", __func__, + pi->nd_opt_pi_len)); continue; } if (128 < pi->nd_opt_pi_prefix_len) { nd6log((LOG_INFO, - "nd6_ra_input: invalid prefix " - "len %d for prefix information option, " - "ignored\n", pi->nd_opt_pi_prefix_len)); + "%s: invalid prefix len %d for prefix " + "information option, ignored\n", __func__, + pi->nd_opt_pi_prefix_len)); continue; } if (IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix) || IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) { nd6log((LOG_INFO, - "nd6_ra_input: invalid prefix " - "%s, ignored\n", - ip6_sprintf(ip6bufs, + "%s: invalid prefix %s, ignored\n", + __func__, ip6_sprintf(ip6bufs, &pi->nd_opt_pi_prefix))); continue; } @@ -397,8 +425,8 @@ nd6_ra_input(struct mbuf *m, int off, int icmp6len) /* lower bound */ if (mtu < IPV6_MMTU) { - nd6log((LOG_INFO, "nd6_ra_input: bogus mtu option " - "mtu=%lu sent from %s, ignoring\n", + nd6log((LOG_INFO, "%s: bogus mtu option mtu=%lu sent " + "from %s, ignoring\n", __func__, mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src))); goto skip; } @@ -416,9 +444,8 @@ nd6_ra_input(struct mbuf *m, int off, int icmp6len) rt_updatemtu(ifp); } } else { - nd6log((LOG_INFO, "nd6_ra_input: bogus mtu " - "mtu=%lu sent from %s; " - "exceeds maxmtu %lu, ignoring\n", + nd6log((LOG_INFO, "%s: bogus mtu=%lu sent from %s; " + "exceeds maxmtu %lu, ignoring\n", __func__, mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src), maxmtu)); } } @@ -439,8 +466,8 @@ nd6_ra_input(struct mbuf *m, int off, int icmp6len) if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, - "nd6_ra_input: lladdrlen mismatch for %s " - "(if %d, RA packet %d)\n", ip6_sprintf(ip6bufs, &saddr6), + "%s: lladdrlen mismatch for %s (if %d, RA packet %d)\n", + __func__, ip6_sprintf(ip6bufs, &saddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } @@ -493,10 +520,71 @@ nd6_rtmsg(int cmd, struct rtentry *rt) ifa_free(ifa); } -/* - * default router list processing sub routines - */ +/* PFXRTR */ +static struct nd_pfxrouter * +pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr) +{ + struct nd_pfxrouter *search; + + ND6_LOCK_ASSERT(); + + LIST_FOREACH(search, &pr->ndpr_advrtrs, pfr_entry) { + if (search->router == dr) + break; + } + return (search); +} + +static void +pfxrtr_add(struct nd_prefix *pr, struct nd_defrouter *dr) +{ + struct nd_pfxrouter *new; + bool update; + + ND6_UNLOCK_ASSERT(); + + ND6_RLOCK(); + if (pfxrtr_lookup(pr, dr) != NULL) { + ND6_RUNLOCK(); + return; + } + ND6_RUNLOCK(); + + new = malloc(sizeof(*new), M_IP6NDP, M_NOWAIT | M_ZERO); + if (new == NULL) + return; + defrouter_ref(dr); + new->router = dr; + + ND6_WLOCK(); + if (pfxrtr_lookup(pr, dr) == NULL) { + LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry); + update = true; + } else { + /* We lost a race to add the reference. */ + defrouter_rele(dr); + free(new, M_IP6NDP); + update = false; + } + ND6_WUNLOCK(); + + if (update) + pfxlist_onlink_check(); +} + +static void +pfxrtr_del(struct nd_pfxrouter *pfr) +{ + + ND6_WLOCK_ASSERT(); + + LIST_REMOVE(pfr, pfr_entry); + defrouter_rele(pfr->router); + free(pfr, M_IP6NDP); +} + +/* Default router list processing sub routines. */ static void defrouter_addreq(struct nd_defrouter *new) { @@ -524,46 +612,6 @@ defrouter_addreq(struct nd_defrouter *new) new->installed = 1; } -struct nd_defrouter * -defrouter_lookup_locked(struct in6_addr *addr, struct ifnet *ifp) -{ - struct nd_defrouter *dr; - - ND6_LOCK_ASSERT(); - TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) - if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) { - defrouter_ref(dr); - return (dr); - } - return (NULL); -} - -struct nd_defrouter * -defrouter_lookup(struct in6_addr *addr, struct ifnet *ifp) -{ - struct nd_defrouter *dr; - - ND6_RLOCK(); - dr = defrouter_lookup_locked(addr, ifp); - ND6_RUNLOCK(); - return (dr); -} - -void -defrouter_ref(struct nd_defrouter *dr) -{ - - refcount_acquire(&dr->refcnt); -} - -void -defrouter_rele(struct nd_defrouter *dr) -{ - - if (refcount_release(&dr->refcnt)) - free(dr, M_IP6NDP); -} - /* * Remove the default route for a given router. * This is just a subroutine function for defrouter_select_fib(), and @@ -595,6 +643,79 @@ defrouter_delreq(struct nd_defrouter *dr) dr->installed = 0; } +void +defrouter_del(struct nd_defrouter *dr) +{ + struct nd_defrouter *deldr = NULL; + struct nd_prefix *pr; + struct nd_pfxrouter *pfxrtr; + + ND6_UNLOCK_ASSERT(); + + /* + * Flush all the routing table entries that use the router + * as a next hop. + */ + if (ND_IFINFO(dr->ifp)->flags & ND6_IFF_ACCEPT_RTADV) + rt6_flush(&dr->rtaddr, dr->ifp); + + if (dr->installed) { + deldr = dr; + defrouter_delreq(dr); + } + + /* + * Also delete all the pointers to the router in each prefix lists. + */ + ND6_WLOCK(); + LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { + if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL) + pfxrtr_del(pfxrtr); + } + ND6_WUNLOCK(); + + pfxlist_onlink_check(); + + /* + * If the router is the primary one, choose a new one. + * Note that defrouter_select_fib() will remove the current + * gateway from the routing table. + */ + if (deldr) + defrouter_select_fib(deldr->ifp->if_fib); + + /* + * Release the list reference. + */ + defrouter_rele(dr); +} + + +struct nd_defrouter * +defrouter_lookup_locked(struct in6_addr *addr, struct ifnet *ifp) +{ + struct nd_defrouter *dr; + + ND6_LOCK_ASSERT(); + TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) + if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) { + defrouter_ref(dr); + return (dr); + } + return (NULL); +} + +struct nd_defrouter * +defrouter_lookup(struct in6_addr *addr, struct ifnet *ifp) +{ + struct nd_defrouter *dr; + + ND6_RLOCK(); + dr = defrouter_lookup_locked(addr, ifp); + ND6_RUNLOCK(); + return (dr); +} + /* * Remove all default routes from default router list. */ @@ -611,14 +732,14 @@ defrouter_reset(void) * current default router list and use that when deleting routes. */ ND6_RLOCK(); - TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) + TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) count++; ND6_RUNLOCK(); dra = malloc(count * sizeof(*dra), M_TEMP, M_WAITOK | M_ZERO); ND6_RLOCK(); - TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { + TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) { if (i == count) break; defrouter_ref(dr); @@ -662,67 +783,30 @@ defrouter_remove(struct in6_addr *addr, struct ifnet *ifp) } /* - * Remove a router from the global list and optionally stash it in a - * caller-supplied queue. - * - * The ND lock must be held. + * for default router selection + * regards router-preference field as a 2-bit signed integer */ -void -defrouter_unlink(struct nd_defrouter *dr, struct nd_drhead *drq) -{ - - ND6_WLOCK_ASSERT(); - TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry); - V_nd6_list_genid++; - if (drq != NULL) - TAILQ_INSERT_TAIL(drq, dr, dr_entry); -} - -void -defrouter_del(struct nd_defrouter *dr) +static int +rtpref(struct nd_defrouter *dr) { - struct nd_defrouter *deldr = NULL; - struct nd_prefix *pr; - struct nd_pfxrouter *pfxrtr; - - ND6_UNLOCK_ASSERT(); - - /* - * Flush all the routing table entries that use the router - * as a next hop. - */ - if (ND_IFINFO(dr->ifp)->flags & ND6_IFF_ACCEPT_RTADV) - rt6_flush(&dr->rtaddr, dr->ifp); - - if (dr->installed) { - deldr = dr; - defrouter_delreq(dr); - } - - /* - * Also delete all the pointers to the router in each prefix lists. - */ - ND6_WLOCK(); - LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { - if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL) - pfxrtr_del(pfxrtr); + switch (dr->raflags & ND_RA_FLAG_RTPREF_MASK) { + case ND_RA_FLAG_RTPREF_HIGH: + return (RTPREF_HIGH); + case ND_RA_FLAG_RTPREF_MEDIUM: + case ND_RA_FLAG_RTPREF_RSV: + return (RTPREF_MEDIUM); + case ND_RA_FLAG_RTPREF_LOW: + return (RTPREF_LOW); + default: + /* + * This case should never happen. If it did, it would mean a + * serious bug of kernel internal. We thus always bark here. + * Or, can we even panic? + */ + log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->raflags); + return (RTPREF_INVALID); } - ND6_WUNLOCK(); - - pfxlist_onlink_check(); - - /* - * If the router is the primary one, choose a new one. - * Note that defrouter_select_fib() will remove the current - * gateway from the routing table. - */ - if (deldr) - defrouter_select_fib(deldr->ifp->if_fib); - - /* - * Release the list reference. - */ - defrouter_rele(dr); + /* NOTREACHED */ } /* @@ -767,7 +851,7 @@ defrouter_select_fib(int fibnum) * Let's handle easy case (3) first: * If default router list is empty, there's nothing to be done. */ - if (TAILQ_EMPTY(&V_nd_defrouter)) { + if (TAILQ_EMPTY(&V_nd6_defrouter)) { ND6_RUNLOCK(); return; } @@ -778,7 +862,7 @@ defrouter_select_fib(int fibnum) * the ordering rule of the list described in defrtrlist_update(). */ selected_dr = installed_dr = NULL; - TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { + TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) { IF_AFDATA_RLOCK(dr->ifp); if (selected_dr == NULL && dr->ifp->if_fib == fibnum && (ln = nd6_lookup(&dr->rtaddr, 0, dr->ifp)) && @@ -817,12 +901,12 @@ defrouter_select_fib(int fibnum) if (selected_dr == NULL) { if (installed_dr == NULL || TAILQ_NEXT(installed_dr, dr_entry) == NULL) - dr = TAILQ_FIRST(&V_nd_defrouter); + dr = TAILQ_FIRST(&V_nd6_defrouter); else dr = TAILQ_NEXT(installed_dr, dr_entry); /* Ensure we select a router for this FIB. */ - TAILQ_FOREACH_FROM(dr, &V_nd_defrouter, dr_entry) { + TAILQ_FOREACH_FROM(dr, &V_nd6_defrouter, dr_entry) { if (dr->ifp->if_fib == fibnum) { selected_dr = dr; defrouter_ref(selected_dr); @@ -872,33 +956,6 @@ defrouter_select(void) defrouter_select_fib(RT_ALL_FIBS); } -/* - * for default router selection - * regards router-preference field as a 2-bit signed integer - */ -static int -rtpref(struct nd_defrouter *dr) -{ - switch (dr->raflags & ND_RA_FLAG_RTPREF_MASK) { - case ND_RA_FLAG_RTPREF_HIGH: - return (RTPREF_HIGH); - case ND_RA_FLAG_RTPREF_MEDIUM: - case ND_RA_FLAG_RTPREF_RSV: - return (RTPREF_MEDIUM); - case ND_RA_FLAG_RTPREF_LOW: - return (RTPREF_LOW); - default: - /* - * This case should never happen. If it did, it would mean a - * serious bug of kernel internal. We thus always bark here. - * Or, can we even panic? - */ - log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->raflags); - return (RTPREF_INVALID); - } - /* NOTREACHED */ -} - static struct nd_defrouter * defrtrlist_update(struct nd_defrouter *new) { @@ -960,7 +1017,7 @@ restart: * The preferred router may have changed, so relocate this * router. */ - TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry); + TAILQ_REMOVE(&V_nd6_defrouter, dr, dr_entry); n = dr; } else { n = malloc(sizeof(*n), M_IP6NDP, M_NOWAIT | M_ZERO); @@ -981,14 +1038,14 @@ restart: */ /* insert at the end of the group */ - TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { + TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) { if (rtpref(n) > rtpref(dr)) break; } if (dr != NULL) TAILQ_INSERT_BEFORE(dr, n, dr_entry); else - TAILQ_INSERT_TAIL(&V_nd_defrouter, n, dr_entry); + TAILQ_INSERT_TAIL(&V_nd6_defrouter, n, dr_entry); V_nd6_list_genid++; ND6_WUNLOCK(); @@ -997,66 +1054,154 @@ restart: return (n); } -static struct nd_pfxrouter * -pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr) +static int +in6_init_prefix_ltimes(struct nd_prefix *ndpr) { - struct nd_pfxrouter *search; + if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME) + ndpr->ndpr_preferred = 0; + else + ndpr->ndpr_preferred = time_uptime + ndpr->ndpr_pltime; + if (ndpr->ndpr_vltime == ND6_INFINITE_LIFETIME) + ndpr->ndpr_expire = 0; + else + ndpr->ndpr_expire = time_uptime + ndpr->ndpr_vltime; - ND6_LOCK_ASSERT(); + return 0; +} - LIST_FOREACH(search, &pr->ndpr_advrtrs, pfr_entry) { - if (search->router == dr) - break; +static void +in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6) +{ + /* init ia6t_expire */ + if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME) + lt6->ia6t_expire = 0; + else { + lt6->ia6t_expire = time_uptime; + lt6->ia6t_expire += lt6->ia6t_vltime; + } + + /* init ia6t_preferred */ + if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME) + lt6->ia6t_preferred = 0; + else { + lt6->ia6t_preferred = time_uptime; + lt6->ia6t_preferred += lt6->ia6t_pltime; } - return (search); } -static void -pfxrtr_add(struct nd_prefix *pr, struct nd_defrouter *dr) +static struct in6_ifaddr * +in6_ifadd(struct nd_prefixctl *pr, int mcast) { - struct nd_pfxrouter *new; - bool update; + struct ifnet *ifp = pr->ndpr_ifp; + struct ifaddr *ifa; + struct in6_aliasreq ifra; + struct in6_ifaddr *ia, *ib; + int error, plen0; + struct in6_addr mask; + int prefixlen = pr->ndpr_plen; + int updateflags; + char ip6buf[INET6_ADDRSTRLEN]; - ND6_UNLOCK_ASSERT(); + in6_prefixlen2mask(&mask, prefixlen); - ND6_RLOCK(); - if (pfxrtr_lookup(pr, dr) != NULL) { - ND6_RUNLOCK(); - return; + /* + * find a link-local address (will be interface ID). + * Is it really mandatory? Theoretically, a global or a site-local + * address can be configured without a link-local address, if we + * have a unique interface identifier... + * + * it is not mandatory to have a link-local address, we can generate + * interface identifier on the fly. we do this because: + * (1) it should be the easiest way to find interface identifier. + * (2) RFC2462 5.4 suggesting the use of the same interface identifier + * for multiple addresses on a single interface, and possible shortcut + * of DAD. we omitted DAD for this reason in the past. + * (3) a user can prevent autoconfiguration of global address + * by removing link-local address by hand (this is partly because we + * don't have other way to control the use of IPv6 on an interface. + * this has been our design choice - cf. NRL's "ifconfig auto"). + * (4) it is easier to manage when an interface has addresses + * with the same interface identifier, than to have multiple addresses + * with different interface identifiers. + */ + ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); /* 0 is OK? */ + if (ifa) + ib = (struct in6_ifaddr *)ifa; + else + return NULL; + + /* prefixlen + ifidlen must be equal to 128 */ + plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL); + if (prefixlen != plen0) { + ifa_free(ifa); + nd6log((LOG_INFO, + "%s: wrong prefixlen for %s (prefix=%d ifid=%d)\n", + __func__, if_name(ifp), prefixlen, 128 - plen0)); + return NULL; } - ND6_RUNLOCK(); - new = malloc(sizeof(*new), M_IP6NDP, M_NOWAIT | M_ZERO); - if (new == NULL) - return; - defrouter_ref(dr); - new->router = dr; + /* make ifaddr */ + in6_prepare_ifra(&ifra, &pr->ndpr_prefix.sin6_addr, &mask); - ND6_WLOCK(); - if (pfxrtr_lookup(pr, dr) == NULL) { - LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry); - update = true; - } else { - /* We lost a race to add the reference. */ - defrouter_rele(dr); - free(new, M_IP6NDP); - update = false; - } - ND6_WUNLOCK(); + IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr, &mask); + /* interface ID */ + ifra.ifra_addr.sin6_addr.s6_addr32[0] |= + (ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]); + ifra.ifra_addr.sin6_addr.s6_addr32[1] |= + (ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]); + ifra.ifra_addr.sin6_addr.s6_addr32[2] |= + (ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]); + ifra.ifra_addr.sin6_addr.s6_addr32[3] |= + (ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]); + ifa_free(ifa); - if (update) - pfxlist_onlink_check(); -} + /* lifetimes. */ + ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime; + ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime; -static void -pfxrtr_del(struct nd_pfxrouter *pfr) -{ + /* XXX: scope zone ID? */ - ND6_WLOCK_ASSERT(); + ifra.ifra_flags |= IN6_IFF_AUTOCONF; /* obey autoconf */ - LIST_REMOVE(pfr, pfr_entry); - defrouter_rele(pfr->router); - free(pfr, M_IP6NDP); + /* + * Make sure that we do not have this address already. This should + * usually not happen, but we can still see this case, e.g., if we + * have manually configured the exact address to be configured. + */ + ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, + &ifra.ifra_addr.sin6_addr); + if (ifa != NULL) { + ifa_free(ifa); + /* this should be rare enough to make an explicit log */ + log(LOG_INFO, "in6_ifadd: %s is already configured\n", + ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr)); + return (NULL); + } + + /* + * Allocate ifaddr structure, link into chain, etc. + * If we are going to create a new address upon receiving a multicasted + * RA, we need to impose a random delay before starting DAD. + * [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2] + */ + updateflags = 0; + if (mcast) + updateflags |= IN6_IFAUPDATE_DADDELAY; + if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) { + nd6log((LOG_ERR, + "%s: failed to make ifaddr %s on %s (errno=%d)\n", __func__, + ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr), + if_name(ifp), error)); + return (NULL); /* ifaddr must not have been allocated. */ + } + + ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); + /* + * XXXRW: Assumption of non-NULLness here might not be true with + * fine-grained locking -- should we validate it? Or just return + * earlier ifa rather than looking it up again? + */ + return (ia); /* this is always non-NULL and referenced. */ } static struct nd_prefix * @@ -1146,8 +1291,8 @@ nd6_prelist_add(struct nd_prefixctl *pr, struct nd_defrouter *dr, if (new->ndpr_raf_onlink) { ND6_ONLINK_LOCK(); if ((error = nd6_prefix_onlink(new)) != 0) { - nd6log((LOG_ERR, "nd6_prelist_add: failed to make " - "the prefix %s/%d on-link on %s (errno=%d)\n", + nd6log((LOG_ERR, "%s: failed to make the prefix %s/%d " + "on-link on %s (errno=%d)\n", __func__, ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), error)); /* proceed anyway. XXX: is it correct? */ @@ -1203,8 +1348,8 @@ nd6_prefix_del(struct nd_prefix *pr) ND6_ONLINK_LOCK(); if ((e = nd6_prefix_offlink(pr)) != 0) { nd6log((LOG_ERR, - "nd6_prefix_del: failed to make %s/%d offlink " - "on %s, errno=%d\n", + "%s: failed to make the prefix %s/%d offlink on %s " + "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); /* what should we do? */ @@ -1275,9 +1420,8 @@ prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr, ND6_ONLINK_LOCK(); if ((error = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, - "prelist_update: failed to make " - "the prefix %s/%d on-link on %s " - "(errno=%d)\n", + "%s: failed to make the prefix %s/%d " + "on-link on %s (errno=%d)\n", __func__, ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), @@ -1297,8 +1441,8 @@ prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr, error = nd6_prelist_add(new, dr, &pr); if (error != 0) { - nd6log((LOG_NOTICE, "prelist_update: " - "nd6_prelist_add failed for %s/%d on %s errno=%d\n", + nd6log((LOG_NOTICE, "%s: nd6_prelist_add() failed for " + "the prefix %s/%d on %s (errno=%d)\n", __func__, ip6_sprintf(ip6buf, &new->ndpr_prefix.sin6_addr), new->ndpr_plen, if_name(new->ndpr_ifp), error)); goto end; /* we should just give up in this case. */ @@ -1498,9 +1642,8 @@ prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr, } if (ifidlen + pr->ndpr_plen != 128) { nd6log((LOG_INFO, - "prelist_update: invalid prefixlen " - "%d for %s, ignored\n", - pr->ndpr_plen, if_name(ifp))); + "%s: invalid prefixlen %d for %s, ignored\n", + __func__, pr->ndpr_plen, if_name(ifp))); goto end; } @@ -1526,10 +1669,9 @@ prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr, if (V_ip6_use_tempaddr) { int e; if ((e = in6_tmpifadd(ia6, 1, 1)) != 0) { - nd6log((LOG_NOTICE, "prelist_update: " - "failed to create a temporary " - "address, errno=%d\n", - e)); + nd6log((LOG_NOTICE, "%s: failed to " + "create a temporary address " + "(errno=%d)\n", __func__, e)); } } ifa_free(&ia6->ia_ifa); @@ -1621,7 +1763,7 @@ pfxlist_onlink_check(void) * that does not advertise any prefixes. */ if (pr == NULL) { - TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { + TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) { struct nd_prefix *pr0; LIST_FOREACH(pr0, &V_nd_prefix, ndpr_entry) { @@ -1632,7 +1774,7 @@ pfxlist_onlink_check(void) break; } } - if (pr != NULL || (!TAILQ_EMPTY(&V_nd_defrouter) && pfxrtr == NULL)) { + if (pr != NULL || (!TAILQ_EMPTY(&V_nd6_defrouter) && pfxrtr == NULL)) { /* * There is at least one prefix that has a reachable router, * or at least a router which probably does not advertise @@ -1692,16 +1834,16 @@ restart: if ((flags & NDPRF_ONLINK) != 0 && (e = nd6_prefix_offlink(pr)) != 0) { nd6log((LOG_ERR, - "pfxlist_onlink_check: failed to " - "make %s/%d offlink, errno=%d\n", + "%s: failed to make %s/%d offlink " + "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); } else if ((flags & NDPRF_ONLINK) == 0 && (e = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, - "pfxlist_onlink_check: failed to " - "make %s/%d onlink, errno=%d\n", + "%s: failed to make %s/%d onlink " + "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); @@ -1834,9 +1976,9 @@ nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa) struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; - nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add " + nd6log((LOG_ERR, "%s: failed to add " "route for a prefix (%s/%d) on %s, gw=%s, mask=%s, " - "flags=%lx errno = %d\n", + "flags=%lx errno = %d\n", __func__, ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), ip6_sprintf(ip6bufg, &sin6->sin6_addr), @@ -1927,8 +2069,8 @@ nd6_prefix_onlink(struct nd_prefix *pr) * interface. This should, of course, be rare though. */ nd6log((LOG_NOTICE, - "nd6_prefix_onlink: failed to find any ifaddr" - " to add route for a prefix(%s/%d) on %s\n", + "%s: failed to find any ifaddr to add route for a " + "prefix(%s/%d) on %s\n", __func__, ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp))); return (0); @@ -2027,10 +2169,9 @@ restart: ND6_RUNLOCK(); if ((e = nd6_prefix_onlink(opr)) != 0) { nd6log((LOG_ERR, - "nd6_prefix_offlink: failed to " - "recover a prefix %s/%d from %s " - "to %s (errno = %d)\n", - ip6_sprintf(ip6buf, + "%s: failed to recover a prefix " + "%s/%d from %s to %s (errno=%d)\n", + __func__, ip6_sprintf(ip6buf, &opr->ndpr_prefix.sin6_addr), opr->ndpr_plen, if_name(ifp), if_name(opr->ndpr_ifp), e)); @@ -2045,10 +2186,9 @@ restart: } else { /* XXX: can we still set the NDPRF_ONLINK flag? */ nd6log((LOG_ERR, - "nd6_prefix_offlink: failed to delete route: " - "%s/%d on %s (errno = %d)\n", - ip6_sprintf(ip6buf, &sa6.sin6_addr), pr->ndpr_plen, - if_name(ifp), error)); + "%s: failed to delete route: %s/%d on %s (errno=%d)\n", + __func__, ip6_sprintf(ip6buf, &sa6.sin6_addr), + pr->ndpr_plen, if_name(ifp), error)); } if (a_failure) @@ -2058,121 +2198,6 @@ restart: return (error); } -static struct in6_ifaddr * -in6_ifadd(struct nd_prefixctl *pr, int mcast) -{ - struct ifnet *ifp = pr->ndpr_ifp; - struct ifaddr *ifa; - struct in6_aliasreq ifra; - struct in6_ifaddr *ia, *ib; - int error, plen0; - struct in6_addr mask; - int prefixlen = pr->ndpr_plen; - int updateflags; - char ip6buf[INET6_ADDRSTRLEN]; - - in6_prefixlen2mask(&mask, prefixlen); - - /* - * find a link-local address (will be interface ID). - * Is it really mandatory? Theoretically, a global or a site-local - * address can be configured without a link-local address, if we - * have a unique interface identifier... - * - * it is not mandatory to have a link-local address, we can generate - * interface identifier on the fly. we do this because: - * (1) it should be the easiest way to find interface identifier. - * (2) RFC2462 5.4 suggesting the use of the same interface identifier - * for multiple addresses on a single interface, and possible shortcut - * of DAD. we omitted DAD for this reason in the past. - * (3) a user can prevent autoconfiguration of global address - * by removing link-local address by hand (this is partly because we - * don't have other way to control the use of IPv6 on an interface. - * this has been our design choice - cf. NRL's "ifconfig auto"). - * (4) it is easier to manage when an interface has addresses - * with the same interface identifier, than to have multiple addresses - * with different interface identifiers. - */ - ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); /* 0 is OK? */ - if (ifa) - ib = (struct in6_ifaddr *)ifa; - else - return NULL; - - /* prefixlen + ifidlen must be equal to 128 */ - plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL); - if (prefixlen != plen0) { - ifa_free(ifa); - nd6log((LOG_INFO, "in6_ifadd: wrong prefixlen for %s " - "(prefix=%d ifid=%d)\n", - if_name(ifp), prefixlen, 128 - plen0)); - return NULL; - } - - /* make ifaddr */ - in6_prepare_ifra(&ifra, &pr->ndpr_prefix.sin6_addr, &mask); - - IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr, &mask); - /* interface ID */ - ifra.ifra_addr.sin6_addr.s6_addr32[0] |= - (ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]); - ifra.ifra_addr.sin6_addr.s6_addr32[1] |= - (ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]); - ifra.ifra_addr.sin6_addr.s6_addr32[2] |= - (ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]); - ifra.ifra_addr.sin6_addr.s6_addr32[3] |= - (ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]); - ifa_free(ifa); - - /* lifetimes. */ - ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime; - ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime; - - /* XXX: scope zone ID? */ - - ifra.ifra_flags |= IN6_IFF_AUTOCONF; /* obey autoconf */ - - /* - * Make sure that we do not have this address already. This should - * usually not happen, but we can still see this case, e.g., if we - * have manually configured the exact address to be configured. - */ - ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, - &ifra.ifra_addr.sin6_addr); - if (ifa != NULL) { - ifa_free(ifa); - /* this should be rare enough to make an explicit log */ - log(LOG_INFO, "in6_ifadd: %s is already configured\n", - ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr)); - return (NULL); - } - - /* - * Allocate ifaddr structure, link into chain, etc. - * If we are going to create a new address upon receiving a multicasted - * RA, we need to impose a random delay before starting DAD. - * [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2] - */ - updateflags = 0; - if (mcast) - updateflags |= IN6_IFAUPDATE_DADDELAY; - if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) { - nd6log((LOG_ERR, - "in6_ifadd: failed to make ifaddr %s on %s (errno=%d)\n", - ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr), - if_name(ifp), error)); - return (NULL); /* ifaddr must not have been allocated. */ - } - - ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); - /* - * XXXRW: Assumption of non-NULLness here might not be true with - * fine-grained locking -- should we validate it? Or just return - * earlier ifa rather than looking it up again? - */ - return (ia); /* this is always non-NULL and referenced. */ -} - /* * ia0 - corresponding public address */ @@ -2199,8 +2224,8 @@ in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay) again: if (in6_get_tmpifid(ifp, (u_int8_t *)randid, (const u_int8_t *)&ia0->ia_addr.sin6_addr.s6_addr[8], forcegen)) { - nd6log((LOG_NOTICE, "in6_tmpifadd: failed to find a good " - "random IFID\n")); + nd6log((LOG_NOTICE, "%s: failed to find a good random IFID\n", + __func__)); return (EINVAL); } ifra.ifra_addr.sin6_addr.s6_addr32[2] |= @@ -2222,8 +2247,8 @@ in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay) } /* Give up. Something strange should have happened. */ - nd6log((LOG_NOTICE, "in6_tmpifadd: failed to " - "find a unique random IFID\n")); + nd6log((LOG_NOTICE, "%s: failed to find a unique random IFID\n", + __func__)); return (EEXIST); } @@ -2276,8 +2301,8 @@ in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay) newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); if (newia == NULL) { /* XXX: can it happen? */ nd6log((LOG_ERR, - "in6_tmpifadd: ifa update succeeded, but we got " - "no ifaddr\n")); + "%s: ifa update succeeded, but we got no ifaddr\n", + __func__)); return (EINVAL); /* XXX */ } newia->ia6_ndpr = ia0->ia6_ndpr; @@ -2298,58 +2323,6 @@ in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay) } static int -in6_init_prefix_ltimes(struct nd_prefix *ndpr) -{ - if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME) - ndpr->ndpr_preferred = 0; - else - ndpr->ndpr_preferred = time_uptime + ndpr->ndpr_pltime; - if (ndpr->ndpr_vltime == ND6_INFINITE_LIFETIME) - ndpr->ndpr_expire = 0; - else - ndpr->ndpr_expire = time_uptime + ndpr->ndpr_vltime; - - return 0; -} - -static void -in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6) -{ - /* init ia6t_expire */ - if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME) - lt6->ia6t_expire = 0; - else { - lt6->ia6t_expire = time_uptime; - lt6->ia6t_expire += lt6->ia6t_vltime; - } - - /* init ia6t_preferred */ - if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME) - lt6->ia6t_preferred = 0; - else { - lt6->ia6t_preferred = time_uptime; - lt6->ia6t_preferred += lt6->ia6t_pltime; - } -} - -/* - * Delete all the routing table entries that use the specified gateway. - * XXX: this function causes search through all entries of routing table, so - * it shouldn't be called when acting as a router. - */ -void -rt6_flush(struct in6_addr *gateway, struct ifnet *ifp) -{ - - /* We'll care only link-local addresses */ - if (!IN6_IS_ADDR_LINKLOCAL(gateway)) - return; - - /* XXX Do we really need to walk any but the default FIB? */ - rt_foreach_fib_walk_del(AF_INET6, rt6_deleteroute, (void *)gateway); -} - -static int rt6_deleteroute(const struct rtentry *rt, void *arg) { #define SIN6(s) ((struct sockaddr_in6 *)s) @@ -2381,6 +2354,23 @@ rt6_deleteroute(const struct rtentry *rt, void *arg) #undef SIN6 } +/* + * Delete all the routing table entries that use the specified gateway. + * XXX: this function causes search through all entries of routing table, so + * it shouldn't be called when acting as a router. + */ +void +rt6_flush(struct in6_addr *gateway, struct ifnet *ifp) +{ + + /* We'll care only link-local addresses */ + if (!IN6_IS_ADDR_LINKLOCAL(gateway)) + return; + + /* XXX Do we really need to walk any but the default FIB? */ + rt_foreach_fib_walk_del(AF_INET6, rt6_deleteroute, (void *)gateway); +} + int nd6_setdefaultiface(int ifindex) { @@ -2408,3 +2398,131 @@ nd6_setdefaultiface(int ifindex) return (error); } + +bool +nd6_defrouter_list_empty(void) +{ + + return (TAILQ_EMPTY(&V_nd6_defrouter)); +} + +void +nd6_defrouter_timer(void) +{ + struct nd_defrouter *dr, *ndr; + struct nd_drhead drq; + + TAILQ_INIT(&drq); + + ND6_WLOCK(); + TAILQ_FOREACH_SAFE(dr, &V_nd6_defrouter, dr_entry, ndr) + if (dr->expire && dr->expire < time_uptime) + defrouter_unlink(dr, &drq); + ND6_WUNLOCK(); + + while ((dr = TAILQ_FIRST(&drq)) != NULL) { + TAILQ_REMOVE(&drq, dr, dr_entry); + defrouter_del(dr); + } +} + +/* + * Nuke default router list entries toward ifp. + * We defer removal of default router list entries that is installed in the + * routing table, in order to keep additional side effects as small as possible. + */ +void +nd6_defrouter_purge(struct ifnet *ifp) +{ + struct nd_defrouter *dr, *ndr; + struct nd_drhead drq; + + TAILQ_INIT(&drq); + + ND6_WLOCK(); + TAILQ_FOREACH_SAFE(dr, &V_nd6_defrouter, dr_entry, ndr) { + if (dr->installed) + continue; + if (dr->ifp == ifp) + defrouter_unlink(dr, &drq); + } + TAILQ_FOREACH_SAFE(dr, &V_nd6_defrouter, dr_entry, ndr) { + if (!dr->installed) + continue; + if (dr->ifp == ifp) + defrouter_unlink(dr, &drq); + } + ND6_WUNLOCK(); + + /* Delete the unlinked router objects. */ + while ((dr = TAILQ_FIRST(&drq)) != NULL) { + TAILQ_REMOVE(&drq, dr, dr_entry); + defrouter_del(dr); + } +} + +void +nd6_defrouter_flush_all(void) +{ + struct nd_defrouter *dr; + struct nd_drhead drq; + + TAILQ_INIT(&drq); + + ND6_WLOCK(); + while ((dr = TAILQ_FIRST(&V_nd6_defrouter)) != NULL) + defrouter_unlink(dr, &drq); + ND6_WUNLOCK(); + + while ((dr = TAILQ_FIRST(&drq)) != NULL) { + TAILQ_REMOVE(&drq, dr, dr_entry); + defrouter_del(dr); + } +} + +void +nd6_defrouter_init(void) +{ + + TAILQ_INIT(&V_nd6_defrouter); +} + +static int +nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS) +{ + struct in6_defrouter d; + struct nd_defrouter *dr; + int error; + + if (req->newptr != NULL) + return (EPERM); + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + + bzero(&d, sizeof(d)); + d.rtaddr.sin6_family = AF_INET6; + d.rtaddr.sin6_len = sizeof(d.rtaddr); + + ND6_RLOCK(); + TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) { + d.rtaddr.sin6_addr = dr->rtaddr; + error = sa6_recoverscope(&d.rtaddr); + if (error != 0) + break; + d.flags = dr->raflags; + d.rtlifetime = dr->rtlifetime; + d.expire = dr->expire + (time_second - time_uptime); + d.if_index = dr->ifp->if_index; + error = SYSCTL_OUT(req, &d, sizeof(d)); + if (error != 0) + break; + } + ND6_RUNLOCK(); + return (error); +} +SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, + NULL, 0, nd6_sysctl_drlist, "S,in6_defrouter", + "NDP default router list"); diff --git a/freebsd/sys/netinet6/raw_ip6.c b/freebsd/sys/netinet6/raw_ip6.c index aa62b7e1..c33bca05 100644 --- a/freebsd/sys/netinet6/raw_ip6.c +++ b/freebsd/sys/netinet6/raw_ip6.c @@ -163,7 +163,7 @@ rip6_input(struct mbuf **mp, int *offp, int proto) struct ifnet *ifp; struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); - struct inpcb *in6p; + struct inpcb *inp; struct inpcb *last = NULL; struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; @@ -176,18 +176,18 @@ rip6_input(struct mbuf **mp, int *offp, int proto) ifp = m->m_pkthdr.rcvif; INP_INFO_RLOCK_ET(&V_ripcbinfo, et); - CK_LIST_FOREACH(in6p, &V_ripcb, inp_list) { + CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { /* XXX inp locking */ - if ((in6p->inp_vflag & INP_IPV6) == 0) + if ((inp->inp_vflag & INP_IPV6) == 0) continue; - if (in6p->inp_ip_p && - in6p->inp_ip_p != proto) + if (inp->inp_ip_p && + inp->inp_ip_p != proto) continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && - !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst)) + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) && - !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) continue; if (last != NULL) { struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT); @@ -225,23 +225,23 @@ rip6_input(struct mbuf **mp, int *offp, int proto) INP_RUNLOCK(last); last = NULL; } - INP_RLOCK(in6p); - if (__predict_false(in6p->inp_flags2 & INP_FREED)) + INP_RLOCK(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) goto skip_2; - if (jailed_without_vnet(in6p->inp_cred)) { + if (jailed_without_vnet(inp->inp_cred)) { /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && - prison_check_ip6(in6p->inp_cred, + prison_check_ip6(inp->inp_cred, &ip6->ip6_dst) != 0) goto skip_2; } - if (in6p->in6p_cksum != -1) { + if (inp->in6p_cksum != -1) { RIP6STAT_INC(rip6s_isum); - if (m->m_pkthdr.len - (*offp + in6p->in6p_cksum) < 2 || + if (m->m_pkthdr.len - (*offp + inp->in6p_cksum) < 2 || in6_cksum(m, proto, *offp, m->m_pkthdr.len - *offp)) { RIP6STAT_INC(rip6s_badsum); @@ -260,7 +260,7 @@ rip6_input(struct mbuf **mp, int *offp, int proto) * should receive it, as multicast filtering is now * the responsibility of the transport layer. */ - if (in6p->in6p_moptions && + if (inp->in6p_moptions && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If the incoming datagram is for MLD, allow it @@ -290,7 +290,7 @@ rip6_input(struct mbuf **mp, int *offp, int proto) mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ip6->ip6_dst; - blocked = im6o_mc_filter(in6p->in6p_moptions, + blocked = im6o_mc_filter(inp->in6p_moptions, ifp, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa); @@ -300,10 +300,10 @@ rip6_input(struct mbuf **mp, int *offp, int proto) goto skip_2; } } - last = in6p; + last = inp; continue; skip_2: - INP_RUNLOCK(in6p); + INP_RUNLOCK(inp); } INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); #if defined(IPSEC) || defined(IPSEC_SUPPORT) @@ -396,7 +396,7 @@ rip6_output(struct mbuf *m, struct socket *so, ...) struct m_tag *mtag; struct sockaddr_in6 *dstsock; struct ip6_hdr *ip6; - struct inpcb *in6p; + struct inpcb *inp; u_int plen = m->m_pkthdr.len; int error = 0; struct ip6_pktopts opt, *optp; @@ -413,18 +413,18 @@ rip6_output(struct mbuf *m, struct socket *so, ...) control = va_arg(ap, struct mbuf *); va_end(ap); - in6p = sotoinpcb(so); - INP_WLOCK(in6p); + inp = sotoinpcb(so); + INP_WLOCK(inp); if (control != NULL) { if ((error = ip6_setpktopts(control, &opt, - in6p->in6p_outputopts, so->so_cred, + inp->in6p_outputopts, so->so_cred, so->so_proto->pr_protocol)) != 0) { goto bad; } optp = &opt; } else - optp = in6p->in6p_outputopts; + optp = inp->in6p_outputopts; /* * Check and convert scope zone ID into internal form. @@ -467,12 +467,12 @@ rip6_output(struct mbuf *m, struct socket *so, ...) /* * Source address selection. */ - error = in6_selectsrc_socket(dstsock, optp, in6p, so->so_cred, + error = in6_selectsrc_socket(dstsock, optp, inp, so->so_cred, scope_ambiguous, &in6a, &hlim); if (error) goto bad; - error = prison_check_ip6(in6p->inp_cred, &in6a); + error = prison_check_ip6(inp->inp_cred, &in6a); if (error != 0) goto bad; ip6->ip6_src = in6a; @@ -483,18 +483,18 @@ rip6_output(struct mbuf *m, struct socket *so, ...) * Fill in the rest of the IPv6 header fields. */ ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | - (in6p->inp_flow & IPV6_FLOWINFO_MASK); + (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); /* * ip6_plen will be filled in ip6_output, so not fill it here. */ - ip6->ip6_nxt = in6p->inp_ip_p; + ip6->ip6_nxt = inp->inp_ip_p; ip6->ip6_hlim = hlim; if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 || - in6p->in6p_cksum != -1) { + inp->in6p_cksum != -1) { struct mbuf *n; int off; u_int16_t *p; @@ -503,7 +503,7 @@ rip6_output(struct mbuf *m, struct socket *so, ...) if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) off = offsetof(struct icmp6_hdr, icmp6_cksum); else - off = in6p->in6p_cksum; + off = inp->in6p_cksum; if (plen < off + 2) { error = EINVAL; goto bad; @@ -539,7 +539,7 @@ rip6_output(struct mbuf *m, struct socket *so, ...) } } - error = ip6_output(m, optp, NULL, 0, in6p->in6p_moptions, &oifp, in6p); + error = ip6_output(m, optp, NULL, 0, inp->in6p_moptions, &oifp, inp); if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) icmp6_ifoutstat_inc(oifp, type, code); @@ -558,7 +558,7 @@ rip6_output(struct mbuf *m, struct socket *so, ...) ip6_clearpktopts(&opt, -1); m_freem(control); } - INP_WUNLOCK(in6p); + INP_WUNLOCK(inp); return (error); } diff --git a/freebsd/sys/netinet6/route6.c b/freebsd/sys/netinet6/route6.c index 7014daa6..908a5479 100644 --- a/freebsd/sys/netinet6/route6.c +++ b/freebsd/sys/netinet6/route6.c @@ -64,12 +64,16 @@ int route6_input(struct mbuf **mp, int *offp, int proto) { struct ip6_hdr *ip6; - struct mbuf *m = *mp; + struct mbuf *m; struct ip6_rthdr *rh; int off = *offp, rhlen; #ifdef __notyet__ struct ip6aux *ip6a; +#endif + + m = *mp; +#ifdef __notyet__ ip6a = ip6_findaux(m); if (ip6a) { /* XXX reject home-address option before rthdr */ @@ -81,18 +85,16 @@ route6_input(struct mbuf **mp, int *offp, int proto) } #endif -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(*rh), IPPROTO_DONE); + if (m->m_len < off + sizeof(*rh)) { + m = m_pullup(m, off + sizeof(*rh)); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = NULL; + return (IPPROTO_DONE); + } + } ip6 = mtod(m, struct ip6_hdr *); rh = (struct ip6_rthdr *)((caddr_t)ip6 + off); -#else - ip6 = mtod(m, struct ip6_hdr *); - IP6_EXTHDR_GET(rh, struct ip6_rthdr *, m, off, sizeof(*rh)); - if (rh == NULL) { - IP6STAT_INC(ip6s_tooshort); - return IPPROTO_DONE; - } -#endif /* * While this switch may look gratuitous, leave it in @@ -108,9 +110,11 @@ route6_input(struct mbuf **mp, int *offp, int proto) IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&rh->ip6r_type - (caddr_t)ip6); + *mp = NULL; return (IPPROTO_DONE); } *offp += rhlen; + *mp = m; return (rh->ip6r_nxt); } diff --git a/freebsd/sys/netinet6/sctp6_usrreq.c b/freebsd/sys/netinet6/sctp6_usrreq.c index 3465f3c3..d3421894 100644 --- a/freebsd/sys/netinet6/sctp6_usrreq.c +++ b/freebsd/sys/netinet6/sctp6_usrreq.c @@ -107,13 +107,15 @@ sctp6_input_with_port(struct mbuf **i_pak, int *offp, uint16_t port) SCTP_STAT_INCR_COUNTER64(sctps_inpackets); /* Get IP, SCTP, and first chunk header together in the first mbuf. */ offset = iphlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); - ip6 = mtod(m, struct ip6_hdr *); - IP6_EXTHDR_GET(sh, struct sctphdr *, m, iphlen, - (int)(sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr))); - if (sh == NULL) { - SCTP_STAT_INCR(sctps_hdrops); - return (IPPROTO_DONE); + if (m->m_len < offset) { + m = m_pullup(m, offset); + if (m == NULL) { + SCTP_STAT_INCR(sctps_hdrops); + return (IPPROTO_DONE); + } } + ip6 = mtod(m, struct ip6_hdr *); + sh = (struct sctphdr *)(mtod(m, caddr_t) + iphlen); ch = (struct sctp_chunkhdr *)((caddr_t)sh + sizeof(struct sctphdr)); offset -= sizeof(struct sctp_chunkhdr); memset(&src, 0, sizeof(struct sockaddr_in6)); @@ -522,7 +524,6 @@ sctp_must_try_again: static int sctp6_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNUSED) { - struct in6pcb *inp6; int error; struct sctp_inpcb *inp; uint32_t vrf_id = SCTP_DEFAULT_VRFID; @@ -544,18 +545,17 @@ sctp6_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNU inp = (struct sctp_inpcb *)so->so_pcb; SCTP_INP_WLOCK(inp); inp->sctp_flags |= SCTP_PCB_FLAGS_BOUND_V6; /* I'm v6! */ - inp6 = (struct in6pcb *)inp; - inp6->inp_vflag |= INP_IPV6; - inp6->in6p_hops = -1; /* use kernel default */ - inp6->in6p_cksum = -1; /* just to be sure */ + inp->ip_inp.inp.inp_vflag |= INP_IPV6; + inp->ip_inp.inp.in6p_hops = -1; /* use kernel default */ + inp->ip_inp.inp.in6p_cksum = -1; /* just to be sure */ #ifdef INET /* * XXX: ugly!! IPv4 TTL initialization is necessary for an IPv6 * socket as well, because the socket may be bound to an IPv6 * wildcard address, which may match an IPv4-mapped IPv6 address. */ - inp6->inp_ip_ttl = MODULE_GLOBAL(ip_defttl); + inp->ip_inp.inp.inp_ip_ttl = MODULE_GLOBAL(ip_defttl); #endif SCTP_INP_WUNLOCK(inp); return (0); @@ -565,8 +565,8 @@ static int sctp6_bind(struct socket *so, struct sockaddr *addr, struct thread *p) { struct sctp_inpcb *inp; - struct in6pcb *inp6; int error; + u_char vflagsav; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { @@ -597,16 +597,16 @@ sctp6_bind(struct socket *so, struct sockaddr *addr, struct thread *p) return (EINVAL); } } - inp6 = (struct in6pcb *)inp; - inp6->inp_vflag &= ~INP_IPV4; - inp6->inp_vflag |= INP_IPV6; - if ((addr != NULL) && (SCTP_IPV6_V6ONLY(inp6) == 0)) { + vflagsav = inp->ip_inp.inp.inp_vflag; + inp->ip_inp.inp.inp_vflag &= ~INP_IPV4; + inp->ip_inp.inp.inp_vflag |= INP_IPV6; + if ((addr != NULL) && (SCTP_IPV6_V6ONLY(&inp->ip_inp.inp) == 0)) { switch (addr->sa_family) { #ifdef INET case AF_INET: /* binding v4 addr to v6 socket, so reset flags */ - inp6->inp_vflag |= INP_IPV4; - inp6->inp_vflag &= ~INP_IPV6; + inp->ip_inp.inp.inp_vflag |= INP_IPV4; + inp->ip_inp.inp.inp_vflag &= ~INP_IPV6; break; #endif #ifdef INET6 @@ -617,17 +617,17 @@ sctp6_bind(struct socket *so, struct sockaddr *addr, struct thread *p) sin6_p = (struct sockaddr_in6 *)addr; if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr)) { - inp6->inp_vflag |= INP_IPV4; + inp->ip_inp.inp.inp_vflag |= INP_IPV4; } #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6_p); - inp6->inp_vflag |= INP_IPV4; - inp6->inp_vflag &= ~INP_IPV6; + inp->ip_inp.inp.inp_vflag |= INP_IPV4; + inp->ip_inp.inp.inp_vflag &= ~INP_IPV6; error = sctp_inpcb_bind(so, (struct sockaddr *)&sin, NULL, p); - return (error); + goto out; } #endif break; @@ -644,7 +644,8 @@ sctp6_bind(struct socket *so, struct sockaddr *addr, struct thread *p) if (addr->sa_family == AF_INET) { /* can't bind v4 addr to v6 only socket! */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); - return (EINVAL); + error = EINVAL; + goto out; } #endif sin6_p = (struct sockaddr_in6 *)addr; @@ -653,10 +654,14 @@ sctp6_bind(struct socket *so, struct sockaddr *addr, struct thread *p) /* can't bind v4-mapped addrs either! */ /* NOTE: we don't support SIIT */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); - return (EINVAL); + error = EINVAL; + goto out; } } error = sctp_inpcb_bind(so, addr, NULL, p); +out: + if (error != 0) + inp->ip_inp.inp.inp_vflag = vflagsav; return (error); } @@ -687,7 +692,6 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *p) { struct sctp_inpcb *inp; - struct in6pcb *inp6; #ifdef INET struct sockaddr_in6 *sin6; @@ -704,7 +708,6 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } - inp6 = (struct in6pcb *)inp; /* * For the TCP model we may get a NULL addr, if we are a connected * socket thats ok. @@ -724,7 +727,7 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, } #ifdef INET sin6 = (struct sockaddr_in6 *)addr; - if (SCTP_IPV6_V6ONLY(inp6)) { + if (SCTP_IPV6_V6ONLY(inp)) { /* * if IPV6_V6ONLY flag, we discard datagrams destined to a * v4 addr or v4-mapped addr @@ -793,14 +796,10 @@ sctp6_connect(struct socket *so, struct sockaddr *addr, struct thread *p) struct sctp_inpcb *inp; struct sctp_tcb *stcb; #ifdef INET - struct in6pcb *inp6; struct sockaddr_in6 *sin6; union sctp_sockstore store; #endif -#ifdef INET - inp6 = (struct in6pcb *)so->so_pcb; -#endif inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ECONNRESET); @@ -858,7 +857,7 @@ sctp6_connect(struct socket *so, struct sockaddr *addr, struct thread *p) } #ifdef INET sin6 = (struct sockaddr_in6 *)addr; - if (SCTP_IPV6_V6ONLY(inp6)) { + if (SCTP_IPV6_V6ONLY(inp)) { /* * if IPV6_V6ONLY flag, ignore connections destined to a v4 * addr or v4-mapped addr @@ -1100,10 +1099,10 @@ sctp6_peeraddr(struct socket *so, struct sockaddr **addr) static int sctp6_in6getaddr(struct socket *so, struct sockaddr **nam) { - struct in6pcb *inp6 = sotoin6pcb(so); + struct inpcb *inp = sotoinpcb(so); int error; - if (inp6 == NULL) { + if (inp == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } @@ -1136,10 +1135,10 @@ sctp6_in6getaddr(struct socket *so, struct sockaddr **nam) static int sctp6_getpeeraddr(struct socket *so, struct sockaddr **nam) { - struct in6pcb *inp6 = sotoin6pcb(so); + struct inpcb *inp = sotoinpcb(so); int error; - if (inp6 == NULL) { + if (inp == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } diff --git a/freebsd/sys/netinet6/udp6_usrreq.c b/freebsd/sys/netinet6/udp6_usrreq.c index 270b4880..845d0dc9 100644 --- a/freebsd/sys/netinet6/udp6_usrreq.c +++ b/freebsd/sys/netinet6/udp6_usrreq.c @@ -224,16 +224,16 @@ udp6_input(struct mbuf **mp, int *offp, int proto) ifp = m->m_pkthdr.rcvif; -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), IPPROTO_DONE); + if (m->m_len < off + sizeof(struct udphdr)) { + m = m_pullup(m, off + sizeof(struct udphdr)); + if (m == NULL) { + IP6STAT_INC(ip6s_exthdrtoolong); + *mp = NULL; + return (IPPROTO_DONE); + } + } ip6 = mtod(m, struct ip6_hdr *); uh = (struct udphdr *)((caddr_t)ip6 + off); -#else - IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(*uh)); - if (!uh) - return (IPPROTO_DONE); - ip6 = mtod(m, struct ip6_hdr *); -#endif UDPSTAT_INC(udps_ipackets); @@ -396,8 +396,11 @@ udp6_input(struct mbuf **mp, int *offp, int proto) else UDP_PROBE(receive, NULL, last, ip6, last, uh); - if (udp6_append(last, n, off, fromsa)) + if (udp6_append(last, n, off, fromsa)) { + /* XXX-BZ do we leak m here? */ + *mp = NULL; goto inp_lost; + } } INP_RUNLOCK(last); } @@ -438,6 +441,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) INP_RUNLOCK(last); inp_lost: INP_INFO_RUNLOCK_ET(pcbinfo, et); + *mp = NULL; return (IPPROTO_DONE); } /* @@ -481,7 +485,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (inp == NULL) { - if (udp_log_in_vain) { + if (V_udp_log_in_vain) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; @@ -505,6 +509,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) if (V_udp_blackhole) goto badunlocked; icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); + *mp = NULL; return (IPPROTO_DONE); } INP_RLOCK_ASSERT(inp); @@ -513,6 +518,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) if (up->u_rxcslen == 0 || up->u_rxcslen > ulen) { INP_RUNLOCK(inp); m_freem(m); + *mp = NULL; return (IPPROTO_DONE); } } @@ -522,6 +528,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) UDP_PROBE(receive, NULL, inp, ip6, inp, uh); if (udp6_append(inp, m, off, fromsa) == 0) INP_RUNLOCK(inp); + *mp = NULL; return (IPPROTO_DONE); badheadlocked: @@ -529,6 +536,7 @@ badheadlocked: badunlocked: if (m) m_freem(m); + *mp = NULL; return (IPPROTO_DONE); } @@ -1145,6 +1153,7 @@ udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; + u_char vflagsav; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); @@ -1152,6 +1161,7 @@ udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); + vflagsav = inp->inp_vflag; inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { @@ -1179,6 +1189,8 @@ udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) #ifdef INET out: #endif + if (error != 0) + inp->inp_vflag = vflagsav; INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); @@ -1225,6 +1237,7 @@ udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) struct inpcbinfo *pcbinfo; struct sockaddr_in6 *sin6; int error; + u_char vflagsav; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); @@ -1252,17 +1265,26 @@ udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; } in6_sin6_2_sin(&sin, sin6); - inp->inp_vflag |= INP_IPV4; - inp->inp_vflag &= ~INP_IPV6; error = prison_remote_ip4(td->td_ucred, &sin.sin_addr); if (error != 0) goto out; + vflagsav = inp->inp_vflag; + inp->inp_vflag |= INP_IPV4; + inp->inp_vflag &= ~INP_IPV6; INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, (struct sockaddr *)&sin, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); + /* + * If connect succeeds, mark socket as connected. If + * connect fails and socket is unbound, reset inp_vflag + * field. + */ if (error == 0) soisconnected(so); + else if (inp->inp_laddr.s_addr == INADDR_ANY && + inp->inp_lport == 0) + inp->inp_vflag = vflagsav; goto out; } else { if ((inp->inp_vflag & INP_IPV6) == 0) { @@ -1275,16 +1297,25 @@ udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) error = EISCONN; goto out; } - inp->inp_vflag &= ~INP_IPV4; - inp->inp_vflag |= INP_IPV6; error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr); if (error != 0) goto out; + vflagsav = inp->inp_vflag; + inp->inp_vflag &= ~INP_IPV4; + inp->inp_vflag |= INP_IPV6; INP_HASH_WLOCK(pcbinfo); error = in6_pcbconnect(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); + /* + * If connect succeeds, mark socket as connected. If + * connect fails and socket is unbound, reset inp_vflag + * field. + */ if (error == 0) soisconnected(so); + else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + inp->inp_lport == 0) + inp->inp_vflag = vflagsav; out: INP_WUNLOCK(inp); return (error); diff --git a/freebsd/sys/netipsec/xform_ah.c b/freebsd/sys/netipsec/xform_ah.c index 618fbd9b..afe26445 100644 --- a/freebsd/sys/netipsec/xform_ah.c +++ b/freebsd/sys/netipsec/xform_ah.c @@ -577,14 +577,16 @@ ah_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) /* Figure out header size. */ rplen = HDRSIZE(sav); - /* XXX don't pullup, just copy header */ - IP6_EXTHDR_GET(ah, struct newah *, m, skip, rplen); - if (ah == NULL) { - DPRINTF(("ah_input: cannot pullup header\n")); - AHSTAT_INC(ahs_hdrops); /*XXX*/ - error = ENOBUFS; - goto bad; + if (m->m_len < skip + rplen) { + m = m_pullup(m, skip + rplen); + if (m == NULL) { + DPRINTF(("ah_input: cannot pullup header\n")); + AHSTAT_INC(ahs_hdrops); /*XXX*/ + error = ENOBUFS; + goto bad; + } } + ah = (struct newah *)(mtod(m, caddr_t) + skip); /* Check replay window, if applicable. */ SECASVAR_LOCK(sav); diff --git a/freebsd/sys/netipsec/xform_esp.c b/freebsd/sys/netipsec/xform_esp.c index f5752a96..da64655d 100644 --- a/freebsd/sys/netipsec/xform_esp.c +++ b/freebsd/sys/netipsec/xform_esp.c @@ -309,8 +309,17 @@ esp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) ESPSTAT_INC(esps_badilen); goto bad; } - /* XXX don't pullup, just copy header */ - IP6_EXTHDR_GET(esp, struct newesp *, m, skip, sizeof (struct newesp)); + + if (m->m_len < skip + sizeof(*esp)) { + m = m_pullup(m, skip + sizeof(*esp)); + if (m == NULL) { + DPRINTF(("%s: cannot pullup header\n", __func__)); + ESPSTAT_INC(esps_hdrops); /*XXX*/ + error = ENOBUFS; + goto bad; + } + } + esp = (struct newesp *)(mtod(m, caddr_t) + skip); esph = sav->tdb_authalgxform; espx = sav->tdb_encalgxform; @@ -609,6 +618,13 @@ esp_input_cb(struct cryptop *crp) } } + /* + * RFC4303 2.6: + * Silently drop packet if next header field is IPPROTO_NONE. + */ + if (lastthree[2] == IPPROTO_NONE) + goto bad; + /* Trim the mbuf chain to remove trailing authenticator and padding */ m_adj(m, -(lastthree[1] + 2)); diff --git a/freebsd/sys/netpfil/pf/pf.c b/freebsd/sys/netpfil/pf/pf.c index 4f9da55b..6bc1b8c8 100644 --- a/freebsd/sys/netpfil/pf/pf.c +++ b/freebsd/sys/netpfil/pf/pf.c @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_bpf.h> #include <rtems/bsd/local/opt_pf.h> +#include <rtems/bsd/local/opt_sctp.h> #include <sys/param.h> #include <sys/bus.h> @@ -105,6 +106,10 @@ __FBSDID("$FreeBSD$"); #include <netinet6/scope6_var.h> #endif /* INET6 */ +#ifdef SCTP +#include <netinet/sctp_crc32.h> +#endif + #include <machine/in_cksum.h> #include <security/mac/mac_framework.h> @@ -5601,7 +5606,7 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { - sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); + sctp_delayed_cksum(m0, (uint32_t)(ip->ip_hl << 2)); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif diff --git a/freebsd/sys/opencrypto/cryptodev.c b/freebsd/sys/opencrypto/cryptodev.c index 02a03034..d3f4ad1c 100644 --- a/freebsd/sys/opencrypto/cryptodev.c +++ b/freebsd/sys/opencrypto/cryptodev.c @@ -268,6 +268,7 @@ crypt_kop_to_32(const struct crypt_kop *from, struct crypt_kop32 *to) struct csession { TAILQ_ENTRY(csession) next; crypto_session_t cses; + volatile u_int refs; u_int32_t ses; struct mtx lock; /* for op submission */ @@ -294,6 +295,7 @@ struct cryptop_data { struct fcrypt { TAILQ_HEAD(csessionlist, csession) csessions; int sesn; + struct mtx lock; }; static struct timeval warninterval = { .tv_sec = 60, .tv_usec = 0 }; @@ -330,8 +332,7 @@ static const rtems_filesystem_file_handlers_r cryptofops; #endif /* __rtems__ */ static struct csession *csefind(struct fcrypt *, u_int); -static int csedelete(struct fcrypt *, struct csession *); -static struct csession *cseadd(struct fcrypt *, struct csession *); +static bool csedelete(struct fcrypt *, u_int); static struct csession *csecreate(struct fcrypt *, crypto_session_t, caddr_t, u_int64_t, caddr_t, u_int64_t, u_int32_t, u_int32_t, struct enc_xform *, struct auth_hash *); @@ -398,8 +399,6 @@ cryptof_ioctl( struct crypt_op copc; struct crypt_kop kopc; #endif - static struct timeval arc4warn, blfwarn, castwarn, deswarn, md5warn; - static struct timeval skipwarn, tdeswarn; switch (cmd) { case CIOCGSESSION: @@ -420,28 +419,18 @@ cryptof_ioctl( case 0: break; case CRYPTO_DES_CBC: - if (ratecheck(&deswarn, &warninterval)) - gone_in(13, "DES cipher via /dev/crypto"); txform = &enc_xform_des; break; case CRYPTO_3DES_CBC: - if (ratecheck(&tdeswarn, &warninterval)) - gone_in(13, "3DES cipher via /dev/crypto"); txform = &enc_xform_3des; break; case CRYPTO_BLF_CBC: - if (ratecheck(&blfwarn, &warninterval)) - gone_in(13, "Blowfish cipher via /dev/crypto"); txform = &enc_xform_blf; break; case CRYPTO_CAST_CBC: - if (ratecheck(&castwarn, &warninterval)) - gone_in(13, "CAST128 cipher via /dev/crypto"); txform = &enc_xform_cast5; break; case CRYPTO_SKIPJACK_CBC: - if (ratecheck(&skipwarn, &warninterval)) - gone_in(13, "Skipjack cipher via /dev/crypto"); txform = &enc_xform_skipjack; break; case CRYPTO_AES_CBC: @@ -454,8 +443,6 @@ cryptof_ioctl( txform = &enc_xform_null; break; case CRYPTO_ARC4: - if (ratecheck(&arc4warn, &warninterval)) - gone_in(13, "ARC4 cipher via /dev/crypto"); txform = &enc_xform_arc4; break; case CRYPTO_CAMELLIA_CBC: @@ -484,9 +471,6 @@ cryptof_ioctl( case 0: break; case CRYPTO_MD5_HMAC: - if (ratecheck(&md5warn, &warninterval)) - gone_in(13, - "MD5-HMAC authenticator via /dev/crypto"); thash = &auth_hash_hmac_md5; break; case CRYPTO_POLY1305: @@ -608,8 +592,8 @@ cryptof_ioctl( if (thash) { cria.cri_alg = thash->type; cria.cri_klen = sop->mackeylen * 8; - if (thash->keysize != 0 && - sop->mackeylen > thash->keysize) { + if (sop->mackeylen > thash->keysize || + sop->mackeylen < 0) { CRYPTDEB("invalid mac key length"); error = EINVAL; SDT_PROBE1(opencrypto, dev, ioctl, error, @@ -692,13 +676,10 @@ bail: break; case CIOCFSESSION: ses = *(u_int32_t *)data; - cse = csefind(fcr, ses); - if (cse == NULL) { + if (!csedelete(fcr, ses)) { SDT_PROBE1(opencrypto, dev, ioctl, error, __LINE__); return (EINVAL); } - csedelete(fcr, cse); - csefree(cse); break; case CIOCCRYPT: #ifdef COMPAT_FREEBSD32 @@ -715,6 +696,7 @@ bail: return (EINVAL); } error = cryptodev_op(cse, cop, active_cred, td); + csefree(cse); #ifdef COMPAT_FREEBSD32 if (error == 0 && cmd == CIOCCRYPT32) crypt_op_to_32(cop, data); @@ -781,6 +763,7 @@ bail: return (EINVAL); } error = cryptodev_aead(cse, caead, active_cred, td); + csefree(cse); break; default: error = EINVAL; @@ -843,6 +826,47 @@ cod_free(struct cryptop_data *cod) free(cod, M_XDATA); } +static void +cryptodev_warn(struct csession *cse) +{ + static struct timeval arc4warn, blfwarn, castwarn, deswarn, md5warn; + static struct timeval skipwarn, tdeswarn; + + switch (cse->cipher) { + case CRYPTO_DES_CBC: + if (ratecheck(&deswarn, &warninterval)) + gone_in(13, "DES cipher via /dev/crypto"); + break; + case CRYPTO_3DES_CBC: + if (ratecheck(&tdeswarn, &warninterval)) + gone_in(13, "3DES cipher via /dev/crypto"); + break; + case CRYPTO_BLF_CBC: + if (ratecheck(&blfwarn, &warninterval)) + gone_in(13, "Blowfish cipher via /dev/crypto"); + break; + case CRYPTO_CAST_CBC: + if (ratecheck(&castwarn, &warninterval)) + gone_in(13, "CAST128 cipher via /dev/crypto"); + break; + case CRYPTO_SKIPJACK_CBC: + if (ratecheck(&skipwarn, &warninterval)) + gone_in(13, "Skipjack cipher via /dev/crypto"); + break; + case CRYPTO_ARC4: + if (ratecheck(&arc4warn, &warninterval)) + gone_in(13, "ARC4 cipher via /dev/crypto"); + break; + } + + switch (cse->mac) { + case CRYPTO_MD5_HMAC: + if (ratecheck(&md5warn, &warninterval)) + gone_in(13, "MD5-HMAC authenticator via /dev/crypto"); + break; + } +} + static int cryptodev_op( struct csession *cse, @@ -965,6 +989,7 @@ cryptodev_op( error = EINVAL; goto bail; } + cryptodev_warn(cse); again: /* @@ -1134,6 +1159,7 @@ cryptodev_aead( SDT_PROBE1(opencrypto, dev, ioctl, error, __LINE__); goto bail; } + cryptodev_warn(cse); again: /* * Let the dispatch run unlocked, then, interlock against the @@ -1383,6 +1409,9 @@ cryptof_close(struct file *fp, struct thread *td) while ((cse = TAILQ_FIRST(&fcr->csessions))) { TAILQ_REMOVE(&fcr->csessions, cse, next); + KASSERT(cse->refs == 1, + ("%s: crypto session %p with %d refs", __func__, cse, + cse->refs)); csefree(cse); } free(fcr, M_XDATA); @@ -1425,34 +1454,36 @@ csefind(struct fcrypt *fcr, u_int ses) { struct csession *cse; - TAILQ_FOREACH(cse, &fcr->csessions, next) - if (cse->ses == ses) + mtx_lock(&fcr->lock); + TAILQ_FOREACH(cse, &fcr->csessions, next) { + if (cse->ses == ses) { + refcount_acquire(&cse->refs); + mtx_unlock(&fcr->lock); return (cse); + } + } + mtx_unlock(&fcr->lock); return (NULL); } -static int -csedelete(struct fcrypt *fcr, struct csession *cse_del) +static bool +csedelete(struct fcrypt *fcr, u_int ses) { struct csession *cse; + mtx_lock(&fcr->lock); TAILQ_FOREACH(cse, &fcr->csessions, next) { - if (cse == cse_del) { + if (cse->ses == ses) { TAILQ_REMOVE(&fcr->csessions, cse, next); - return (1); + mtx_unlock(&fcr->lock); + csefree(cse); + return (true); } } - return (0); + mtx_unlock(&fcr->lock); + return (false); } -static struct csession * -cseadd(struct fcrypt *fcr, struct csession *cse) -{ - TAILQ_INSERT_TAIL(&fcr->csessions, cse, next); - cse->ses = fcr->sesn++; - return (cse); -} - struct csession * csecreate(struct fcrypt *fcr, crypto_session_t cses, caddr_t key, u_int64_t keylen, caddr_t mackey, u_int64_t mackeylen, u_int32_t cipher, u_int32_t mac, @@ -1464,6 +1495,7 @@ csecreate(struct fcrypt *fcr, crypto_session_t cses, caddr_t key, u_int64_t keyl if (cse == NULL) return NULL; mtx_init(&cse->lock, "cryptodev", "crypto session lock", MTX_DEF); + refcount_init(&cse->refs, 1); cse->key = key; cse->keylen = keylen/8; cse->mackey = mackey; @@ -1473,7 +1505,10 @@ csecreate(struct fcrypt *fcr, crypto_session_t cses, caddr_t key, u_int64_t keyl cse->mac = mac; cse->txform = txform; cse->thash = thash; - cseadd(fcr, cse); + mtx_lock(&fcr->lock); + TAILQ_INSERT_TAIL(&fcr->csessions, cse, next); + cse->ses = fcr->sesn++; + mtx_unlock(&fcr->lock); return (cse); } @@ -1481,6 +1516,8 @@ static void csefree(struct csession *cse) { + if (!refcount_release(&cse->refs)) + return; crypto_freesession(cse->cses); mtx_destroy(&cse->lock); if (cse->key) @@ -1517,13 +1554,14 @@ cryptoioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread switch (cmd) { case CRIOGET: - fcr = malloc(sizeof(struct fcrypt), M_XDATA, M_WAITOK); + fcr = malloc(sizeof(struct fcrypt), M_XDATA, M_WAITOK | M_ZERO); TAILQ_INIT(&fcr->csessions); - fcr->sesn = 0; + mtx_init(&fcr->lock, "fcrypt", NULL, MTX_DEF); error = falloc(td, &f, &fd, 0); if (error) { + mtx_destroy(&fcr->lock); free(fcr, M_XDATA); return (error); } diff --git a/freebsd/sys/sys/buf.h b/freebsd/sys/sys/buf.h index a099a972..209174b4 100644 --- a/freebsd/sys/sys/buf.h +++ b/freebsd/sys/sys/buf.h @@ -450,7 +450,7 @@ buf_countdeps(struct buf *bp, int i) } static __inline void -buf_track(struct buf *bp, const char *location) +buf_track(struct buf *bp __unused, const char *location __unused) { #if defined(FULL_BUF_TRACKING) diff --git a/freebsd/sys/sys/bus.h b/freebsd/sys/sys/bus.h index 1ac476a4..48babb3a 100644 --- a/freebsd/sys/sys/bus.h +++ b/freebsd/sys/sys/bus.h @@ -563,6 +563,7 @@ int bus_child_present(device_t child); int bus_child_pnpinfo_str(device_t child, char *buf, size_t buflen); int bus_child_location_str(device_t child, char *buf, size_t buflen); void bus_enumerate_hinted_children(device_t bus); +int bus_delayed_attach_children(device_t bus); static __inline struct resource * bus_alloc_resource_any(device_t dev, int type, int *rid, u_int flags) diff --git a/freebsd/sys/sys/conf.h b/freebsd/sys/sys/conf.h index 4ace162f..d6215ba9 100644 --- a/freebsd/sys/sys/conf.h +++ b/freebsd/sys/sys/conf.h @@ -66,7 +66,7 @@ struct cdev { #define SI_ETERNAL 0x0001 /* never destroyed */ #define SI_ALIAS 0x0002 /* carrier of alias name */ #define SI_NAMED 0x0004 /* make_dev{_alias} has been called */ -#define SI_CHEAPCLONE 0x0008 /* can be removed_dev'ed when vnode reclaims */ +#define SI_UNUSED1 0x0008 /* unused */ #define SI_CHILD 0x0010 /* child of another struct cdev **/ #define SI_DUMPDEV 0x0080 /* is kernel dumpdev */ #define SI_CLONELIST 0x0200 /* on a clone list */ diff --git a/freebsd/sys/sys/kernel.h b/freebsd/sys/sys/kernel.h index 41a5233a..fb9ad6ac 100644 --- a/freebsd/sys/sys/kernel.h +++ b/freebsd/sys/sys/kernel.h @@ -475,6 +475,8 @@ struct tunable_str { #define TUNABLE_LONG_FETCH(path, var) #define TUNABLE_ULONG(path, var) #define TUNABLE_ULONG_FETCH(path, var) +#define TUNABLE_UINT64(path, var) +#define TUNABLE_UINT64_FETCH(path, var) #define TUNABLE_QUAD(path, var) #define TUNABLE_QUAD_FETCH(path, var) #define TUNABLE_STR(path, var, size) diff --git a/freebsd/sys/sys/linker.h b/freebsd/sys/sys/linker.h index 8aae31d9..10baaa03 100644 --- a/freebsd/sys/sys/linker.h +++ b/freebsd/sys/sys/linker.h @@ -97,6 +97,11 @@ struct linker_file { */ int nenabled; /* number of enabled probes. */ int fbt_nentries; /* number of fbt entries created. */ + +#ifdef __arm__ + caddr_t exidx_addr; /* Unwind data index table start */ + size_t exidx_size; /* Unwind data index table size */ +#endif }; /* diff --git a/freebsd/sys/sys/malloc.h b/freebsd/sys/sys/malloc.h index 83510329..56b17f36 100644 --- a/freebsd/sys/sys/malloc.h +++ b/freebsd/sys/sys/malloc.h @@ -185,7 +185,11 @@ void *contigmalloc_domainset(unsigned long size, struct malloc_type *type, unsigned long alignment, vm_paddr_t boundary) __malloc_like __result_use_check __alloc_size(1) __alloc_align(7); void free(void *addr, struct malloc_type *type); +#ifndef __rtems__ void free_domain(void *addr, struct malloc_type *type); +#else /* __rtems__ */ +#define free_domain(addr, type) free(addr, type) +#endif /* __rtems__ */ #ifndef __rtems__ void *malloc(size_t size, struct malloc_type *type, int flags) __malloc_like __result_use_check __alloc_size(1); @@ -250,9 +254,13 @@ void *_bsd_malloc(size_t size, struct malloc_type *type, int flags) _malloc_item; \ }) +#ifndef __rtems__ void *malloc_domainset(size_t size, struct malloc_type *type, struct domainset *ds, int flags) __malloc_like __result_use_check __alloc_size(1); +#else /* __rtems__ */ +#define malloc_domainset(size, type, ds, flags) malloc(size, type, flags) +#endif /* __rtems__ */ void *mallocarray(size_t nmemb, size_t size, struct malloc_type *type, int flags) __malloc_like __result_use_check __alloc_size2(1, 2); diff --git a/freebsd/sys/sys/mbuf.h b/freebsd/sys/sys/mbuf.h index 634f7d9e..badc7eef 100644 --- a/freebsd/sys/sys/mbuf.h +++ b/freebsd/sys/sys/mbuf.h @@ -521,6 +521,8 @@ struct mbuf { #define CSUM_L5_VALID 0x20000000 /* checksum is correct */ #define CSUM_COALESCED 0x40000000 /* contains merged segments */ +#define CSUM_SND_TAG 0x80000000 /* Packet header has send tag */ + /* * CSUM flag description for use with printf(9) %b identifier. */ @@ -530,7 +532,7 @@ struct mbuf { "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \ "\16CSUM_IP6_ISCSI" \ "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \ - "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED" + "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG" /* CSUM flags compatibility mappings. */ #define CSUM_IP_CHECKED CSUM_L3_CALC diff --git a/freebsd/sys/sys/mount.h b/freebsd/sys/sys/mount.h index dabb506d..698716f5 100644 --- a/freebsd/sys/sys/mount.h +++ b/freebsd/sys/sys/mount.h @@ -396,6 +396,7 @@ void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *); #define MNTK_UNMAPPED_BUFS 0x00002000 #define MNTK_USES_BCACHE 0x00004000 /* FS uses the buffer cache. */ #define MNTK_TEXT_REFS 0x00008000 /* Keep use ref for text */ +#define MNTK_VMSETSIZE_BUG 0x00010000 #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ diff --git a/freebsd/sys/sys/pcpu.h b/freebsd/sys/sys/pcpu.h index 0ce30af7..7812c3d8 100644 --- a/freebsd/sys/sys/pcpu.h +++ b/freebsd/sys/sys/pcpu.h @@ -221,10 +221,6 @@ extern struct cpuhead cpuhead; extern struct pcpu *cpuid_to_pcpu[]; #define curcpu PCPU_GET(cpuid) -#define curproc (curthread->td_proc) -#ifndef curthread -#define curthread PCPU_GET(curthread) -#endif #define curvidata PCPU_GET(vidata) #ifndef __rtems__ @@ -233,20 +229,12 @@ extern struct pcpu *cpuid_to_pcpu[]; #define UMA_PCPU_ALLOC_SIZE (PAGE_SIZE / 32) #endif /* __rtems__ */ -#ifndef __rtems__ -#ifdef CTASSERT -#if defined(__i386__) || defined(__amd64__) -/* Required for counters(9) to work on x86. */ -CTASSERT(sizeof(struct pcpu) == UMA_PCPU_ALLOC_SIZE); -#else -/* - * To minimize memory waste in per-cpu UMA zones, size of struct pcpu - * should be denominator of PAGE_SIZE. - */ -CTASSERT((PAGE_SIZE / sizeof(struct pcpu)) * sizeof(struct pcpu) == PAGE_SIZE); -#endif /* UMA_PCPU_ALLOC_SIZE && x86 */ -#endif /* CTASSERT */ -#endif /* __rtems__ */ +#include <machine/pcpu_aux.h> + +#ifndef curthread +#define curthread PCPU_GET(curthread) +#endif +#define curproc (curthread->td_proc) /* Accessor to elements allocated via UMA_ZONE_PCPU zone. */ static inline void * diff --git a/freebsd/sys/sys/proc.h b/freebsd/sys/sys/proc.h index 01cf3963..04a0e430 100644 --- a/freebsd/sys/sys/proc.h +++ b/freebsd/sys/sys/proc.h @@ -1057,6 +1057,8 @@ struct fork_req { int *fr_pd_fd; int fr_pd_flags; struct filecaps *fr_pd_fcaps; + int fr_flags2; +#define FR2_DROPSIG_CAUGHT 0x00001 /* Drop caught non-DFL signals */ }; /* @@ -1185,6 +1187,7 @@ void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); struct thread *thread_alloc(int pages); int thread_alloc_stack(struct thread *, int pages); +int thread_check_susp(struct thread *td, bool sleep); void thread_cow_get_proc(struct thread *newtd, struct proc *p); void thread_cow_get(struct thread *newtd, struct thread *td); void thread_cow_free(struct thread *td); diff --git a/freebsd/sys/sys/signalvar.h b/freebsd/sys/sys/signalvar.h index aafbc0f8..70dd8fa3 100644 --- a/freebsd/sys/sys/signalvar.h +++ b/freebsd/sys/sys/signalvar.h @@ -382,6 +382,7 @@ void sigacts_copy(struct sigacts *dest, struct sigacts *src); void sigacts_free(struct sigacts *ps); struct sigacts *sigacts_hold(struct sigacts *ps); int sigacts_shared(struct sigacts *ps); +void sig_drop_caught(struct proc *p); void sigexit(struct thread *td, int sig) __dead2; int sigev_findtd(struct proc *p, struct sigevent *sigev, struct thread **); int sig_ffs(sigset_t *set); diff --git a/freebsd/sys/sys/smp.h b/freebsd/sys/sys/smp.h index aa0c3119..22b7dcd5 100644 --- a/freebsd/sys/sys/smp.h +++ b/freebsd/sys/sys/smp.h @@ -168,8 +168,10 @@ extern cpuset_t logical_cpus_mask; #ifndef __rtems__ extern u_int mp_maxid; extern int mp_maxcpus; +extern int mp_ncores; extern int mp_ncpus; extern volatile int smp_started; +extern int smp_threads_per_core; extern cpuset_t all_cpus; extern cpuset_t cpuset_domain[MAXMEMDOM]; /* CPUs in each NUMA domain. */ diff --git a/freebsd/sys/sys/sysctl.h b/freebsd/sys/sys/sysctl.h index b2ae7f97..c21f19d3 100644 --- a/freebsd/sys/sys/sysctl.h +++ b/freebsd/sys/sys/sysctl.h @@ -888,7 +888,7 @@ TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry); /* * Top-level identifiers */ -#define CTL_UNSPEC 0 /* unused */ +#define CTL_SYSCTL 0 /* "magic" numbers */ #define CTL_KERN 1 /* "high kernel": proc, limits */ #define CTL_VM 2 /* virtual memory */ #define CTL_VFS 3 /* filesystem, mount type is next */ @@ -900,6 +900,17 @@ TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry); #define CTL_P1003_1B 9 /* POSIX 1003.1B */ /* + * CTL_SYSCTL identifiers + */ +#define CTL_SYSCTL_DEBUG 0 /* printf all nodes */ +#define CTL_SYSCTL_NAME 1 /* string name of OID */ +#define CTL_SYSCTL_NEXT 2 /* next OID */ +#define CTL_SYSCTL_NAME2OID 3 /* int array of name */ +#define CTL_SYSCTL_OIDFMT 4 /* OID's kind and format */ +#define CTL_SYSCTL_OIDDESCR 5 /* OID's description */ +#define CTL_SYSCTL_OIDLABEL 6 /* aggregation label */ + +/* * CTL_KERN identifiers */ #define KERN_OSTYPE 1 /* string: system version */ @@ -1085,6 +1096,7 @@ SYSCTL_DECL(_hw_bus); SYSCTL_DECL(_hw_bus_devices); SYSCTL_DECL(_hw_bus_info); SYSCTL_DECL(_machdep); +SYSCTL_DECL(_machdep_mitigations); SYSCTL_DECL(_user); SYSCTL_DECL(_compat); SYSCTL_DECL(_regression); diff --git a/freebsd/sys/sys/systm.h b/freebsd/sys/sys/systm.h index a52cde01..aae31704 100644 --- a/freebsd/sys/sys/systm.h +++ b/freebsd/sys/sys/systm.h @@ -634,9 +634,14 @@ int poll_no_poll(int events); void DELAY(int usec); /* Root mount holdback API */ -struct root_hold_token; +struct root_hold_token { + int flags; + const char *who; + TAILQ_ENTRY(root_hold_token) list; +}; struct root_hold_token *root_mount_hold(const char *identifier); +void root_mount_hold_token(const char *identifier, struct root_hold_token *h); void root_mount_rel(struct root_hold_token *h); int root_mounted(void); diff --git a/freebsd/sys/sys/taskqueue.h b/freebsd/sys/sys/taskqueue.h index 4af1e0a3..3f7ff1f5 100644 --- a/freebsd/sys/sys/taskqueue.h +++ b/freebsd/sys/sys/taskqueue.h @@ -42,6 +42,7 @@ struct taskqueue; struct taskqgroup; +struct proc; struct thread; struct timeout_task { @@ -75,7 +76,9 @@ struct taskqueue *taskqueue_create(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context); int taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, - const char *name, ...) __printflike(4, 5); + const char *name, ...) __printflike(4, 5); +int taskqueue_start_threads_in_proc(struct taskqueue **tqp, int count, + int pri, struct proc *p, const char *name, ...) __printflike(5, 6); int taskqueue_start_threads_cpuset(struct taskqueue **tqp, int count, int pri, cpuset_t *mask, const char *name, ...) __printflike(5, 6); int taskqueue_enqueue(struct taskqueue *queue, struct task *task); diff --git a/freebsd/sys/sys/unpcb.h b/freebsd/sys/sys/unpcb.h index 7d7a20ac..3ea20b1d 100644 --- a/freebsd/sys/sys/unpcb.h +++ b/freebsd/sys/sys/unpcb.h @@ -160,7 +160,7 @@ struct xunpcb { char xu_dummy2[256]; }; struct xsocket xu_socket; -} __aligned(8); +} __aligned(MAX(8, sizeof(void *))); struct xunpgen { ksize_t xug_len; diff --git a/freebsd/sys/sys/vnode.h b/freebsd/sys/sys/vnode.h index c1235a79..f3cdf8a5 100644 --- a/freebsd/sys/sys/vnode.h +++ b/freebsd/sys/sys/vnode.h @@ -247,6 +247,7 @@ struct xvnode { #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_ETERNALDEV 0x0008 /* device that is never destroyed */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ +#define VV_VMSIZEVNLOCK 0x0020 /* object size check requires vnode lock */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ @@ -577,6 +578,7 @@ typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int); #define VN_OPEN_NOAUDIT 0x00000001 #define VN_OPEN_NOCAPCHECK 0x00000002 #define VN_OPEN_NAMECACHE 0x00000004 +#define VN_OPEN_INVFS 0x00000008 /* * Public vnode manipulation functions. @@ -920,6 +922,8 @@ int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, void vn_fsid(struct vnode *vp, struct vattr *va); +int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp); + #endif /* _KERNEL */ #endif /* __rtems__ */ diff --git a/freebsd/sys/vm/uma_core.c b/freebsd/sys/vm/uma_core.c index 7738c5d2..8c3a84b4 100644 --- a/freebsd/sys/vm/uma_core.c +++ b/freebsd/sys/vm/uma_core.c @@ -189,8 +189,14 @@ SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0, #ifndef __rtems__ /* Is the VM done starting up? */ -static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS, - BOOT_RUNNING } booted = BOOT_COLD; +static enum { + BOOT_COLD, + BOOT_STRAPPED, + BOOT_PAGEALLOC, + BOOT_BUCKETS, + BOOT_RUNNING, + BOOT_SHUTDOWN, +} booted = BOOT_COLD; #endif /* __rtems__ */ /* @@ -311,6 +317,9 @@ static int hash_expand(struct uma_hash *, struct uma_hash *); static void hash_free(struct uma_hash *hash); static void uma_timeout(void *); static void uma_startup3(void); +#ifndef __rtems__ +static void uma_shutdown(void); +#endif /* __rtems__ */ static void *zone_alloc_item(uma_zone_t, void *, int, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); static void bucket_enable(void); @@ -1255,8 +1264,7 @@ startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, case BOOT_PAGEALLOC: if (keg->uk_ppera > 1) break; - case BOOT_BUCKETS: - case BOOT_RUNNING: + default: #ifdef UMA_MD_SMALL_ALLOC keg->uk_allocf = (keg->uk_ppera > 1) ? page_alloc : uma_small_alloc; @@ -2259,10 +2267,6 @@ uma_startup2(void) } #endif /* __rtems__ */ -/* - * Initialize our callout handle - * - */ static void uma_startup3(void) { @@ -2278,9 +2282,21 @@ uma_startup3(void) callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); #ifndef __rtems__ booted = BOOT_RUNNING; + + EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL, + EVENTHANDLER_PRI_FIRST); #endif /* __rtems__ */ } +#ifndef __rtems__ +static void +uma_shutdown(void) +{ + + booted = BOOT_SHUTDOWN; +} +#endif /* __rtems__ */ + static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, int align, uint32_t flags) @@ -2518,6 +2534,16 @@ void uma_zdestroy(uma_zone_t zone) { +#ifndef __rtems__ + /* + * Large slabs are expensive to reclaim, so don't bother doing + * unnecessary work if we're shutting down. + */ + if (booted == BOOT_SHUTDOWN && + zone->uz_fini == NULL && + zone->uz_release == (uma_release)zone_release) + return; +#endif /* __rtems__ */ sx_slock(&uma_drain_lock); zone_free_item(zones, zone, NULL, SKIP_NONE); sx_sunlock(&uma_drain_lock); diff --git a/freebsd/sys/vm/vm_extern.h b/freebsd/sys/vm/vm_extern.h index 52884357..0d17f8af 100644 --- a/freebsd/sys/vm/vm_extern.h +++ b/freebsd/sys/vm/vm_extern.h @@ -85,19 +85,18 @@ void kmeminit(void); int kernacc(void *, int, int); int useracc(void *, int, int); -int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); +int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, + int fault_flags, vm_page_t *m_hold); void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t, vm_ooffset_t *); int vm_fault_disable_pagefaults(void); void vm_fault_enable_pagefaults(int save); #ifndef __rtems__ -int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, - int fault_flags, vm_page_t *m_hold); int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count); +int vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, + int fault_flags, int *signo, int *ucode); #endif /* __rtems__ */ -void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t); -int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t); int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int); void vm_waitproc(struct proc *); |