diff options
Diffstat (limited to 'freebsd/sys/dev/e1000/if_igb.c')
-rw-r--r-- | freebsd/sys/dev/e1000/if_igb.c | 1466 |
1 files changed, 994 insertions, 472 deletions
diff --git a/freebsd/sys/dev/e1000/if_igb.c b/freebsd/sys/dev/e1000/if_igb.c index 27d9c92b..bf88f60f 100644 --- a/freebsd/sys/dev/e1000/if_igb.c +++ b/freebsd/sys/dev/e1000/if_igb.c @@ -2,7 +2,7 @@ /****************************************************************************** - Copyright (c) 2001-2010, Intel Corporation + Copyright (c) 2001-2013, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -38,6 +38,7 @@ #ifdef HAVE_KERNEL_OPTION_HEADERS #include <rtems/bsd/local/opt_device_polling.h> #include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_altq.h> #endif @@ -107,7 +108,7 @@ int igb_display_debug_stats = 0; /********************************************************************* * Driver version: *********************************************************************/ -char igb_driver_version[] = "version - 2.0.7"; +char igb_driver_version[] = "version - 2.3.9 - 8"; /********************************************************************* @@ -149,6 +150,22 @@ static igb_vendor_info_t igb_vendor_info_array[] = PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_DH89XXCC_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, { 0x8086, E1000_DEV_ID_DH89XXCC_SGMII, PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_DH89XXCC_SFP, PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_DH89XXCC_BACKPLANE, + PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_I350_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_I350_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_I350_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_I350_SGMII, PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_I350_VF, PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_I210_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_I210_COPPER_IT, PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_I210_COPPER_OEM1, + PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_I210_FIBER, PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_I210_SERDES, PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_I210_SGMII, PCI_ANY_ID, PCI_ANY_ID, 0}, + { 0x8086, E1000_DEV_ID_I211_COPPER, PCI_ANY_ID, PCI_ANY_ID, 0}, /* required last entry */ { 0, 0, 0, 0, 0} }; @@ -170,13 +187,14 @@ static int igb_detach(device_t); static int igb_shutdown(device_t); static int igb_suspend(device_t); static int igb_resume(device_t); -static void igb_start(struct ifnet *); -static void igb_start_locked(struct tx_ring *, struct ifnet *ifp); #if __FreeBSD_version >= 800000 static int igb_mq_start(struct ifnet *, struct mbuf *); -static int igb_mq_start_locked(struct ifnet *, - struct tx_ring *, struct mbuf *); +static int igb_mq_start_locked(struct ifnet *, struct tx_ring *); static void igb_qflush(struct ifnet *); +static void igb_deferred_mq_start(void *, int); +#else +static void igb_start(struct ifnet *); +static void igb_start_locked(struct tx_ring *, struct ifnet *ifp); #endif static int igb_ioctl(struct ifnet *, u_long, caddr_t); static void igb_init(void *); @@ -220,19 +238,11 @@ static __inline void igb_rx_discard(struct rx_ring *, int); static __inline void igb_rx_input(struct rx_ring *, struct ifnet *, struct mbuf *, u32); -#ifdef __rtems__ -/* XXX report inconsistent proto/body to FreeBSD */ static bool igb_rxeof(struct igb_queue *, int, int *); -#else -static boolean_t igb_rxeof(struct igb_queue *, int, int *); -#endif static void igb_rx_checksum(u32, struct mbuf *, u32); -#ifdef __rtems__ -/* XXX report inconsistent proto/body to FreeBSD */ static bool igb_tx_ctx_setup(struct tx_ring *, struct mbuf *); -#else -#endif -static boolean_t igb_tso_setup(struct tx_ring *, struct mbuf *, u32 *); +static bool igb_tso_setup(struct tx_ring *, struct mbuf *, int, + struct ip *, struct tcphdr *); static void igb_set_promisc(struct adapter *); static void igb_disable_promisc(struct adapter *); static void igb_set_multi(struct adapter *); @@ -264,14 +274,17 @@ static void igb_enable_wakeup(device_t); static void igb_led_func(void *, int); static int igb_irq_fast(void *); -static void igb_add_rx_process_limit(struct adapter *, const char *, - const char *, int *, int); +static void igb_msix_que(void *); +static void igb_msix_link(void *); static void igb_handle_que(void *context, int pending); static void igb_handle_link(void *context, int pending); +static void igb_handle_link_locked(struct adapter *); -/* These are MSIX only irq handlers */ -static void igb_msix_que(void *); -static void igb_msix_link(void *); +static void igb_set_sysctl_value(struct adapter *, const char *, + const char *, int *, int); +static int igb_set_flowcntl(SYSCTL_HANDLER_ARGS); +static int igb_sysctl_dmac(SYSCTL_HANDLER_ARGS); +static int igb_sysctl_eee(SYSCTL_HANDLER_ARGS); #ifdef DEVICE_POLLING static poll_handler_t igb_poll; @@ -289,7 +302,7 @@ static device_method_t igb_methods[] = { DEVMETHOD(device_shutdown, igb_shutdown), DEVMETHOD(device_suspend, igb_suspend), DEVMETHOD(device_resume, igb_resume), - {0, 0} + DEVMETHOD_END }; static driver_t igb_driver = { @@ -305,11 +318,17 @@ MODULE_DEPEND(igb, ether, 1, 1, 1); * Tunable default values. *********************************************************************/ +static SYSCTL_NODE(_hw, OID_AUTO, igb, CTLFLAG_RD, 0, "IGB driver parameters"); + /* Descriptor defaults */ static int igb_rxd = IGB_DEFAULT_RXD; static int igb_txd = IGB_DEFAULT_TXD; TUNABLE_INT("hw.igb.rxd", &igb_rxd); TUNABLE_INT("hw.igb.txd", &igb_txd); +SYSCTL_INT(_hw_igb, OID_AUTO, rxd, CTLFLAG_RDTUN, &igb_rxd, 0, + "Number of receive descriptors per queue"); +SYSCTL_INT(_hw_igb, OID_AUTO, txd, CTLFLAG_RDTUN, &igb_txd, 0, + "Number of transmit descriptors per queue"); /* ** AIM: Adaptive Interrupt Moderation @@ -319,6 +338,8 @@ TUNABLE_INT("hw.igb.txd", &igb_txd); */ static int igb_enable_aim = TRUE; TUNABLE_INT("hw.igb.enable_aim", &igb_enable_aim); +SYSCTL_INT(_hw_igb, OID_AUTO, enable_aim, CTLFLAG_RW, &igb_enable_aim, 0, + "Enable adaptive interrupt moderation"); /* * MSIX should be the default for best performance, @@ -326,12 +347,26 @@ TUNABLE_INT("hw.igb.enable_aim", &igb_enable_aim); */ static int igb_enable_msix = 1; TUNABLE_INT("hw.igb.enable_msix", &igb_enable_msix); +SYSCTL_INT(_hw_igb, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &igb_enable_msix, 0, + "Enable MSI-X interrupts"); /* ** Tuneable Interrupt rate */ static int igb_max_interrupt_rate = 8000; TUNABLE_INT("hw.igb.max_interrupt_rate", &igb_max_interrupt_rate); +SYSCTL_INT(_hw_igb, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN, + &igb_max_interrupt_rate, 0, "Maximum interrupts per second"); + +#if __FreeBSD_version >= 800000 +/* +** Tuneable number of buffers in the buf-ring (drbr_xxx) +*/ +static int igb_buf_ring_size = IGB_BR_SIZE; +TUNABLE_INT("hw.igb.buf_ring_size", &igb_buf_ring_size); +SYSCTL_INT(_hw_igb, OID_AUTO, buf_ring_size, CTLFLAG_RDTUN, + &igb_buf_ring_size, 0, "Size of the bufring"); +#endif /* ** Header split causes the packet header to @@ -341,8 +376,10 @@ TUNABLE_INT("hw.igb.max_interrupt_rate", &igb_max_interrupt_rate); ** into the header and thus use no cluster. Its ** a very workload dependent type feature. */ -static bool igb_header_split = FALSE; +static int igb_header_split = FALSE; TUNABLE_INT("hw.igb.hdr_split", &igb_header_split); +SYSCTL_INT(_hw_igb, OID_AUTO, header_split, CTLFLAG_RDTUN, &igb_header_split, 0, + "Enable receive mbuf header split"); /* ** This will autoconfigure based on @@ -350,15 +387,26 @@ TUNABLE_INT("hw.igb.hdr_split", &igb_header_split); */ static int igb_num_queues = 0; TUNABLE_INT("hw.igb.num_queues", &igb_num_queues); +SYSCTL_INT(_hw_igb, OID_AUTO, num_queues, CTLFLAG_RDTUN, &igb_num_queues, 0, + "Number of queues to configure, 0 indicates autoconfigure"); + +/* +** Global variable to store last used CPU when binding queues +** to CPUs in igb_allocate_msix. Starts at CPU_FIRST and increments when a +** queue is bound to a cpu. +*/ +static int igb_last_bind_cpu = -1; /* How many packets rxeof tries to clean at a time */ static int igb_rx_process_limit = 100; TUNABLE_INT("hw.igb.rx_process_limit", &igb_rx_process_limit); +SYSCTL_INT(_hw_igb, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN, + &igb_rx_process_limit, 0, + "Maximum number of received packets to process at a time, -1 means unlimited"); -/* Flow control setting - default to FULL */ -static int igb_fc_setting = e1000_fc_full; -TUNABLE_INT("hw.igb.fc_setting", &igb_fc_setting); - +#ifdef DEV_NETMAP /* see ixgbe.c for details */ +#include <dev/netmap/if_igb_netmap.h> +#endif /* DEV_NETMAP */ /********************************************************************* * Device identification routine * @@ -429,6 +477,11 @@ igb_attach(device_t dev) INIT_DEBUGOUT("igb_attach: begin"); + if (resource_disabled("igb", device_get_unit(dev))) { + device_printf(dev, "Disabled by device hint\n"); + return (ENXIO); + } + adapter = device_get_softc(dev); adapter->dev = adapter->osdep.dev = dev; IGB_CORE_LOCK_INIT(adapter, device_get_nameunit(dev)); @@ -439,15 +492,14 @@ igb_attach(device_t dev) OID_AUTO, "nvm", CTLTYPE_INT|CTLFLAG_RW, adapter, 0, igb_sysctl_nvm_info, "I", "NVM Information"); - SYSCTL_ADD_INT(device_get_sysctl_ctx(adapter->dev), - SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), - OID_AUTO, "flow_control", CTLTYPE_INT|CTLFLAG_RW, - &igb_fc_setting, 0, "Flow Control"); + igb_set_sysctl_value(adapter, "enable_aim", + "Interrupt Moderation", &adapter->enable_aim, + igb_enable_aim); - SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "enable_aim", CTLTYPE_INT|CTLFLAG_RW, - &igb_enable_aim, 1, "Interrupt Moderation"); + OID_AUTO, "fc", CTLTYPE_INT|CTLFLAG_RW, + adapter, 0, igb_set_flowcntl, "I", "Flow Control"); callout_init_mtx(&adapter->timer, &adapter->core_mtx, 0); @@ -470,10 +522,10 @@ igb_attach(device_t dev) e1000_get_bus_info(&adapter->hw); - /* Sysctls for limiting the amount of work done in the taskqueue */ - igb_add_rx_process_limit(adapter, "rx_processing_limit", - "max number of rx packets to process", &adapter->rx_process_limit, - igb_rx_process_limit); + /* Sysctl for limiting the amount of work done in the taskqueue */ + igb_set_sysctl_value(adapter, "rx_processing_limit", + "max number of rx packets to process", + &adapter->rx_process_limit, igb_rx_process_limit); /* * Validate number of transmit and receive descriptors. It @@ -522,7 +574,7 @@ igb_attach(device_t dev) } /* Allocate the appropriate stats memory */ - if (adapter->hw.mac.type == e1000_vfadapt) { + if (adapter->vf_ifp) { adapter->stats = (struct e1000_vf_stats *)malloc(sizeof \ (struct e1000_vf_stats), M_DEVBUF, M_NOWAIT | M_ZERO); @@ -546,6 +598,21 @@ igb_attach(device_t dev) goto err_late; } + /* Some adapter-specific advanced features */ + if (adapter->hw.mac.type >= e1000_i350) { + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), + SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "dmac", CTLTYPE_INT|CTLFLAG_RW, + adapter, 0, igb_sysctl_dmac, "I", "DMA Coalesce"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), + SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "eee_disabled", CTLTYPE_INT|CTLFLAG_RW, + adapter, 0, igb_sysctl_eee, "I", + "Disable Energy Efficient Ethernet"); + if (adapter->hw.phy.media_type == e1000_media_type_copper) + e1000_set_eee_i350(&adapter->hw); + } + /* ** Start from a known state, this is ** important in reading the nvm and @@ -554,7 +621,9 @@ igb_attach(device_t dev) e1000_reset_hw(&adapter->hw); /* Make sure we have a good EEPROM before we read from it */ - if (e1000_validate_nvm_checksum(&adapter->hw) < 0) { + if (((adapter->hw.mac.type != e1000_i210) && + (adapter->hw.mac.type != e1000_i211)) && + (e1000_validate_nvm_checksum(&adapter->hw) < 0)) { /* ** Some PCI-E parts fail the first check due to ** the link being in sleep state, call it again, @@ -584,16 +653,6 @@ igb_attach(device_t dev) goto err_late; } - /* - ** Configure Interrupts - */ - if ((adapter->msix > 1) && (igb_enable_msix)) - error = igb_allocate_msix(adapter); - else /* MSI or Legacy */ - error = igb_allocate_legacy(adapter); - if (error) - goto err_late; - /* Setup OS specific network interface */ if (igb_setup_interface(dev, adapter) != 0) goto err_late; @@ -632,23 +691,38 @@ igb_attach(device_t dev) igb_add_hw_stats(adapter); /* Tell the stack that the interface is not active */ - adapter->ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + adapter->ifp->if_drv_flags |= IFF_DRV_OACTIVE; adapter->led_dev = led_create(igb_led_func, adapter, device_get_nameunit(dev)); + /* + ** Configure Interrupts + */ + if ((adapter->msix > 1) && (igb_enable_msix)) + error = igb_allocate_msix(adapter); + else /* MSI or Legacy */ + error = igb_allocate_legacy(adapter); + if (error) + goto err_late; + +#ifdef DEV_NETMAP + igb_netmap_attach(adapter); +#endif /* DEV_NETMAP */ INIT_DEBUGOUT("igb_attach: end"); return (0); err_late: + igb_detach(dev); igb_free_transmit_structures(adapter); igb_free_receive_structures(adapter); igb_release_hw_control(adapter); - if (adapter->ifp != NULL) - if_free(adapter->ifp); err_pci: igb_free_pci_resources(adapter); + if (adapter->ifp != NULL) + if_free(adapter->ifp); free(adapter->mta, M_DEVBUF); IGB_CORE_LOCK_DESTROY(adapter); @@ -679,6 +753,8 @@ igb_detach(device_t dev) return (EBUSY); } + ether_ifdetach(adapter->ifp); + if (adapter->led_dev != NULL) led_destroy(adapter->led_dev); @@ -710,17 +786,19 @@ igb_detach(device_t dev) if (adapter->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, adapter->vlan_detach); - ether_ifdetach(adapter->ifp); - callout_drain(&adapter->timer); +#ifdef DEV_NETMAP + netmap_detach(adapter->ifp); +#endif /* DEV_NETMAP */ igb_free_pci_resources(adapter); bus_generic_detach(dev); if_free(ifp); igb_free_transmit_structures(adapter); igb_free_receive_structures(adapter); - free(adapter->mta, M_DEVBUF); + if (adapter->mta != NULL) + free(adapter->mta, M_DEVBUF); IGB_CORE_LOCK_DESTROY(adapter); @@ -769,6 +847,7 @@ static int igb_resume(device_t dev) { struct adapter *adapter = device_get_softc(dev); + struct tx_ring *txr = adapter->tx_rings; struct ifnet *ifp = adapter->ifp; IGB_CORE_LOCK(adapter); @@ -776,15 +855,29 @@ igb_resume(device_t dev) igb_init_manageability(adapter); if ((ifp->if_flags & IFF_UP) && - (ifp->if_drv_flags & IFF_DRV_RUNNING)) - igb_start(ifp); - + (ifp->if_drv_flags & IFF_DRV_RUNNING) && adapter->link_active) { + for (int i = 0; i < adapter->num_queues; i++, txr++) { + IGB_TX_LOCK(txr); +#if __FreeBSD_version >= 800000 + /* Process the stack queue only if not depleted */ + if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && + !drbr_empty(ifp, txr->br)) + igb_mq_start_locked(ifp, txr); +#else + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + igb_start_locked(txr, ifp); +#endif + IGB_TX_UNLOCK(txr); + } + } IGB_CORE_UNLOCK(adapter); return bus_generic_resume(dev); } +#if __FreeBSD_version < 800000 + /********************************************************************* * Transmit entry point * @@ -814,8 +907,8 @@ igb_start_locked(struct tx_ring *txr, struct ifnet *ifp) igb_txeof(txr); while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { - if (txr->tx_avail <= IGB_TX_OP_THRESHOLD) { - ifp->if_drv_flags |= IFF_DRV_OACTIVE; + if (txr->tx_avail <= IGB_MAX_SCATTER) { + txr->queue_status |= IGB_QUEUE_DEPLETED; break; } IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); @@ -826,10 +919,10 @@ igb_start_locked(struct tx_ring *txr, struct ifnet *ifp) * NULL on failure. In that event, we can't requeue. */ if (igb_xmit(txr, &m_head)) { - if (m_head == NULL) - break; - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - IFQ_DRV_PREPEND(&ifp->if_snd, m_head); + if (m_head != NULL) + IFQ_DRV_PREPEND(&ifp->if_snd, m_head); + if (txr->tx_avail <= IGB_MAX_SCATTER) + txr->queue_status |= IGB_QUEUE_DEPLETED; break; } @@ -838,7 +931,7 @@ igb_start_locked(struct tx_ring *txr, struct ifnet *ifp) /* Set watchdog on */ txr->watchdog_time = ticks; - txr->queue_status = IGB_QUEUE_WORKING; + txr->queue_status |= IGB_QUEUE_WORKING; } } @@ -861,9 +954,11 @@ igb_start(struct ifnet *ifp) return; } -#if __FreeBSD_version >= 800000 +#else /* __FreeBSD_version >= 800000 */ + /* -** Multiqueue Transmit driver +** Multiqueue Transmit Entry: +** quick turnaround to the stack ** */ static int @@ -872,83 +967,81 @@ igb_mq_start(struct ifnet *ifp, struct mbuf *m) struct adapter *adapter = ifp->if_softc; struct igb_queue *que; struct tx_ring *txr; - int i = 0, err = 0; + int i, err = 0; /* Which queue to use */ if ((m->m_flags & M_FLOWID) != 0) i = m->m_pkthdr.flowid % adapter->num_queues; - + else + i = curcpu % adapter->num_queues; txr = &adapter->tx_rings[i]; que = &adapter->queues[i]; - if (IGB_TX_TRYLOCK(txr)) { - err = igb_mq_start_locked(ifp, txr, m); - IGB_TX_UNLOCK(txr); - } else { - err = drbr_enqueue(ifp, txr->br, m); - taskqueue_enqueue(que->tq, &que->que_task); - } + err = drbr_enqueue(ifp, txr->br, m); + taskqueue_enqueue(que->tq, &txr->txq_task); return (err); } static int -igb_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m) +igb_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr) { struct adapter *adapter = txr->adapter; - struct mbuf *next; - int err = 0, enq; + struct mbuf *buf; + int err = 0, enq = 0; IGB_TX_LOCK_ASSERT(txr); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != - IFF_DRV_RUNNING || adapter->link_active == 0) { - if (m != NULL) - err = drbr_enqueue(ifp, txr->br, m); - return (err); - } - - /* Call cleanup if number of TX descriptors low */ - if (txr->tx_avail <= IGB_TX_CLEANUP_THRESHOLD) - igb_txeof(txr); - - enq = 0; - if (m == NULL) { - next = drbr_dequeue(ifp, txr->br); - } else if (drbr_needs_enqueue(ifp, txr->br)) { - if ((err = drbr_enqueue(ifp, txr->br, m)) != 0) - return (err); - next = drbr_dequeue(ifp, txr->br); - } else - next = m; + if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) || + adapter->link_active == 0) + return (ENETDOWN); /* Process the queue */ - while (next != NULL) { - if ((err = igb_xmit(txr, &next)) != 0) { - if (next != NULL) - err = drbr_enqueue(ifp, txr->br, next); + buf = drbr_dequeue(ifp, txr->br); + while (buf != NULL) { + if ((err = igb_xmit(txr, &buf)) != 0) { + if (buf != NULL) + err = drbr_enqueue(ifp, txr->br, buf); break; } enq++; - drbr_stats_update(ifp, next->m_pkthdr.len, next->m_flags); - ETHER_BPF_MTAP(ifp, next); + ifp->if_obytes += buf->m_pkthdr.len; + if (buf->m_flags & M_MCAST) + ifp->if_omcasts++; + ETHER_BPF_MTAP(ifp, buf); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; - if (txr->tx_avail <= IGB_TX_OP_THRESHOLD) { - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - break; - } - next = drbr_dequeue(ifp, txr->br); + buf = drbr_dequeue(ifp, txr->br); } if (enq > 0) { /* Set the watchdog */ - txr->queue_status = IGB_QUEUE_WORKING; + txr->queue_status |= IGB_QUEUE_WORKING; txr->watchdog_time = ticks; } + if (txr->tx_avail <= IGB_TX_CLEANUP_THRESHOLD) + igb_txeof(txr); + if (txr->tx_avail <= IGB_MAX_SCATTER) + txr->queue_status |= IGB_QUEUE_DEPLETED; return (err); } /* + * Called from a taskqueue to drain queued transmit packets. + */ +static void +igb_deferred_mq_start(void *arg, int pending) +{ + struct tx_ring *txr = arg; + struct adapter *adapter = txr->adapter; + struct ifnet *ifp = adapter->ifp; + + IGB_TX_LOCK(txr); + if (!drbr_empty(ifp, txr->br)) + igb_mq_start_locked(ifp, txr); + IGB_TX_UNLOCK(txr); +} + +/* ** Flush all ring buffers */ static void @@ -981,11 +1074,12 @@ static int igb_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct adapter *adapter = ifp->if_softc; - struct ifreq *ifr = (struct ifreq *)data; -#ifdef INET - struct ifaddr *ifa = (struct ifaddr *)data; + struct ifreq *ifr = (struct ifreq *)data; +#if defined(INET) || defined(INET6) + struct ifaddr *ifa = (struct ifaddr *)data; #endif - int error = 0; + bool avoid_reset = FALSE; + int error = 0; if (adapter->in_detach) return (error); @@ -993,24 +1087,26 @@ igb_ioctl(struct ifnet *ifp, u_long command, caddr_t data) switch (command) { case SIOCSIFADDR: #ifdef INET - if (ifa->ifa_addr->sa_family == AF_INET) { - /* - * XXX - * Since resetting hardware takes a very long time - * and results in link renegotiation we only - * initialize the hardware only when it is absolutely - * required. - */ + if (ifa->ifa_addr->sa_family == AF_INET) + avoid_reset = TRUE; +#endif +#ifdef INET6 + if (ifa->ifa_addr->sa_family == AF_INET6) + avoid_reset = TRUE; +#endif + /* + ** Calling init results in link renegotiation, + ** so we avoid doing it when possible. + */ + if (avoid_reset) { ifp->if_flags |= IFF_UP; - if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { - IGB_CORE_LOCK(adapter); - igb_init_locked(adapter); - IGB_CORE_UNLOCK(adapter); - } + if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) + igb_init(adapter); +#ifdef INET if (!(ifp->if_flags & IFF_NOARP)) arp_ifinit(ifp, ifa); - } else #endif + } else error = ether_ioctl(ifp, command, data); break; case SIOCSIFMTU: @@ -1069,11 +1165,6 @@ igb_ioctl(struct ifnet *ifp, u_long command, caddr_t data) } break; case SIOCSIFMEDIA: - /* - ** As the speed/duplex settings are being - ** changed, we need toreset the PHY. - */ - adapter->hw.phy.reset_disable = FALSE; /* Check SOL/IDER usage */ IGB_CORE_LOCK(adapter); if (e1000_check_reset_block(&adapter->hw)) { @@ -1131,6 +1222,10 @@ igb_ioctl(struct ifnet *ifp, u_long command, caddr_t data) ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; reinit = 1; } + if (mask & IFCAP_VLAN_HWTSO) { + ifp->if_capenable ^= IFCAP_VLAN_HWTSO; + reinit = 1; + } if (mask & IFCAP_LRO) { ifp->if_capenable ^= IFCAP_LRO; reinit = 1; @@ -1227,18 +1322,9 @@ igb_init_locked(struct adapter *adapter) } igb_initialize_receive_units(adapter); - /* Use real VLAN Filter support? */ - if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) { - if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) - /* Use real VLAN Filter support */ - igb_setup_vlan_hw_support(adapter); - else { - u32 ctrl; - ctrl = E1000_READ_REG(&adapter->hw, E1000_CTRL); - ctrl |= E1000_CTRL_VME; - E1000_WRITE_REG(&adapter->hw, E1000_CTRL, ctrl); - } - } + /* Enable VLAN support */ + if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) + igb_setup_vlan_hw_support(adapter); /* Don't lose promiscuous settings */ igb_set_promisc(adapter); @@ -1264,12 +1350,13 @@ igb_init_locked(struct adapter *adapter) else #endif /* DEVICE_POLLING */ { - igb_enable_intr(adapter); - E1000_WRITE_REG(&adapter->hw, E1000_ICS, E1000_ICS_LSC); + igb_enable_intr(adapter); + E1000_WRITE_REG(&adapter->hw, E1000_ICS, E1000_ICS_LSC); } - /* Don't reset the phy next time init gets called */ - adapter->hw.phy.reset_disable = TRUE; + /* Set Energy Efficient Ethernet */ + if (adapter->hw.phy.media_type == e1000_media_type_copper) + e1000_set_eee_i350(&adapter->hw); } static void @@ -1294,19 +1381,21 @@ igb_handle_que(void *context, int pending) if (ifp->if_drv_flags & IFF_DRV_RUNNING) { bool more; - more = igb_rxeof(que, -1, NULL); + more = igb_rxeof(que, adapter->rx_process_limit, NULL); IGB_TX_LOCK(txr); - if (igb_txeof(txr)) - more = TRUE; + igb_txeof(txr); #if __FreeBSD_version >= 800000 - if (!drbr_empty(ifp, txr->br)) - igb_mq_start_locked(ifp, txr, NULL); + /* Process the stack queue only if not depleted */ + if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && + !drbr_empty(ifp, txr->br)) + igb_mq_start_locked(ifp, txr); #else if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) igb_start_locked(txr, ifp); #endif IGB_TX_UNLOCK(txr); + /* Do we need another? */ if (more) { taskqueue_enqueue(que->tq, &que->que_task); return; @@ -1330,8 +1419,35 @@ igb_handle_link(void *context, int pending) { struct adapter *adapter = context; + IGB_CORE_LOCK(adapter); + igb_handle_link_locked(adapter); + IGB_CORE_UNLOCK(adapter); +} + +static void +igb_handle_link_locked(struct adapter *adapter) +{ + struct tx_ring *txr = adapter->tx_rings; + struct ifnet *ifp = adapter->ifp; + + IGB_CORE_LOCK_ASSERT(adapter); adapter->hw.mac.get_link_status = 1; igb_update_link_status(adapter); + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) && adapter->link_active) { + for (int i = 0; i < adapter->num_queues; i++, txr++) { + IGB_TX_LOCK(txr); +#if __FreeBSD_version >= 800000 + /* Process the stack queue only if not depleted */ + if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && + !drbr_empty(ifp, txr->br)) + igb_mq_start_locked(ifp, txr); +#else + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + igb_start_locked(txr, ifp); +#endif + IGB_TX_UNLOCK(txr); + } + } } /********************************************************************* @@ -1379,12 +1495,6 @@ igb_irq_fast(void *arg) } #ifdef DEVICE_POLLING -/********************************************************************* - * - * Legacy polling routine : if using this code you MUST be sure that - * multiqueue is not defined, ie, set igb_num_queues to 1. - * - *********************************************************************/ #if __FreeBSD_version >= 800000 #define POLL_RETURN_COUNT(a) (a) static int @@ -1395,8 +1505,8 @@ static void igb_poll(struct ifnet *ifp, enum poll_cmd cmd, int count) { struct adapter *adapter = ifp->if_softc; - struct igb_queue *que = adapter->queues; - struct tx_ring *txr = adapter->tx_rings; + struct igb_queue *que; + struct tx_ring *txr; u32 reg_icr, rx_done = 0; u32 loop = IGB_MAX_LOOP; bool more; @@ -1411,34 +1521,40 @@ igb_poll(struct ifnet *ifp, enum poll_cmd cmd, int count) reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR); /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) - igb_handle_link(adapter, 0); + igb_handle_link_locked(adapter); if (reg_icr & E1000_ICR_RXO) adapter->rx_overruns++; } IGB_CORE_UNLOCK(adapter); - igb_rxeof(que, count, &rx_done); + for (int i = 0; i < adapter->num_queues; i++) { + que = &adapter->queues[i]; + txr = que->txr; - IGB_TX_LOCK(txr); - do { - more = igb_txeof(txr); - } while (loop-- && more); + igb_rxeof(que, count, &rx_done); + + IGB_TX_LOCK(txr); + do { + more = igb_txeof(txr); + } while (loop-- && more); #if __FreeBSD_version >= 800000 - if (!drbr_empty(ifp, txr->br)) - igb_mq_start_locked(ifp, txr, NULL); + if (!drbr_empty(ifp, txr->br)) + igb_mq_start_locked(ifp, txr); #else - if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) - igb_start_locked(txr, ifp); + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + igb_start_locked(txr, ifp); #endif - IGB_TX_UNLOCK(txr); + IGB_TX_UNLOCK(txr); + } + return POLL_RETURN_COUNT(rx_done); } #endif /* DEVICE_POLLING */ /********************************************************************* * - * MSIX TX Interrupt Service routine + * MSIX Que Interrupt Service routine * **********************************************************************/ static void @@ -1446,21 +1562,31 @@ igb_msix_que(void *arg) { struct igb_queue *que = arg; struct adapter *adapter = que->adapter; + struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = que->txr; struct rx_ring *rxr = que->rxr; u32 newitr = 0; - bool more_tx, more_rx; + bool more_rx; E1000_WRITE_REG(&adapter->hw, E1000_EIMC, que->eims); ++que->irqs; IGB_TX_LOCK(txr); - more_tx = igb_txeof(txr); + igb_txeof(txr); +#if __FreeBSD_version >= 800000 + /* Process the stack queue only if not depleted */ + if (((txr->queue_status & IGB_QUEUE_DEPLETED) == 0) && + !drbr_empty(ifp, txr->br)) + igb_mq_start_locked(ifp, txr); +#else + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + igb_start_locked(txr, ifp); +#endif IGB_TX_UNLOCK(txr); more_rx = igb_rxeof(que, adapter->rx_process_limit, NULL); - if (igb_enable_aim == FALSE) + if (adapter->enable_aim == FALSE) goto no_calc; /* ** Do Adaptive Interrupt Moderation: @@ -1513,7 +1639,7 @@ igb_msix_que(void *arg) no_calc: /* Schedule a clean task if needed*/ - if (more_tx || more_rx) + if (more_rx) taskqueue_enqueue(que->tq, &que->que_task); else /* Reenable this interrupt */ @@ -1560,7 +1686,6 @@ static void igb_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct adapter *adapter = ifp->if_softc; - u_char fiber_type = IFM_1000_SX; INIT_DEBUGOUT("igb_media_status: begin"); @@ -1577,26 +1702,31 @@ igb_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) ifmr->ifm_status |= IFM_ACTIVE; - if ((adapter->hw.phy.media_type == e1000_media_type_fiber) || - (adapter->hw.phy.media_type == e1000_media_type_internal_serdes)) - ifmr->ifm_active |= fiber_type | IFM_FDX; - else { - switch (adapter->link_speed) { - case 10: - ifmr->ifm_active |= IFM_10_T; - break; - case 100: - ifmr->ifm_active |= IFM_100_TX; - break; - case 1000: - ifmr->ifm_active |= IFM_1000_T; - break; - } - if (adapter->link_duplex == FULL_DUPLEX) - ifmr->ifm_active |= IFM_FDX; + switch (adapter->link_speed) { + case 10: + ifmr->ifm_active |= IFM_10_T; + break; + case 100: + /* + ** Support for 100Mb SFP - these are Fiber + ** but the media type appears as serdes + */ + if (adapter->hw.phy.media_type == + e1000_media_type_internal_serdes) + ifmr->ifm_active |= IFM_100_FX; else - ifmr->ifm_active |= IFM_HDX; + ifmr->ifm_active |= IFM_100_TX; + break; + case 1000: + ifmr->ifm_active |= IFM_1000_T; + break; } + + if (adapter->link_duplex == FULL_DUPLEX) + ifmr->ifm_active |= IFM_FDX; + else + ifmr->ifm_active |= IFM_HDX; + IGB_CORE_UNLOCK(adapter); } @@ -1661,10 +1791,8 @@ igb_media_change(struct ifnet *ifp) /********************************************************************* * * This routine maps the mbufs to Advanced TX descriptors. - * used by the 82575 adapter. * **********************************************************************/ - static int igb_xmit(struct tx_ring *txr, struct mbuf **m_headp) { @@ -1673,35 +1801,123 @@ igb_xmit(struct tx_ring *txr, struct mbuf **m_headp) bus_dmamap_t map; struct igb_tx_buffer *tx_buffer, *tx_buffer_mapped; union e1000_adv_tx_desc *txd = NULL; - struct mbuf *m_head; - u32 olinfo_status = 0, cmd_type_len = 0; - int nsegs, i, j, error, first, last = 0; - u32 hdrlen = 0; - - m_head = *m_headp; - + struct mbuf *m_head = *m_headp; + struct ether_vlan_header *eh = NULL; + struct ip *ip = NULL; + struct tcphdr *th = NULL; + u32 hdrlen, cmd_type_len, olinfo_status = 0; + int ehdrlen, poff; + int nsegs, i, first, last = 0; + int error, do_tso, remap = 1; /* Set basic descriptor constants */ - cmd_type_len |= E1000_ADVTXD_DTYP_DATA; + cmd_type_len = E1000_ADVTXD_DTYP_DATA; cmd_type_len |= E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT; if (m_head->m_flags & M_VLANTAG) cmd_type_len |= E1000_ADVTXD_DCMD_VLE; - /* - * Force a cleanup if number of TX descriptors - * available hits the threshold - */ - if (txr->tx_avail <= IGB_TX_CLEANUP_THRESHOLD) { - igb_txeof(txr); - /* Now do we at least have a minimal? */ - if (txr->tx_avail <= IGB_TX_OP_THRESHOLD) { - txr->no_desc_avail++; +retry: + m_head = *m_headp; + do_tso = ((m_head->m_pkthdr.csum_flags & CSUM_TSO) != 0); + hdrlen = ehdrlen = poff = 0; + + /* + * Intel recommends entire IP/TCP header length reside in a single + * buffer. If multiple descriptors are used to describe the IP and + * TCP header, each descriptor should describe one or more + * complete headers; descriptors referencing only parts of headers + * are not supported. If all layer headers are not coalesced into + * a single buffer, each buffer should not cross a 4KB boundary, + * or be larger than the maximum read request size. + * Controller also requires modifing IP/TCP header to make TSO work + * so we firstly get a writable mbuf chain then coalesce ethernet/ + * IP/TCP header into a single buffer to meet the requirement of + * controller. This also simplifies IP/TCP/UDP checksum offloading + * which also has similiar restrictions. + */ + if (do_tso || m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) { + if (do_tso || (m_head->m_next != NULL && + m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD)) { + if (M_WRITABLE(*m_headp) == 0) { + m_head = m_dup(*m_headp, M_NOWAIT); + m_freem(*m_headp); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + *m_headp = m_head; + } + } + /* + * Assume IPv4, we don't have TSO/checksum offload support + * for IPv6 yet. + */ + ehdrlen = sizeof(struct ether_header); + m_head = m_pullup(m_head, ehdrlen); + if (m_head == NULL) { + *m_headp = NULL; return (ENOBUFS); } + eh = mtod(m_head, struct ether_vlan_header *); + if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { + ehdrlen = sizeof(struct ether_vlan_header); + m_head = m_pullup(m_head, ehdrlen); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + } + m_head = m_pullup(m_head, ehdrlen + sizeof(struct ip)); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + ip = (struct ip *)(mtod(m_head, char *) + ehdrlen); + poff = ehdrlen + (ip->ip_hl << 2); + if (do_tso) { + m_head = m_pullup(m_head, poff + sizeof(struct tcphdr)); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + /* + * The pseudo TCP checksum does not include TCP payload + * length so driver should recompute the checksum here + * what hardware expect to see. This is adherence of + * Microsoft's Large Send specification. + */ + th = (struct tcphdr *)(mtod(m_head, char *) + poff); + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(IPPROTO_TCP)); + /* Keep track of the full header length */ + hdrlen = poff + (th->th_off << 2); + } else if (m_head->m_pkthdr.csum_flags & CSUM_TCP) { + m_head = m_pullup(m_head, poff + sizeof(struct tcphdr)); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + th = (struct tcphdr *)(mtod(m_head, char *) + poff); + m_head = m_pullup(m_head, poff + (th->th_off << 2)); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + ip = (struct ip *)(mtod(m_head, char *) + ehdrlen); + th = (struct tcphdr *)(mtod(m_head, char *) + poff); + } else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) { + m_head = m_pullup(m_head, poff + sizeof(struct udphdr)); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + ip = (struct ip *)(mtod(m_head, char *) + ehdrlen); + } + *m_headp = m_head; } /* - * Map the packet for DMA. + * Map the packet for DMA * * Capture the first descriptor index, * this descriptor will have the index @@ -1716,10 +1932,19 @@ igb_xmit(struct tx_ring *txr, struct mbuf **m_headp) error = bus_dmamap_load_mbuf_sg(txr->txtag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); - if (error == EFBIG) { + /* + * There are two types of errors we can (try) to handle: + * - EFBIG means the mbuf chain was too long and bus_dma ran + * out of segments. Defragment the mbuf chain and try again. + * - ENOMEM means bus_dma could not obtain enough bounce buffers + * at this point in time. Defer sending and try again later. + * All other errors, in particular EINVAL, are fatal and prevent the + * mbuf chain from ever going through. Drop it and report error. + */ + if (error == EFBIG && remap) { struct mbuf *m; - m = m_defrag(*m_headp, M_DONTWAIT); + m = m_defrag(*m_headp, M_NOWAIT); if (m == NULL) { adapter->mbuf_defrag_failed++; m_freem(*m_headp); @@ -1728,19 +1953,9 @@ igb_xmit(struct tx_ring *txr, struct mbuf **m_headp) } *m_headp = m; - /* Try it again */ - error = bus_dmamap_load_mbuf_sg(txr->txtag, map, - *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); - - if (error == ENOMEM) { - adapter->no_tx_dma_setup++; - return (error); - } else if (error != 0) { - adapter->no_tx_dma_setup++; - m_freem(*m_headp); - *m_headp = NULL; - return (error); - } + /* Try it again, but only once */ + remap = 0; + goto retry; } else if (error == ENOMEM) { adapter->no_tx_dma_setup++; return (error); @@ -1751,29 +1966,35 @@ igb_xmit(struct tx_ring *txr, struct mbuf **m_headp) return (error); } - /* Check again to be sure we have enough descriptors */ - if (nsegs > (txr->tx_avail - 2)) { + /* + ** Make sure we don't overrun the ring, + ** we need nsegs descriptors and one for + ** the context descriptor used for the + ** offloads. + */ + if ((nsegs + 1) > (txr->tx_avail - 2)) { txr->no_desc_avail++; bus_dmamap_unload(txr->txtag, map); return (ENOBUFS); } m_head = *m_headp; - /* - * Set up the context descriptor: - * used when any hardware offload is done. - * This includes CSUM, VLAN, and TSO. It - * will use the first descriptor. + /* Do hardware assists: + * Set up the context descriptor, used + * when any hardware offload is done. + * This includes CSUM, VLAN, and TSO. + * It will use the first descriptor. */ - if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { - if (igb_tso_setup(txr, m_head, &hdrlen)) { + + if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { + if (igb_tso_setup(txr, m_head, ehdrlen, ip, th)) { cmd_type_len |= E1000_ADVTXD_DCMD_TSE; olinfo_status |= E1000_TXD_POPTS_IXSM << 8; olinfo_status |= E1000_TXD_POPTS_TXSM << 8; } else - return (ENXIO); + return (ENXIO); } else if (igb_tx_ctx_setup(txr, m_head)) - olinfo_status |= E1000_TXD_POPTS_TXSM << 8; + olinfo_status |= E1000_TXD_POPTS_TXSM << 8; /* Calculate payload length */ olinfo_status |= ((m_head->m_pkthdr.len - hdrlen) @@ -1785,7 +2006,7 @@ igb_xmit(struct tx_ring *txr, struct mbuf **m_headp) /* Set up our transmit descriptors */ i = txr->next_avail_desc; - for (j = 0; j < nsegs; j++) { + for (int j = 0; j < nsegs; j++) { bus_size_t seg_len; bus_addr_t seg_addr; @@ -1806,8 +2027,14 @@ igb_xmit(struct tx_ring *txr, struct mbuf **m_headp) txr->next_avail_desc = i; txr->tx_avail -= nsegs; - tx_buffer->m_head = m_head; + + /* + ** Here we swap the map so the last descriptor, + ** which gets the completion interrupt has the + ** real map, and the first descriptor gets the + ** unused map from this descriptor. + */ tx_buffer_mapped->map = tx_buffer->map; tx_buffer->map = map; bus_dmamap_sync(txr->txtag, map, BUS_DMASYNC_PREWRITE); @@ -1825,6 +2052,7 @@ igb_xmit(struct tx_ring *txr, struct mbuf **m_headp) */ tx_buffer = &txr->tx_buffers[first]; tx_buffer->next_eop = last; + /* Update the watchdog time early and often */ txr->watchdog_time = ticks; /* @@ -1837,9 +2065,7 @@ igb_xmit(struct tx_ring *txr, struct mbuf **m_headp) ++txr->tx_packets; return (0); - } - static void igb_set_promisc(struct adapter *adapter) { @@ -1847,7 +2073,7 @@ igb_set_promisc(struct adapter *adapter) struct e1000_hw *hw = &adapter->hw; u32 reg; - if (hw->mac.type == e1000_vfadapt) { + if (adapter->vf_ifp) { e1000_promisc_set_vf(hw, e1000_promisc_enabled); return; } @@ -1869,7 +2095,7 @@ igb_disable_promisc(struct adapter *adapter) struct e1000_hw *hw = &adapter->hw; u32 reg; - if (hw->mac.type == e1000_vfadapt) { + if (adapter->vf_ifp) { e1000_promisc_set_vf(hw, e1000_promisc_disabled); return; } @@ -1946,7 +2172,10 @@ igb_local_timer(void *arg) { struct adapter *adapter = arg; device_t dev = adapter->dev; + struct ifnet *ifp = adapter->ifp; struct tx_ring *txr = adapter->tx_rings; + struct igb_queue *que = adapter->queues; + int hung = 0, busy = 0; IGB_CORE_LOCK_ASSERT(adapter); @@ -1954,23 +2183,34 @@ igb_local_timer(void *arg) igb_update_link_status(adapter); igb_update_stats_counters(adapter); - /* - ** If flow control has paused us since last checking - ** it invalidates the watchdog timing, so dont run it. - */ - if (adapter->pause_frames) { - adapter->pause_frames = 0; - goto out; - } - /* - ** Watchdog: check for time since any descriptor was cleaned + ** Check the TX queues status + ** - central locked handling of OACTIVE + ** - watchdog only if all queues show hung */ - for (int i = 0; i < adapter->num_queues; i++, txr++) - if (txr->queue_status == IGB_QUEUE_HUNG) - goto timeout; -out: + for (int i = 0; i < adapter->num_queues; i++, que++, txr++) { + if ((txr->queue_status & IGB_QUEUE_HUNG) && + (adapter->pause_frames == 0)) + ++hung; + if (txr->queue_status & IGB_QUEUE_DEPLETED) + ++busy; + if ((txr->queue_status & IGB_QUEUE_IDLE) == 0) + taskqueue_enqueue(que->tq, &que->que_task); + } + if (hung == adapter->num_queues) + goto timeout; + if (busy == adapter->num_queues) + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + else if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) && + (busy < adapter->num_queues)) + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + + adapter->pause_frames = 0; callout_reset(&adapter->timer, hz, igb_local_timer, adapter); +#ifndef DEVICE_POLLING + /* Schedule all queue interrupts - deadlock protection */ + E1000_WRITE_REG(&adapter->hw, E1000_EICS, adapter->que_mask); +#endif return; timeout: @@ -1989,11 +2229,15 @@ timeout: static void igb_update_link_status(struct adapter *adapter) { - struct e1000_hw *hw = &adapter->hw; - struct ifnet *ifp = adapter->ifp; - device_t dev = adapter->dev; - struct tx_ring *txr = adapter->tx_rings; - u32 link_check = 0; + struct e1000_hw *hw = &adapter->hw; + struct e1000_fc_info *fc = &hw->fc; + struct ifnet *ifp = adapter->ifp; + device_t dev = adapter->dev; + struct tx_ring *txr = adapter->tx_rings; + u32 link_check, thstat, ctrl; + char *flowctl = NULL; + + link_check = thstat = ctrl = 0; /* Get the cached link value or read for real */ switch (hw->phy.media_type) { @@ -2023,17 +2267,44 @@ igb_update_link_status(struct adapter *adapter) break; } + /* Check for thermal downshift or shutdown */ + if (hw->mac.type == e1000_i350) { + thstat = E1000_READ_REG(hw, E1000_THSTAT); + ctrl = E1000_READ_REG(hw, E1000_CTRL_EXT); + } + + /* Get the flow control for display */ + switch (fc->current_mode) { + case e1000_fc_rx_pause: + flowctl = "RX"; + break; + case e1000_fc_tx_pause: + flowctl = "TX"; + break; + case e1000_fc_full: + flowctl = "Full"; + break; + case e1000_fc_none: + default: + flowctl = "None"; + break; + } + /* Now we check if a transition has happened */ if (link_check && (adapter->link_active == 0)) { e1000_get_speed_and_duplex(&adapter->hw, &adapter->link_speed, &adapter->link_duplex); if (bootverbose) - device_printf(dev, "Link is up %d Mbps %s\n", + device_printf(dev, "Link is up %d Mbps %s," + " Flow Control: %s\n", adapter->link_speed, ((adapter->link_duplex == FULL_DUPLEX) ? - "Full Duplex" : "Half Duplex")); + "Full Duplex" : "Half Duplex"), flowctl); adapter->link_active = 1; ifp->if_baudrate = adapter->link_speed * 1000000; + if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) && + (thstat & E1000_THSTAT_LINK_THROTTLE)) + device_printf(dev, "Link: thermal downshift\n"); /* This can sleep */ if_link_state_change(ifp, LINK_STATE_UP); } else if (!link_check && (adapter->link_active == 1)) { @@ -2041,10 +2312,13 @@ igb_update_link_status(struct adapter *adapter) adapter->link_duplex = 0; if (bootverbose) device_printf(dev, "Link is Down\n"); + if ((ctrl & E1000_CTRL_EXT_LINK_MODE_GMII) && + (thstat & E1000_THSTAT_PWR_DOWN)) + device_printf(dev, "Link: thermal shutdown\n"); adapter->link_active = 0; /* This can sleep */ if_link_state_change(ifp, LINK_STATE_DOWN); - /* Turn off watchdogs */ + /* Reset queue state */ for (int i = 0; i < adapter->num_queues; i++, txr++) txr->queue_status = IGB_QUEUE_IDLE; } @@ -2073,9 +2347,10 @@ igb_stop(void *arg) callout_stop(&adapter->timer); /* Tell the stack that the interface is no longer active */ - ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + ifp->if_drv_flags |= IFF_DRV_OACTIVE; - /* Unarm watchdog timer. */ + /* Disarm watchdog timer. */ for (int i = 0; i < adapter->num_queues; i++, txr++) { IGB_TX_LOCK(txr); txr->queue_status = IGB_QUEUE_IDLE; @@ -2123,6 +2398,13 @@ igb_identify_hardware(struct adapter *adapter) /* Set MAC type early for PCI setup */ e1000_set_mac_type(&adapter->hw); + + /* Are we a VF device? */ + if ((adapter->hw.mac.type == e1000_vfadapt) || + (adapter->hw.mac.type == e1000_vfadapt_i350)) + adapter->vf_ifp = 1; + else + adapter->vf_ifp = 0; } static int @@ -2163,6 +2445,7 @@ igb_allocate_legacy(struct adapter *adapter) { device_t dev = adapter->dev; struct igb_queue *que = adapter->queues; + struct tx_ring *txr = adapter->tx_rings; int error, rid = 0; /* Turn off all interrupts */ @@ -2181,6 +2464,10 @@ igb_allocate_legacy(struct adapter *adapter) return (ENXIO); } +#if __FreeBSD_version >= 800000 + TASK_INIT(&txr->txq_task, 0, igb_deferred_mq_start, txr); +#endif + /* * Try allocating a fast interrupt and the associated deferred * processing contexts. @@ -2218,6 +2505,9 @@ igb_allocate_msix(struct adapter *adapter) struct igb_queue *que = adapter->queues; int error, rid, vector = 0; + /* Be sure to start with all interrupts disabled */ + E1000_WRITE_REG(&adapter->hw, E1000_IMC, ~0); + E1000_WRITE_FLUSH(&adapter->hw); for (int i = 0; i < adapter->num_queues; i++, vector++, que++) { rid = vector +1; @@ -2249,11 +2539,22 @@ igb_allocate_msix(struct adapter *adapter) ** Bind the msix vector, and thus the ** rings to the corresponding cpu. */ - if (adapter->num_queues > 1) - bus_bind_intr(dev, que->res, i); + if (adapter->num_queues > 1) { + if (igb_last_bind_cpu < 0) + igb_last_bind_cpu = CPU_FIRST(); + bus_bind_intr(dev, que->res, igb_last_bind_cpu); + device_printf(dev, + "Bound queue %d to cpu %d\n", + i,igb_last_bind_cpu); + igb_last_bind_cpu = CPU_NEXT(igb_last_bind_cpu); + } +#if __FreeBSD_version >= 800000 + TASK_INIT(&que->txr->txq_task, 0, igb_deferred_mq_start, + que->txr); +#endif /* Make tasklet for deferred handling */ TASK_INIT(&que->que_task, 0, igb_handle_que, que); - que->tq = taskqueue_create_fast("igb_que", M_NOWAIT, + que->tq = taskqueue_create("igb_que", M_NOWAIT, taskqueue_thread_enqueue, &que->tq); taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que", device_get_nameunit(adapter->dev)); @@ -2292,7 +2593,7 @@ igb_configure_queues(struct adapter *adapter) u32 tmp, ivar = 0, newitr = 0; /* First turn on RSS capability */ - if (adapter->hw.mac.type > e1000_82575) + if (adapter->hw.mac.type != e1000_82575) E1000_WRITE_REG(hw, E1000_GPIE, E1000_GPIE_MSIX_MODE | E1000_GPIE_EIAME | E1000_GPIE_PBA | E1000_GPIE_NSICR); @@ -2300,7 +2601,11 @@ igb_configure_queues(struct adapter *adapter) /* Turn on MSIX */ switch (adapter->hw.mac.type) { case e1000_82580: + case e1000_i350: + case e1000_i210: + case e1000_i211: case e1000_vfadapt: + case e1000_vfadapt_i350: /* RX entries */ for (int i = 0; i < adapter->num_queues; i++) { u32 index = i >> 1; @@ -2328,13 +2633,12 @@ igb_configure_queues(struct adapter *adapter) ivar |= (que->msix | E1000_IVAR_VALID) << 8; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); - adapter->eims_mask |= que->eims; + adapter->que_mask |= que->eims; } /* And for the link interrupt */ ivar = (adapter->linkvec | E1000_IVAR_VALID) << 8; adapter->link_mask = 1 << adapter->linkvec; - adapter->eims_mask |= adapter->link_mask; E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar); break; case e1000_82576: @@ -2351,7 +2655,7 @@ igb_configure_queues(struct adapter *adapter) ivar |= (que->msix | E1000_IVAR_VALID) << 16; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); - adapter->eims_mask |= que->eims; + adapter->que_mask |= que->eims; } /* TX entries */ for (int i = 0; i < adapter->num_queues; i++) { @@ -2366,13 +2670,12 @@ igb_configure_queues(struct adapter *adapter) ivar |= (que->msix | E1000_IVAR_VALID) << 24; } E1000_WRITE_REG_ARRAY(hw, E1000_IVAR0, index, ivar); - adapter->eims_mask |= que->eims; + adapter->que_mask |= que->eims; } /* And for the link interrupt */ ivar = (adapter->linkvec | E1000_IVAR_VALID) << 8; adapter->link_mask = 1 << adapter->linkvec; - adapter->eims_mask |= adapter->link_mask; E1000_WRITE_REG(hw, E1000_IVAR_MISC, ivar); break; @@ -2393,14 +2696,13 @@ igb_configure_queues(struct adapter *adapter) que->eims = tmp; E1000_WRITE_REG_ARRAY(hw, E1000_MSIXBM(0), i, que->eims); - adapter->eims_mask |= que->eims; + adapter->que_mask |= que->eims; } /* Link */ E1000_WRITE_REG(hw, E1000_MSIXBM(adapter->linkvec), E1000_EIMS_OTHER); adapter->link_mask |= E1000_EIMS_OTHER; - adapter->eims_mask |= adapter->link_mask; default: break; } @@ -2461,13 +2763,24 @@ igb_free_pci_resources(struct adapter *adapter) else (adapter->msix != 0) ? (rid = 1):(rid = 0); + que = adapter->queues; if (adapter->tag != NULL) { + taskqueue_drain(que->tq, &adapter->link_task); bus_teardown_intr(dev, adapter->res, adapter->tag); adapter->tag = NULL; } if (adapter->res != NULL) bus_release_resource(dev, SYS_RES_IRQ, rid, adapter->res); + for (int i = 0; i < adapter->num_queues; i++, que++) { + if (que->tq != NULL) { +#if __FreeBSD_version >= 800000 + taskqueue_drain(que->tq, &que->txr->txq_task); +#endif + taskqueue_drain(que->tq, &que->que_task); + taskqueue_free(que->tq); + } + } mem: if (adapter->msix) pci_release_msi(dev); @@ -2489,7 +2802,7 @@ static int igb_setup_msix(struct adapter *adapter) { device_t dev = adapter->dev; - int rid, want, queues, msgs; + int rid, want, queues, msgs, maxqueues; /* tuneable override */ if (igb_enable_msix == 0) @@ -2520,16 +2833,29 @@ igb_setup_msix(struct adapter *adapter) /* Manual override */ if (igb_num_queues != 0) queues = igb_num_queues; - if (queues > 8) /* max queues */ - queues = 8; - /* Can have max of 4 queues on 82575 */ - if ((adapter->hw.mac.type == e1000_82575) && (queues > 4)) - queues = 4; - - /* Limit the VF adapter to one queue */ - if (adapter->hw.mac.type == e1000_vfadapt) - queues = 1; + /* Sanity check based on HW */ + switch (adapter->hw.mac.type) { + case e1000_82575: + maxqueues = 4; + break; + case e1000_82576: + case e1000_82580: + case e1000_i350: + maxqueues = 8; + break; + case e1000_i210: + maxqueues = 4; + break; + case e1000_i211: + maxqueues = 2; + break; + default: /* VF interfaces */ + maxqueues = 1; + break; + } + if (queues > maxqueues) + queues = maxqueues; /* ** One vector (RX/TX pair) per queue @@ -2543,7 +2869,7 @@ igb_setup_msix(struct adapter *adapter) "MSIX Configuration Problem, " "%d vectors configured, but %d queues wanted!\n", msgs, want); - return (ENXIO); + return (0); } if ((msgs) && pci_alloc_msix(dev, &msgs) == 0) { device_printf(adapter->dev, @@ -2553,9 +2879,11 @@ igb_setup_msix(struct adapter *adapter) } msi: msgs = pci_msi_count(dev); - if (msgs == 1 && pci_alloc_msi(dev, &msgs) == 0) - device_printf(adapter->dev,"Using MSI interrupt\n"); - return (msgs); + if (msgs == 1 && pci_alloc_msi(dev, &msgs) == 0) { + device_printf(adapter->dev," Using MSI interrupt\n"); + return (msgs); + } + return (0); } /********************************************************************* @@ -2589,10 +2917,18 @@ igb_reset(struct adapter *adapter) break; case e1000_82576: case e1000_vfadapt: - pba = E1000_PBA_64K; + pba = E1000_READ_REG(hw, E1000_RXPBS); + pba &= E1000_RXPBS_SIZE_MASK_82576; break; case e1000_82580: - pba = E1000_PBA_35K; + case e1000_i350: + case e1000_vfadapt_i350: + pba = E1000_READ_REG(hw, E1000_RXPBS); + pba = e1000_rxpbs_adjust_82580(pba); + break; + case e1000_i210: + case e1000_i211: + pba = E1000_PBA_34K; default: break; } @@ -2647,14 +2983,10 @@ igb_reset(struct adapter *adapter) fc->pause_time = IGB_FC_PAUSE_TIME; fc->send_xon = TRUE; - - /* Set Flow control, use the tunable location if sane */ - if ((igb_fc_setting >= 0) && (igb_fc_setting < 4)) - fc->requested_mode = igb_fc_setting; + if (adapter->fc) + fc->requested_mode = adapter->fc; else - fc->requested_mode = e1000_fc_none; - - fc->current_mode = fc->requested_mode; + fc->requested_mode = e1000_fc_default; /* Issue a global reset */ e1000_reset_hw(hw); @@ -2663,28 +2995,48 @@ igb_reset(struct adapter *adapter) if (e1000_init_hw(hw) < 0) device_printf(dev, "Hardware Initialization Failed\n"); - if (hw->mac.type == e1000_82580) { - u32 reg; + /* Setup DMA Coalescing */ + if ((hw->mac.type > e1000_82580) && + (hw->mac.type != e1000_i211)) { + u32 dmac; + u32 reg = ~E1000_DMACR_DMAC_EN; - hwm = (pba << 10) - (2 * adapter->max_frame_size); - /* - * 0x80000000 - enable DMA COAL - * 0x10000000 - use L0s as low power - * 0x20000000 - use L1 as low power - * X << 16 - exit dma coal when rx data exceeds X kB - * Y - upper limit to stay in dma coal in units of 32usecs - */ - E1000_WRITE_REG(hw, E1000_DMACR, - 0xA0000006 | ((hwm << 6) & 0x00FF0000)); + if (adapter->dmac == 0) { /* Disabling it */ + E1000_WRITE_REG(hw, E1000_DMACR, reg); + goto reset_out; + } - /* set hwm to PBA - 2 * max frame size */ - E1000_WRITE_REG(hw, E1000_FCRTC, hwm); - /* - * This sets the time to wait before requesting transition to - * low power state to number of usecs needed to receive 1 512 - * byte frame at gigabit line rate - */ - E1000_WRITE_REG(hw, E1000_DMCTLX, 4); + /* Set starting thresholds */ + E1000_WRITE_REG(hw, E1000_DMCTXTH, 0); + E1000_WRITE_REG(hw, E1000_DMCRTRH, 0); + + hwm = 64 * pba - adapter->max_frame_size / 16; + if (hwm < 64 * (pba - 6)) + hwm = 64 * (pba - 6); + reg = E1000_READ_REG(hw, E1000_FCRTC); + reg &= ~E1000_FCRTC_RTH_COAL_MASK; + reg |= ((hwm << E1000_FCRTC_RTH_COAL_SHIFT) + & E1000_FCRTC_RTH_COAL_MASK); + E1000_WRITE_REG(hw, E1000_FCRTC, reg); + + + dmac = pba - adapter->max_frame_size / 512; + if (dmac < pba - 10) + dmac = pba - 10; + reg = E1000_READ_REG(hw, E1000_DMACR); + reg &= ~E1000_DMACR_DMACTHR_MASK; + reg = ((dmac << E1000_DMACR_DMACTHR_SHIFT) + & E1000_DMACR_DMACTHR_MASK); + /* transition to L0x or L1 if available..*/ + reg |= (E1000_DMACR_DMAC_EN | E1000_DMACR_DMAC_LX_MASK); + /* timer = value in adapter->dmac in 32usec intervals */ + reg |= (adapter->dmac >> 5); + E1000_WRITE_REG(hw, E1000_DMACR, reg); + + /* Set the interval before transition */ + reg = E1000_READ_REG(hw, E1000_DMCTLX); + reg |= 0x80000004; + E1000_WRITE_REG(hw, E1000_DMCTLX, reg); /* free space in tx packet buffer to wake from DMA coal */ E1000_WRITE_REG(hw, E1000_DMCTXTH, @@ -2692,10 +3044,18 @@ igb_reset(struct adapter *adapter) /* make low power state decision controlled by DMA coal */ reg = E1000_READ_REG(hw, E1000_PCIEMISC); + reg &= ~E1000_PCIEMISC_LX_DECISION; + E1000_WRITE_REG(hw, E1000_PCIEMISC, reg); + device_printf(dev, "DMA Coalescing enabled\n"); + + } else if (hw->mac.type == e1000_82580) { + u32 reg = E1000_READ_REG(hw, E1000_PCIEMISC); + E1000_WRITE_REG(hw, E1000_DMACR, 0); E1000_WRITE_REG(hw, E1000_PCIEMISC, - reg | E1000_PCIEMISC_LX_DECISION); + reg & ~E1000_PCIEMISC_LX_DECISION); } +reset_out: E1000_WRITE_REG(&adapter->hw, E1000_VET, ETHERTYPE_VLAN); e1000_get_phy_info(hw); e1000_check_for_link(hw); @@ -2720,19 +3080,19 @@ igb_setup_interface(device_t dev, struct adapter *adapter) return (-1); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); - ifp->if_mtu = ETHERMTU; ifp->if_init = igb_init; ifp->if_softc = adapter; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = igb_ioctl; - ifp->if_start = igb_start; #if __FreeBSD_version >= 800000 ifp->if_transmit = igb_mq_start; ifp->if_qflush = igb_qflush; -#endif +#else + ifp->if_start = igb_start; IFQ_SET_MAXLEN(&ifp->if_snd, adapter->num_tx_desc - 1); ifp->if_snd.ifq_drv_maxlen = adapter->num_tx_desc - 1; IFQ_SET_READY(&ifp->if_snd); +#endif ether_ifattach(ifp, adapter->hw.mac.addr); @@ -2755,15 +3115,19 @@ igb_setup_interface(device_t dev, struct adapter *adapter) * support full VLAN capability. */ ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); - ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; - ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; + ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING + | IFCAP_VLAN_HWTSO + | IFCAP_VLAN_MTU; + ifp->if_capenable |= IFCAP_VLAN_HWTAGGING + | IFCAP_VLAN_HWTSO + | IFCAP_VLAN_MTU; /* - ** Dont turn this on by default, if vlans are + ** Don't turn this on by default, if vlans are ** created on another pseudo device (eg. lagg) ** then vlan events are not passed thru, breaking ** operation, but with HW FILTER off it works. If - ** using vlans directly on the em driver you can + ** using vlans directly on the igb driver you can ** enable this and get full hardware tag filtering. */ ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; @@ -2837,7 +3201,7 @@ igb_dma_malloc(struct adapter *adapter, bus_size_t size, } error = bus_dmamem_alloc(dma->dma_tag, (void**) &dma->dma_vaddr, - BUS_DMA_NOWAIT, &dma->dma_map); + BUS_DMA_NOWAIT | BUS_DMA_COHERENT, &dma->dma_map); if (error) { device_printf(adapter->dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", @@ -2966,7 +3330,7 @@ igb_allocate_queues(struct adapter *adapter) } #if __FreeBSD_version >= 800000 /* Allocate a buf ring */ - txr->br = buf_ring_alloc(IGB_BR_SIZE, M_DEVBUF, + txr->br = buf_ring_alloc(igb_buf_ring_size, M_DEVBUF, M_WAITOK, &txr->tx_mtx); #endif } @@ -3105,9 +3469,16 @@ igb_setup_transmit_ring(struct tx_ring *txr) struct adapter *adapter = txr->adapter; struct igb_tx_buffer *txbuf; int i; +#ifdef DEV_NETMAP + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_slot *slot; +#endif /* DEV_NETMAP */ /* Clear the old descriptor contents */ IGB_TX_LOCK(txr); +#ifdef DEV_NETMAP + slot = netmap_reset(na, NR_TX, txr->me, 0); +#endif /* DEV_NETMAP */ bzero((void *)txr->tx_base, (sizeof(union e1000_adv_tx_desc)) * adapter->num_tx_desc); /* Reset indices */ @@ -3124,6 +3495,13 @@ igb_setup_transmit_ring(struct tx_ring *txr) m_freem(txbuf->m_head); txbuf->m_head = NULL; } +#ifdef DEV_NETMAP + if (slot) { + int si = netmap_idx_n2k(&na->tx_rings[txr->me], i); + /* no need to set the address */ + netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si)); + } +#endif /* DEV_NETMAP */ /* clear the watch index */ txbuf->next_eop = -1; } @@ -3195,7 +3573,7 @@ igb_initialize_transmit_units(struct adapter *adapter) E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl); } - if (adapter->hw.mac.type == e1000_vfadapt) + if (adapter->vf_ifp) return; e1000_config_collision_dist(hw); @@ -3289,8 +3667,9 @@ igb_free_transmit_buffers(struct tx_ring *txr) * Setup work for hardware segmentation offload (TSO) * **********************************************************************/ -static boolean_t -igb_tso_setup(struct tx_ring *txr, struct mbuf *mp, u32 *hdrlen) +static bool +igb_tso_setup(struct tx_ring *txr, struct mbuf *mp, int ehdrlen, + struct ip *ip, struct tcphdr *th) { struct adapter *adapter = txr->adapter; struct e1000_adv_tx_context_desc *TXD; @@ -3298,45 +3677,15 @@ igb_tso_setup(struct tx_ring *txr, struct mbuf *mp, u32 *hdrlen) u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0; u32 mss_l4len_idx = 0; u16 vtag = 0; - int ctxd, ehdrlen, ip_hlen, tcp_hlen; - struct ether_vlan_header *eh; - struct ip *ip; - struct tcphdr *th; - - - /* - * Determine where frame payload starts. - * Jump over vlan headers if already present - */ - eh = mtod(mp, struct ether_vlan_header *); - if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) - ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; - else - ehdrlen = ETHER_HDR_LEN; - - /* Ensure we have at least the IP+TCP header in the first mbuf. */ - if (mp->m_len < ehdrlen + sizeof(struct ip) + sizeof(struct tcphdr)) - return FALSE; + int ctxd, ip_hlen, tcp_hlen; - /* Only supports IPV4 for now */ ctxd = txr->next_avail_desc; tx_buffer = &txr->tx_buffers[ctxd]; TXD = (struct e1000_adv_tx_context_desc *) &txr->tx_base[ctxd]; - ip = (struct ip *)(mp->m_data + ehdrlen); - if (ip->ip_p != IPPROTO_TCP) - return FALSE; /* 0 */ ip->ip_sum = 0; ip_hlen = ip->ip_hl << 2; - th = (struct tcphdr *)((caddr_t)ip + ip_hlen); - th->th_sum = in_pseudo(ip->ip_src.s_addr, - ip->ip_dst.s_addr, htons(IPPROTO_TCP)); tcp_hlen = th->th_off << 2; - /* - * Calculate header length, this is used - * in the transmit desc in igb_xmit - */ - *hdrlen = ehdrlen + ip_hlen + tcp_hlen; /* VLAN MACLEN IPLEN */ if (mp->m_flags & M_VLANTAG) { @@ -3519,6 +3868,19 @@ igb_txeof(struct tx_ring *txr) IGB_TX_LOCK_ASSERT(txr); +#ifdef DEV_NETMAP + if (ifp->if_capenable & IFCAP_NETMAP) { + struct netmap_adapter *na = NA(ifp); + + selwakeuppri(&na->tx_rings[txr->me].si, PI_NET); + IGB_TX_UNLOCK(txr); + IGB_CORE_LOCK(adapter); + selwakeuppri(&na->tx_si, PI_NET); + IGB_CORE_UNLOCK(adapter); + IGB_TX_LOCK(txr); + return FALSE; + } +#endif /* DEV_NETMAP */ if (txr->tx_avail == adapter->num_tx_desc) { txr->queue_status = IGB_QUEUE_IDLE; return FALSE; @@ -3598,25 +3960,23 @@ igb_txeof(struct tx_ring *txr) ** for too long indicates a hang. */ if ((!processed) && ((ticks - txr->watchdog_time) > IGB_WATCHDOG)) - txr->queue_status = IGB_QUEUE_HUNG; - + txr->queue_status |= IGB_QUEUE_HUNG; /* - * If we have enough room, clear IFF_DRV_OACTIVE - * to tell the stack that it is OK to send packets. + * If we have a minimum free, + * clear depleted state bit */ - if (txr->tx_avail > IGB_TX_CLEANUP_THRESHOLD) { - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - /* All clean, turn off the watchdog */ - if (txr->tx_avail == adapter->num_tx_desc) { - txr->queue_status = IGB_QUEUE_IDLE; - return (FALSE); - } + if (txr->tx_avail >= IGB_QUEUE_THRESHOLD) + txr->queue_status &= ~IGB_QUEUE_DEPLETED; + + /* All clean, turn off the watchdog */ + if (txr->tx_avail == adapter->num_tx_desc) { + txr->queue_status = IGB_QUEUE_IDLE; + return (FALSE); } return (TRUE); } - /********************************************************************* * * Refresh mbuf buffers for RX descriptor rings @@ -3634,17 +3994,25 @@ igb_refresh_mbufs(struct rx_ring *rxr, int limit) bus_dma_segment_t pseg[1]; struct igb_rx_buf *rxbuf; struct mbuf *mh, *mp; - int i, nsegs, error, cleaned; + int i, j, nsegs, error; + bool refreshed = FALSE; - i = rxr->next_to_refresh; - cleaned = -1; /* Signify no completions */ - while (i != limit) { + i = j = rxr->next_to_refresh; + /* + ** Get one descriptor beyond + ** our work mark to control + ** the loop. + */ + if (++j == adapter->num_rx_desc) + j = 0; + + while (j != limit) { rxbuf = &rxr->rx_buffers[i]; /* No hdr mbuf used with header split off */ if (rxr->hdr_split == FALSE) goto no_split; if (rxbuf->m_head == NULL) { - mh = m_gethdr(M_DONTWAIT, MT_DATA); + mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) goto update; } else @@ -3670,7 +4038,7 @@ igb_refresh_mbufs(struct rx_ring *rxr, int limit) htole64(hseg[0].ds_addr); no_split: if (rxbuf->m_pack == NULL) { - mp = m_getjcl(M_DONTWAIT, MT_DATA, + mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (mp == NULL) goto update; @@ -3693,18 +4061,17 @@ no_split: BUS_DMASYNC_PREREAD); rxr->rx_base[i].read.pkt_addr = htole64(pseg[0].ds_addr); + refreshed = TRUE; /* I feel wefreshed :) */ - cleaned = i; - /* Calculate next index */ - if (++i == adapter->num_rx_desc) - i = 0; - /* This is the work marker for refresh */ + i = j; /* our next is precalculated */ rxr->next_to_refresh = i; + if (++j == adapter->num_rx_desc) + j = 0; } update: - if (cleaned != -1) /* If we refreshed some, bump tail */ + if (refreshed) /* update tail */ E1000_WRITE_REG(&adapter->hw, - E1000_RDT(rxr->me), cleaned); + E1000_RDT(rxr->me), rxr->next_to_refresh); return; } @@ -3796,12 +4163,11 @@ fail: static void igb_free_receive_ring(struct rx_ring *rxr) { - struct adapter *adapter; + struct adapter *adapter = rxr->adapter; struct igb_rx_buf *rxbuf; - int i; - adapter = rxr->adapter; - for (i = 0; i < adapter->num_rx_desc; i++) { + + for (int i = 0; i < adapter->num_rx_desc; i++) { rxbuf = &rxr->rx_buffers[i]; if (rxbuf->m_head != NULL) { bus_dmamap_sync(rxr->htag, rxbuf->hmap, @@ -3838,6 +4204,10 @@ igb_setup_receive_ring(struct rx_ring *rxr) bus_dma_segment_t pseg[1], hseg[1]; struct lro_ctrl *lro = &rxr->lro; int rsize, nsegs, error = 0; +#ifdef DEV_NETMAP + struct netmap_adapter *na = NA(rxr->adapter->ifp); + struct netmap_slot *slot; +#endif /* DEV_NETMAP */ adapter = rxr->adapter; dev = adapter->dev; @@ -3845,6 +4215,9 @@ igb_setup_receive_ring(struct rx_ring *rxr) /* Clear the ring contents */ IGB_RX_LOCK(rxr); +#ifdef DEV_NETMAP + slot = netmap_reset(na, NR_RX, rxr->me, 0); +#endif /* DEV_NETMAP */ rsize = roundup2(adapter->num_rx_desc * sizeof(union e1000_adv_rx_desc), IGB_DBA_ALIGN); bzero((void *)rxr->rx_base, rsize); @@ -3863,11 +4236,25 @@ igb_setup_receive_ring(struct rx_ring *rxr) struct mbuf *mh, *mp; rxbuf = &rxr->rx_buffers[j]; +#ifdef DEV_NETMAP + if (slot) { + /* slot sj is mapped to the i-th NIC-ring entry */ + int sj = netmap_idx_n2k(&na->rx_rings[rxr->me], j); + uint64_t paddr; + void *addr; + + addr = PNMB(slot + sj, &paddr); + netmap_load_map(rxr->ptag, rxbuf->pmap, addr); + /* Update descriptor */ + rxr->rx_base[j].read.pkt_addr = htole64(paddr); + continue; + } +#endif /* DEV_NETMAP */ if (rxr->hdr_split == FALSE) goto skip_head; /* First the header */ - rxbuf->m_head = m_gethdr(M_DONTWAIT, MT_DATA); + rxbuf->m_head = m_gethdr(M_NOWAIT, MT_DATA); if (rxbuf->m_head == NULL) { error = ENOBUFS; goto fail; @@ -3889,7 +4276,7 @@ igb_setup_receive_ring(struct rx_ring *rxr) skip_head: /* Now the payload cluster */ - rxbuf->m_pack = m_getjcl(M_DONTWAIT, MT_DATA, + rxbuf->m_pack = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, adapter->rx_mbuf_sz); if (rxbuf->m_pack == NULL) { error = ENOBUFS; @@ -3911,7 +4298,7 @@ skip_head: /* Setup our descriptor indices */ rxr->next_to_check = 0; - rxr->next_to_refresh = 0; + rxr->next_to_refresh = adapter->num_rx_desc - 1; rxr->lro_enabled = FALSE; rxr->rx_split_packets = 0; rxr->rx_bytes = 0; @@ -3949,6 +4336,7 @@ fail: return (error); } + /********************************************************************* * * Initialize all receive rings. @@ -3971,8 +4359,8 @@ fail: * the rings that completed, the failing case will have * cleaned up for itself. 'i' is the endpoint. */ - for (int j = 0; j > i; ++j) { - rxr = &adapter->rx_rings[i]; + for (int j = 0; j < i; ++j) { + rxr = &adapter->rx_rings[j]; IGB_RX_LOCK(rxr); igb_free_receive_ring(rxr); IGB_RX_UNLOCK(rxr); @@ -4006,7 +4394,7 @@ igb_initialize_receive_units(struct adapter *adapter) /* ** Set up for header split */ - if (rxr->hdr_split) { + if (igb_header_split) { /* Use a standard mbuf for the header */ srrctl |= IGB_HDR_BUF << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT; srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS; @@ -4144,9 +4532,29 @@ igb_initialize_receive_units(struct adapter *adapter) * - needs to be after enable */ for (int i = 0; i < adapter->num_queues; i++) { - E1000_WRITE_REG(hw, E1000_RDH(i), 0); - E1000_WRITE_REG(hw, E1000_RDT(i), - adapter->num_rx_desc - 1); + rxr = &adapter->rx_rings[i]; + E1000_WRITE_REG(hw, E1000_RDH(i), rxr->next_to_check); +#ifdef DEV_NETMAP + /* + * an init() while a netmap client is active must + * preserve the rx buffers passed to userspace. + * In this driver it means we adjust RDT to + * somthing different from next_to_refresh + * (which is not used in netmap mode). + */ + if (ifp->if_capenable & IFCAP_NETMAP) { + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[i]; + int t = rxr->next_to_refresh - kring->nr_hwavail; + + if (t >= adapter->num_rx_desc) + t -= adapter->num_rx_desc; + else if (t < 0) + t += adapter->num_rx_desc; + E1000_WRITE_REG(hw, E1000_RDT(i), t); + } else +#endif /* DEV_NETMAP */ + E1000_WRITE_REG(hw, E1000_RDT(i), rxr->next_to_refresh); } return; } @@ -4325,6 +4733,20 @@ igb_rxeof(struct igb_queue *que, int count, int *done) bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); +#ifdef DEV_NETMAP + if (ifp->if_capenable & IFCAP_NETMAP) { + struct netmap_adapter *na = NA(ifp); + + na->rx_rings[rxr->me].nr_kflags |= NKR_PENDINTR; + selwakeuppri(&na->rx_rings[rxr->me].si, PI_NET); + IGB_RX_UNLOCK(rxr); + IGB_CORE_LOCK(adapter); + selwakeuppri(&na->rx_si, PI_NET); + IGB_CORE_UNLOCK(adapter); + return (0); + } +#endif /* DEV_NETMAP */ + /* Main clean loop */ for (i = rxr->next_to_check; count != 0;) { struct mbuf *sendmp, *mh, *mp; @@ -4344,14 +4766,18 @@ igb_rxeof(struct igb_queue *que, int count, int *done) rxbuf = &rxr->rx_buffers[i]; plen = le16toh(cur->wb.upper.length); ptype = le32toh(cur->wb.lower.lo_dword.data) & IGB_PKTTYPE_MASK; - vtag = le16toh(cur->wb.upper.vlan); + if ((adapter->hw.mac.type == e1000_i350) && + (staterr & E1000_RXDEXT_STATERR_LB)) + vtag = be16toh(cur->wb.upper.vlan); + else + vtag = le16toh(cur->wb.upper.vlan); hdr = le16toh(cur->wb.lower.lo_dword.hs_rss.hdr_info); eop = ((staterr & E1000_RXD_STAT_EOP) == E1000_RXD_STAT_EOP); /* Make sure all segments of a bad packet are discarded */ if (((staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK) != 0) || (rxr->discard)) { - ifp->if_ierrors++; + adapter->dropped_pkts++; ++rxr->rx_discarded; if (!eop) /* Catch subsequent segs */ rxr->discard = TRUE; @@ -4479,10 +4905,8 @@ next_desc: } /* Catch any remainders */ - if (processed != 0) { + if (igb_rx_unrefreshed(rxr)) igb_refresh_mbufs(rxr, i); - processed = 0; - } rxr->next_to_check = i; @@ -4494,19 +4918,11 @@ next_desc: tcp_lro_flush(lro, queued); } - IGB_RX_UNLOCK(rxr); - if (done != NULL) - *done = rxdone; + *done += rxdone; - /* - ** We still have cleaning to do? - ** Schedule another interrupt if so. - */ - if ((staterr & E1000_RXD_STAT_DD) != 0) - return (TRUE); - - return (FALSE); + IGB_RX_UNLOCK(rxr); + return ((staterr & E1000_RXD_STAT_DD) ? TRUE : FALSE); } /********************************************************************* @@ -4581,9 +4997,9 @@ igb_register_vlan(void *arg, struct ifnet *ifp, u16 vtag) bit = vtag & 0x1F; adapter->shadow_vfta[index] |= (1 << bit); ++adapter->num_vlans; - /* Re-init to load the changes */ + /* Change hw filter setting */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) - igb_init_locked(adapter); + igb_setup_vlan_hw_support(adapter); IGB_CORE_UNLOCK(adapter); } @@ -4608,9 +5024,9 @@ igb_unregister_vlan(void *arg, struct ifnet *ifp, u16 vtag) bit = vtag & 0x1F; adapter->shadow_vfta[index] &= ~(1 << bit); --adapter->num_vlans; - /* Re-init to load the changes */ + /* Change hw filter setting */ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) - igb_init_locked(adapter); + igb_setup_vlan_hw_support(adapter); IGB_CORE_UNLOCK(adapter); } @@ -4618,49 +5034,48 @@ static void igb_setup_vlan_hw_support(struct adapter *adapter) { struct e1000_hw *hw = &adapter->hw; + struct ifnet *ifp = adapter->ifp; u32 reg; - /* - ** We get here thru init_locked, meaning - ** a soft reset, this has already cleared - ** the VFTA and other state, so if there - ** have been no vlan's registered do nothing. - */ - if (adapter->num_vlans == 0) - return; + if (adapter->vf_ifp) { + e1000_rlpml_set_vf(hw, + adapter->max_frame_size + VLAN_TAG_SIZE); + return; + } + + reg = E1000_READ_REG(hw, E1000_CTRL); + reg |= E1000_CTRL_VME; + E1000_WRITE_REG(hw, E1000_CTRL, reg); + + /* Enable the Filter Table */ + if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) { + reg = E1000_READ_REG(hw, E1000_RCTL); + reg &= ~E1000_RCTL_CFIEN; + reg |= E1000_RCTL_VFE; + E1000_WRITE_REG(hw, E1000_RCTL, reg); + } + /* Update the frame size */ + E1000_WRITE_REG(&adapter->hw, E1000_RLPML, + adapter->max_frame_size + VLAN_TAG_SIZE); + + /* Don't bother with table if no vlans */ + if ((adapter->num_vlans == 0) || + ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0)) + return; /* ** A soft reset zero's out the VFTA, so ** we need to repopulate it now. */ for (int i = 0; i < IGB_VFTA_SIZE; i++) if (adapter->shadow_vfta[i] != 0) { - if (hw->mac.type == e1000_vfadapt) + if (adapter->vf_ifp) e1000_vfta_set_vf(hw, adapter->shadow_vfta[i], TRUE); else - E1000_WRITE_REG_ARRAY(hw, E1000_VFTA, - i, adapter->shadow_vfta[i]); + e1000_write_vfta(hw, + i, adapter->shadow_vfta[i]); } - - if (hw->mac.type == e1000_vfadapt) - e1000_rlpml_set_vf(hw, - adapter->max_frame_size + VLAN_TAG_SIZE); - else { - reg = E1000_READ_REG(hw, E1000_CTRL); - reg |= E1000_CTRL_VME; - E1000_WRITE_REG(hw, E1000_CTRL, reg); - - /* Enable the Filter Table */ - reg = E1000_READ_REG(hw, E1000_RCTL); - reg &= ~E1000_RCTL_CFIEN; - reg |= E1000_RCTL_VFE; - E1000_WRITE_REG(hw, E1000_RCTL, reg); - - /* Update the frame size */ - E1000_WRITE_REG(&adapter->hw, E1000_RLPML, - adapter->max_frame_size + VLAN_TAG_SIZE); - } } static void @@ -4668,12 +5083,10 @@ igb_enable_intr(struct adapter *adapter) { /* With RSS set up what to auto clear */ if (adapter->msix_mem) { - E1000_WRITE_REG(&adapter->hw, E1000_EIAC, - adapter->eims_mask); - E1000_WRITE_REG(&adapter->hw, E1000_EIAM, - adapter->eims_mask); - E1000_WRITE_REG(&adapter->hw, E1000_EIMS, - adapter->eims_mask); + u32 mask = (adapter->que_mask | adapter->link_mask); + E1000_WRITE_REG(&adapter->hw, E1000_EIAC, mask); + E1000_WRITE_REG(&adapter->hw, E1000_EIAM, mask); + E1000_WRITE_REG(&adapter->hw, E1000_EIMS, mask); E1000_WRITE_REG(&adapter->hw, E1000_IMS, E1000_IMS_LSC); } else { @@ -4750,7 +5163,7 @@ igb_get_hw_control(struct adapter *adapter) { u32 ctrl_ext; - if (adapter->hw.mac.type == e1000_vfadapt) + if (adapter->vf_ifp) return; /* Let firmware know the driver has taken over */ @@ -4770,7 +5183,7 @@ igb_release_hw_control(struct adapter *adapter) { u32 ctrl_ext; - if (adapter->hw.mac.type == e1000_vfadapt) + if (adapter->vf_ifp) return; /* Let firmware taken over control of h/w */ @@ -4849,7 +5262,7 @@ igb_update_stats_counters(struct adapter *adapter) ** small controlled set of stats, do only ** those and return. */ - if (adapter->hw.mac.type == e1000_vfadapt) { + if (adapter->vf_ifp) { igb_update_vf_stats_counters(adapter); return; } @@ -5190,7 +5603,7 @@ igb_add_hw_stats(struct adapter *adapter) ** VF adapter has a very limited set of stats ** since its not managing the metal, so to speak. */ - if (adapter->hw.mac.type == e1000_vfadapt) { + if (adapter->vf_ifp) { SYSCTL_ADD_QUAD(ctx, stat_list, OID_AUTO, "good_pkts_recvd", CTLFLAG_RD, &stats->gprc, "Good Packets Received"); @@ -5507,7 +5920,7 @@ igb_print_nvm_info(struct adapter *adapter) } static void -igb_add_rx_process_limit(struct adapter *adapter, const char *name, +igb_set_sysctl_value(struct adapter *adapter, const char *name, const char *description, int *limit, int value) { *limit = value; @@ -5515,3 +5928,112 @@ igb_add_rx_process_limit(struct adapter *adapter, const char *name, SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, limit, value, description); } + +/* +** Set flow control using sysctl: +** Flow control values: +** 0 - off +** 1 - rx pause +** 2 - tx pause +** 3 - full +*/ +static int +igb_set_flowcntl(SYSCTL_HANDLER_ARGS) +{ + int error; + static int input = 3; /* default is full */ + struct adapter *adapter = (struct adapter *) arg1; + + error = sysctl_handle_int(oidp, &input, 0, req); + + if ((error) || (req->newptr == NULL)) + return (error); + + switch (input) { + case e1000_fc_rx_pause: + case e1000_fc_tx_pause: + case e1000_fc_full: + case e1000_fc_none: + adapter->hw.fc.requested_mode = input; + adapter->fc = input; + break; + default: + /* Do nothing */ + return (error); + } + + adapter->hw.fc.current_mode = adapter->hw.fc.requested_mode; + e1000_force_mac_fc(&adapter->hw); + return (error); +} + +/* +** Manage DMA Coalesce: +** Control values: +** 0/1 - off/on +** Legal timer values are: +** 250,500,1000-10000 in thousands +*/ +static int +igb_sysctl_dmac(SYSCTL_HANDLER_ARGS) +{ + struct adapter *adapter = (struct adapter *) arg1; + int error; + + error = sysctl_handle_int(oidp, &adapter->dmac, 0, req); + + if ((error) || (req->newptr == NULL)) + return (error); + + switch (adapter->dmac) { + case 0: + /*Disabling */ + break; + case 1: /* Just enable and use default */ + adapter->dmac = 1000; + break; + case 250: + case 500: + case 1000: + case 2000: + case 3000: + case 4000: + case 5000: + case 6000: + case 7000: + case 8000: + case 9000: + case 10000: + /* Legal values - allow */ + break; + default: + /* Do nothing, illegal value */ + adapter->dmac = 0; + return (error); + } + /* Reinit the interface */ + igb_init(adapter); + return (error); +} + +/* +** Manage Energy Efficient Ethernet: +** Control values: +** 0/1 - enabled/disabled +*/ +static int +igb_sysctl_eee(SYSCTL_HANDLER_ARGS) +{ + struct adapter *adapter = (struct adapter *) arg1; + int error, value; + + value = adapter->hw.dev_spec._82575.eee_disable; + error = sysctl_handle_int(oidp, &value, 0, req); + if (error || req->newptr == NULL) + return (error); + IGB_CORE_LOCK(adapter); + adapter->hw.dev_spec._82575.eee_disable = (value != 0); + igb_init_locked(adapter); + IGB_CORE_UNLOCK(adapter); + return (0); +} |