diff options
Diffstat (limited to 'freebsd/sys/netinet/tcp_subr.c')
-rw-r--r-- | freebsd/sys/netinet/tcp_subr.c | 339 |
1 files changed, 289 insertions, 50 deletions
diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c index 1b19aecb..787213b0 100644 --- a/freebsd/sys/netinet/tcp_subr.c +++ b/freebsd/sys/netinet/tcp_subr.c @@ -40,7 +40,6 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include <rtems/bsd/local/opt_compat.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_ipsec.h> @@ -106,6 +105,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_var.h> #include <netinet/tcp_log_buf.h> #include <netinet/tcp_syncache.h> +#include <netinet/tcp_hpts.h> #include <netinet/cc/cc.h> #ifdef INET6 #include <netinet6/tcp6_var.h> @@ -239,6 +239,9 @@ VNET_DEFINE(uma_zone_t, sack_hole_zone); VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); #endif +static int tcp_default_fb_init(struct tcpcb *tp); +static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); +static int tcp_default_handoff_ok(struct tcpcb *tp); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static void tcp_mtudisc(struct inpcb *, int); @@ -247,21 +250,17 @@ static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, static struct tcp_function_block tcp_def_funcblk = { - "default", - tcp_output, - tcp_do_segment, - tcp_default_ctloutput, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - 0, - 0 + .tfb_tcp_block_name = "freebsd", + .tfb_tcp_output = tcp_output, + .tfb_tcp_do_segment = tcp_do_segment, + .tfb_tcp_ctloutput = tcp_default_ctloutput, + .tfb_tcp_handoff_ok = tcp_default_handoff_ok, + .tfb_tcp_fb_init = tcp_default_fb_init, + .tfb_tcp_fb_fini = tcp_default_fb_fini, }; int t_functions_inited = 0; +static int tcp_fb_cnt = 0; struct tcp_funchead t_functions; static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; @@ -334,6 +333,88 @@ find_and_ref_tcp_fb(struct tcp_function_block *blk) return(rblk); } +static struct tcp_function_block * +find_and_ref_tcp_default_fb(void) +{ + struct tcp_function_block *rblk; + + rw_rlock(&tcp_function_lock); + rblk = tcp_func_set_ptr; + refcount_acquire(&rblk->tfb_refcnt); + rw_runlock(&tcp_function_lock); + return (rblk); +} + +void +tcp_switch_back_to_default(struct tcpcb *tp) +{ + struct tcp_function_block *tfb; + + KASSERT(tp->t_fb != &tcp_def_funcblk, + ("%s: called by the built-in default stack", __func__)); + + /* + * Release the old stack. This function will either find a new one + * or panic. + */ + if (tp->t_fb->tfb_tcp_fb_fini != NULL) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); + refcount_release(&tp->t_fb->tfb_refcnt); + + /* + * Now, we'll find a new function block to use. + * Start by trying the current user-selected + * default, unless this stack is the user-selected + * default. + */ + tfb = find_and_ref_tcp_default_fb(); + if (tfb == tp->t_fb) { + refcount_release(&tfb->tfb_refcnt); + tfb = NULL; + } + /* Does the stack accept this connection? */ + if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL && + (*tfb->tfb_tcp_handoff_ok)(tp)) { + refcount_release(&tfb->tfb_refcnt); + tfb = NULL; + } + /* Try to use that stack. */ + if (tfb != NULL) { + /* Initialize the new stack. If it succeeds, we are done. */ + tp->t_fb = tfb; + if (tp->t_fb->tfb_tcp_fb_init == NULL || + (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0) + return; + + /* + * Initialization failed. Release the reference count on + * the stack. + */ + refcount_release(&tfb->tfb_refcnt); + } + + /* + * If that wasn't feasible, use the built-in default + * stack which is not allowed to reject anyone. + */ + tfb = find_and_ref_tcp_fb(&tcp_def_funcblk); + if (tfb == NULL) { + /* there always should be a default */ + panic("Can't refer to tcp_def_funcblk"); + } + if (tfb->tfb_tcp_handoff_ok != NULL) { + if ((*tfb->tfb_tcp_handoff_ok) (tp)) { + /* The default stack cannot say no */ + panic("Default stack rejects a new session?"); + } + } + tp->t_fb = tfb; + if (tp->t_fb->tfb_tcp_fb_init != NULL && + (*tp->t_fb->tfb_tcp_fb_init)(tp)) { + /* The default stack cannot fail */ + panic("Default stack initialization failed"); + } +} static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) @@ -433,14 +514,14 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, "list available TCP Function sets"); /* - * Exports one (struct tcp_function_id) for each non-alias. + * Exports one (struct tcp_function_info) for each alias/name. */ static int -sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS) +sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS) { - int error, cnt; + int cnt, error; struct tcp_function *f; - struct tcp_function_id tfi; + struct tcp_function_info tfi; /* * We don't allow writes. @@ -459,20 +540,31 @@ sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS) } /* - * Walk the list, comparing the name of the function entry and - * function block to determine which is an alias. - * If exporting the list, copy out matching entries. Otherwise, - * just record the total length. + * Walk the list and copy out matching entries. If INVARIANTS + * is compiled in, also walk the list to verify the length of + * the list matches what we have recorded. */ - cnt = 0; rw_rlock(&tcp_function_lock); + + cnt = 0; +#ifndef INVARIANTS + if (req->oldptr == NULL) { + cnt = tcp_fb_cnt; + goto skip_loop; + } +#endif TAILQ_FOREACH(f, &t_functions, tf_next) { - if (strncmp(f->tf_name, f->tf_fb->tfb_tcp_block_name, - TCP_FUNCTION_NAME_LEN_MAX)) - continue; +#ifdef INVARIANTS + cnt++; +#endif if (req->oldptr != NULL) { + tfi.tfi_refcnt = f->tf_fb->tfb_refcnt; tfi.tfi_id = f->tf_fb->tfb_id; - (void)strncpy(tfi.tfi_name, f->tf_name, + (void)strncpy(tfi.tfi_alias, f->tf_name, + TCP_FUNCTION_NAME_LEN_MAX); + tfi.tfi_alias[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; + (void)strncpy(tfi.tfi_name, + f->tf_fb->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX); tfi.tfi_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); @@ -481,23 +573,110 @@ sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS) * mechanism we use to accumulate length * information if the buffer was too short. */ - } else - cnt++; + } } + KASSERT(cnt == tcp_fb_cnt, + ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt)); +#ifndef INVARIANTS +skip_loop: +#endif rw_runlock(&tcp_function_lock); if (req->oldptr == NULL) error = SYSCTL_OUT(req, NULL, - (cnt + 1) * sizeof(struct tcp_function_id)); + (cnt + 1) * sizeof(struct tcp_function_info)); return (error); } -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_ids, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info, CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE, - NULL, 0, sysctl_net_inet_list_func_ids, "S,tcp_function_id", + NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info", "List TCP function block name-to-ID mappings"); /* + * tfb_tcp_handoff_ok() function for the default stack. + * Note that we'll basically try to take all comers. + */ +static int +tcp_default_handoff_ok(struct tcpcb *tp) +{ + + return (0); +} + +/* + * tfb_tcp_fb_init() function for the default stack. + * + * This handles making sure we have appropriate timers set if you are + * transitioning a socket that has some amount of setup done. + * + * The init() fuction from the default can *never* return non-zero i.e. + * it is required to always succeed since it is the stack of last resort! + */ +static int +tcp_default_fb_init(struct tcpcb *tp) +{ + + struct socket *so; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT, + ("%s: connection %p in unexpected state %d", __func__, tp, + tp->t_state)); + + /* + * Nothing to do for ESTABLISHED or LISTEN states. And, we don't + * know what to do for unexpected states (which includes TIME_WAIT). + */ + if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT) + return (0); + + /* + * Make sure some kind of transmission timer is set if there is + * outstanding data. + */ + so = tp->t_inpcb->inp_socket; + if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) || + tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) || + tcp_timer_active(tp, TT_PERSIST))) { + /* + * If the session has established and it looks like it should + * be in the persist state, set the persist timer. Otherwise, + * set the retransmit timer. + */ + if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 && + (int32_t)(tp->snd_nxt - tp->snd_una) < + (int32_t)sbavail(&so->so_snd)) + tcp_setpersist(tp); + else + tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + } + + /* All non-embryonic sessions get a keepalive timer. */ + if (!tcp_timer_active(tp, TT_KEEP)) + tcp_timer_activate(tp, TT_KEEP, + TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) : + TP_KEEPINIT(tp)); + + return (0); +} + +/* + * tfb_tcp_fb_fini() function for the default stack. + * + * This changes state as necessary (or prudent) to prepare for another stack + * to assume responsibility for the connection. + */ +static void +tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged) +{ + + INP_WLOCK_ASSERT(tp->t_inpcb); + return; +} + +/* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment @@ -660,6 +839,7 @@ register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, (void)strncpy(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX); n->tf_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; TAILQ_INSERT_TAIL(&t_functions, n, tf_next); + tcp_fb_cnt++; rw_wunlock(&tcp_function_lock); } return(0); @@ -676,6 +856,7 @@ cleanup: if (!strncmp(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX)) { TAILQ_REMOVE(&t_functions, n, tf_next); + tcp_fb_cnt--; n->tf_fb = NULL; free(n, M_TCPFUNCTIONS); break; @@ -721,11 +902,28 @@ register_tcp_functions(struct tcp_function_block *blk, int wait) return (register_tcp_functions_as_name(blk, NULL, wait)); } +/* + * Deregister all names associated with a function block. This + * functionally removes the function block from use within the system. + * + * When called with a true quiesce argument, mark the function block + * as being removed so no more stacks will use it and determine + * whether the removal would succeed. + * + * When called with a false quiesce argument, actually attempt the + * removal. + * + * When called with a force argument, attempt to switch all TCBs to + * use the default stack instead of returning EBUSY. + * + * Returns 0 on success (or if the removal would succeed, or an error + * code on failure. + */ int -deregister_tcp_functions(struct tcp_function_block *blk) +deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, + bool force) { struct tcp_function *f; - int error=ENOENT; if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { /* You can't un-register the default */ @@ -737,21 +935,64 @@ deregister_tcp_functions(struct tcp_function_block *blk) rw_wunlock(&tcp_function_lock); return (EBUSY); } + /* Mark the block so no more stacks can use it. */ + blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; + /* + * If TCBs are still attached to the stack, attempt to switch them + * to the default stack. + */ + if (force && blk->tfb_refcnt) { + struct inpcb *inp; + struct tcpcb *tp; + VNET_ITERATOR_DECL(vnet_iter); + + rw_wunlock(&tcp_function_lock); + + VNET_LIST_RLOCK(); + /* XXX handle */ + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + INP_INFO_WLOCK(&V_tcbinfo); + LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { + INP_WLOCK(inp); + if (inp->inp_flags & INP_TIMEWAIT) { + INP_WUNLOCK(inp); + continue; + } + tp = intotcpcb(inp); + if (tp == NULL || tp->t_fb != blk) { + INP_WUNLOCK(inp); + continue; + } + tcp_switch_back_to_default(tp); + INP_WUNLOCK(inp); + } + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK(); + + rw_wlock(&tcp_function_lock); + } if (blk->tfb_refcnt) { - /* Still tcb attached, mark it. */ - blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; - rw_wunlock(&tcp_function_lock); + /* TCBs still attached. */ + rw_wunlock(&tcp_function_lock); return (EBUSY); } + if (quiesce) { + /* Skip removal. */ + rw_wunlock(&tcp_function_lock); + return (0); + } + /* Remove any function names that map to this function block. */ while (find_tcp_fb_locked(blk, &f) != NULL) { - /* Found */ TAILQ_REMOVE(&t_functions, f, tf_next); + tcp_fb_cnt--; f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); - error = 0; } rw_wunlock(&tcp_function_lock); - return (error); + return (0); } void @@ -1498,6 +1739,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo) tmpalgo = CC_ALGO(tp); /* NewReno does not require any init. */ CC_ALGO(tp) = &newreno_cc_algo; + /* XXX defer to epoch_call */ if (tmpalgo->cb_destroy != NULL) tmpalgo->cb_destroy(tp->ccv); } @@ -1545,7 +1787,7 @@ tcp_discardcb(struct tcpcb *tp) #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ - int released; + int released __unused; INP_WLOCK_ASSERT(inp); @@ -1868,6 +2110,7 @@ static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, m, n, pcb_count; + struct in_pcblist *il; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; @@ -1914,7 +2157,8 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) if (error) return (error); - inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS); + inp_list = il->il_inp_list; INP_INFO_WLOCK(&V_tcbinfo); for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; @@ -1957,14 +2201,10 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) } else INP_RUNLOCK(inp); } - INP_INFO_RLOCK(&V_tcbinfo); - for (i = 0; i < n; i++) { - inp = inp_list[i]; - INP_RLOCK(inp); - if (!in_pcbrele_rlocked(inp)) - INP_RUNLOCK(inp); - } - INP_INFO_RUNLOCK(&V_tcbinfo); + + il->il_count = n; + il->il_pcbinfo = &V_tcbinfo; + epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked); if (!error) { /* @@ -1981,7 +2221,6 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) INP_LIST_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } - free(inp_list, M_TEMP); return (error); } |