diff options
Diffstat (limited to 'freebsd/sys/net/route.c')
-rw-r--r-- | freebsd/sys/net/route.c | 1519 |
1 files changed, 994 insertions, 525 deletions
diff --git a/freebsd/sys/net/route.c b/freebsd/sys/net/route.c index 781d8bb9..3eb05b94 100644 --- a/freebsd/sys/net/route.c +++ b/freebsd/sys/net/route.c @@ -45,7 +45,6 @@ #include <rtems/bsd/sys/param.h> #include <sys/systm.h> -#include <sys/syslog.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/socket.h> @@ -57,8 +56,10 @@ #include <sys/kernel.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_dl.h> #include <net/route.h> +#include <net/route_var.h> #include <net/vnet.h> #include <net/flowtable.h> @@ -75,8 +76,7 @@ #include <sys/file.h> #endif /* __rtems__ */ -/* We use 4 bits in the mbuf flags, thus we are limited to 16 FIBS. */ -#define RT_MAXFIBS 16 +#define RT_MAXFIBS UINT16_MAX /* Kernel config default option. */ #ifdef ROUTETABLES @@ -102,17 +102,7 @@ extern void sctp_addr_change(struct ifaddr *ifa, int cmd); /* This is read-only.. */ u_int rt_numfibs = RT_NUMFIBS; -SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, ""); -/* - * Allow the boot code to allow LESS than RT_MAXFIBS to be used. - * We can't do more because storage is statically allocated for now. - * (for compatibility reasons.. this will change. When this changes, code should - * be refactored to protocol independent parts and protocol dependent parts, - * probably hanging of domain(9) specific storage to not need the full - * fib * af RNH allocation etc. but allow tuning the number of tables per - * address family). - */ -TUNABLE_INT("net.fibs", &rt_numfibs); +SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, ""); /* * By default add routes to all fibs for new interfaces. @@ -124,25 +114,20 @@ TUNABLE_INT("net.fibs", &rt_numfibs); * always work given the fib can be overridden and prefixes can be added * from the network stack context. */ -u_int rt_add_addr_allfibs = 1; -SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW, - &rt_add_addr_allfibs, 0, ""); -TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs); +VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1; +SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET, + &VNET_NAME(rt_add_addr_allfibs), 0, ""); VNET_DEFINE(struct rtstat, rtstat); #define V_rtstat VNET(rtstat) -VNET_DEFINE(struct radix_node_head *, rt_tables); +VNET_DEFINE(struct rib_head *, rt_tables); #define V_rt_tables VNET(rt_tables) VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ #define V_rttrash VNET(rttrash) -/* compare two sockaddr structures */ -#define sa_equal(a1, a2) (((a1)->sa_len == (a2)->sa_len) && \ - (bcmp((a1), (a2), (a1)->sa_len) == 0)) - /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a @@ -158,6 +143,28 @@ VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */ #define V_rtzone VNET(rtzone) +static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *, + struct rtentry **, u_int); +static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *); +static int rt_ifdelroute(const struct rtentry *rt, void *arg); +static struct rtentry *rt_unlinkrte(struct rib_head *rnh, + struct rt_addrinfo *info, int *perror); +static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info); +#ifdef RADIX_MPATH +static struct radix_node *rt_mpath_unlink(struct rib_head *rnh, + struct rt_addrinfo *info, struct rtentry *rto, int *perror); +#endif +static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, + int flags); + +struct if_mtuinfo +{ + struct ifnet *ifp; + int mtu; +}; + +static int if_updatemtu_cb(struct radix_node *, void *); + /* * handler for net.my_fibnum */ @@ -179,10 +186,10 @@ sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); -static __inline struct radix_node_head ** +static __inline struct rib_head ** rt_tables_get_rnh_ptr(int table, int fam) { - struct radix_node_head **rnh; + struct rib_head **rnh; KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.", __func__)); @@ -190,20 +197,32 @@ rt_tables_get_rnh_ptr(int table, int fam) __func__)); /* rnh is [fib=0][af=0]. */ - rnh = (struct radix_node_head **)V_rt_tables; + rnh = (struct rib_head **)V_rt_tables; /* Get the offset to the requested table and fam. */ rnh += table * (AF_MAX+1) + fam; return (rnh); } -struct radix_node_head * +struct rib_head * rt_tables_get_rnh(int table, int fam) { return (*rt_tables_get_rnh_ptr(table, fam)); } +u_int +rt_tables_get_gen(int table, int fam) +{ + struct rib_head *rnh; + + rnh = *rt_tables_get_rnh_ptr(table, fam); + KASSERT(rnh != NULL, ("%s: NULL rib_head pointer table %d fam %d", + __func__, table, fam)); + return (rnh->rnh_gen); +} + + /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. @@ -211,36 +230,72 @@ rt_tables_get_rnh(int table, int fam) static void route_init(void) { - struct domain *dom; - int max_keylen = 0; /* whack the tunable ints into line. */ if (rt_numfibs > RT_MAXFIBS) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; +} +SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0); - for (dom = domains; dom; dom = dom->dom_next) - if (dom->dom_maxrtkey > max_keylen) - max_keylen = dom->dom_maxrtkey; +static int +rtentry_zinit(void *mem, int size, int how) +{ + struct rtentry *rt = mem; + + rt->rt_pksent = counter_u64_alloc(how); + if (rt->rt_pksent == NULL) + return (ENOMEM); - rn_init(max_keylen); /* init all zeroes, all ones, mask table */ + RT_LOCK_INIT(rt); + + return (0); +} + +static void +rtentry_zfini(void *mem, int size) +{ + struct rtentry *rt = mem; + + RT_LOCK_DESTROY(rt); + counter_u64_free(rt->rt_pksent); +} + +static int +rtentry_ctor(void *mem, int size, void *arg, int how) +{ + struct rtentry *rt = mem; + + bzero(rt, offsetof(struct rtentry, rt_endzero)); + counter_u64_zero(rt->rt_pksent); + rt->rt_chain = NULL; + + return (0); +} + +static void +rtentry_dtor(void *mem, int size, void *arg) +{ + struct rtentry *rt = mem; + + RT_UNLOCK_COND(rt); } -SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0); static void vnet_route_init(const void *unused __unused) { struct domain *dom; - struct radix_node_head **rnh; + struct rib_head **rnh; int table; int fam; V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) * - sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK|M_ZERO); + sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO); - V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, 0); + V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), + rtentry_ctor, rtentry_dtor, + rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0); for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach == NULL) continue; @@ -250,15 +305,10 @@ vnet_route_init(const void *unused __unused) if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; - /* - * XXX MRT rtattach will be also called from - * vfs_export.c but the offset will be 0 (only for - * AF_INET and AF_INET6 which don't need it anyhow). - */ rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); - dom->dom_rtattach((void **)rnh, dom->dom_rtoffset); + dom->dom_rtattach((void **)rnh, 0); } } } @@ -272,7 +322,7 @@ vnet_route_uninit(const void *unused __unused) int table; int fam; struct domain *dom; - struct radix_node_head **rnh; + struct rib_head **rnh; for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtdetach == NULL) @@ -287,14 +337,68 @@ vnet_route_uninit(const void *unused __unused) rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); - dom->dom_rtdetach((void **)rnh, dom->dom_rtoffset); + dom->dom_rtdetach((void **)rnh, 0); } } + + free(V_rt_tables, M_RTABLE); + uma_zdestroy(V_rtzone); } -VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, +VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, vnet_route_uninit, 0); #endif +struct rib_head * +rt_table_init(int offset) +{ + struct rib_head *rh; + + rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO); + + /* TODO: These details should be hidded inside radix.c */ + /* Init masks tree */ + rn_inithead_internal(&rh->head, rh->rnh_nodes, offset); + rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0); + rh->head.rnh_masks = &rh->rmhead; + + /* Init locks */ + rw_init(&rh->rib_lock, "rib head lock"); + + /* Finally, set base callbacks */ + rh->rnh_addaddr = rn_addroute; + rh->rnh_deladdr = rn_delete; + rh->rnh_matchaddr = rn_match; + rh->rnh_lookup = rn_lookup; + rh->rnh_walktree = rn_walktree; + rh->rnh_walktree_from = rn_walktree_from; + + return (rh); +} + +static int +rt_freeentry(struct radix_node *rn, void *arg) +{ + struct radix_head * const rnh = arg; + struct radix_node *x; + + x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); + if (x != NULL) + R_Free(x); + return (0); +} + +void +rt_table_destroy(struct rib_head *rh) +{ + + rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); + + /* Assume table is already empty */ + rw_destroy(&rh->rib_lock); + free(rh, M_RTABLE); +} + + #ifndef _SYS_SYSPROTO_H_ struct setfib_args { int fibnum; @@ -335,35 +439,6 @@ setfib(int fibnum) * Packet routing routines. */ void -rtalloc(struct route *ro) -{ - - rtalloc_ign_fib(ro, 0UL, RT_DEFAULT_FIB); -} - -void -rtalloc_fib(struct route *ro, u_int fibnum) -{ - rtalloc_ign_fib(ro, 0UL, fibnum); -} - -void -rtalloc_ign(struct route *ro, u_long ignore) -{ - struct rtentry *rt; - - if ((rt = ro->ro_rt) != NULL) { - if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) - return; - RTFREE(rt); - ro->ro_rt = NULL; - } - ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, RT_DEFAULT_FIB); - if (ro->ro_rt) - RT_UNLOCK(ro->ro_rt); -} - -void rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum) { struct rtentry *rt; @@ -396,49 +471,32 @@ struct rtentry * rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { - struct radix_node_head *rnh; + struct rib_head *rh; struct radix_node *rn; struct rtentry *newrt; struct rt_addrinfo info; int err = 0, msgtype = RTM_MISS; - int needlock; KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); - switch (dst->sa_family) { - case AF_INET6: - case AF_INET: - /* We support multiple FIBs. */ - break; - default: - fibnum = RT_DEFAULT_FIB; - break; - } - rnh = rt_tables_get_rnh(fibnum, dst->sa_family); + rh = rt_tables_get_rnh(fibnum, dst->sa_family); newrt = NULL; - if (rnh == NULL) + if (rh == NULL) goto miss; /* * Look up the address in the table for that Address Family */ - needlock = !(ignflags & RTF_RNH_LOCKED); - if (needlock) - RADIX_NODE_HEAD_RLOCK(rnh); -#ifdef INVARIANTS - else - RADIX_NODE_HEAD_LOCK_ASSERT(rnh); -#endif - rn = rnh->rnh_matchaddr(dst, rnh); + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr(dst, &rh->head); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { newrt = RNTORT(rn); RT_LOCK(newrt); RT_ADDREF(newrt); - if (needlock) - RADIX_NODE_HEAD_RUNLOCK(rnh); - goto done; + RIB_RUNLOCK(rh); + return (newrt); - } else if (needlock) - RADIX_NODE_HEAD_RUNLOCK(rnh); + } else + RIB_RUNLOCK(rh); /* * Either we hit the root or couldn't find any match, @@ -457,10 +515,7 @@ miss: bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; rt_missmsg_fib(msgtype, &info, 0, err, fibnum); - } -done: - if (newrt) - RT_LOCK_ASSERT(newrt); + } return (newrt); } @@ -471,7 +526,7 @@ done: void rtfree(struct rtentry *rt) { - struct radix_node_head *rnh; + struct rib_head *rnh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); @@ -499,7 +554,7 @@ rtfree(struct rtentry *rt) * on the entry so that the code below reclaims the storage. */ if (rt->rt_refcnt == 0 && rnh->rnh_close) - rnh->rnh_close((struct radix_node *)rt, rnh); + rnh->rnh_close((struct radix_node *)rt, &rnh->head); /* * If we are no longer "up" (and ref == 0) @@ -531,12 +586,11 @@ rtfree(struct rtentry *rt) * This also frees the gateway, as they are always malloc'd * together. */ - Free(rt_key(rt)); + R_Free(rt_key(rt)); /* * and the rtentry itself of course */ - RT_LOCK_DESTROY(rt); uma_zfree(V_rtzone, rt); return; } @@ -552,17 +606,6 @@ done: * message from the network layer. */ void -rtredirect(struct sockaddr *dst, - struct sockaddr *gateway, - struct sockaddr *netmask, - int flags, - struct sockaddr *src) -{ - - rtredirect_fib(dst, gateway, netmask, flags, src, RT_DEFAULT_FIB); -} - -void rtredirect_fib(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, @@ -570,12 +613,12 @@ rtredirect_fib(struct sockaddr *dst, struct sockaddr *src, u_int fibnum) { - struct rtentry *rt, *rt0 = NULL; + struct rtentry *rt; int error = 0; short *stat = NULL; struct rt_addrinfo info; struct ifaddr *ifa; - struct radix_node_head *rnh; + struct rib_head *rnh; ifa = NULL; rnh = rt_tables_get_rnh(fibnum, dst->sa_family); @@ -585,7 +628,7 @@ rtredirect_fib(struct sockaddr *dst, } /* verify the gateway is directly reachable */ - if ((ifa = ifa_ifwithnet_fib(gateway, 0, fibnum)) == NULL) { + if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) { error = ENETUNREACH; goto out; } @@ -596,13 +639,20 @@ rtredirect_fib(struct sockaddr *dst, * we have a routing loop, perhaps as a result of an interface * going down recently. */ - if (!(flags & RTF_DONE) && rt && - (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa)) - error = EINVAL; - else if (ifa_ifwithaddr_check(gateway)) + if (!(flags & RTF_DONE) && rt) { + if (!sa_equal(src, rt->rt_gateway)) { + error = EINVAL; + goto done; + } + if (rt->rt_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) { + error = EINVAL; + goto done; + } + } + if ((flags & RTF_GATEWAY) && ifa_ifwithaddr_check(gateway)) { error = EHOSTUNREACH; - if (error) goto done; + } /* * Create a new entry if we just got back a wildcard entry * or the lookup failed. This is necessary for hosts @@ -622,36 +672,31 @@ rtredirect_fib(struct sockaddr *dst, * Create new route, rather than smashing route to net. */ create: - rt0 = rt; - rt = NULL; + if (rt != NULL) + RTFREE_LOCKED(rt); - flags |= RTF_GATEWAY | RTF_DYNAMIC; + flags |= RTF_DYNAMIC; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_ifa = ifa; info.rti_flags = flags; - if (rt0 != NULL) - RT_UNLOCK(rt0); /* drop lock to avoid LOR with RNH */ error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); if (rt != NULL) { RT_LOCK(rt); - if (rt0 != NULL) - EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst); flags = rt->rt_flags; } - if (rt0 != NULL) - RTFREE(rt0); stat = &V_rtstat.rts_dynamic; } else { - struct rtentry *gwrt; /* * Smash the current notion of the gateway to * this destination. Should check about netmask!!! */ + if ((flags & RTF_GATEWAY) == 0) + rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; stat = &V_rtstat.rts_newgateway; @@ -659,13 +704,10 @@ rtredirect_fib(struct sockaddr *dst, * add the key and gateway (in one malloc'd chunk). */ RT_UNLOCK(rt); - RADIX_NODE_HEAD_LOCK(rnh); + RIB_WLOCK(rnh); RT_LOCK(rt); rt_setgate(rt, rt_key(rt), gateway); - gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED); - RADIX_NODE_HEAD_UNLOCK(rnh); - EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst); - RTFREE_LOCKED(gwrt); + RIB_WUNLOCK(rnh); } } else error = EHOSTUNREACH; @@ -687,13 +729,6 @@ out: ifa_free(ifa); } -int -rtioctl(u_long req, caddr_t data) -{ - - return (rtioctl_fib(req, data, RT_DEFAULT_FIB)); -} - /* * Routing table ioctl interface. */ @@ -715,21 +750,11 @@ rtioctl_fib(u_long req, caddr_t data, u_int fibnum) #endif /* INET */ } -/* - * For both ifa_ifwithroute() routines, 'ifa' is returned referenced. - */ struct ifaddr * -ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway) -{ - - return (ifa_ifwithroute_fib(flags, dst, gateway, RT_DEFAULT_FIB)); -} - -struct ifaddr * -ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway, +ifa_ifwithroute(int flags, const struct sockaddr *dst, struct sockaddr *gateway, u_int fibnum) { - register struct ifaddr *ifa; + struct ifaddr *ifa; int not_found = 0; if ((flags & RTF_GATEWAY) == 0) { @@ -742,7 +767,7 @@ ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway, */ ifa = NULL; if (flags & RTF_HOST) - ifa = ifa_ifwithdstaddr_fib(dst, fibnum); + ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { @@ -751,12 +776,12 @@ ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway, * or host, the gateway may still be on the * other end of a pt to pt link. */ - ifa = ifa_ifwithdstaddr_fib(gateway, fibnum); + ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) - ifa = ifa_ifwithnet_fib(gateway, 0, fibnum); + ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { - struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum); + struct rtentry *rt = rtalloc1_fib(gateway, 0, 0, fibnum); if (rt == NULL) return (NULL); /* @@ -800,19 +825,6 @@ ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway, * all the bits of info needed */ int -rtrequest(int req, - struct sockaddr *dst, - struct sockaddr *gateway, - struct sockaddr *netmask, - int flags, - struct rtentry **ret_nrt) -{ - - return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, - RT_DEFAULT_FIB)); -} - -int rtrequest_fib(int req, struct sockaddr *dst, struct sockaddr *gateway, @@ -834,6 +846,443 @@ rtrequest_fib(int req, return rtrequest1_fib(req, &info, ret_nrt, fibnum); } + +/* + * Copy most of @rt data into @info. + * + * If @flags contains NHR_COPY, copies dst,netmask and gw to the + * pointers specified by @info structure. Assume such pointers + * are zeroed sockaddr-like structures with sa_len field initialized + * to reflect size of the provided buffer. if no NHR_COPY is specified, + * point dst,netmask and gw @info fields to appropriate @rt values. + * + * if @flags contains NHR_REF, do refcouting on rt_ifp. + * + * Returns 0 on success. + */ +int +rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags) +{ + struct rt_metrics *rmx; + struct sockaddr *src, *dst; + int sa_len; + + if (flags & NHR_COPY) { + /* Copy destination if dst is non-zero */ + src = rt_key(rt); + dst = info->rti_info[RTAX_DST]; + sa_len = src->sa_len; + if (dst != NULL) { + if (src->sa_len > dst->sa_len) + return (ENOMEM); + memcpy(dst, src, src->sa_len); + info->rti_addrs |= RTA_DST; + } + + /* Copy mask if set && dst is non-zero */ + src = rt_mask(rt); + dst = info->rti_info[RTAX_NETMASK]; + if (src != NULL && dst != NULL) { + + /* + * Radix stores different value in sa_len, + * assume rt_mask() to have the same length + * as rt_key() + */ + if (sa_len > dst->sa_len) + return (ENOMEM); + memcpy(dst, src, src->sa_len); + info->rti_addrs |= RTA_NETMASK; + } + + /* Copy gateway is set && dst is non-zero */ + src = rt->rt_gateway; + dst = info->rti_info[RTAX_GATEWAY]; + if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){ + if (src->sa_len > dst->sa_len) + return (ENOMEM); + memcpy(dst, src, src->sa_len); + info->rti_addrs |= RTA_GATEWAY; + } + } else { + info->rti_info[RTAX_DST] = rt_key(rt); + info->rti_addrs |= RTA_DST; + if (rt_mask(rt) != NULL) { + info->rti_info[RTAX_NETMASK] = rt_mask(rt); + info->rti_addrs |= RTA_NETMASK; + } + if (rt->rt_flags & RTF_GATEWAY) { + info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info->rti_addrs |= RTA_GATEWAY; + } + } + + rmx = info->rti_rmx; + if (rmx != NULL) { + info->rti_mflags |= RTV_MTU; + rmx->rmx_mtu = rt->rt_mtu; + } + + info->rti_flags = rt->rt_flags; + info->rti_ifp = rt->rt_ifp; + info->rti_ifa = rt->rt_ifa; + + if (flags & NHR_REF) { + /* Do 'traditional' refcouting */ + if_ref(info->rti_ifp); + } + + return (0); +} + +/* + * Lookups up route entry for @dst in RIB database for fib @fibnum. + * Exports entry data to @info using rt_exportinfo(). + * + * if @flags contains NHR_REF, refcouting is performed on rt_ifp. + * All references can be released later by calling rib_free_info() + * + * Returns 0 on success. + * Returns ENOENT for lookup failure, ENOMEM for export failure. + */ +int +rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, + uint32_t flowid, struct rt_addrinfo *info) +{ + struct rib_head *rh; + struct radix_node *rn; + struct rtentry *rt; + int error; + + KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, dst->sa_family); + if (rh == NULL) + return (ENOENT); + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + rt = RNTORT(rn); + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(rt->rt_ifp)) { + flags = (flags & NHR_REF) | NHR_COPY; + error = rt_exportinfo(rt, info, flags); + RIB_RUNLOCK(rh); + + return (error); + } + } + RIB_RUNLOCK(rh); + + return (ENOENT); +} + +/* + * Releases all references acquired by rib_lookup_info() when + * called with NHR_REF flags. + */ +void +rib_free_info(struct rt_addrinfo *info) +{ + + if_rele(info->rti_ifp); +} + +/* + * Iterates over all existing fibs in system calling + * @setwa_f function prior to traversing each fib. + * Calls @wa_f function for each element in current fib. + * If af is not AF_UNSPEC, iterates over fibs in particular + * address family. + */ +void +rt_foreach_fib_walk(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f, + void *arg) +{ + struct rib_head *rnh; + uint32_t fibnum; + int i; + + for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { + /* Do we want some specific family? */ + if (af != AF_UNSPEC) { + rnh = rt_tables_get_rnh(fibnum, af); + if (rnh == NULL) + continue; + if (setwa_f != NULL) + setwa_f(rnh, fibnum, af, arg); + + RIB_WLOCK(rnh); + rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); + RIB_WUNLOCK(rnh); + continue; + } + + for (i = 1; i <= AF_MAX; i++) { + rnh = rt_tables_get_rnh(fibnum, i); + if (rnh == NULL) + continue; + if (setwa_f != NULL) + setwa_f(rnh, fibnum, i, arg); + + RIB_WLOCK(rnh); + rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); + RIB_WUNLOCK(rnh); + } + } +} + +struct rt_delinfo +{ + struct rt_addrinfo info; + struct rib_head *rnh; + struct rtentry *head; +}; + +/* + * Conditionally unlinks @rn from radix tree based + * on info data passed in @arg. + */ +static int +rt_checkdelroute(struct radix_node *rn, void *arg) +{ + struct rt_delinfo *di; + struct rt_addrinfo *info; + struct rtentry *rt; + int error; + + di = (struct rt_delinfo *)arg; + rt = (struct rtentry *)rn; + info = &di->info; + error = 0; + + info->rti_info[RTAX_DST] = rt_key(rt); + info->rti_info[RTAX_NETMASK] = rt_mask(rt); + info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; + + rt = rt_unlinkrte(di->rnh, info, &error); + if (rt == NULL) { + /* Either not allowed or not matched. Skip entry */ + return (0); + } + + /* Entry was unlinked. Add to the list and return */ + rt->rt_chain = di->head; + di->head = rt; + + return (0); +} + +/* + * Iterates over all existing fibs in system. + * Deletes each element for which @filter_f function returned + * non-zero value. + * If @af is not AF_UNSPEC, iterates over fibs in particular + * address family. + */ +void +rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg) +{ + struct rib_head *rnh; + struct rt_delinfo di; + struct rtentry *rt; + uint32_t fibnum; + int i, start, end; + + bzero(&di, sizeof(di)); + di.info.rti_filter = filter_f; + di.info.rti_filterdata = arg; + + for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { + /* Do we want some specific family? */ + if (af != AF_UNSPEC) { + start = af; + end = af; + } else { + start = 1; + end = AF_MAX; + } + + for (i = start; i <= end; i++) { + rnh = rt_tables_get_rnh(fibnum, i); + if (rnh == NULL) + continue; + di.rnh = rnh; + + RIB_WLOCK(rnh); + rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); + RIB_WUNLOCK(rnh); + + if (di.head == NULL) + continue; + + /* We might have something to reclaim */ + while (di.head != NULL) { + rt = di.head; + di.head = rt->rt_chain; + rt->rt_chain = NULL; + + /* TODO std rt -> rt_addrinfo export */ + di.info.rti_info[RTAX_DST] = rt_key(rt); + di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); + + rt_notifydelete(rt, &di.info); + RTFREE_LOCKED(rt); + } + + } + } +} + +/* + * Delete Routes for a Network Interface + * + * Called for each routing entry via the rnh->rnh_walktree() call above + * to delete all route entries referencing a detaching network interface. + * + * Arguments: + * rt pointer to rtentry + * arg argument passed to rnh->rnh_walktree() - detaching interface + * + * Returns: + * 0 successful + * errno failed - reason indicated + */ +static int +rt_ifdelroute(const struct rtentry *rt, void *arg) +{ + struct ifnet *ifp = arg; + + if (rt->rt_ifp != ifp) + return (0); + + /* + * Protect (sorta) against walktree recursion problems + * with cloned routes + */ + if ((rt->rt_flags & RTF_UP) == 0) + return (0); + + return (1); +} + +/* + * Delete all remaining routes using this interface + * Unfortuneatly the only way to do this is to slog through + * the entire routing table looking for routes which point + * to this interface...oh well... + */ +void +rt_flushifroutes_af(struct ifnet *ifp, int af) +{ + KASSERT((af >= 1 && af <= AF_MAX), ("%s: af %d not >= 1 and <= %d", + __func__, af, AF_MAX)); + + rt_foreach_fib_walk_del(af, rt_ifdelroute, ifp); +} + +void +rt_flushifroutes(struct ifnet *ifp) +{ + + rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp); +} + +/* + * Conditionally unlinks rtentry matching data inside @info from @rnh. + * Returns unlinked, locked and referenced @rtentry on success, + * Returns NULL and sets @perror to: + * ESRCH - if prefix was not found, + * EADDRINUSE - if trying to delete PINNED route without appropriate flag. + * ENOENT - if supplied filter function returned 0 (not matched). + */ +static struct rtentry * +rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror) +{ + struct sockaddr *dst, *netmask; + struct rtentry *rt; + struct radix_node *rn; + + dst = info->rti_info[RTAX_DST]; + netmask = info->rti_info[RTAX_NETMASK]; + + rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head); + if (rt == NULL) { + *perror = ESRCH; + return (NULL); + } + + if ((info->rti_flags & RTF_PINNED) == 0) { + /* Check if target route can be deleted */ + if (rt->rt_flags & RTF_PINNED) { + *perror = EADDRINUSE; + return (NULL); + } + } + + if (info->rti_filter != NULL) { + if (info->rti_filter(rt, info->rti_filterdata) == 0) { + /* Not matched */ + *perror = ENOENT; + return (NULL); + } + + /* + * Filter function requested rte deletion. + * Ease the caller work by filling in remaining info + * from that particular entry. + */ + info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; + } + + /* + * Remove the item from the tree and return it. + * Complain if it is not there and do no more processing. + */ + *perror = ESRCH; +#ifdef RADIX_MPATH + if (rt_mpath_capable(rnh)) + rn = rt_mpath_unlink(rnh, info, rt, perror); + else +#endif + rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); + if (rn == NULL) + return (NULL); + + if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) + panic ("rtrequest delete"); + + rt = RNTORT(rn); + RT_LOCK(rt); + RT_ADDREF(rt); + rt->rt_flags &= ~RTF_UP; + + *perror = 0; + + return (rt); +} + +static void +rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info) +{ + struct ifaddr *ifa; + + /* + * give the protocol a chance to keep things in sync. + */ + ifa = rt->rt_ifa; + if (ifa != NULL && ifa->ifa_rtrequest != NULL) + ifa->ifa_rtrequest(RTM_DELETE, rt, info); + + /* + * One more rtentry floating around that is not + * linked to the routing table. rttrash will be decremented + * when RTFREE(rt) is eventually called. + */ + V_rttrash++; +} + + /* * These (questionable) definitions of apparent local variables apply * to the next two functions. XXXXXX!!! @@ -845,13 +1294,6 @@ rtrequest_fib(int req, #define ifpaddr info->rti_info[RTAX_IFP] #define flags info->rti_flags -int -rt_getifa(struct rt_addrinfo *info) -{ - - return (rt_getifa_fib(info, RT_DEFAULT_FIB)); -} - /* * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, * it will be referenced so the caller must free it. @@ -868,7 +1310,7 @@ rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) */ if (info->rti_ifp == NULL && ifpaddr != NULL && ifpaddr->sa_family == AF_LINK && - (ifa = ifa_ifwithnet_fib(ifpaddr, 0, fibnum)) != NULL) { + (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) { info->rti_ifp = ifa->ifa_ifp; ifa_free(ifa); } @@ -882,10 +1324,10 @@ rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) if (sa != NULL && info->rti_ifp != NULL) info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); else if (dst != NULL && gateway != NULL) - info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway, + info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) - info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa, + info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if ((ifa = info->rti_ifa) != NULL) { @@ -896,94 +1338,70 @@ rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) return (error); } -/* - * Expunges references to a route that's about to be reclaimed. - * The route must be locked. - */ -int -rtexpunge(struct rtentry *rt) +static int +if_updatemtu_cb(struct radix_node *rn, void *arg) { -#if !defined(RADIX_MPATH) - struct radix_node *rn; -#else - struct rt_addrinfo info; - int fib; - struct rtentry *rt0; -#endif - struct radix_node_head *rnh; - struct ifaddr *ifa; - int error = 0; - - /* - * Find the correct routing tree to use for this Address Family - */ - rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); - RT_LOCK_ASSERT(rt); - if (rnh == NULL) - return (EAFNOSUPPORT); - RADIX_NODE_HEAD_LOCK_ASSERT(rnh); + struct rtentry *rt; + struct if_mtuinfo *ifmtu; -#ifdef RADIX_MPATH - fib = rt->rt_fibnum; - bzero(&info, sizeof(info)); - info.rti_ifp = rt->rt_ifp; - info.rti_flags = RTF_RNH_LOCKED; - info.rti_info[RTAX_DST] = rt_key(rt); - info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr; + rt = (struct rtentry *)rn; + ifmtu = (struct if_mtuinfo *)arg; - RT_UNLOCK(rt); - error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib); + if (rt->rt_ifp != ifmtu->ifp) + return (0); - if (error == 0 && rt0 != NULL) { - rt = rt0; - RT_LOCK(rt); - } else if (error != 0) { - RT_LOCK(rt); - return (error); + if (rt->rt_mtu >= ifmtu->mtu) { + /* We have to decrease mtu regardless of flags */ + rt->rt_mtu = ifmtu->mtu; + return (0); } -#else + /* - * Remove the item from the tree; it should be there, - * but when callers invoke us blindly it may not (sigh). + * New MTU is bigger. Check if are allowed to alter it */ - rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh); - if (rn == NULL) { - error = ESRCH; - goto bad; + if ((rt->rt_flags & (RTF_FIXEDMTU | RTF_GATEWAY | RTF_HOST)) != 0) { + + /* + * Skip routes with user-supplied MTU and + * non-interface routes + */ + return (0); } - KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0, - ("unexpected flags 0x%x", rn->rn_flags)); - KASSERT(rt == RNTORT(rn), - ("lookup mismatch, rt %p rn %p", rt, rn)); -#endif /* RADIX_MPATH */ - rt->rt_flags &= ~RTF_UP; + /* We are safe to update route MTU */ + rt->rt_mtu = ifmtu->mtu; - /* - * Give the protocol a chance to keep things in sync. - */ - if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) { - struct rt_addrinfo info; + return (0); +} - bzero((caddr_t)&info, sizeof(info)); - info.rti_flags = rt->rt_flags; - info.rti_info[RTAX_DST] = rt_key(rt); - info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; - info.rti_info[RTAX_NETMASK] = rt_mask(rt); - ifa->ifa_rtrequest(RTM_DELETE, rt, &info); - } +void +rt_updatemtu(struct ifnet *ifp) +{ + struct if_mtuinfo ifmtu; + struct rib_head *rnh; + int i, j; + + ifmtu.ifp = ifp; /* - * one more rtentry floating around that is not - * linked to the routing table. + * Try to update rt_mtu for all routes using this interface + * Unfortunately the only way to do this is to traverse all + * routing tables in all fibs/domains. */ - V_rttrash++; -#if !defined(RADIX_MPATH) -bad: -#endif - return (error); + for (i = 1; i <= AF_MAX; i++) { + ifmtu.mtu = if_getmtu_family(ifp, i); + for (j = 0; j < rt_numfibs; j++) { + rnh = rt_tables_get_rnh(j, i); + if (rnh == NULL) + continue; + RIB_WLOCK(rnh); + rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu); + RIB_WUNLOCK(rnh); + } + } } + #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); @@ -1036,26 +1454,32 @@ rt_print(char *buf, int buflen, struct rtentry *rt) #endif #ifdef RADIX_MPATH -static int -rn_mpath_update(int req, struct rt_addrinfo *info, - struct radix_node_head *rnh, struct rtentry **ret_nrt) +/* + * Deletes key for single-path routes, unlinks rtentry with + * gateway specified in @info from multi-path routes. + * + * Returnes unlinked entry. In case of failure, returns NULL + * and sets @perror to ESRCH. + */ +static struct radix_node * +rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry *rto, int *perror) { /* * if we got multipath routes, we require users to specify * a matching RTAX_GATEWAY. */ - struct rtentry *rt, *rto = NULL; - register struct radix_node *rn; - int error = 0; + struct rtentry *rt; // *rto = NULL; + struct radix_node *rn; + struct sockaddr *gw; - rn = rnh->rnh_lookup(dst, netmask, rnh); - if (rn == NULL) - return (ESRCH); - rto = rt = RNTORT(rn); + gw = info->rti_info[RTAX_GATEWAY]; + rt = rt_mpath_matchgate(rto, gw); + if (rt == NULL) { + *perror = ESRCH; + return (NULL); + } - rt = rt_mpath_matchgate(rt, gateway); - if (rt == NULL) - return (ESRCH); /* * this is the first entry in the chain */ @@ -1078,67 +1502,95 @@ rn_mpath_update(int req, struct rt_addrinfo *info, * check the case when there is only * one route in the chain. */ - if (gateway && - (rt->rt_gateway->sa_len != gateway->sa_len || - memcmp(rt->rt_gateway, gateway, gateway->sa_len))) - error = ESRCH; - else { - /* - * remove from tree before returning it - * to the caller - */ - rn = rnh->rnh_deladdr(dst, netmask, rnh); - KASSERT(rt == RNTORT(rn), ("radix node disappeared")); - goto gwdelete; + if (gw && + (rt->rt_gateway->sa_len != gw->sa_len || + memcmp(rt->rt_gateway, gw, gw->sa_len))) { + *perror = ESRCH; + return (NULL); } - } + /* * use the normal delete code to remove * the first entry */ - if (req != RTM_DELETE) - goto nondelete; - - error = ENOENT; - goto done; + rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); + *perror = 0; + return (rn); } /* * if the entry is 2nd and on up */ - if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt)) + if (rt_mpath_deldup(rto, rt) == 0) panic ("rtrequest1: rt_mpath_deldup"); -gwdelete: - RT_LOCK(rt); - RT_ADDREF(rt); - if (req == RTM_DELETE) { - rt->rt_flags &= ~RTF_UP; - /* - * One more rtentry floating around that is not - * linked to the routing table. rttrash will be decremented - * when RTFREE(rt) is eventually called. - */ - V_rttrash++; + *perror = 0; + rn = (struct radix_node *)rt; + return (rn); +} +#endif + +#ifdef FLOWTABLE +static struct rtentry * +rt_flowtable_check_route(struct rib_head *rnh, struct rt_addrinfo *info) +{ +#if defined(INET6) || defined(INET) + struct radix_node *rn; +#endif + struct rtentry *rt0; + + rt0 = NULL; + /* "flow-table" only supports IPv6 and IPv4 at the moment. */ + switch (dst->sa_family) { +#ifdef INET6 + case AF_INET6: +#endif +#ifdef INET + case AF_INET: +#endif +#if defined(INET6) || defined(INET) + rn = rnh->rnh_matchaddr(dst, &rnh->head); + if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { + struct sockaddr *mask; + u_char *m, *n; + int len; + + /* + * compare mask to see if the new route is + * more specific than the existing one + */ + rt0 = RNTORT(rn); + RT_LOCK(rt0); + RT_ADDREF(rt0); + RT_UNLOCK(rt0); + /* + * A host route is already present, so + * leave the flow-table entries as is. + */ + if (rt0->rt_flags & RTF_HOST) { + RTFREE(rt0); + rt0 = NULL; + } else if (!(flags & RTF_HOST) && netmask) { + mask = rt_mask(rt0); + len = mask->sa_len; + m = (u_char *)mask; + n = (u_char *)netmask; + while (len-- > 0) { + if (*n != *m) + break; + n++; + m++; + } + if (len == 0 || (*n < *m)) { + RTFREE(rt0); + rt0 = NULL; + } + } + } +#endif/* INET6 || INET */ } - -nondelete: - if (req != RTM_DELETE) - panic("unrecognized request %d", req); - - /* - * If the caller wants it, then it can have it, - * but it's up to it to free the rtentry as we won't be - * doing it. - */ - if (ret_nrt) { - *ret_nrt = rt; - RT_UNLOCK(rt); - } else - RTFREE_LOCKED(rt); -done: - return (error); + return (rt0); } #endif @@ -1146,19 +1598,19 @@ int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { - int error = 0, needlock = 0; - register struct rtentry *rt; + int error = 0; + struct rtentry *rt, *rt_old; #ifdef FLOWTABLE - register struct rtentry *rt0; + struct rtentry *rt0; #endif - register struct radix_node *rn; - register struct radix_node_head *rnh; + struct radix_node *rn; + struct rib_head *rnh; struct ifaddr *ifa; struct sockaddr *ndst; struct sockaddr_storage mdst; -#define senderr(x) { error = x ; goto bad; } KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); + KASSERT((flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked")); switch (dst->sa_family) { case AF_INET6: case AF_INET: @@ -1175,12 +1627,7 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) return (EAFNOSUPPORT); - needlock = ((flags & RTF_RNH_LOCKED) == 0); - flags &= ~RTF_RNH_LOCKED; - if (needlock) - RADIX_NODE_HEAD_LOCK(rnh); - else - RADIX_NODE_HEAD_LOCK_ASSERT(rnh); + /* * If we are adding a host route then we don't want to put * a netmask in the tree, nor do we want to clone it. @@ -1194,52 +1641,14 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } -#ifdef RADIX_MPATH - if (rn_mpath_capable(rnh)) { - error = rn_mpath_update(req, info, rnh, ret_nrt); - /* - * "bad" holds true for the success case - * as well - */ - if (error != ENOENT) - goto bad; - error = 0; - } -#endif - if ((flags & RTF_PINNED) == 0) { - /* Check if target route can be deleted */ - rt = (struct rtentry *)rnh->rnh_lookup(dst, - netmask, rnh); - if ((rt != NULL) && (rt->rt_flags & RTF_PINNED)) - senderr(EADDRINUSE); - } - /* - * Remove the item from the tree and return it. - * Complain if it is not there and do no more processing. - */ - rn = rnh->rnh_deladdr(dst, netmask, rnh); - if (rn == NULL) - senderr(ESRCH); - if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) - panic ("rtrequest delete"); - rt = RNTORT(rn); - RT_LOCK(rt); - RT_ADDREF(rt); - rt->rt_flags &= ~RTF_UP; + RIB_WLOCK(rnh); + rt = rt_unlinkrte(rnh, info, &error); + RIB_WUNLOCK(rnh); + if (error != 0) + return (error); - /* - * give the protocol a chance to keep things in sync. - */ - if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) - ifa->ifa_rtrequest(RTM_DELETE, rt, info); - - /* - * One more rtentry floating around that is not - * linked to the routing table. rttrash will be decremented - * when RTFREE(rt) is eventually called. - */ - V_rttrash++; + rt_notifydelete(rt, info); /* * If the caller wants it, then it can have it, @@ -1260,37 +1669,32 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, break; case RTM_ADD: if ((flags & RTF_GATEWAY) && !gateway) - senderr(EINVAL); + return (EINVAL); if (dst && gateway && (dst->sa_family != gateway->sa_family) && (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) - senderr(EINVAL); + return (EINVAL); if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, fibnum); if (error) - senderr(error); + return (error); } else ifa_ref(info->rti_ifa); ifa = info->rti_ifa; - rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO); + rt = uma_zalloc(V_rtzone, M_NOWAIT); if (rt == NULL) { - if (ifa != NULL) - ifa_free(ifa); - senderr(ENOBUFS); + ifa_free(ifa); + return (ENOBUFS); } - RT_LOCK_INIT(rt); rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = fibnum; /* * Add the gateway. Possibly re-malloc-ing the storage for it. */ - RT_LOCK(rt); if ((error = rt_setgate(rt, dst, gateway)) != 0) { - RT_LOCK_DESTROY(rt); - if (ifa != NULL) - ifa_free(ifa); + ifa_free(ifa); uma_zfree(V_rtzone, rt); - senderr(error); + return (error); } /* @@ -1313,111 +1717,81 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, */ rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; - rt->rt_rmx.rmx_weight = 1; + rt->rt_weight = 1; + rt_setmetrics(info, rt); + + RIB_WLOCK(rnh); + RT_LOCK(rt); #ifdef RADIX_MPATH /* do not permit exactly the same dst/mask/gw pair */ - if (rn_mpath_capable(rnh) && + if (rt_mpath_capable(rnh) && rt_mpath_conflict(rnh, rt, netmask)) { - if (rt->rt_ifa) { - ifa_free(rt->rt_ifa); - } - Free(rt_key(rt)); - RT_LOCK_DESTROY(rt); + RIB_WUNLOCK(rnh); + + ifa_free(rt->rt_ifa); + R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); - senderr(EEXIST); + return (EEXIST); } #endif #ifdef FLOWTABLE - rt0 = NULL; - /* "flow-table" only supports IPv6 and IPv4 at the moment. */ - switch (dst->sa_family) { -#ifdef INET6 - case AF_INET6: -#endif -#ifdef INET - case AF_INET: -#endif -#if defined(INET6) || defined(INET) - rn = rnh->rnh_matchaddr(dst, rnh); - if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { - struct sockaddr *mask; - u_char *m, *n; - int len; - - /* - * compare mask to see if the new route is - * more specific than the existing one - */ - rt0 = RNTORT(rn); - RT_LOCK(rt0); - RT_ADDREF(rt0); - RT_UNLOCK(rt0); - /* - * A host route is already present, so - * leave the flow-table entries as is. - */ - if (rt0->rt_flags & RTF_HOST) { - RTFREE(rt0); - rt0 = NULL; - } else if (!(flags & RTF_HOST) && netmask) { - mask = rt_mask(rt0); - len = mask->sa_len; - m = (u_char *)mask; - n = (u_char *)netmask; - while (len-- > 0) { - if (*n != *m) - break; - n++; - m++; - } - if (len == 0 || (*n < *m)) { - RTFREE(rt0); - rt0 = NULL; - } - } - } -#endif/* INET6 || INET */ - } + rt0 = rt_flowtable_check_route(rnh, info); #endif /* FLOWTABLE */ /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ - rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes); + rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); + + rt_old = NULL; + if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) { + + /* + * Force removal and re-try addition + * TODO: better multipath&pinned support + */ + struct sockaddr *info_dst = info->rti_info[RTAX_DST]; + info->rti_info[RTAX_DST] = ndst; + /* Do not delete existing PINNED(interface) routes */ + info->rti_flags &= ~RTF_PINNED; + rt_old = rt_unlinkrte(rnh, info, &error); + info->rti_flags |= RTF_PINNED; + info->rti_info[RTAX_DST] = info_dst; + if (rt_old != NULL) + rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, + rt->rt_nodes); + } + RIB_WUNLOCK(rnh); + + if (rt_old != NULL) + RT_UNLOCK(rt_old); + /* * If it still failed to go into the tree, * then un-make it (this should be a function) */ if (rn == NULL) { - if (rt->rt_ifa) - ifa_free(rt->rt_ifa); - Free(rt_key(rt)); - RT_LOCK_DESTROY(rt); + ifa_free(rt->rt_ifa); + R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); #ifdef FLOWTABLE if (rt0 != NULL) RTFREE(rt0); #endif - senderr(EEXIST); + return (EEXIST); } #ifdef FLOWTABLE else if (rt0 != NULL) { - switch (dst->sa_family) { -#ifdef INET6 - case AF_INET6: - flowtable_route_flush(V_ip6_ft, rt0); - break; -#endif -#ifdef INET - case AF_INET: - flowtable_route_flush(V_ip_ft, rt0); - break; -#endif - } + flowtable_route_flush(dst->sa_family, rt0); RTFREE(rt0); } #endif + if (rt_old != NULL) { + rt_notifydelete(rt_old, info); + RTFREE(rt_old); + } + /* * If this protocol has something to add to this then * allow it to do that as well. @@ -1433,16 +1807,19 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, *ret_nrt = rt; RT_ADDREF(rt); } + rnh->rnh_gen++; /* Routing table updated */ RT_UNLOCK(rt); break; + case RTM_CHANGE: + RIB_WLOCK(rnh); + error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum); + RIB_WUNLOCK(rnh); + break; default: error = EOPNOTSUPP; } -bad: - if (needlock) - RADIX_NODE_HEAD_UNLOCK(rnh); + return (error); -#undef senderr } #undef dst @@ -1452,20 +1829,147 @@ bad: #undef ifpaddr #undef flags +static int +rtrequest1_fib_change(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry **ret_nrt, u_int fibnum) +{ + struct rtentry *rt = NULL; + int error = 0; + int free_ifa = 0; + int family, mtu; + struct if_mtuinfo ifmtu; + + rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], + info->rti_info[RTAX_NETMASK], &rnh->head); + + if (rt == NULL) + return (ESRCH); + +#ifdef RADIX_MPATH + /* + * If we got multipath routes, + * we require users to specify a matching RTAX_GATEWAY. + */ + if (rt_mpath_capable(rnh)) { + rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); + if (rt == NULL) + return (ESRCH); + } +#endif + + RT_LOCK(rt); + + rt_setmetrics(info, rt); + + /* + * New gateway could require new ifaddr, ifp; + * flags may also be different; ifp may be specified + * by ll sockaddr when protocol address is ambiguous + */ + if (((rt->rt_flags & RTF_GATEWAY) && + info->rti_info[RTAX_GATEWAY] != NULL) || + info->rti_info[RTAX_IFP] != NULL || + (info->rti_info[RTAX_IFA] != NULL && + !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) { + + error = rt_getifa_fib(info, fibnum); + if (info->rti_ifa != NULL) + free_ifa = 1; + + if (error != 0) + goto bad; + } + + /* Check if outgoing interface has changed */ + if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa && + rt->rt_ifa != NULL && rt->rt_ifa->ifa_rtrequest != NULL) { + rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info); + ifa_free(rt->rt_ifa); + } + /* Update gateway address */ + if (info->rti_info[RTAX_GATEWAY] != NULL) { + error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]); + if (error != 0) + goto bad; + + rt->rt_flags &= ~RTF_GATEWAY; + rt->rt_flags |= (RTF_GATEWAY & info->rti_flags); + } + + if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) { + ifa_ref(info->rti_ifa); + rt->rt_ifa = info->rti_ifa; + rt->rt_ifp = info->rti_ifp; + } + /* Allow some flags to be toggled on change. */ + rt->rt_flags &= ~RTF_FMASK; + rt->rt_flags |= info->rti_flags & RTF_FMASK; + + if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL) + rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info); + + /* Alter route MTU if necessary */ + if (rt->rt_ifp != NULL) { + family = info->rti_info[RTAX_DST]->sa_family; + mtu = if_getmtu_family(rt->rt_ifp, family); + /* Set default MTU */ + if (rt->rt_mtu == 0) + rt->rt_mtu = mtu; + if (rt->rt_mtu != mtu) { + /* Check if we really need to update */ + ifmtu.ifp = rt->rt_ifp; + ifmtu.mtu = mtu; + if_updatemtu_cb(rt->rt_nodes, &ifmtu); + } + } + + if (ret_nrt) { + *ret_nrt = rt; + RT_ADDREF(rt); + } +bad: + RT_UNLOCK(rt); + if (free_ifa != 0) + ifa_free(info->rti_ifa); + return (error); +} + +static void +rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt) +{ + + if (info->rti_mflags & RTV_MTU) { + if (info->rti_rmx->rmx_mtu != 0) { + + /* + * MTU was explicitly provided by user. + * Keep it. + */ + rt->rt_flags |= RTF_FIXEDMTU; + } else { + + /* + * User explicitly sets MTU to 0. + * Assume rollback to default. + */ + rt->rt_flags &= ~RTF_FIXEDMTU; + } + rt->rt_mtu = info->rti_rmx->rmx_mtu; + } + if (info->rti_mflags & RTV_WEIGHT) + rt->rt_weight = info->rti_rmx->rmx_weight; + /* Kernel -> userland timebase conversion. */ + if (info->rti_mflags & RTV_EXPIRE) + rt->rt_expire = info->rti_rmx->rmx_expire ? + info->rti_rmx->rmx_expire - time_second + time_uptime : 0; +} + int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { /* XXX dst may be overwritten, can we move this to below */ int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); -#ifdef INVARIANTS - struct radix_node_head *rnh; - rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family); -#endif - - RT_LOCK_ASSERT(rt); - RADIX_NODE_HEAD_LOCK_ASSERT(rnh); - /* * Prepare to store the gateway in rt->rt_gateway. * Both dst and gateway are stored one after the other in the same @@ -1487,7 +1991,7 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) * Free()/free() handle a NULL argument just fine. */ bcopy(dst, new, dlen); - Free(rt_key(rt)); /* free old block, if any */ + R_Free(rt_key(rt)); /* free old block, if any */ rt_key(rt) = (struct sockaddr *)new; rt->rt_gateway = (struct sockaddr *)(new + dlen); } @@ -1503,9 +2007,9 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { - register u_char *cp1 = (u_char *)src; - register u_char *cp2 = (u_char *)dst; - register u_char *cp3 = (u_char *)netmask; + u_char *cp1 = (u_char *)src; + u_char *cp2 = (u_char *)dst; + u_char *cp3 = (u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; @@ -1537,7 +2041,7 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) int didwork = 0; int a_failure = 0; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; - struct radix_node_head *rnh; + struct rib_head *rnh; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; @@ -1558,13 +2062,13 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) break; } if (fibnum == RT_ALL_FIBS) { - if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) { + if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) #ifndef __rtems__ startfib = endfib = ifa->ifa_ifp->if_fib; #else /* __rtems__ */ startfib = endfib = BSD_DEFAULT_FIB; #endif /* __rtems__ */ - } else { + else { startfib = 0; endfib = rt_numfibs - 1; } @@ -1609,10 +2113,10 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) if (rnh == NULL) /* this table doesn't exist but others might */ continue; - RADIX_NODE_HEAD_RLOCK(rnh); - rn = rnh->rnh_lookup(dst, netmask, rnh); + RIB_RLOCK(rnh); + rn = rnh->rnh_lookup(dst, netmask, &rnh->head); #ifdef RADIX_MPATH - if (rn_mpath_capable(rnh)) { + if (rt_mpath_capable(rnh)) { if (rn == NULL) error = ESRCH; @@ -1635,7 +2139,7 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) error = (rn == NULL || (rn->rn_flags & RNF_ROOT) || RNTORT(rn)->rt_ifa != ifa); - RADIX_NODE_HEAD_RUNLOCK(rnh); + RIB_RUNLOCK(rnh); if (error) { /* this is only an error if bad on ALL tables */ continue; @@ -1660,32 +2164,6 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) info.rti_info[RTAX_NETMASK] = netmask; error = rtrequest1_fib(cmd, &info, &rt, fibnum); - if ((error == EEXIST) && (cmd == RTM_ADD)) { - /* - * Interface route addition failed. - * Atomically delete current prefix generating - * RTM_DELETE message, and retry adding - * interface prefix. - */ - rnh = rt_tables_get_rnh(fibnum, dst->sa_family); - RADIX_NODE_HEAD_LOCK(rnh); - - /* Delete old prefix */ - info.rti_ifa = NULL; - info.rti_flags = RTF_RNH_LOCKED; - - error = rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum); - if (error == 0) { - info.rti_ifa = ifa; - info.rti_flags = flags | RTF_RNH_LOCKED | - (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; - error = rtrequest1_fib(cmd, &info, &rt, fibnum); - } - - RADIX_NODE_HEAD_UNLOCK(rnh); - } - - if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change @@ -1760,15 +2238,6 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) return (error); } -#ifndef BURN_BRIDGES -/* special one for inet internal use. may not use. */ -int -rtinit_fib(struct ifaddr *ifa, int cmd, int flags) -{ - return (rtinit1(ifa, cmd, flags, RT_ALL_FIBS)); -} -#endif - /* * Set up a routing table entry, normally * for an interface. |