diff --git a/sys/kern/kern_softint.c b/sys/kern/kern_softint.c index 782540f..75f67d6 100644 --- a/sys/kern/kern_softint.c +++ b/sys/kern/kern_softint.c @@ -442,8 +442,8 @@ softint_disestablish(void *arg) KASSERT(sh->sh_func != NULL); flags |= sh->sh_flags; } - /* Neither pending nor active on all CPUs? */ - if ((flags & (SOFTINT_PENDING | SOFTINT_ACTIVE)) == 0) { + /* Inactive on all CPUs? */ + if ((flags & SOFTINT_ACTIVE) == 0) { break; } /* Oops, still active. Wait for it to clear. */ diff --git a/sys/net/if_gif.c b/sys/net/if_gif.c index 0310969..d381cc0 100644 --- a/sys/net/if_gif.c +++ b/sys/net/if_gif.c @@ -53,6 +53,7 @@ __KERNEL_RCSID(0, "$NetBSD: if_gif.c,v 1.105 2016/01/18 06:08:26 knakahara Exp $ #include #include #include +#include #include #include @@ -100,6 +101,7 @@ static int gif_check_nesting(struct ifnet *, struct mbuf *); static int gif_encap_attach(struct gif_softc *); static int gif_encap_detach(struct gif_softc *); +static void gif_encap_pause(struct gif_softc *); static struct if_clone gif_cloner = IF_CLONE_INITIALIZER("gif", gif_clone_create, gif_clone_destroy); @@ -217,7 +219,8 @@ gif_encapcheck(struct mbuf *m, int off, int proto, void *arg) if (sc == NULL) return 0; - if ((sc->gif_if.if_flags & IFF_UP) == 0) + if ((sc->gif_if.if_flags & (IFF_UP|IFF_RUNNING)) + != (IFF_UP|IFF_RUNNING)) return 0; /* no physical address */ @@ -321,9 +324,8 @@ gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, } m->m_flags &= ~(M_BCAST|M_MCAST); - if (!(ifp->if_flags & IFF_UP) || - sc->gif_psrc == NULL || sc->gif_pdst == NULL || - sc->gif_si == NULL) { + if (!(ifp->if_flags & IFF_UP) || /* check IFF_RUNNING later */ + sc->gif_psrc == NULL || sc->gif_pdst == NULL) { m_freem(m); error = ENETDOWN; goto end; @@ -344,6 +346,17 @@ gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, m->m_pkthdr.csum_data = 0; s = splnet(); + /* + * This if_flags check, IFQ_ENQUEUE and softint_schedule() are required + * to be done atomically in the local CPU, because this local CPU must + * let gif_encap_pause() wait until softint_schedule() completion. + */ + if (!(ifp->if_flags & IFF_RUNNING)) { + splx(s); + m_freem(m); + error = ENETDOWN; + goto end; + } IFQ_ENQUEUE(&ifp->if_snd, m, &pktattr, error); if (error) { splx(s); @@ -376,15 +389,6 @@ gifintr(void *arg) sc = arg; ifp = &sc->gif_if; - /* - * other CPUs does {set,delete}_tunnel after curcpu have done - * softint_schedule(). - */ - if (sc->gif_pdst == NULL || sc->gif_psrc == NULL) { - IFQ_PURGE(&ifp->if_snd); - return; - } - /* output processing */ while (1) { s = splnet(); @@ -776,6 +780,26 @@ gif_encap_detach(struct gif_softc *sc) return error; } +static void +gif_encap_pause(struct gif_softc *sc) +{ + struct ifnet *ifp = &sc->gif_if; + uint64_t where; + + ifp->if_flags &= ~IFF_RUNNING; + membar_sync(); + + /* + * Wait for softint_schedule() completion done by other CPUs which + * already run over if_flags check in gif_output(). + * In addition, wait for softint_execute()(ipintr() or ip6intr()) + * completion done by other CPUs which already run over if_flags + * check in in_gif_input() or in6_gif_input(). + */ + where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL); + xc_wait(where); +} + int gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) { @@ -783,11 +807,11 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) struct gif_softc *sc2; struct sockaddr *osrc, *odst; struct sockaddr *nsrc, *ndst; - void *osi; int s; int error; s = splsoftnet(); + encap_lock_enter(); LIST_FOREACH(sc2, &gif_softc_list, gif_list) { if (sc2 == sc) @@ -798,6 +822,7 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) if (sockaddr_cmp(sc2->gif_pdst, dst) == 0 && sockaddr_cmp(sc2->gif_psrc, src) == 0) { /* continue to use the old configureation. */ + encap_lock_exit(); splx(s); return EADDRNOTAVAIL; } @@ -806,42 +831,29 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) } if ((nsrc = sockaddr_dup(src, M_WAITOK)) == NULL) { + encap_lock_exit(); splx(s); return ENOMEM; } if ((ndst = sockaddr_dup(dst, M_WAITOK)) == NULL) { sockaddr_free(nsrc); + encap_lock_exit(); splx(s); return ENOMEM; } + gif_encap_pause(sc); + /* + * At this point, gif_output() does not softint_schedule() any more. + * Furthermore, all of gif_output() has completed. It promises not to + * call softint_schedule() anymore, so we can call + * softint_disestablish() now. + */ + /* Firstly, clear old configurations. */ if (sc->gif_si) { - osrc = sc->gif_psrc; - odst = sc->gif_pdst; - osi = sc->gif_si; - sc->gif_psrc = NULL; - sc->gif_pdst = NULL; + softint_disestablish(sc->gif_si); sc->gif_si = NULL; - /* - * At this point, gif_output() does not softint_schedule() - * any more. However, there are below 2 fears of other CPUs - * which would cause panic because of the race between - * softint_execute() and softint_disestablish(). - * (a) gif_output() has done softint_schedule(), and softint - * (gifintr()) is waiting for execution - * => This pattern is avoided by waiting SOFTINT_PENDING - * CPUs in softint_disestablish() - * (b) gifintr() is already running - * => This pattern is avoided by waiting SOFTINT_ACTIVE - * CPUs in softint_disestablish() - */ - - softint_disestablish(osi); - sc->gif_psrc = osrc; - sc->gif_pdst = odst; - osrc = NULL; - odst = NULL; } /* XXX we can detach from both, but be polite just in case */ if (sc->gif_psrc) @@ -900,6 +912,7 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) else ifp->if_flags &= ~IFF_RUNNING; + encap_lock_exit(); splx(s); return error; } @@ -908,24 +921,15 @@ void gif_delete_tunnel(struct ifnet *ifp) { struct gif_softc *sc = ifp->if_softc; - struct sockaddr *osrc, *odst; - void *osi; int s; s = splsoftnet(); + encap_lock_enter(); + gif_encap_pause(sc); if (sc->gif_si) { - osrc = sc->gif_psrc; - odst = sc->gif_pdst; - osi = sc->gif_si; - - sc->gif_psrc = NULL; - sc->gif_pdst = NULL; + softint_disestablish(sc->gif_si); sc->gif_si = NULL; - - softint_disestablish(osi); - sc->gif_psrc = osrc; - sc->gif_pdst = odst; } if (sc->gif_psrc) { sockaddr_free(sc->gif_psrc); @@ -947,5 +951,7 @@ gif_delete_tunnel(struct ifnet *ifp) ifp->if_flags |= IFF_RUNNING; else ifp->if_flags &= ~IFF_RUNNING; + + encap_lock_exit(); splx(s); } diff --git a/sys/netinet/in_gif.c b/sys/netinet/in_gif.c index 8cb54fa..86f25bb 100644 --- a/sys/netinet/in_gif.c +++ b/sys/netinet/in_gif.c @@ -216,7 +216,8 @@ in_gif_input(struct mbuf *m, ...) gifp = (struct ifnet *)encap_getarg(m); - if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { + if (gifp == NULL || (gifp->if_flags & (IFF_UP|IFF_RUNNING)) + != (IFF_UP|IFF_RUNNING)) { m_freem(m); ip_statinc(IP_STAT_NOGIF); return; diff --git a/sys/netinet/ip_encap.c b/sys/netinet/ip_encap.c index b407ea3..a1dc2bf 100644 --- a/sys/netinet/ip_encap.c +++ b/sys/netinet/ip_encap.c @@ -57,15 +57,6 @@ */ /* XXX is M_NETADDR correct? */ -/* - * The code will use radix table for tunnel lookup, for - * tunnels registered with encap_attach() with a addr/mask pair. - * Faster on machines with thousands of tunnel registerations (= interfaces). - * - * The code assumes that radix table code can handle non-continuous netmask, - * as it will pass radix table memory region with (src + dst) sockaddr pair. - */ - #include __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.48 2016/01/20 05:58:49 knakahara Exp $"); @@ -83,6 +74,9 @@ __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.48 2016/01/20 05:58:49 knakahara Exp #include #include #include +#include +#include +#include #include #include @@ -118,13 +112,17 @@ static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction); static int encap_add(struct encaptab *); static int encap_remove(struct encaptab *); static int encap_afcheck(int, const struct sockaddr *, const struct sockaddr *); -static struct radix_node_head *encap_rnh(int); -static int mask_matchlen(const struct sockaddr *); +static int mask_match(const struct encaptab *, const struct sockaddr *, + const struct sockaddr *); static void encap_fillarg(struct mbuf *, const struct encaptab *); LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(&encaptab); -struct radix_node_head *encap_head[2]; /* 0 for AF_INET, 1 for AF_INET6 */ +static ONCE_DECL(encap_init_control); +static kmutex_t encaptab_list_lock __cacheline_aligned; +static pserialize_t encap_psz __cacheline_aligned; + +static int encap_init_once(void); void encap_init(void) @@ -144,16 +142,6 @@ encap_init(void) */ LIST_INIT(&encaptab); #endif - - /* - * initialize radix lookup table when the radix subsystem is inited. - */ - rn_delayedinit((void *)&encap_head[0], - sizeof(struct sockaddr_pack) << 3); -#ifdef INET6 - rn_delayedinit((void *)&encap_head[1], - sizeof(struct sockaddr_pack) << 3); -#endif } #ifdef INET @@ -164,8 +152,6 @@ encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir) struct ip_pack4 pack; struct encaptab *ep, *match; int prio, matchprio; - struct radix_node_head *rnh = encap_rnh(AF_INET); - struct radix_node *rn; KASSERT(m->m_len >= sizeof(*ip)); @@ -186,13 +172,11 @@ encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir) match = NULL; matchprio = 0; - rn = rnh->rnh_matchaddr((void *)&pack, rnh); - if (rn && (rn->rn_flags & RNF_ROOT) == 0) { - match = (struct encaptab *)rn; - matchprio = mask_matchlen(match->srcmask) + - mask_matchlen(match->dstmask); - } - + /* + * TODO: + * make it quick to search in many tunnel interfaces somehow + * other than radix tree. + */ LIST_FOREACH(ep, &encaptab, chain) { if (ep->af != AF_INET) continue; @@ -201,7 +185,8 @@ encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir) if (ep->func) prio = (*ep->func)(m, off, proto, ep->arg); else - continue; + prio = mask_match(ep, (struct sockaddr *)&pack.mine, + (struct sockaddr *)&pack.yours); /* * We prioritize the matches by using bit length of the @@ -220,9 +205,6 @@ encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir) * We need to loop through all the possible candidates * to get the best match - the search takes O(n) for * n attachments (i.e. interfaces). - * - * For radix-based lookup, I guess source takes precedence. - * See rn_{refines,lexobetter} for the correct answer. */ if (prio <= 0) continue; @@ -242,12 +224,14 @@ encap4_input(struct mbuf *m, ...) va_list ap; const struct protosw *psw; struct encaptab *match; + int s; va_start(ap, m); off = va_arg(ap, int); proto = va_arg(ap, int); va_end(ap); + s = pserialize_read_enter(); match = encap4_lookup(m, off, proto, INBOUND); if (match) { @@ -255,11 +239,15 @@ encap4_input(struct mbuf *m, ...) psw = match->psw; if (psw && psw->pr_input) { encap_fillarg(m, match); + pserialize_read_exit(s); (*psw->pr_input)(m, off, proto); - } else + } else { + pserialize_read_exit(s); m_freem(m); + } return; } + pserialize_read_exit(s); /* last resort: inject to raw socket */ rip_input(m, off, proto); @@ -274,8 +262,6 @@ encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir) struct ip_pack6 pack; int prio, matchprio; struct encaptab *ep, *match; - struct radix_node_head *rnh = encap_rnh(AF_INET6); - struct radix_node *rn; KASSERT(m->m_len >= sizeof(*ip6)); @@ -296,13 +282,6 @@ encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir) match = NULL; matchprio = 0; - rn = rnh->rnh_matchaddr((void *)&pack, rnh); - if (rn && (rn->rn_flags & RNF_ROOT) == 0) { - match = (struct encaptab *)rn; - matchprio = mask_matchlen(match->srcmask) + - mask_matchlen(match->dstmask); - } - LIST_FOREACH(ep, &encaptab, chain) { if (ep->af != AF_INET6) continue; @@ -311,7 +290,8 @@ encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir) if (ep->func) prio = (*ep->func)(m, off, proto, ep->arg); else - continue; + prio = mask_match(ep, (struct sockaddr *)&pack.mine, + (struct sockaddr *)&pack.yours); /* see encap4_lookup() for issues here */ if (prio <= 0) @@ -331,7 +311,9 @@ encap6_input(struct mbuf **mp, int *offp, int proto) struct mbuf *m = *mp; const struct ip6protosw *psw; struct encaptab *match; + int s; + s = pserialize_read_enter(); match = encap6_lookup(m, *offp, proto, INBOUND); if (match) { @@ -339,12 +321,15 @@ encap6_input(struct mbuf **mp, int *offp, int proto) psw = (const struct ip6protosw *)match->psw; if (psw && psw->pr_input) { encap_fillarg(m, match); + pserialize_read_exit(s); return (*psw->pr_input)(mp, offp, proto); } else { + pserialize_read_exit(s); m_freem(m); return IPPROTO_DONE; } } + pserialize_read_exit(s); /* last resort: inject to raw socket */ return rip6_input(mp, offp, proto); @@ -354,37 +339,19 @@ encap6_input(struct mbuf **mp, int *offp, int proto) static int encap_add(struct encaptab *ep) { - struct radix_node_head *rnh = encap_rnh(ep->af); - int error = 0; LIST_INSERT_HEAD(&encaptab, ep, chain); - if (!ep->func && rnh) { - if (!rnh->rnh_addaddr((void *)ep->addrpack, - (void *)ep->maskpack, rnh, ep->nodes)) { - error = EEXIST; - goto fail; - } - } - return error; - fail: - LIST_REMOVE(ep, chain); - return error; + return 0; } static int encap_remove(struct encaptab *ep) { - struct radix_node_head *rnh = encap_rnh(ep->af); - int error = 0; LIST_REMOVE(ep, chain); - if (!ep->func && rnh) { - if (!rnh->rnh_deladdr((void *)ep->addrpack, - (void *)ep->maskpack, rnh)) - error = ESRCH; - } - return error; + + return 0; } static int @@ -422,6 +389,16 @@ encap_afcheck(int af, const struct sockaddr *sp, const struct sockaddr *dp) return 0; } +static int +encap_init_once(void) +{ + + encap_psz = pserialize_create(); + mutex_init(&encaptab_list_lock, MUTEX_DEFAULT, IPL_SOFTNET); + + return 0; +} + /* * sp (src ptr) is always my side, and dp (dst ptr) is always remote side. * length of mask (sm and dm) is assumed to be same as sp/dp. @@ -442,6 +419,8 @@ encap_attach(int af, int proto, struct ip_pack6 *pack6; #endif + RUN_ONCE(&encap_init_control, encap_init_once); + s = splsoftnet(); /* sanity check on args */ error = encap_afcheck(af, sp, dp); @@ -537,9 +516,14 @@ encap_attach(int af, int proto, ep->psw = psw; ep->arg = arg; + mutex_enter(&encaptab_list_lock); error = encap_add(ep); - if (error) + if (error) { + mutex_exit(&encaptab_list_lock); goto gc; + } + pserialize_perform(encap_psz); + mutex_exit(&encaptab_list_lock); error = 0; splx(s); @@ -566,6 +550,8 @@ encap_attach_func(int af, int proto, int error; int s; + RUN_ONCE(&encap_init_control, encap_init_once); + s = splsoftnet(); /* sanity check on args */ if (!func) { @@ -590,9 +576,14 @@ encap_attach_func(int af, int proto, ep->psw = psw; ep->arg = arg; + mutex_enter(&encaptab_list_lock); error = encap_add(ep); - if (error) + if (error) { + mutex_exit(&encaptab_list_lock); goto fail; + } + pserialize_perform(encap_psz); + mutex_exit(&encaptab_list_lock); error = 0; splx(s); @@ -617,6 +608,7 @@ encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) int nxt; struct encaptab *ep; const struct ip6protosw *psw; + int s; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) @@ -646,9 +638,11 @@ encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) /* * Check to see if we have a valid encap configuration. */ + s = pserialize_read_enter(); match = encap6_lookup(m, off, nxt, OUTBOUND); if (match) valid++; + pserialize_read_exit(s); /* * Depending on the value of "valid" and routing table @@ -666,6 +660,7 @@ encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) } /* inform all listeners */ + s = pserialize_read_enter(); LIST_FOREACH(ep, &encaptab, chain) { if (ep->af != AF_INET6) continue; @@ -679,6 +674,7 @@ encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) if (psw && psw->pr_ctlinput) (*psw->pr_ctlinput)(cmd, sa, d); } + pserialize_read_exit(s); rip6_ctlinput(cmd, sa, d0); return NULL; @@ -692,55 +688,81 @@ encap_detach(const struct encaptab *cookie) struct encaptab *p, *np; int error; + mutex_enter(&encaptab_list_lock); LIST_FOREACH_SAFE(p, &encaptab, chain, np) { if (p == ep) { error = encap_remove(p); - if (error) + if (error) { + mutex_exit(&encaptab_list_lock); return error; + } + pserialize_perform(encap_psz); + if (!ep->func) { kmem_free(p->addrpack, ep->addrpack->sa_len); kmem_free(p->maskpack, ep->maskpack->sa_len); } kmem_free(p, sizeof(*p)); /*XXX*/ + mutex_exit(&encaptab_list_lock); return 0; } } + mutex_exit(&encaptab_list_lock); return ENOENT; } -static struct radix_node_head * -encap_rnh(int af) -{ - - switch (af) { - case AF_INET: - return encap_head[0]; -#ifdef INET6 - case AF_INET6: - return encap_head[1]; -#endif - default: - return NULL; - } -} - static int -mask_matchlen(const struct sockaddr *sa) +mask_match(const struct encaptab *ep, const struct sockaddr *sp, + const struct sockaddr *dp) { - const char *p, *ep; - int l; - - p = (const char *)sa; - ep = p + sa->sa_len; - p += 2; /* sa_len + sa_family */ + struct sockaddr_storage s; + struct sockaddr_storage d; + int i; + const u_int8_t *p, *q; + u_int8_t *r; + int matchlen; + + KASSERTMSG(ep->func == NULL, "wrong encaptab passed to mask_match"); + + if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) + return 0; + if (sp->sa_family != ep->af || dp->sa_family != ep->af) + return 0; + if (sp->sa_len != ep->src->sa_len || dp->sa_len != ep->dst->sa_len) + return 0; + + matchlen = 0; + + p = (const u_int8_t *)sp; + q = (const u_int8_t *)ep->srcmask; + r = (u_int8_t *)&s; + for (i = 0 ; i < sp->sa_len; i++) { + r[i] = p[i] & q[i]; + /* XXX estimate */ + matchlen += (q[i] ? 8 : 0); + } - l = 0; - while (p < ep) { - l += (*p ? 8 : 0); /* estimate */ - p++; + p = (const u_int8_t *)dp; + q = (const u_int8_t *)ep->dstmask; + r = (u_int8_t *)&d; + for (i = 0 ; i < dp->sa_len; i++) { + r[i] = p[i] & q[i]; + /* XXX rough estimate */ + matchlen += (q[i] ? 8 : 0); } - return l; + + /* need to overwrite len/family portion as we don't compare them */ + s.ss_len = sp->sa_len; + s.ss_family = sp->sa_family; + d.ss_len = dp->sa_len; + d.ss_family = dp->sa_family; + + if (memcmp(&s, ep->src, ep->src->sa_len) == 0 && + memcmp(&d, ep->dst, ep->dst->sa_len) == 0) { + return matchlen; + } else + return 0; } static void @@ -769,3 +791,21 @@ encap_getarg(struct mbuf *m) } return p; } + +/* + * TODO: + * make interruptible lock using mutex and cv_wait. + */ +void +encap_lock_enter(void) +{ + + KERNEL_LOCK(1, NULL); +} + +void +encap_lock_exit(void) +{ + + KERNEL_UNLOCK_ONE(NULL); +} diff --git a/sys/netinet/ip_encap.h b/sys/netinet/ip_encap.h index f6f9fc6..8411aae 100644 --- a/sys/netinet/ip_encap.h +++ b/sys/netinet/ip_encap.h @@ -85,6 +85,9 @@ const struct encaptab *encap_attach_func(int, int, void *encap6_ctlinput(int, const struct sockaddr *, void *); int encap_detach(const struct encaptab *); void *encap_getarg(struct mbuf *); + +void encap_lock_enter(void); +void encap_lock_exit(void); #endif #endif /* !_NETINET_IP_ENCAP_H_ */ diff --git a/sys/netinet6/in6_gif.c b/sys/netinet6/in6_gif.c index 8922547..347e773 100644 --- a/sys/netinet6/in6_gif.c +++ b/sys/netinet6/in6_gif.c @@ -218,7 +218,8 @@ in6_gif_input(struct mbuf **mp, int *offp, int proto) gifp = (struct ifnet *)encap_getarg(m); - if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { + if (gifp == NULL || (gifp->if_flags & (IFF_UP|IFF_RUNNING)) + != (IFF_UP|IFF_RUNNING)) { m_freem(m); IP6_STATINC(IP6_STAT_NOGIF); return IPPROTO_DONE;