From 7da7e984681d77f62e85dc79d14a35b900c8a66e Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 12 Jan 2022 00:46:40 +0000 Subject: [PATCH 01/30] driver(9): devsw_detach never fails. Make it return void. Prune a whole lotta dead branches as a result of this. (Some logic calling this is also wrong for other reasons; devsw_detach is final -- you should never have any reason to decide to roll it back. To be cleaned up in subsequent commits...) --- external/cddl/osnet/dev/dtrace/dtrace_modevent.c | 4 +--- external/cddl/osnet/dev/fbt/fbt.c | 3 ++- external/cddl/osnet/dev/sdt/sdt.c | 3 ++- .../cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c | 2 +- share/man/man9/devsw_attach.9 | 15 +++++++++------ sys/coda/coda_psdev.c | 2 +- sys/dev/ccd.c | 2 +- sys/dev/clockctl.c | 8 +++----- sys/dev/hdaudio/hdaudio.c | 6 +----- sys/dev/i2c/i2c.c | 7 ++----- sys/dev/pad/pad.c | 4 +--- sys/dev/raidframe/rf_netbsdkintf.c | 11 +---------- sys/dev/sysmon/sysmon.c | 2 +- sys/dev/tprof/tprof.c | 8 +------- sys/dist/pf/net/pf_ioctl.c | 3 ++- sys/external/bsd/ipf/netinet/ip_fil_netbsd.c | 2 +- sys/fs/autofs/autofs_vfsops.c | 4 +--- sys/kern/kern_drvctl.c | 9 ++------- sys/kern/subr_devsw.c | 3 +-- sys/modules/examples/pollpal/pollpal.c | 3 ++- sys/net/if_tap.c | 5 +---- sys/net/if_tun.c | 9 +-------- sys/rump/dev/lib/libbpf/bpf_component.c | 3 +-- sys/rump/dev/lib/libdrvctl/drvctl_component.c | 4 +--- sys/sys/conf.h | 2 +- 25 files changed, 41 insertions(+), 83 deletions(-) diff --git a/external/cddl/osnet/dev/dtrace/dtrace_modevent.c b/external/cddl/osnet/dev/dtrace/dtrace_modevent.c index cc0f8103fb98..f1338840125a 100644 --- a/external/cddl/osnet/dev/dtrace/dtrace_modevent.c +++ b/external/cddl/osnet/dev/dtrace/dtrace_modevent.c @@ -42,9 +42,7 @@ dtrace_modcmd(modcmd_t cmd, void *data) return error; case MODULE_CMD_FINI: - error = devsw_detach(NULL, &dtrace_cdevsw); - if (error != 0) - return error; + devsw_detach(NULL, &dtrace_cdevsw); error = dtrace_unload(); if (error != 0) { diff --git a/external/cddl/osnet/dev/fbt/fbt.c b/external/cddl/osnet/dev/fbt/fbt.c index b367c2155292..46dd7c1f7f06 100644 --- a/external/cddl/osnet/dev/fbt/fbt.c +++ b/external/cddl/osnet/dev/fbt/fbt.c @@ -1329,7 +1329,8 @@ dtrace_fbt_modcmd(modcmd_t cmd, void *data) error = fbt_unload(); if (error != 0) return error; - return devsw_detach(NULL, &fbt_cdevsw); + devsw_detach(NULL, &fbt_cdevsw); + return 0; case MODULE_CMD_AUTOUNLOAD: return EBUSY; default: diff --git a/external/cddl/osnet/dev/sdt/sdt.c b/external/cddl/osnet/dev/sdt/sdt.c index c3ad129f8284..5a41270a2917 100644 --- a/external/cddl/osnet/dev/sdt/sdt.c +++ b/external/cddl/osnet/dev/sdt/sdt.c @@ -562,7 +562,8 @@ dtrace_sdt_modcmd(modcmd_t cmd, void *data) error = sdt_unload(); if (error != 0) return error; - return devsw_detach(NULL, &sdt_cdevsw); + devsw_detach(NULL, &sdt_cdevsw); + return 0; case MODULE_CMD_AUTOUNLOAD: return EBUSY; default: diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c index 9e19cd1dc0c3..d74d8c71e54d 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c @@ -7231,7 +7231,7 @@ zfs_modcmd(modcmd_t cmd, void *arg) if (error) return error; - (void) devsw_detach(&zfs_bdevsw, &zfs_cdevsw); + devsw_detach(&zfs_bdevsw, &zfs_cdevsw); attacherr: zfs_sysctl_fini(); diff --git a/share/man/man9/devsw_attach.9 b/share/man/man9/devsw_attach.9 index cf862be5846a..6ffc51957a3f 100644 --- a/share/man/man9/devsw_attach.9 +++ b/share/man/man9/devsw_attach.9 @@ -27,7 +27,7 @@ .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" -.Dd April 30, 2017 +.Dd January 11, 2022 .Dt DEVSW 9 .Os .Sh NAME @@ -49,7 +49,7 @@ .Fa "const struct cdevsw *cdev" .Fa "devmajor_t *cmajor" .Fc -.Ft int +.Ft void .Fo devsw_detach .Fa "const struct bdevsw *bdev" .Fa "const struct cdevsw *cdev" @@ -130,6 +130,11 @@ and structures. .Fn devsw_detach should be called before a loaded device driver is unloaded. +The caller must ensure that there are no open instances of the device, +and that the device's +.Fn d_open +function will fail, before calling. +Fn devsw_detach . .Pp The .Fn bdevsw_lookup @@ -155,10 +160,8 @@ or .Sh RETURN VALUES Upon successful completion, .Fn devsw_attach -and -.Fn devsw_detach -return 0. -Otherwise they return an error value. +returns 0. +Otherwise it returns an error value. .Pp In case of failure, .Fn bdevsw_lookup diff --git a/sys/coda/coda_psdev.c b/sys/coda/coda_psdev.c index cede16da3f53..7f531f03fe56 100644 --- a/sys/coda/coda_psdev.c +++ b/sys/coda/coda_psdev.c @@ -758,7 +758,7 @@ vcoda_modcmd(modcmd_t cmd, void *arg) if (VC_OPEN(vcp)) return EBUSY; } - return devsw_detach(NULL, &vcoda_cdevsw); + devsw_detach(NULL, &vcoda_cdevsw); } #endif break; diff --git a/sys/dev/ccd.c b/sys/dev/ccd.c index 05945f9a67ba..2283bc0346da 100644 --- a/sys/dev/ccd.c +++ b/sys/dev/ccd.c @@ -1710,7 +1710,7 @@ ccd_modcmd(modcmd_t cmd, void *arg) error = EBUSY; } else { mutex_exit(&ccd_lock); - error = devsw_detach(&ccd_bdevsw, &ccd_cdevsw); + devsw_detach(&ccd_bdevsw, &ccd_cdevsw); ccddetach(); } #endif diff --git a/sys/dev/clockctl.c b/sys/dev/clockctl.c index 0da5e7765fe8..9685c0f129f6 100644 --- a/sys/dev/clockctl.c +++ b/sys/dev/clockctl.c @@ -182,14 +182,12 @@ clockctl_modcmd(modcmd_t cmd, void *data) return EBUSY; } #ifdef _MODULE - error = devsw_detach(NULL, &clockctl_cdevsw); + devsw_detach(NULL, &clockctl_cdevsw); #endif mutex_exit(&clockctl_mtx); - if (error == 0) { - kauth_unlisten_scope(clockctl_listener); - mutex_destroy(&clockctl_mtx); - } + kauth_unlisten_scope(clockctl_listener); + mutex_destroy(&clockctl_mtx); break; default: diff --git a/sys/dev/hdaudio/hdaudio.c b/sys/dev/hdaudio/hdaudio.c index d39ff2db6cde..5c7874778e22 100644 --- a/sys/dev/hdaudio/hdaudio.c +++ b/sys/dev/hdaudio/hdaudio.c @@ -1636,11 +1636,7 @@ hdaudio_modcmd(modcmd_t cmd, void *opaque) error = config_cfdriver_detach(&hdaudio_cd); if (error) break; - error = devsw_detach(NULL, &hdaudio_cdevsw); - if (error) { - config_cfdriver_attach(&hdaudio_cd); - break; - } + devsw_detach(NULL, &hdaudio_cdevsw); #endif break; default: diff --git a/sys/dev/i2c/i2c.c b/sys/dev/i2c/i2c.c index 322f6ad3f199..b56b3e235112 100644 --- a/sys/dev/i2c/i2c.c +++ b/sys/dev/i2c/i2c.c @@ -904,7 +904,7 @@ iic_modcmd(modcmd_t cmd, void *opaque) if (error) { aprint_error("%s: unable to init component\n", iic_cd.cd_name); - (void)devsw_detach(NULL, &iic_cdevsw); + devsw_detach(NULL, &iic_cdevsw); } mutex_exit(&iic_mtx); #endif @@ -922,10 +922,7 @@ iic_modcmd(modcmd_t cmd, void *opaque) mutex_exit(&iic_mtx); break; } - error = devsw_detach(NULL, &iic_cdevsw); - if (error != 0) - config_init_component(cfdriver_ioconf_iic, - cfattach_ioconf_iic, cfdata_ioconf_iic); + devsw_detach(NULL, &iic_cdevsw); #endif mutex_exit(&iic_mtx); break; diff --git a/sys/dev/pad/pad.c b/sys/dev/pad/pad.c index fe0b429cf386..a779f1f71b8d 100644 --- a/sys/dev/pad/pad.c +++ b/sys/dev/pad/pad.c @@ -777,9 +777,7 @@ pad_modcmd(modcmd_t cmd, void *arg) case MODULE_CMD_FINI: #ifdef _MODULE - error = devsw_detach(NULL, &pad_cdevsw); - if (error) - break; + devsw_detach(NULL, &pad_cdevsw); error = config_fini_component(cfdriver_ioconf_pad, pad_cfattach, cfdata_ioconf_pad); diff --git a/sys/dev/raidframe/rf_netbsdkintf.c b/sys/dev/raidframe/rf_netbsdkintf.c index d1bda3553e03..87439aa70bfb 100644 --- a/sys/dev/raidframe/rf_netbsdkintf.c +++ b/sys/dev/raidframe/rf_netbsdkintf.c @@ -4088,16 +4088,7 @@ raid_modcmd_fini(void) return error; } #endif - error = devsw_detach(&raid_bdevsw, &raid_cdevsw); - if (error != 0) { - aprint_error("%s: cannot detach devsw\n",__func__); -#ifdef _MODULE - config_cfdriver_attach(&raid_cd); -#endif - config_cfattach_attach(raid_cd.cd_name, &raid_ca); - mutex_exit(&raid_lock); - return error; - } + devsw_detach(&raid_bdevsw, &raid_cdevsw); rf_BootRaidframe(false); #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) rf_destroy_mutex2(rf_sparet_wait_mutex); diff --git a/sys/dev/sysmon/sysmon.c b/sys/dev/sysmon/sysmon.c index 46aedaad7337..fd2993f0c180 100644 --- a/sys/dev/sysmon/sysmon.c +++ b/sys/dev/sysmon/sysmon.c @@ -356,7 +356,7 @@ sysmon_fini(void) if (error == 0) { mutex_enter(&sysmon_minor_mtx); sm_is_attached = false; - error = devsw_detach(NULL, &sysmon_cdevsw); + devsw_detach(NULL, &sysmon_cdevsw); mutex_exit(&sysmon_minor_mtx); } #endif diff --git a/sys/dev/tprof/tprof.c b/sys/dev/tprof/tprof.c index b069a5b7df5d..136fd190ad14 100644 --- a/sys/dev/tprof/tprof.c +++ b/sys/dev/tprof/tprof.c @@ -768,13 +768,7 @@ tprof_modcmd(modcmd_t cmd, void *arg) case MODULE_CMD_FINI: #if defined(_MODULE) - { - int error; - error = devsw_detach(NULL, &tprof_cdevsw); - if (error) { - return error; - } - } + devsw_detach(NULL, &tprof_cdevsw); #endif /* defined(_MODULE) */ tprof_driver_fini(); return 0; diff --git a/sys/dist/pf/net/pf_ioctl.c b/sys/dist/pf/net/pf_ioctl.c index 94bfb70a411d..e4c13be698f8 100644 --- a/sys/dist/pf/net/pf_ioctl.c +++ b/sys/dist/pf/net/pf_ioctl.c @@ -3459,7 +3459,8 @@ pf_modcmd(modcmd_t cmd, void *opaque) } else { pfdetach(); pflogdetach(); - return devsw_detach(NULL, &pf_cdevsw); + devsw_detach(NULL, &pf_cdevsw); + return 0; } default: return ENOTTY; diff --git a/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c b/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c index d0c4ca95097c..bb4e0706cc9b 100644 --- a/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c +++ b/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c @@ -2256,7 +2256,7 @@ ipl_fini(void *opaque) { #ifdef _MODULE - (void)devsw_detach(NULL, &ipl_cdevsw); + devsw_detach(NULL, &ipl_cdevsw); #endif /* diff --git a/sys/fs/autofs/autofs_vfsops.c b/sys/fs/autofs/autofs_vfsops.c index fbd6eafe6532..1204d1f9b6d3 100644 --- a/sys/fs/autofs/autofs_vfsops.c +++ b/sys/fs/autofs/autofs_vfsops.c @@ -496,9 +496,7 @@ autofs_modcmd(modcmd_t cmd, void *arg) } mutex_exit(&autofs_softc->sc_lock); - error = devsw_detach(NULL, &autofs_cdevsw); - if (error) - break; + devsw_detach(NULL, &autofs_cdevsw); #endif error = vfs_detach(&autofs_vfsops); if (error) diff --git a/sys/kern/kern_drvctl.c b/sys/kern/kern_drvctl.c index 37f4730b2512..8a4156f8a0aa 100644 --- a/sys/kern/kern_drvctl.c +++ b/sys/kern/kern_drvctl.c @@ -665,15 +665,10 @@ drvctl_modcmd(modcmd_t cmd, void *arg) devmon_insert_vec = saved_insert_vec; saved_insert_vec = NULL; #ifdef _MODULE - error = devsw_detach(NULL, &drvctl_cdevsw); - if (error != 0) { - saved_insert_vec = devmon_insert_vec; - devmon_insert_vec = devmon_insert; - } + devsw_detach(NULL, &drvctl_cdevsw); #endif mutex_exit(&drvctl_lock); - if (error == 0) - drvctl_fini(); + drvctl_fini(); break; default: diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index 1a0f721fdd65..b95a09d3e6b6 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -343,14 +343,13 @@ devsw_detach_locked(const struct bdevsw *bdev, const struct cdevsw *cdev) } } -int +void devsw_detach(const struct bdevsw *bdev, const struct cdevsw *cdev) { mutex_enter(&device_lock); devsw_detach_locked(bdev, cdev); mutex_exit(&device_lock); - return 0; } /* diff --git a/sys/modules/examples/pollpal/pollpal.c b/sys/modules/examples/pollpal/pollpal.c index b76e0733699b..d8ddc73450e0 100644 --- a/sys/modules/examples/pollpal/pollpal.c +++ b/sys/modules/examples/pollpal/pollpal.c @@ -311,7 +311,8 @@ pollpal_modcmd(modcmd_t cmd, void *arg __unused) case MODULE_CMD_FINI: if (pollpal_nopen != 0) return EBUSY; - return devsw_detach(NULL, &pollpal_cdevsw); + devsw_detach(NULL, &pollpal_cdevsw); + return 0; default: return ENOTTY; } diff --git a/sys/net/if_tap.c b/sys/net/if_tap.c index 0b57ad4a711c..314f4647707c 100644 --- a/sys/net/if_tap.c +++ b/sys/net/if_tap.c @@ -256,9 +256,7 @@ tapdetach(void) if_clone_detach(&tap_cloners); #ifdef _MODULE - error = devsw_detach(NULL, &tap_cdevsw); - if (error != 0) - goto out2; + devsw_detach(NULL, &tap_cdevsw); #endif if (tap_count != 0) { @@ -277,7 +275,6 @@ tapdetach(void) out1: #ifdef _MODULE devsw_attach("tap", NULL, &tap_bmajor, &tap_cdevsw, &tap_cmajor); - out2: #endif if_clone_attach(&tap_cloners); diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c index 4f533a8f08d1..f4e5b6d86d43 100644 --- a/sys/net/if_tun.c +++ b/sys/net/if_tun.c @@ -142,17 +142,10 @@ tuninit(void) static int tundetach(void) { -#ifdef _MODULE - int error; -#endif if_clone_detach(&tun_cloner); #ifdef _MODULE - error = devsw_detach(NULL, &tun_cdevsw); - if (error != 0) { - if_clone_attach(&tun_cloner); - return error; - } + devsw_detach(NULL, &tun_cdevsw); #endif if (!LIST_EMPTY(&tun_softc_list) || !LIST_EMPTY(&tunz_softc_list)) { diff --git a/sys/rump/dev/lib/libbpf/bpf_component.c b/sys/rump/dev/lib/libbpf/bpf_component.c index 05807d371d40..d41d1987afe8 100644 --- a/sys/rump/dev/lib/libbpf/bpf_component.c +++ b/sys/rump/dev/lib/libbpf/bpf_component.c @@ -50,6 +50,5 @@ RUMP_COMPONENT(RUMP_COMPONENT_NET) panic("bpf devsw attach failed: %d", error); if ((error = rump_vfs_makeonedevnode(S_IFCHR, "/dev/bpf", cmaj, 0)) !=0) panic("cannot create bpf device nodes: %d", error); - if ((error = devsw_detach(NULL, &bpf_cdevsw)) != 0) - panic("cannot detach bpf devsw: %d", error); + devsw_detach(NULL, &bpf_cdevsw); } diff --git a/sys/rump/dev/lib/libdrvctl/drvctl_component.c b/sys/rump/dev/lib/libdrvctl/drvctl_component.c index e2e79f45f9de..ac4e103fdb9c 100644 --- a/sys/rump/dev/lib/libdrvctl/drvctl_component.c +++ b/sys/rump/dev/lib/libdrvctl/drvctl_component.c @@ -51,7 +51,5 @@ RUMP_COMPONENT(RUMP_COMPONENT_DEV) if ( error !=0) panic("cannot create drvctl device node: %d", error); - error = devsw_detach(NULL, &drvctl_cdevsw); - if (error != 0) - panic("cannot detach drvctl devsw: %d", error); + devsw_detach(NULL, &drvctl_cdevsw); } diff --git a/sys/sys/conf.h b/sys/sys/conf.h index 081631d2111f..ba46976a924b 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -104,7 +104,7 @@ extern kmutex_t device_lock; int devsw_attach(const char *, const struct bdevsw *, devmajor_t *, const struct cdevsw *, devmajor_t *); -int devsw_detach(const struct bdevsw *, const struct cdevsw *); +void devsw_detach(const struct bdevsw *, const struct cdevsw *); const struct bdevsw *bdevsw_lookup(dev_t); const struct cdevsw *cdevsw_lookup(dev_t); devmajor_t bdevsw_lookup_major(const struct bdevsw *); From 42bc5693680c9c8d1e85cbcbe390b31f12d5d48c Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 12 Jan 2022 01:05:24 +0000 Subject: [PATCH 02/30] driver(9): Fix synchronization of devsw_attach/lookup/detach. (`dev' means either `bdev' or `cdev' for brevity here, e.g. in `devsw_lookup' (bdevsw_lookup, cdevsw_lookup), `dev_open' (bdev_open, cdev_open), `maxdevsws', &c., except for `devsw_attach' and `devsw_detach' which are taken literally.) - Use atomic_store_release and atomic_load_consume for devsw and tables and their entries, which are read unlocked and thus require memory barriers to ensure ordering between initialization in devsw_attach and use in dev_lookup. - Use pserialize(9) and localcount(9) to synchronize dev_open and devsw_detach. => Driver must ensure d_open fails and all open instances have been closed by the time it calls devsw_detach. => Bonus: dev_open is no longer globally serialized through device_lock. - Use atomic_store_release and atomic_load_acquire for max_devsws, which is used in conditionals in the new devsw_lookup_acquire. => It is safe to use atomic_load_relaxed in devsw_lookup because the caller must guarantee the entry is stable, so any increase of max_devsws must have already happened. => devsw_lookup and devsw_lookup_acquire assume that max_devsws never goes down. If you change this you must find some way to adapt the users, preferably without adding much overhead so that devsw operations are cheap. This change introduces an auxiliary table devswref mapping device majors to localcounts of opens in progress. The auxiliary table only occupies one pointer's worth of memory in a monolithic kernel, and is allocated on the fly for dynamically loaded modules. We could ask the module itself to reserve storage for it, but I don't see much value in that, and it would require some changes to the ABI and to config(8). --- sys/kern/subr_devsw.c | 300 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 246 insertions(+), 54 deletions(-) diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index b95a09d3e6b6..d22b088826dd 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -85,6 +85,9 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp #include #include #include +#include +#include +#include #ifdef DEVSW_DEBUG #define DPRINTF(x) printf x @@ -97,12 +100,23 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp #define CDEVSW_SIZE (sizeof(struct cdevsw *)) #define DEVSWCONV_SIZE (sizeof(struct devsw_conv)) +struct devswref { + struct localcount *dr_lc; + bool dr_dynamic; +}; + +/* XXX bdevsw, cdevsw, max_bdevsws, and max_cdevsws should be volatile */ extern const struct bdevsw **bdevsw, *bdevsw0[]; extern const struct cdevsw **cdevsw, *cdevsw0[]; extern struct devsw_conv *devsw_conv, devsw_conv0[]; extern const int sys_bdevsws, sys_cdevsws; extern int max_bdevsws, max_cdevsws, max_devsw_convs; +static struct devswref *cdevswref; +static struct devswref *bdevswref; +static struct pserialize *devsw_psz; +static kcondvar_t devsw_cv; + static int bdevsw_attach(const struct bdevsw *, devmajor_t *); static int cdevsw_attach(const struct cdevsw *, devmajor_t *); static void devsw_detach_locked(const struct bdevsw *, const struct cdevsw *); @@ -118,6 +132,9 @@ devsw_init(void) KASSERT(sys_bdevsws < MAXDEVSW - 1); KASSERT(sys_cdevsws < MAXDEVSW - 1); mutex_init(&device_lock, MUTEX_DEFAULT, IPL_NONE); + + devsw_psz = pserialize_create(); + cv_init(&devsw_cv, "devsw"); } int @@ -158,15 +175,12 @@ devsw_attach(const char *devname, error = EEXIST; goto fail; } - - if (bdev != NULL) - bdevsw[*bmajor] = bdev; - cdevsw[*cmajor] = cdev; - - mutex_exit(&device_lock); - return (0); } + /* + * XXX This should allocate what it needs up front so we never + * need to flail around trying to unwind. + */ error = bdevsw_attach(bdev, bmajor); if (error != 0) goto fail; @@ -176,6 +190,13 @@ devsw_attach(const char *devname, goto fail; } + /* + * If we already found a conv, we're done. Otherwise, find an + * empty slot or extend the table. + */ + if (i == max_devsw_convs) + goto fail; + for (i = 0 ; i < max_devsw_convs ; i++) { if (devsw_conv[i].d_name == NULL) break; @@ -224,7 +245,9 @@ devsw_attach(const char *devname, static int bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) { - const struct bdevsw **newptr; + const struct bdevsw **newbdevsw = NULL; + struct devswref *newbdevswref = NULL; + struct localcount *lc; devmajor_t bmajor; int i; @@ -253,20 +276,35 @@ bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) return (ENOMEM); } + if (bdevswref == NULL) { + newbdevswref = kmem_zalloc(MAXDEVSW * sizeof(newbdevswref[0]), + KM_NOSLEEP); + if (newbdevswref == NULL) + return ENOMEM; + atomic_store_release(&bdevswref, newbdevswref); + } + if (*devmajor >= max_bdevsws) { KASSERT(bdevsw == bdevsw0); - newptr = kmem_zalloc(MAXDEVSW * BDEVSW_SIZE, KM_NOSLEEP); - if (newptr == NULL) - return (ENOMEM); - memcpy(newptr, bdevsw, max_bdevsws * BDEVSW_SIZE); - bdevsw = newptr; - max_bdevsws = MAXDEVSW; + newbdevsw = kmem_zalloc(MAXDEVSW * sizeof(newbdevsw[0]), + KM_NOSLEEP); + if (newbdevsw == NULL) + return ENOMEM; + memcpy(newbdevsw, bdevsw, max_bdevsws * sizeof(bdevsw[0])); + atomic_store_release(&bdevsw, newbdevsw); + atomic_store_release(&max_bdevsws, MAXDEVSW); } if (bdevsw[*devmajor] != NULL) return (EEXIST); - bdevsw[*devmajor] = devsw; + KASSERT(bdevswref[*devmajor].dr_lc == NULL); + lc = kmem_zalloc(sizeof(*lc), KM_SLEEP); + localcount_init(lc); + bdevswref[*devmajor].dr_lc = lc; + bdevswref[*devmajor].dr_dynamic = true; + + atomic_store_release(&bdevsw[*devmajor], devsw); return (0); } @@ -274,7 +312,9 @@ bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) static int cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) { - const struct cdevsw **newptr; + const struct cdevsw **newcdevsw = NULL; + struct devswref *newcdevswref = NULL; + struct localcount *lc; devmajor_t cmajor; int i; @@ -300,20 +340,35 @@ cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) return (ENOMEM); } + if (cdevswref == NULL) { + newcdevswref = kmem_zalloc(MAXDEVSW * sizeof(newcdevswref[0]), + KM_NOSLEEP); + if (newcdevswref == NULL) + return ENOMEM; + atomic_store_release(&cdevswref, newcdevswref); + } + if (*devmajor >= max_cdevsws) { KASSERT(cdevsw == cdevsw0); - newptr = kmem_zalloc(MAXDEVSW * CDEVSW_SIZE, KM_NOSLEEP); - if (newptr == NULL) - return (ENOMEM); - memcpy(newptr, cdevsw, max_cdevsws * CDEVSW_SIZE); - cdevsw = newptr; - max_cdevsws = MAXDEVSW; + newcdevsw = kmem_zalloc(MAXDEVSW * sizeof(newcdevsw[0]), + KM_NOSLEEP); + if (newcdevsw == NULL) + return ENOMEM; + memcpy(newcdevsw, cdevsw, max_cdevsws * sizeof(cdevsw[0])); + atomic_store_release(&cdevsw, newcdevsw); + atomic_store_release(&max_cdevsws, MAXDEVSW); } if (cdevsw[*devmajor] != NULL) return (EEXIST); - cdevsw[*devmajor] = devsw; + KASSERT(cdevswref[*devmajor].dr_lc == NULL); + lc = kmem_zalloc(sizeof(*lc), KM_SLEEP); + localcount_init(lc); + cdevswref[*devmajor].dr_lc = lc; + cdevswref[*devmajor].dr_dynamic = true; + + atomic_store_release(&cdevsw[*devmajor], devsw); return (0); } @@ -321,25 +376,62 @@ cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) static void devsw_detach_locked(const struct bdevsw *bdev, const struct cdevsw *cdev) { - int i; + int bi, ci; KASSERT(mutex_owned(&device_lock)); + /* Prevent new references. */ if (bdev != NULL) { - for (i = 0 ; i < max_bdevsws ; i++) { - if (bdevsw[i] != bdev) + for (bi = 0; bi < max_bdevsws; bi++) { + if (bdevsw[bi] != bdev) continue; - bdevsw[i] = NULL; + atomic_store_relaxed(&bdevsw[bi], NULL); break; } + KASSERT(bi < max_bdevsws); } if (cdev != NULL) { - for (i = 0 ; i < max_cdevsws ; i++) { - if (cdevsw[i] != cdev) + for (ci = 0; ci < max_cdevsws; ci++) { + if (cdevsw[ci] != cdev) continue; - cdevsw[i] = NULL; + atomic_store_relaxed(&cdevsw[ci], NULL); break; } + KASSERT(ci < max_cdevsws); + } + + if (bdev == NULL && cdev == NULL) /* XXX possible? */ + return; + + /* + * Wait for all bdevsw_lookup_acquire, cdevsw_lookup_acquire + * calls to notice that the devsw is gone. + */ + pserialize_perform(devsw_psz); + + /* + * Wait for all references to drain. It is the caller's + * responsibility to ensure that at this point, there are no + * extant open instances and all new d_open calls will fail. + * + * Note that localcount_drain may release and reacquire + * device_lock. + */ + if (bdev != NULL) { + localcount_drain(bdevswref[bi].dr_lc, + &devsw_cv, &device_lock); + localcount_fini(bdevswref[bi].dr_lc); + kmem_free(bdevswref[bi].dr_lc, sizeof(*bdevswref[bi].dr_lc)); + bdevswref[bi].dr_lc = NULL; + bdevswref[bi].dr_dynamic = false; + } + if (cdev != NULL) { + localcount_drain(cdevswref[ci].dr_lc, + &devsw_cv, &device_lock); + localcount_fini(cdevswref[ci].dr_lc); + kmem_free(cdevswref[ci].dr_lc, sizeof(*cdevswref[ci].dr_lc)); + cdevswref[ci].dr_lc = NULL; + cdevswref[ci].dr_dynamic = false; } } @@ -365,10 +457,60 @@ bdevsw_lookup(dev_t dev) if (dev == NODEV) return (NULL); bmajor = major(dev); - if (bmajor < 0 || bmajor >= max_bdevsws) + if (bmajor < 0 || bmajor >= atomic_load_relaxed(&max_bdevsws)) return (NULL); - return (bdevsw[bmajor]); + return atomic_load_consume(&bdevsw)[bmajor]; +} + +static const struct bdevsw * +bdevsw_lookup_acquire(dev_t dev, struct localcount **lcp) +{ + devmajor_t bmajor; + const struct bdevsw *bdev = NULL, *const *curbdevsw; + struct devswref *curbdevswref; + int s; + + if (dev == NODEV) + return NULL; + bmajor = major(dev); + if (bmajor < 0) + return NULL; + + s = pserialize_read_enter(); + + /* + * max_bdevsws never goes down, so it is safe to rely on this + * condition without any locking for the array access below. + * Test sys_bdevsws first so we can avoid the memory barrier in + * that case. + */ + if (bmajor >= sys_bdevsws && + bmajor >= atomic_load_acquire(&max_bdevsws)) + goto out; + curbdevsw = atomic_load_consume(&bdevsw); + if ((bdev = atomic_load_consume(&curbdevsw[bmajor])) == NULL) + goto out; + + curbdevswref = atomic_load_consume(&bdevswref); + if (curbdevswref == NULL || !curbdevswref[bmajor].dr_dynamic) { + *lcp = NULL; + } else { + *lcp = curbdevswref[bmajor].dr_lc; + localcount_acquire(*lcp); + } + +out: pserialize_read_exit(s); + return bdev; +} + +static void +bdevsw_release(const struct bdevsw *bdev, struct localcount *lc) +{ + + if (lc == NULL) + return; + localcount_release(lc, &devsw_cv, &device_lock); } /* @@ -384,10 +526,60 @@ cdevsw_lookup(dev_t dev) if (dev == NODEV) return (NULL); cmajor = major(dev); - if (cmajor < 0 || cmajor >= max_cdevsws) + if (cmajor < 0 || cmajor >= atomic_load_relaxed(&max_cdevsws)) return (NULL); - return (cdevsw[cmajor]); + return atomic_load_consume(&cdevsw)[cmajor]; +} + +static const struct cdevsw * +cdevsw_lookup_acquire(dev_t dev, struct localcount **lcp) +{ + devmajor_t cmajor; + const struct cdevsw *cdev = NULL, *const *curcdevsw; + struct devswref *curcdevswref; + int s; + + if (dev == NODEV) + return NULL; + cmajor = major(dev); + if (cmajor < 0) + return NULL; + + s = pserialize_read_enter(); + + /* + * max_cdevsws never goes down, so it is safe to rely on this + * condition without any locking for the array access below. + * Test sys_cdevsws first so we can avoid the memory barrier in + * that case. + */ + if (cmajor >= sys_cdevsws && + cmajor >= atomic_load_acquire(&max_cdevsws)) + goto out; + curcdevsw = atomic_load_consume(&cdevsw); + if ((cdev = atomic_load_consume(&curcdevsw[cmajor])) == NULL) + goto out; + + curcdevswref = atomic_load_consume(&cdevswref); + if (curcdevswref == NULL || !curcdevswref[cmajor].dr_dynamic) { + *lcp = NULL; + } else { + *lcp = curcdevswref[cmajor].dr_lc; + localcount_acquire(*lcp); + } + +out: pserialize_read_exit(s); + return cdev; +} + +static void +cdevsw_release(const struct cdevsw *cdev, struct localcount *lc) +{ + + if (lc == NULL) + return; + localcount_release(lc, &devsw_cv, &device_lock); } /* @@ -399,10 +591,13 @@ cdevsw_lookup(dev_t dev) devmajor_t bdevsw_lookup_major(const struct bdevsw *bdev) { - devmajor_t bmajor; + const struct bdevsw *const *curbdevsw; + devmajor_t bmajor, bmax; - for (bmajor = 0 ; bmajor < max_bdevsws ; bmajor++) { - if (bdevsw[bmajor] == bdev) + bmax = atomic_load_acquire(&max_bdevsws); + curbdevsw = atomic_load_consume(&bdevsw); + for (bmajor = 0; bmajor < bmax; bmajor++) { + if (atomic_load_relaxed(&curbdevsw[bmajor]) == bdev) return (bmajor); } @@ -418,10 +613,13 @@ bdevsw_lookup_major(const struct bdevsw *bdev) devmajor_t cdevsw_lookup_major(const struct cdevsw *cdev) { - devmajor_t cmajor; + const struct cdevsw *const *curcdevsw; + devmajor_t cmajor, cmax; - for (cmajor = 0 ; cmajor < max_cdevsws ; cmajor++) { - if (cdevsw[cmajor] == cdev) + cmax = atomic_load_acquire(&max_cdevsws); + curcdevsw = atomic_load_consume(&cdevsw); + for (cmajor = 0; cmajor < cmax; cmajor++) { + if (atomic_load_relaxed(&curcdevsw[cmajor]) == cdev) return (cmajor); } @@ -696,15 +894,10 @@ int bdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct bdevsw *d; + struct localcount *lc; int rv, mpflag; - /* - * For open we need to lock, in order to synchronize - * with attach/detach. - */ - mutex_enter(&device_lock); - d = bdevsw_lookup(dev); - mutex_exit(&device_lock); + d = bdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; @@ -712,6 +905,8 @@ bdev_open(dev_t dev, int flag, int devtype, lwp_t *l) rv = (*d->d_open)(dev, flag, devtype, l); DEV_UNLOCK(d); + bdevsw_release(d, lc); + return rv; } @@ -854,15 +1049,10 @@ int cdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct cdevsw *d; + struct localcount *lc; int rv, mpflag; - /* - * For open we need to lock, in order to synchronize - * with attach/detach. - */ - mutex_enter(&device_lock); - d = cdevsw_lookup(dev); - mutex_exit(&device_lock); + d = cdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; @@ -870,6 +1060,8 @@ cdev_open(dev_t dev, int flag, int devtype, lwp_t *l) rv = (*d->d_open)(dev, flag, devtype, l); DEV_UNLOCK(d); + cdevsw_release(d, lc); + return rv; } From cf7834ba86cc792ac7c0bd4985fdee4b53b4543e Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 15 Jan 2022 20:01:53 +0000 Subject: [PATCH 03/30] driver(9): Use xcall(9) rather than pserialize(9) for devsw detach. devsw_init is called too early for the kmem(9) in pserialize_create, but pserialize_create doesn't do anything useful anyway. --- sys/kern/subr_devsw.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index d22b088826dd..776fed5ce706 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -88,6 +88,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp #include #include #include +#include #ifdef DEVSW_DEBUG #define DPRINTF(x) printf x @@ -114,7 +115,6 @@ extern int max_bdevsws, max_cdevsws, max_devsw_convs; static struct devswref *cdevswref; static struct devswref *bdevswref; -static struct pserialize *devsw_psz; static kcondvar_t devsw_cv; static int bdevsw_attach(const struct bdevsw *, devmajor_t *); @@ -133,7 +133,6 @@ devsw_init(void) KASSERT(sys_cdevsws < MAXDEVSW - 1); mutex_init(&device_lock, MUTEX_DEFAULT, IPL_NONE); - devsw_psz = pserialize_create(); cv_init(&devsw_cv, "devsw"); } @@ -406,8 +405,11 @@ devsw_detach_locked(const struct bdevsw *bdev, const struct cdevsw *cdev) /* * Wait for all bdevsw_lookup_acquire, cdevsw_lookup_acquire * calls to notice that the devsw is gone. + * + * XXX Can't use pserialize_perform here because devsw_init is + * too early for pserialize_create(). */ - pserialize_perform(devsw_psz); + xc_barrier(0); /* * Wait for all references to drain. It is the caller's From cb51d9a3df20535a675940049acf53450a247390 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 12 Jan 2022 14:07:35 +0000 Subject: [PATCH 04/30] autoconf(9): New localcount-based device instance references. device_lookup_acquire looks up an autoconf device instance, if found, and acquires a reference the caller must release with device_release. If attach or detach is in progress, device_lookup_acquire waits until it completes; while references are held, detach is blocked. The reference is meant to be held across opening a device in the short term, and then to be passed off to a longer-term reference that can be broken explicitly by detach -- usually a device special vnode, which is broken by vdevgone in the driver's detach function *_detach. Sleeping while holding a reference is allowed, but the caller shouldn't sleep on anything that is only cancelled by *_detach, because *_detach doesn't run until the reference is released. The purpose of device_lookup_acquire is to let *_detach reliably determine whether there are any open instances of the device before it commits to detaching, without needing the kernel lock. Subsequent changes to subr_devsw.c will make bdev_open and cdev_open take a reference to an autoconf instance automatically so there are no logic changes needed in the *_open devsw functions. --- sys/kern/subr_autoconf.c | 180 ++++++++++++++++++++++++++++++++++++--- sys/sys/device.h | 6 ++ 2 files changed, 174 insertions(+), 12 deletions(-) diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c index 8439ae8c6a00..073ce6f52059 100644 --- a/sys/kern/subr_autoconf.c +++ b/sys/kern/subr_autoconf.c @@ -108,6 +108,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_autoconf.c,v 1.291 2021/12/31 14:19:57 riastrad #include #include #include +#include #include @@ -1453,6 +1454,9 @@ config_devdelete(device_t dev) if (dg->dg_devs != NULL) kmem_free(dg->dg_devs, sizeof(device_t) * dg->dg_ndevs); + localcount_fini(dev->dv_localcount); + kmem_free(dev->dv_localcount, sizeof(*dev->dv_localcount)); + cv_destroy(&dvl->dvl_cv); mutex_destroy(&dvl->dvl_mtx); @@ -1556,6 +1560,7 @@ config_devalloc(const device_t parent, const cfdata_t cf, dev->dv_activity_handlers = NULL; dev->dv_private = dev_private; dev->dv_flags = ca->ca_flags; /* inherit flags from class */ + dev->dv_attaching = curlwp; myunit = config_unit_alloc(dev, cd, cf); if (myunit == -1) { @@ -1604,6 +1609,10 @@ config_devalloc(const device_t parent, const cfdata_t cf, "device-parent", device_xname(parent)); } + dev->dv_localcount = kmem_zalloc(sizeof(*dev->dv_localcount), + KM_SLEEP); + localcount_init(dev->dv_localcount); + if (dev->dv_cfdriver->cd_attrs != NULL) config_add_attrib_dict(dev); @@ -1755,8 +1764,29 @@ config_attach_internal(device_t parent, cfdata_t cf, void *aux, cfprint_t print, /* Let userland know */ devmon_report_device(dev, true); + /* + * Prevent detach until the driver's attach function, and all + * deferred actions, have finished. + */ config_pending_incr(dev); + + /* Call the driver's attach function. */ (*dev->dv_cfattach->ca_attach)(parent, dev, aux); + + /* + * Allow other threads to acquire references to the device now + * that the driver's attach function is done. + */ + mutex_enter(&config_misc_lock); + KASSERT(dev->dv_attaching == curlwp); + dev->dv_attaching = NULL; + cv_broadcast(&config_misc_cv); + mutex_exit(&config_misc_lock); + + /* + * Synchronous parts of attach are done. Allow detach, unless + * the driver's attach function scheduled deferred actions. + */ config_pending_decr(dev); mutex_enter(&config_misc_lock); @@ -1822,8 +1852,29 @@ config_attach_pseudo(cfdata_t cf) /* Let userland know */ devmon_report_device(dev, true); + /* + * Prevent detach until the driver's attach function, and all + * deferred actions, have finished. + */ config_pending_incr(dev); + + /* Call the driver's attach function. */ (*dev->dv_cfattach->ca_attach)(ROOT, dev, NULL); + + /* + * Allow other threads to acquire references to the device now + * that the driver's attach function is done. + */ + mutex_enter(&config_misc_lock); + KASSERT(dev->dv_attaching == curlwp); + dev->dv_attaching = NULL; + cv_broadcast(&config_misc_cv); + mutex_exit(&config_misc_lock); + + /* + * Synchronous parts of attach are done. Allow detach, unless + * the driver's attach function scheduled deferred actions. + */ config_pending_decr(dev); config_process_deferred(&deferred_config_queue, dev); @@ -1872,24 +1923,49 @@ config_dump_garbage(struct devicelist *garbage) static int config_detach_enter(device_t dev) { - int error; + int error = 0; mutex_enter(&config_misc_lock); - for (;;) { - if (dev->dv_pending == 0 && dev->dv_detaching == NULL) { - dev->dv_detaching = curlwp; - error = 0; - break; - } + + /* + * Wait until attach has fully completed, and until any + * concurrent detach (e.g., drvctl racing with USB event + * thread) has completed. + * + * Caller must hold alldevs_nread or alldevs_nwrite (e.g., via + * deviter) to ensure the winner of the race doesn't free the + * device leading the loser of the race into use-after-free. + * + * XXX Not all callers do this! + */ + while (dev->dv_pending || dev->dv_detaching) { KASSERTMSG(dev->dv_detaching != curlwp, "recursively detaching %s", device_xname(dev)); error = cv_wait_sig(&config_misc_cv, &config_misc_lock); if (error) - break; + goto out; } - KASSERT(error || dev->dv_detaching == curlwp); - mutex_exit(&config_misc_lock); + /* + * Attach has completed, and no other concurrent detach is + * running. Claim the device for detaching. This will cause + * all new attempts to acquire references to block. + */ + KASSERT(dev->dv_attaching == NULL); + KASSERT(dev->dv_detaching == NULL); + dev->dv_detaching = curlwp; + + /* + * Wait for all device_lookup_acquire references -- mostly, for + * all attempts to open the device -- to drain. There may + * still be open instances of the device after this point, but + * all new attempts to acquire references will block until + * dv_detaching clears. + */ + localcount_drain(dev->dv_localcount, + &config_misc_cv, &config_misc_lock); + +out: mutex_exit(&config_misc_lock); return error; } @@ -1980,9 +2056,18 @@ config_detach(device_t dev, int flags) */ if (rv == 0) dev->dv_flags &= ~DVF_ACTIVE; - else if ((flags & DETACH_FORCE) == 0) + else if ((flags & DETACH_FORCE) == 0) { + /* + * Detach failed -- likely EBUSY. Reset the + * localcount. At this point there can be no + * references held, and no new references acquired -- + * calls to device_lookup_acquire are held up on + * dv_detaching until config_detach_exit. + */ + localcount_fini(dev->dv_localcount); + localcount_init(dev->dv_localcount); goto out; - else { + } else { panic("config_detach: forced detach of %s failed (%d)", device_xname(dev), rv); } @@ -2498,6 +2583,14 @@ config_alldevs_exit(struct alldevs_foray *af) * device_lookup: * * Look up a device instance for a given driver. + * + * Caller is responsible for ensuring the device's state is + * stable, either by holding a reference already obtained with + * device_lookup_acquire or by otherwise ensuring the device is + * attached and can't be detached (e.g., holding an open device + * node and ensuring *_detach calls vdevgone). + * + * XXX Find a way to assert this. */ device_t device_lookup(cfdriver_t cd, int unit) @@ -2526,6 +2619,69 @@ device_lookup_private(cfdriver_t cd, int unit) return device_private(device_lookup(cd, unit)); } +/* + * device_lookup_acquire: + * + * Look up a device instance for a given driver, and return a + * reference to it that must be released by device_release. + * + * => If the device is still attaching, blocks until *_attach has + * returned. + * + * => If the device is detaching, blocks until *_detach has + * returned. May succeed or fail in that case, depending on + * whether *_detach has backed out (EBUSY) or committed to + * detaching. + */ +device_t +device_lookup_acquire(cfdriver_t cd, int unit) +{ + device_t dv; + + /* XXX This should have a pserialized fast path -- TBD. */ + mutex_enter(&config_misc_lock); + mutex_enter(&alldevs_lock); +retry: if (unit < 0 || unit >= cd->cd_ndevs || + (dv = cd->cd_devs[unit]) == NULL || + dv->dv_del_gen != 0) { + dv = NULL; + } else { + /* + * Wait for the device to stabilize, if attaching or + * detaching. Either way we must wait for *_attach or + * *_detach to complete, and either way we must retry: + * even if detaching, *_detach might fail (EBUSY) so + * the device may still be there. + */ + if ((dv->dv_attaching != NULL && dv->dv_attaching != curlwp) || + dv->dv_detaching != NULL) { + mutex_exit(&alldevs_lock); + cv_wait(&config_misc_cv, &config_misc_lock); + mutex_enter(&alldevs_lock); + goto retry; + } + localcount_acquire(dv->dv_localcount); + } + mutex_exit(&alldevs_lock); + mutex_exit(&config_misc_lock); + + return dv; +} + +/* + * device_release: + * + * Release a reference to a device acquired with + * device_lookup_acquire. + */ +void +device_release(device_t dv) +{ + + localcount_release(dv->dv_localcount, + &config_misc_cv, &config_misc_lock); +} + /* * device_find_by_xname: * diff --git a/sys/sys/device.h b/sys/sys/device.h index 99940ab1563e..76d9f8be5ad6 100644 --- a/sys/sys/device.h +++ b/sys/sys/device.h @@ -274,10 +274,12 @@ struct device { void *dv_private; /* this device's private storage */ int *dv_locators; /* our actual locators (optional) */ prop_dictionary_t dv_properties;/* properties dictionary */ + struct localcount *dv_localcount;/* reference count */ int dv_pending; /* config_pending count */ TAILQ_ENTRY(device) dv_pending_list; + struct lwp *dv_attaching; /* thread not yet finished in attach */ struct lwp *dv_detaching; /* detach lock (config_misc_lock/cv) */ size_t dv_activity_count; @@ -651,6 +653,10 @@ void null_childdetached(device_t, device_t); device_t device_lookup(cfdriver_t, int); void *device_lookup_private(cfdriver_t, int); + +device_t device_lookup_acquire(cfdriver_t, int); +void device_release(device_t); + void device_register(device_t, void *); void device_register_post_config(device_t, void *); From f3efbbc29f17103e573522cfd826daf5ac73d9e1 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 12 Jan 2022 14:18:57 +0000 Subject: [PATCH 05/30] driver(9): New devsw members d_cfdriver, d_devtounit. If set, then bdev_open/cdev_open will use d_devtounit to map the dev_t to an autoconf instance (e.g., /dev/wd0a -> wd0) and hold a reference with device_lookup_acquire across the call to d_open. This guarantees that the autoconf instance cannot be detached while the devsw's d_open function is trying to open it (and also that the autoconf instance has finished *_attach before anyone can open it). Of course, if the underlying hardware has gone away, there will be I/O errors, but this avoids software synchronization bugs between open and detach for drivers that opt into it. It's up to the driver and bus to figure out how to deal with I/O errors from operations on hardware that has gone away while the software hasn't finished notifying everything that it's gone yet. --- sys/kern/subr_devsw.c | 49 +++++++++++++++++++++++++++++++++++++++++-- sys/sys/conf.h | 4 ++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index 776fed5ce706..83aae0af3488 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -89,6 +89,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp #include #include #include +#include #ifdef DEVSW_DEBUG #define DPRINTF(x) printf x @@ -897,16 +898,38 @@ bdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct bdevsw *d; struct localcount *lc; - int rv, mpflag; + device_t dv = NULL/*XXXGCC*/; + int unit, rv, mpflag; d = bdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; + if (d->d_devtounit) { + /* + * If the device node corresponds to an autoconf device + * instance, acquire a reference to it so that during + * d_open, device_lookup is stable. + * + * XXX This should also arrange to instantiate cloning + * pseudo-devices if appropriate, but that requires + * reviewing them all to find and verify a common + * pattern. + */ + if ((unit = (*d->d_devtounit)(dev)) == -1) + return ENXIO; + if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL) + return ENXIO; + } + DEV_LOCK(d); rv = (*d->d_open)(dev, flag, devtype, l); DEV_UNLOCK(d); + if (d->d_devtounit) { + device_release(dv); + } + bdevsw_release(d, lc); return rv; @@ -1052,16 +1075,38 @@ cdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct cdevsw *d; struct localcount *lc; - int rv, mpflag; + device_t dv = NULL/*XXXGCC*/; + int unit, rv, mpflag; d = cdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; + if (d->d_devtounit) { + /* + * If the device node corresponds to an autoconf device + * instance, acquire a reference to it so that during + * d_open, device_lookup is stable. + * + * XXX This should also arrange to instantiate cloning + * pseudo-devices if appropriate, but that requires + * reviewing them all to find and verify a common + * pattern. + */ + if ((unit = (*d->d_devtounit)(dev)) == -1) + return ENXIO; + if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL) + return ENXIO; + } + DEV_LOCK(d); rv = (*d->d_open)(dev, flag, devtype, l); DEV_UNLOCK(d); + if (d->d_devtounit) { + device_release(dv); + } + cdevsw_release(d, lc); return rv; diff --git a/sys/sys/conf.h b/sys/sys/conf.h index ba46976a924b..12cfd3bf89a6 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -76,6 +76,8 @@ struct bdevsw { int (*d_dump)(dev_t, daddr_t, void *, size_t); int (*d_psize)(dev_t); int (*d_discard)(dev_t, off_t, off_t); + int (*d_devtounit)(dev_t); + struct cfdriver *d_cfdriver; int d_flag; }; @@ -94,6 +96,8 @@ struct cdevsw { paddr_t (*d_mmap)(dev_t, off_t, int); int (*d_kqfilter)(dev_t, struct knote *); int (*d_discard)(dev_t, off_t, off_t); + int (*d_devtounit)(dev_t); + struct cfdriver *d_cfdriver; int d_flag; }; From 03735bc6b6d3e512045a2431a9be33567e3489a8 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 15 Jan 2022 19:11:10 +0000 Subject: [PATCH 06/30] disk(9): New function disklabel_dev_unit. Maps a dev_t like wd3e to an autoconf instance number like 3, with no partition. Same as DISKUNIT macro, but is a symbol whose pointer can be taken. Meant for use with struct bdevsw, cdevsw::d_devtounit. --- sys/kern/subr_disk.c | 7 +++++++ sys/sys/disklabel.h | 1 + 2 files changed, 8 insertions(+) diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c index da664f920382..41218421db57 100644 --- a/sys/kern/subr_disk.c +++ b/sys/kern/subr_disk.c @@ -728,3 +728,10 @@ disk_set_info(device_t dev, struct disk *dk, const char *type) if (odisk_info) prop_object_release(odisk_info); } + +int +disklabel_dev_unit(dev_t dev) +{ + + return DISKUNIT(dev); +} diff --git a/sys/sys/disklabel.h b/sys/sys/disklabel.h index 4e94b8671332..853cdbe668a3 100644 --- a/sys/sys/disklabel.h +++ b/sys/sys/disklabel.h @@ -509,6 +509,7 @@ const char *convertdisklabel(struct disklabel *, void (*)(struct buf *), int bounds_check_with_label(struct disk *, struct buf *, int); int bounds_check_with_mediasize(struct buf *, int, uint64_t); const char *getfstypename(int); +int disklabel_dev_unit(dev_t); #endif #endif /* _LOCORE */ From f331194bf011699d975e5b9d9d995b01f36e2948 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 15 Jan 2022 19:18:09 +0000 Subject: [PATCH 07/30] driver(9): New function dev_minor_unit. --- sys/kern/subr_devsw.c | 15 +++++++++++++++ sys/sys/conf.h | 1 + 2 files changed, 16 insertions(+) diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index 83aae0af3488..42bf25c4e354 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -1301,3 +1301,18 @@ nommap(dev_t dev, off_t off, int prot) return (paddr_t)-1; } + +/* + * dev_minor_unit(dev) + * + * Returns minor(dev) as an int. Intended for use with struct + * bdevsw, cdevsw::d_devtounit for drivers whose /dev nodes are + * implemented by reference to an autoconf instance with the minor + * number. + */ +int +dev_minor_unit(dev_t dev) +{ + + return minor(dev); +} diff --git a/sys/sys/conf.h b/sys/sys/conf.h index 12cfd3bf89a6..cb4c287d1982 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -280,6 +280,7 @@ devmajor_t devsw_name2blk(const char *, char *, size_t); devmajor_t devsw_name2chr(const char *, char *, size_t); dev_t devsw_chr2blk(dev_t); dev_t devsw_blk2chr(dev_t); +int dev_minor_unit(dev_t); void mm_init(void); #endif /* _KERNEL */ From 10c376008cf37301f119f65c98f5a9f2f9b8824e Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:03:49 +0000 Subject: [PATCH 08/30] specfs: Call bdev_open without the vnode lock. There is no need for it to serialize opens, because they are already serialized by sd_opencnt which for block devices is always either 0 or 1. There's not obviously any other reason why the vnode lock should be held across bdev_open, other than that it might be nice to avoid dropping it if not necessary. For character devices we always have to drop the vnode lock because open might hang indefinitely, when opening a tty, which is not allowed while holding the vnode lock. --- sys/miscfs/specfs/spec_vnops.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index b4bc4c34ab03..67354edf89d3 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -609,6 +609,7 @@ spec_open(void *v) sd->sd_opencnt = 1; sd->sd_bdevvp = vp; mutex_exit(&device_lock); + VOP_UNLOCK(vp); do { const struct bdevsw *bdev; @@ -628,13 +629,10 @@ spec_open(void *v) if ((name = bdevsw_getname(major(dev))) == NULL) break; - VOP_UNLOCK(vp); - /* Try to autoload device module */ (void) module_autoload(name, MODULE_CLASS_DRIVER); - - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } while (gen != module_gen); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); break; From e709e3b5cacfb2dd2c63d9f6912538bd059721d4 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:10:19 +0000 Subject: [PATCH 09/30] specfs: Assert v_type is VBLK or VCHR in spec_open. Nothing else makes sense. Prune dead branches (and replace default case by panic). --- sys/miscfs/specfs/spec_vnops.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 67354edf89d3..b238aa30c067 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -517,7 +517,10 @@ spec_open(void *v) sd = sn->sn_dev; name = NULL; gen = 0; - + + KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d", + vp->v_type); + /* * Don't allow open if fs is mounted -nodev. */ @@ -636,15 +639,8 @@ spec_open(void *v) break; - case VNON: - case VLNK: - case VDIR: - case VREG: - case VBAD: - case VFIFO: - case VSOCK: default: - return 0; + panic("invalid specfs vnode type: %d", vp->v_type); } mutex_enter(&device_lock); From 68cd201df93bd26ec3e0a2479cf5df135033f4e4 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:11:49 +0000 Subject: [PATCH 10/30] specfs: Factor common kauth check out of switch in spec_open. No functional change. --- sys/miscfs/specfs/spec_vnops.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index b238aa30c067..3b2baf2a3388 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -538,13 +538,12 @@ spec_open(void *v) req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ; break; } + error = kauth_authorize_device_spec(ap->a_cred, req, vp); + if (error != 0) + return (error); switch (vp->v_type) { case VCHR: - error = kauth_authorize_device_spec(ap->a_cred, req, vp); - if (error != 0) - return (error); - /* * Character devices can accept opens from multiple * vnodes. @@ -587,10 +586,6 @@ spec_open(void *v) break; case VBLK: - error = kauth_authorize_device_spec(ap->a_cred, req, vp); - if (error != 0) - return (error); - /* * For block devices, permit only one open. The buffer * cache cannot remain self-consistent with multiple From 487c721970b7fc803e46cb8b0f9c853e5a286ac4 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:34:59 +0000 Subject: [PATCH 11/30] specfs: Split spec_open switch into three sections. The sections are now: 1. Acquire open reference. 1a (intermezzo). Set VV_ISTTY. 2. Drop the vnode lock to call .d_open and autoload modules if necessary. 3. Handle concurrent revoke if it happenend, or release open reference if .d_open failed. No functional change. Sprinkle comments about problems. --- sys/miscfs/specfs/spec_vnops.c | 133 +++++++++++++++++++++++++++------ 1 file changed, 111 insertions(+), 22 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 3b2baf2a3388..9c613d7f5a22 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -542,6 +542,21 @@ spec_open(void *v) if (error != 0) return (error); + /* + * Acquire an open reference -- as long as we hold onto it, and + * the vnode isn't revoked, it can't be closed (XXX except for + * D_MCLOSE devices, which don't exist in-tree). + * + * But first check whether it has been revoked -- if so, we + * can't acquire more open references and we must fail + * immediately with EBADF. + * + * XXX This races with revoke: once we release the vnode lock, + * the vnode may be revoked, and the .d_close callback run, at + * the same time as we're calling .d_open here. Drivers + * shouldn't have to contemplate this scenario; .d_open and + * .d_close should be prevented from running concurrently. + */ switch (vp->v_type) { case VCHR: /* @@ -556,8 +571,73 @@ spec_open(void *v) sd->sd_opencnt++; sn->sn_opencnt++; mutex_exit(&device_lock); + break; + case VBLK: + /* + * For block devices, permit only one open. The buffer + * cache cannot remain self-consistent with multiple + * vnodes holding a block device open. + * + * Treat zero opencnt with non-NULL mountpoint as open. + * This may happen after forced detach of a mounted device. + */ + mutex_enter(&device_lock); + if (sn->sn_gone) { + mutex_exit(&device_lock); + return (EBADF); + } + if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) { + mutex_exit(&device_lock); + return EBUSY; + } + sn->sn_opencnt = 1; + sd->sd_opencnt = 1; + sd->sd_bdevvp = vp; + mutex_exit(&device_lock); + break; + default: + panic("invalid specfs vnode type: %d", vp->v_type); + } + + /* + * Set VV_ISTTY if this is a tty cdev. + * + * XXX This does the wrong thing if the module has to be + * autoloaded. We should maybe set this after autoloading + * modules and calling .d_open successfully, except (a) we need + * the vnode lock to touch it, and (b) once we acquire the + * vnode lock again, the vnode may have been revoked, and + * deadfs's dead_read needs VV_ISTTY to be already set in order + * to return the right answer. So this needs some additional + * synchronization to be made to work correctly with tty driver + * module autoload. For now, let's just hope it doesn't cause + * too much trouble for a tty from an autoloaded driver module + * to fail with EIO instead of returning EOF. + */ + if (vp->v_type == VCHR) { if (cdev_type(dev) == D_TTY) vp->v_vflag |= VV_ISTTY; + } + + /* + * Open the device. If .d_open returns ENXIO (device not + * configured), the driver may not be loaded, so try + * autoloading a module and then try .d_open again if anything + * got loaded. + * + * Because the opening the device may block indefinitely, + * e.g. when opening a tty, and loading a module may cross into + * many other subsystems, we must not hold the vnode lock while + * calling .d_open, so release it now and reacquire it when + * done. + * + * XXX The vnode may be revoked concurrently while we have the + * vnode lock released. If this happens, the sn and sd + * pointers may be invalidated, but we don't do anything to + * avoid touching them after we're done here. + */ + switch (vp->v_type) { + case VCHR: VOP_UNLOCK(vp); do { const struct cdevsw *cdev; @@ -586,27 +666,6 @@ spec_open(void *v) break; case VBLK: - /* - * For block devices, permit only one open. The buffer - * cache cannot remain self-consistent with multiple - * vnodes holding a block device open. - * - * Treat zero opencnt with non-NULL mountpoint as open. - * This may happen after forced detach of a mounted device. - */ - mutex_enter(&device_lock); - if (sn->sn_gone) { - mutex_exit(&device_lock); - return (EBADF); - } - if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) { - mutex_exit(&device_lock); - return EBUSY; - } - sn->sn_opencnt = 1; - sd->sd_opencnt = 1; - sd->sd_bdevvp = vp; - mutex_exit(&device_lock); VOP_UNLOCK(vp); do { const struct bdevsw *bdev; @@ -635,9 +694,39 @@ spec_open(void *v) break; default: - panic("invalid specfs vnode type: %d", vp->v_type); + __unreachable(); } + /* + * If it has been revoked since we released the vnode lock and + * reacquired it, then spec_node_revoke has closed it, and we + * must fail with EBADF. + * + * Otherwise, if opening it failed, back out and release the + * open reference. + * + * XXX This is wrong -- we might release the last open + * reference here, but we don't close the device. If only this + * thread's call to open failed, that's fine, but we might + * have: + * + * Thread 1 Thread 2 + * VOP_OPEN + * ... + * .d_open -> 0 (success) + * acquire vnode lock + * do stuff VOP_OPEN + * release vnode lock ... + * .d_open -> EBUSY + * VOP_CLOSE + * acquire vnode lock + * --sd_opencnt != 0 + * => no .d_close + * release vnode lock + * acquire vnode lock + * --sd_opencnt == 0 + * but no .d_close (***) + */ mutex_enter(&device_lock); if (sn->sn_gone) { if (error == 0) From 033ad99dbb7c31e129fc3fab6d065baae206b3f7 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:38:12 +0000 Subject: [PATCH 12/30] specfs: Factor common device_lock out of switch for clarity. No functional change. --- sys/miscfs/specfs/spec_vnops.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 9c613d7f5a22..2ba7f08c88be 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -557,20 +557,19 @@ spec_open(void *v) * shouldn't have to contemplate this scenario; .d_open and * .d_close should be prevented from running concurrently. */ + mutex_enter(&device_lock); switch (vp->v_type) { case VCHR: /* * Character devices can accept opens from multiple * vnodes. */ - mutex_enter(&device_lock); if (sn->sn_gone) { - mutex_exit(&device_lock); - return (EBADF); + error = EBADF; + break; } sd->sd_opencnt++; sn->sn_opencnt++; - mutex_exit(&device_lock); break; case VBLK: /* @@ -581,23 +580,24 @@ spec_open(void *v) * Treat zero opencnt with non-NULL mountpoint as open. * This may happen after forced detach of a mounted device. */ - mutex_enter(&device_lock); if (sn->sn_gone) { - mutex_exit(&device_lock); - return (EBADF); + error = EBADF; + break; } if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) { - mutex_exit(&device_lock); - return EBUSY; + error = EBUSY; + break; } sn->sn_opencnt = 1; sd->sd_opencnt = 1; sd->sd_bdevvp = vp; - mutex_exit(&device_lock); break; default: panic("invalid specfs vnode type: %d", vp->v_type); } + mutex_exit(&device_lock); + if (error) + return error; /* * Set VV_ISTTY if this is a tty cdev. From d7960996c7c84f825922fab519c46095b5ab3906 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:42:24 +0000 Subject: [PATCH 13/30] specfs: Factor VOP_UNLOCK/vn_lock out of switch for clarity. No functional change. --- sys/miscfs/specfs/spec_vnops.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 2ba7f08c88be..97ae74e64660 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -636,9 +636,9 @@ spec_open(void *v) * pointers may be invalidated, but we don't do anything to * avoid touching them after we're done here. */ + VOP_UNLOCK(vp); switch (vp->v_type) { case VCHR: - VOP_UNLOCK(vp); do { const struct cdevsw *cdev; @@ -661,12 +661,9 @@ spec_open(void *v) /* Try to autoload device module */ (void) module_autoload(name, MODULE_CLASS_DRIVER); } while (gen != module_gen); - - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); break; case VBLK: - VOP_UNLOCK(vp); do { const struct bdevsw *bdev; @@ -689,13 +686,12 @@ spec_open(void *v) /* Try to autoload device module */ (void) module_autoload(name, MODULE_CLASS_DRIVER); } while (gen != module_gen); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - break; default: __unreachable(); } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * If it has been revoked since we released the vnode lock and From ec95a800b6c4c6df83a4c63a65024a29f4ab47b1 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:55:03 +0000 Subject: [PATCH 14/30] specfs: Reorganize D_DISK tail of spec_open and explain what's up. --- sys/miscfs/specfs/spec_vnops.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 97ae74e64660..4214aafef5ae 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -736,15 +736,27 @@ spec_open(void *v) } mutex_exit(&device_lock); - if (cdev_type(dev) != D_DISK || error != 0) + /* If anything went wrong, we're done. */ + if (error) return error; - - ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl; - error = (*ioctl)(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, curlwp); - if (error == 0) - uvm_vnp_setsize(vp, (voff_t)pi.pi_secsize * pi.pi_size); + /* + * For disk devices, automagically set the vnode size to the + * partition size, if we can. This applies to block devices + * and character devices alike -- every block device must have + * a corresponding character device. And if the module is + * loaded it will remain loaded until we're done here (it is + * forbidden to devsw_detach until closed). So it is safe to + * query cdev_type unconditionally here. + */ + if (cdev_type(dev) == D_DISK) { + ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl; + if ((*ioctl)(dev, DIOCGPARTINFO, &pi, FREAD, curlwp) == 0) + uvm_vnp_setsize(vp, + (voff_t)pi.pi_secsize * pi.pi_size); + } + /* Success! */ return 0; } From 26ca8cbc9d0ded4355505c1d07e1d1915ef8578e Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 01:06:36 +0000 Subject: [PATCH 15/30] specfs: Exclude .d_open and .d_close, and serialize .d_close. See comments for details. --- sys/miscfs/specfs/spec_vnops.c | 111 +++++++++++++++++++++------------ sys/miscfs/specfs/specdev.h | 3 +- 2 files changed, 72 insertions(+), 42 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 4214aafef5ae..59757cb7b8eb 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -81,10 +81,21 @@ __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.183 2021/07/18 23:57:14 dholland Ex #include #include #include +#include +#include #include #include +/* + * Lock order: + * + * vnode lock + * -> struct specdev::sd_opencloselock + * -> device_lock + * -> struct vnode::v_interlock + */ + /* symbolic sleep message strings for devices */ const char devopn[] = "devopn"; const char devio[] = "devio"; @@ -250,6 +261,7 @@ spec_node_init(vnode_t *vp, dev_t rdev) sd->sd_refcnt = 1; sd->sd_opencnt = 0; sd->sd_bdevvp = NULL; + rw_init(&sd->sd_opencloselock); sn->sn_dev = sd; sd = NULL; } else { @@ -463,6 +475,7 @@ spec_node_destroy(vnode_t *vp) if (refcnt == 1) { KASSERT(sd->sd_opencnt == 0); KASSERT(sd->sd_bdevvp == NULL); + rw_destroy(&sd->sd_opencloselock); kmem_free(sd, sizeof(*sd)); } kmem_free(sn, sizeof(*sn)); @@ -542,6 +555,17 @@ spec_open(void *v) if (error != 0) return (error); + /* + * Prevent concurrent .d_close while we are trying to open the + * device with the vnode unlocked later on. Concurrent opens + * by other treads are OK -- it's up to the device driver to + * reject those if need be. Because concurrent .d_close is + * blocked, once we acquire an open reference, the vnode cannot + * be revoked until we allow .d_close again -- spec_node_revoke + * will wait for this via VOP_CLOSE. + */ + rw_enter(&sd->sd_opencloselock, RW_READER); + /* * Acquire an open reference -- as long as we hold onto it, and * the vnode isn't revoked, it can't be closed (XXX except for @@ -550,12 +574,6 @@ spec_open(void *v) * But first check whether it has been revoked -- if so, we * can't acquire more open references and we must fail * immediately with EBADF. - * - * XXX This races with revoke: once we release the vnode lock, - * the vnode may be revoked, and the .d_close callback run, at - * the same time as we're calling .d_open here. Drivers - * shouldn't have to contemplate this scenario; .d_open and - * .d_close should be prevented from running concurrently. */ mutex_enter(&device_lock); switch (vp->v_type) { @@ -596,8 +614,10 @@ spec_open(void *v) panic("invalid specfs vnode type: %d", vp->v_type); } mutex_exit(&device_lock); - if (error) + if (error) { + rw_exit(&sd->sd_opencloselock); return error; + } /* * Set VV_ISTTY if this is a tty cdev. @@ -630,11 +650,6 @@ spec_open(void *v) * many other subsystems, we must not hold the vnode lock while * calling .d_open, so release it now and reacquire it when * done. - * - * XXX The vnode may be revoked concurrently while we have the - * vnode lock released. If this happens, the sn and sd - * pointers may be invalidated, but we don't do anything to - * avoid touching them after we're done here. */ VOP_UNLOCK(vp); switch (vp->v_type) { @@ -691,39 +706,22 @@ spec_open(void *v) default: __unreachable(); } - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * If it has been revoked since we released the vnode lock and - * reacquired it, then spec_node_revoke has closed it, and we - * must fail with EBADF. + * reacquired it, then spec_node_revoke will close it as soon + * as we release sd->sd_opencloselock, and we must fail with + * EBADF. * * Otherwise, if opening it failed, back out and release the - * open reference. - * - * XXX This is wrong -- we might release the last open - * reference here, but we don't close the device. If only this - * thread's call to open failed, that's fine, but we might - * have: - * - * Thread 1 Thread 2 - * VOP_OPEN - * ... - * .d_open -> 0 (success) - * acquire vnode lock - * do stuff VOP_OPEN - * release vnode lock ... - * .d_open -> EBUSY - * VOP_CLOSE - * acquire vnode lock - * --sd_opencnt != 0 - * => no .d_close - * release vnode lock - * acquire vnode lock - * --sd_opencnt == 0 - * but no .d_close (***) + * open reference. This can only release the last reference if + * it was not previously open at all, because during the whole + * time we held an open reference we blocked out .d_close with + * sd->sd_opencloselock, so we need not close it even if + * sd->sd_opencnt drops to zero. */ mutex_enter(&device_lock); + KASSERT(vp->v_specnode == sn); if (sn->sn_gone) { if (error == 0) error = EBADF; @@ -736,6 +734,19 @@ spec_open(void *v) } mutex_exit(&device_lock); + /* + * Allow .d_close again and reacquire the vnode lock before we + * return. At this point, the vnode may be revoked. Once we + * lock it again, it may be dead, so we must not touch sn or sd + * after this point. + * + * XXX This should do vdead_check before .d_ioctl -- otherwise + * the devsw might be unloaded altogether by the time we reach + * it. + */ + rw_exit(&sd->sd_opencloselock); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + /* If anything went wrong, we're done. */ if (error) return error; @@ -1355,6 +1366,20 @@ spec_close(void *v) panic("spec_close: not special"); } + /* + * Before we examine the open reference count and, if + * appropriate, call .d_close, wait for any concurrent .d_open + * calls to complete, and block new ones. That way, drivers + * need not worry about .d_open and .d_close running at the + * same time. + * + * This also avoids a race where .d_open fails in spec_open but + * spec_close concurrently thinks the device is still open + * before spec_open has had a chance to drop sd_opencnt back to + * 1, so nobody calls .d_close in the end. + */ + rw_enter(&sd->sd_opencloselock, RW_WRITER); + mutex_enter(&device_lock); sn->sn_opencnt--; count = --sd->sd_opencnt; @@ -1362,8 +1387,11 @@ spec_close(void *v) sd->sd_bdevvp = NULL; mutex_exit(&device_lock); - if (count != 0 && (vp->v_type != VCHR || !(cdev_flags(dev) & D_MCLOSE))) - return 0; + if (count != 0 && + (vp->v_type != VCHR || !(cdev_flags(dev) & D_MCLOSE))) { + error = 0; + goto out; + } /* * If we're able to block, release the vnode lock & reacquire. We @@ -1381,7 +1409,8 @@ spec_close(void *v) if (!(flags & FNONBLOCK)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - return (error); +out: rw_exit(&sd->sd_opencloselock); + return error; } /* diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h index 621c103e80b1..7917c3aea594 100644 --- a/sys/miscfs/specfs/specdev.h +++ b/sys/miscfs/specfs/specdev.h @@ -60,7 +60,7 @@ #ifndef _MISCFS_SPECFS_SPECDEV_H_ #define _MISCFS_SPECFS_SPECDEV_H_ -#include +#include #include typedef struct specnode { @@ -78,6 +78,7 @@ typedef struct specdev { u_int sd_opencnt; u_int sd_refcnt; dev_t sd_rdev; + krwlock_t sd_opencloselock; } specdev_t; /* From f52d08411664401b25770bff4fbe7aafda0c04d9 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 01:11:45 +0000 Subject: [PATCH 16/30] specfs: sn_gone cannot be set while we hold the vnode lock. vrevoke suspends the file system, which waits for the vnode lock to be released, before it sets sn_gone and changes v_op so nothing can re-enter spec_open with this vnode. --- sys/miscfs/specfs/spec_vnops.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 59757cb7b8eb..e2e89bdabddf 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -570,10 +570,6 @@ spec_open(void *v) * Acquire an open reference -- as long as we hold onto it, and * the vnode isn't revoked, it can't be closed (XXX except for * D_MCLOSE devices, which don't exist in-tree). - * - * But first check whether it has been revoked -- if so, we - * can't acquire more open references and we must fail - * immediately with EBADF. */ mutex_enter(&device_lock); switch (vp->v_type) { @@ -582,10 +578,7 @@ spec_open(void *v) * Character devices can accept opens from multiple * vnodes. */ - if (sn->sn_gone) { - error = EBADF; - break; - } + KASSERT(!sn->sn_gone); sd->sd_opencnt++; sn->sn_opencnt++; break; @@ -598,10 +591,7 @@ spec_open(void *v) * Treat zero opencnt with non-NULL mountpoint as open. * This may happen after forced detach of a mounted device. */ - if (sn->sn_gone) { - error = EBADF; - break; - } + KASSERT(!sn->sn_gone); if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) { error = EBUSY; break; From 730bdad19d562d4958991494be4cd3fd440a9aa1 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 01:12:46 +0000 Subject: [PATCH 17/30] specfs: Factor KASSERT out of switch in spec_open. No functional change. --- sys/miscfs/specfs/spec_vnops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index e2e89bdabddf..5c74570c8687 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -572,13 +572,13 @@ spec_open(void *v) * D_MCLOSE devices, which don't exist in-tree). */ mutex_enter(&device_lock); + KASSERT(!sn->sn_gone); switch (vp->v_type) { case VCHR: /* * Character devices can accept opens from multiple * vnodes. */ - KASSERT(!sn->sn_gone); sd->sd_opencnt++; sn->sn_opencnt++; break; @@ -591,7 +591,6 @@ spec_open(void *v) * Treat zero opencnt with non-NULL mountpoint as open. * This may happen after forced detach of a mounted device. */ - KASSERT(!sn->sn_gone); if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) { error = EBUSY; break; From f48454cd29f917b1dbd9623d5471e432d265152d Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 01:16:23 +0000 Subject: [PATCH 18/30] specfs: If sd_opencnt is zero, sn_opencnt had better be zero. --- sys/miscfs/specfs/spec_vnops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 5c74570c8687..e489802029c1 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -595,6 +595,7 @@ spec_open(void *v) error = EBUSY; break; } + KASSERTMSG(sn->sn_opencnt == 0, "%u", sn->sn_opencnt); sn->sn_opencnt = 1; sd->sd_opencnt = 1; sd->sd_bdevvp = vp; From 34ccef043b44a50015eebb92ac686a2dd9a16210 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 01:52:30 +0000 Subject: [PATCH 19/30] specfs: Make sure not to call .d_ioctl if revoked beat spec_open. --- sys/miscfs/specfs/spec_vnops.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index e489802029c1..d4a7f9bd464d 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -729,10 +729,6 @@ spec_open(void *v) * return. At this point, the vnode may be revoked. Once we * lock it again, it may be dead, so we must not touch sn or sd * after this point. - * - * XXX This should do vdead_check before .d_ioctl -- otherwise - * the devsw might be unloaded altogether by the time we reach - * it. */ rw_exit(&sd->sd_opencloselock); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); @@ -741,6 +737,20 @@ spec_open(void *v) if (error) return error; + /* + * The vnode may have been revoked between releasing + * sd_opencloselock and reacquiring the vnode lock, while we + * dropped the vnode lock. If that has happened, the device + * has been closed, so we must not call .d_ioctl; we must + * immediately fail with EBADF. + */ + mutex_enter(vp->v_interlock); + if (vdead_check(vp, VDEAD_NOWAIT) != 0) + error = EBADF; + mutex_exit(vp->v_interlock); + if (error) + return error; + /* * For disk devices, automagically set the vnode size to the * partition size, if we can. This applies to block devices From c81abf623ef9d6491887070e9034196892e4229f Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 01:56:05 +0000 Subject: [PATCH 20/30] specfs: Omit needless vdead_check in spec_fdiscard. The vnode lock is held, so the vnode cannot be revoked without also changing v_op so subsequent uses under the vnode lock will go to deadfs's VOP_FDISCARD instead (which is genfs_eopnotsupp). --- sys/miscfs/specfs/spec_vnops.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index d4a7f9bd464d..1a35b42726e4 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -967,17 +967,7 @@ spec_fdiscard(void *v) dev_t dev; vp = ap->a_vp; - dev = NODEV; - - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - - if (dev == NODEV) { - return ENXIO; - } + dev = vp->v_rdev; switch (vp->v_type) { case VCHR: From 804f5efe4f9ef5c4fa7f3bd0106780fa85153491 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 02:03:31 +0000 Subject: [PATCH 21/30] specfs: Add a comment and assertion to spec_close about refcnts. --- sys/miscfs/specfs/spec_vnops.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 1a35b42726e4..b60f4b5a7fb9 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -1370,11 +1370,20 @@ spec_close(void *v) */ rw_enter(&sd->sd_opencloselock, RW_WRITER); + /* + * Decrement the open reference count of this node and the + * device. For block devices, the open reference count must be + * 1 at this point. If the device's open reference count goes + * to zero, we're the last one out so get the lights. + */ mutex_enter(&device_lock); sn->sn_opencnt--; count = --sd->sd_opencnt; - if (vp->v_type == VBLK) + if (vp->v_type == VBLK) { + KASSERTMSG(count == 0, "block device with %u opens", + count + 1); sd->sd_bdevvp = NULL; + } mutex_exit(&device_lock); if (count != 0 && From bb53041be1359096ebfc583950c22cda2c47669f Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 02:16:40 +0000 Subject: [PATCH 22/30] driver(9): Eliminate D_MCLOSE. D_MCLOSE was introduced a few years ago by mistake for audio(4), which should have used -- and now does use -- fd_clone to create per-open state. The semantics was originally to call close once every time the device node is closed, not only for the last close. Nothing uses it any more, and it complicates reasoning about the system, so let's simplify it away. --- sys/miscfs/specfs/spec_vnops.c | 7 ++----- sys/sys/conf.h | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index b60f4b5a7fb9..549913e430bc 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -568,8 +568,7 @@ spec_open(void *v) /* * Acquire an open reference -- as long as we hold onto it, and - * the vnode isn't revoked, it can't be closed (XXX except for - * D_MCLOSE devices, which don't exist in-tree). + * the vnode isn't revoked, it can't be closed. */ mutex_enter(&device_lock); KASSERT(!sn->sn_gone); @@ -1385,9 +1384,7 @@ spec_close(void *v) sd->sd_bdevvp = NULL; } mutex_exit(&device_lock); - - if (count != 0 && - (vp->v_type != VCHR || !(cdev_flags(dev) & D_MCLOSE))) { + if (count != 0) { error = 0; goto out; } diff --git a/sys/sys/conf.h b/sys/sys/conf.h index cb4c287d1982..16dd87e5480c 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -63,7 +63,7 @@ struct vnode; #define D_TYPEMASK 0x00ff #define D_MPSAFE 0x0100 #define D_NEGOFFSAFE 0x0200 -#define D_MCLOSE 0x0400 +#define D_UNUSED0 0x0400 /* was D_MCLOSE */ /* * Block device switch table From b1899df3dbb2aab16dfafd693d2d406bfc15f7f5 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 12:10:55 +0000 Subject: [PATCH 23/30] specfs: Assert specnode is open before we bdev_ioctl. spec_node_setmountedfs should only be called while the vnode is open; otherwise nothing can be mounted from it. --- sys/miscfs/specfs/spec_vnops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 549913e430bc..78ca09519957 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -385,6 +385,8 @@ spec_node_setmountedfs(vnode_t *devvp, struct mount *mp) KASSERT(devvp->v_type == VBLK); KASSERT(devvp->v_specnode->sn_dev->sd_mountpoint == NULL || mp == NULL); + KASSERT(devvp->v_specnode->sn_opencnt); + devvp->v_specnode->sn_dev->sd_mountpoint = mp; if (mp == NULL) return; From 4cddb17a158402c7b276f0afc9c9fa313e3ae682 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 12:52:23 +0000 Subject: [PATCH 24/30] specfs: Drain all I/O operations after last .d_close call. New kind of I/O reference on specnodes, sn_iocnt. This could be done with psref instead; I chose a reference count instead for onw because we already have to take a per-object lock anyway, v_interlock, for vdead_check, so another atomic is not likely to hurt much more. We can always change the mechanism inside spec_io_enter/exit/drain later on. Make sure every access to vp->v_rdev or vp->v_specnode and every call to a devsw operation is protected either: - by the vnode lock (with vdead_check if we unlocked/relocked), - by positive sn_opencnt, - by spec_io_enter/exit, or - by sd_opencloselock and sn_opencnt management in open/close. --- sys/miscfs/specfs/spec_vnops.c | 348 +++++++++++++++++++++++---------- sys/miscfs/specfs/specdev.h | 1 + 2 files changed, 245 insertions(+), 104 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 78ca09519957..663f23eb74ef 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -176,6 +176,7 @@ const struct vnodeopv_desc spec_vnodeop_opv_desc = { &spec_vnodeop_p, spec_vnodeop_entries }; static kauth_listener_t rawio_listener; +static struct kcondvar specfs_iocv; /* Returns true if vnode is /dev/mem or /dev/kmem. */ bool @@ -221,6 +222,123 @@ spec_init(void) rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE, rawio_listener_cb, NULL); + cv_init(&specfs_iocv, "specio"); +} + +/* + * spec_io_enter(vp, &sn, &dev) + * + * Enter an operation that may not hold vp's vnode lock or an + * fstrans on vp's mount. Until spec_io_exit, the vnode will not + * be revoked. + * + * On success, set sn to the specnode pointer and dev to the dev_t + * number and return zero. Caller must later call spec_io_exit + * when done. + * + * On failure, return ENXIO -- the device has been revoked and no + * longer exists. + */ +static int +spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp) +{ + dev_t dev; + struct specnode *sn; + unsigned iocnt; + int error = 0; + + mutex_enter(vp->v_interlock); + + /* + * Extract all the info we need from the vnode, unless the + * vnode has already been reclaimed. This can happen if the + * underlying device has been removed and all the device nodes + * for it have been revoked. The caller may not hold a vnode + * lock or fstrans to prevent this from happening before it has + * had an opportunity to notice the vnode is dead. + */ + if (vdead_check(vp, VDEAD_NOWAIT) != 0 || + (sn = vp->v_specnode) == NULL || + (dev = vp->v_rdev) == NODEV) { + error = ENXIO; + goto out; + } + + /* + * Notify spec_node_revoke that we are doing an I/O operation + * which may not be not bracketed by fstrans(9) and thus is not + * blocked by vfs suspension. + * + * We could hold this reference with psref(9) instead, but we + * already have to take the interlock for vdead_check, so + * there's not much more cost here to another atomic operation. + */ + iocnt = atomic_inc_uint_nv(&sn->sn_iocnt); + CTASSERT(MAXLWP < UINT_MAX); + KASSERT(iocnt < UINT_MAX); + + /* Success! */ + *snp = sn; + *devp = dev; + error = 0; + +out: mutex_exit(vp->v_interlock); + return error; +} + +/* + * spec_io_exit(vp, sn) + * + * Exit an operation entered with a successful spec_io_enter -- + * allow concurrent spec_node_revoke to proceed. The argument sn + * must match the struct specnode pointer returned by spec_io_exit + * for vp. + */ +static void +spec_io_exit(struct vnode *vp, struct specnode *sn) +{ + unsigned iocnt; + + KASSERT(vp->v_specnode == sn); + + /* + * We are done. Notify spec_node_revoke if appropriate. The + * transition of 1 -> 0 must happen under device_lock so + * spec_node_revoke doesn't miss a wakeup. + */ + do { + iocnt = atomic_load_relaxed(&sn->sn_iocnt); + if (iocnt == 1) { + mutex_enter(&device_lock); + if (atomic_dec_uint_nv(&sn->sn_iocnt) == 0) + cv_broadcast(&specfs_iocv); + mutex_exit(&device_lock); + break; + } + } while (atomic_cas_uint(&sn->sn_iocnt, iocnt, iocnt - 1) != iocnt); +} + +/* + * spec_io_drain(vp, sn) + * + * Wait for all existing spec_io_enter/exit sections to complete. + * Caller must ensure spec_io_enter will fail at this point. + */ +static void +spec_io_drain(struct vnode *vp, struct specnode *sn) +{ + + /* + * I/O at the same time as closing is unlikely -- it often + * indicates an application bug. + */ + if (__predict_true(atomic_load_relaxed(&sn->sn_iocnt) == 0)) + return; + + mutex_enter(&device_lock); + while (atomic_load_relaxed(&sn->sn_iocnt) != 0) + cv_wait(&specfs_iocv, &device_lock); + mutex_exit(&device_lock); } /* @@ -513,29 +631,27 @@ spec_open(void *v) int a_mode; kauth_cred_t a_cred; } */ *ap = v; - struct lwp *l; - struct vnode *vp; + struct lwp *l = curlwp; + struct vnode *vp = ap->a_vp; dev_t dev; int error; enum kauth_device_req req; specnode_t *sn; specdev_t *sd; spec_ioctl_t ioctl; - u_int gen; - const char *name; + u_int gen = 0; + const char *name = NULL; struct partinfo pi; - - l = curlwp; - vp = ap->a_vp; - dev = vp->v_rdev; - sn = vp->v_specnode; - sd = sn->sn_dev; - name = NULL; - gen = 0; + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d", vp->v_type); + dev = vp->v_rdev; + sn = vp->v_specnode; + sd = sn->sn_dev; + /* * Don't allow open if fs is mounted -nodev. */ @@ -788,6 +904,8 @@ spec_read(void *v) struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct lwp *l = curlwp; + struct specnode *sn; + dev_t dev; struct buf *bp; daddr_t bn; int bsize, bscale; @@ -807,20 +925,26 @@ spec_read(void *v) if (uio->uio_resid == 0) return (0); + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; + switch (vp->v_type) { case VCHR: VOP_UNLOCK(vp); - error = cdev_read(vp->v_rdev, uio, ap->a_ioflag); + error = cdev_read(dev, uio, ap->a_ioflag); vn_lock(vp, LK_SHARED | LK_RETRY); - return (error); + break; case VBLK: KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); - if (uio->uio_offset < 0) - return (EINVAL); + if (uio->uio_offset < 0) { + error = EINVAL; + break; + } - if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) + if (bdev_ioctl(dev, DIOCGPARTINFO, &pi, FREAD, l) == 0) bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE); else bsize = BLKDEV_IOSIZE; @@ -865,12 +989,14 @@ spec_read(void *v) kmem_free(rablks, nra * sizeof(*rablks)); kmem_free(rasizes, nra * sizeof(*rasizes)); - return (error); + break; default: panic("spec_read type"); } - /* NOTREACHED */ + + spec_io_exit(vp, sn); + return error; } /* @@ -889,6 +1015,8 @@ spec_write(void *v) struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct lwp *l = curlwp; + struct specnode *sn; + dev_t dev; struct buf *bp; daddr_t bn; int bsize, bscale; @@ -901,22 +1029,30 @@ spec_write(void *v) uio->uio_vmspace == curproc->p_vmspace, "vmspace belongs to neither kernel nor curproc"); + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; + switch (vp->v_type) { case VCHR: VOP_UNLOCK(vp); - error = cdev_write(vp->v_rdev, uio, ap->a_ioflag); + error = cdev_write(dev, uio, ap->a_ioflag); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - return (error); + break; case VBLK: - KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); - if (uio->uio_resid == 0) - return (0); - if (uio->uio_offset < 0) - return (EINVAL); + KASSERT(vp == sn->sn_dev->sd_bdevvp); + if (uio->uio_resid == 0) { + error = 0; + break; + } + if (uio->uio_offset < 0) { + error = EINVAL; + break; + } - if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) + if (bdev_ioctl(dev, DIOCGPARTINFO, &pi, FREAD, l) == 0) bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE); else bsize = BLKDEV_IOSIZE; @@ -930,9 +1066,8 @@ spec_write(void *v) bp = getblk(vp, bn, bsize, 0, 0); else error = bread(vp, bn, bsize, B_MODIFY, &bp); - if (error) { - return (error); - } + if (error) + break; n = uimin(n, bsize - bp->b_resid); error = uiomove((char *)bp->b_data + on, n, uio); if (error) @@ -945,12 +1080,14 @@ spec_write(void *v) error = bp->b_error; } } while (error == 0 && uio->uio_resid > 0 && n != 0); - return (error); + break; default: panic("spec_write type"); } - /* NOTREACHED */ + + spec_io_exit(vp, sn); + return error; } /* @@ -964,10 +1101,12 @@ spec_fdiscard(void *v) off_t a_pos; off_t a_len; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; dev_t dev; - vp = ap->a_vp; + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); + dev = vp->v_rdev; switch (vp->v_type) { @@ -997,40 +1136,32 @@ spec_ioctl(void *v) int a_fflag; kauth_cred_t a_cred; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int error; - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - vp = ap->a_vp; - dev = NODEV; - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - if (dev == NODEV) { - return ENXIO; - } + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; switch (vp->v_type) { - case VCHR: - return cdev_ioctl(dev, ap->a_command, ap->a_data, + error = cdev_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); - + break; case VBLK: KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); - return bdev_ioctl(dev, ap->a_command, ap->a_data, + error = bdev_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); - + break; default: panic("spec_ioctl"); /* NOTREACHED */ } + + spec_io_exit(vp, sn); + return error; } /* ARGSUSED */ @@ -1041,33 +1172,25 @@ spec_poll(void *v) struct vnode *a_vp; int a_events; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int revents; - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - vp = ap->a_vp; - dev = NODEV; - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - if (dev == NODEV) { + if (spec_io_enter(vp, &sn, &dev) != 0) return POLLERR; - } switch (vp->v_type) { - case VCHR: - return cdev_poll(dev, ap->a_events, curlwp); - + revents = cdev_poll(dev, ap->a_events, curlwp); + break; default: - return (genfs_poll(v)); + revents = genfs_poll(v); + break; } + + spec_io_exit(vp, sn); + return revents; } /* ARGSUSED */ @@ -1078,20 +1201,30 @@ spec_kqfilter(void *v) struct vnode *a_vp; struct proc *a_kn; } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int error; - switch (ap->a_vp->v_type) { + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; + switch (vp->v_type) { case VCHR: - dev = ap->a_vp->v_rdev; - return cdev_kqfilter(dev, ap->a_kn); + error = cdev_kqfilter(dev, ap->a_kn); + break; default: /* * Block devices don't support kqfilter, and refuse it * for any other files (like those vflush()ed) too. */ - return (EOPNOTSUPP); + error = EOPNOTSUPP; + break; } + + spec_io_exit(vp, sn); + return error; } /* @@ -1106,11 +1239,19 @@ spec_mmap(void *v) kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; + struct specnode *sn; + dev_t dev; + int error; KASSERT(vp->v_type == VBLK); - if (bdev_type(vp->v_rdev) != D_DISK) - return EINVAL; + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; + + error = bdev_type(dev) == D_DISK ? 0 : EINVAL; + + spec_io_exit(vp, sn); return 0; } @@ -1155,27 +1296,14 @@ spec_strategy(void *v) } */ *ap = v; struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; + struct specnode *sn = NULL; dev_t dev; int error; - dev = NODEV; - - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) { - KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - - if (dev == NODEV) { - error = ENXIO; + error = spec_io_enter(vp, &sn, &dev); + if (error) goto out; - } + bp->b_dev = dev; if (!(bp->b_flags & B_READ)) { @@ -1195,13 +1323,15 @@ spec_strategy(void *v) } bdev_strategy(bp); - return 0; - -out: - bp->b_error = error; - bp->b_resid = bp->b_bcount; - biodone(bp); + error = 0; +out: if (sn) + spec_io_exit(vp, sn); + if (error) { + bp->b_error = error; + bp->b_resid = bp->b_bcount; + biodone(bp); + } return error; } @@ -1270,14 +1400,18 @@ spec_close(void *v) } */ *ap = v; struct vnode *vp = ap->a_vp; struct session *sess; - dev_t dev = vp->v_rdev; + dev_t dev; int flags = ap->a_fflag; int mode, error, count; specnode_t *sn; specdev_t *sd; + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); + mutex_enter(vp->v_interlock); sn = vp->v_specnode; + dev = vp->v_rdev; sd = sn->sn_dev; /* * If we're going away soon, make this non-blocking. @@ -1404,6 +1538,12 @@ spec_close(void *v) else error = cdev_close(dev, flags, mode, curlwp); + /* + * Wait for all other devsw operations to drain. After this + * point, no bdev/cdev_* can be active for this specnode. + */ + spec_io_drain(vp, sn); + if (!(flags & FNONBLOCK)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h index 7917c3aea594..9d5a7e3fc6bb 100644 --- a/sys/miscfs/specfs/specdev.h +++ b/sys/miscfs/specfs/specdev.h @@ -69,6 +69,7 @@ typedef struct specnode { u_int sn_opencnt; dev_t sn_rdev; bool sn_gone; + volatile u_int sn_iocnt; } specnode_t; typedef struct specdev { From 50547efa87de2a8e03cbd9ac9000978948ab67ff Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Tue, 18 Jan 2022 01:11:50 +0000 Subject: [PATCH 25/30] driver(9): New devsw d_cancel op to interrupt I/O before close. If specified, when revoking a device node or closing its last open node, specfs will: 1. Call d_cancel, which should return promptly without blocking. 2. Wait for all concurrent d_read/write/ioctl/&c. to drain. 3. Call d_close. Otherwise, specfs will: 1. Call d_close. 2. Wait for all concurrent d_read/write/ioctl/&c. to drain. This fallback is problematic because often parts of d_close rely on concurrent devsw operations to have completed already, so it is up to each driver to have its own mechanism for waiting, and the extra step in (2) is almost redundant. But it is still important to ensure that devsw operations are not active by the time a module tries to invoke devsw_detach, because only d_open is protected against that. The signature of d_cancel matches d_close, mostly so we don't raise questions about `why is this different?'; the lwp argument is not useful but we should remove it from open/cancel/close all at the same time. The only way d_cancel should fail, if it does at all, is with ENODEV, meaning the driver doesn't support cancelling outstanding I/O, and will take responsibility for that in d_close. I would make it return void and only have bdev_cancel and cdev_cancel possibly return ENODEV so specfs can detect whether a driver supports it, but this would break the pattern around devsw operation types. Drivers are allowed to omit it from struct bdevsw, struct cdevsw -- if so, it is as if they used a function that just returns ENODEV. --- sys/kern/subr_devsw.c | 36 ++++++++++++++++++++++++++++++++++ sys/miscfs/specfs/spec_vnops.c | 16 +++++++++++++++ sys/sys/conf.h | 5 +++++ 3 files changed, 57 insertions(+) diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index 42bf25c4e354..8a79e43e8628 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -935,6 +935,24 @@ bdev_open(dev_t dev, int flag, int devtype, lwp_t *l) return rv; } +int +bdev_cancel(dev_t dev, int flag, int devtype, struct lwp *l) +{ + const struct bdevsw *d; + int rv, mpflag; + + if ((d = bdevsw_lookup(dev)) == NULL) + return ENXIO; + if (d->d_cancel == NULL) + return ENODEV; + + DEV_LOCK(d); + rv = (*d->d_cancel)(dev, flag, devtype, l); + DEV_UNLOCK(d); + + return rv; +} + int bdev_close(dev_t dev, int flag, int devtype, lwp_t *l) { @@ -1112,6 +1130,24 @@ cdev_open(dev_t dev, int flag, int devtype, lwp_t *l) return rv; } +int +cdev_cancel(dev_t dev, int flag, int devtype, struct lwp *l) +{ + const struct cdevsw *d; + int rv, mpflag; + + if ((d = cdevsw_lookup(dev)) == NULL) + return ENXIO; + if (d->d_cancel == NULL) + return ENODEV; + + DEV_LOCK(d); + rv = (*d->d_cancel)(dev, flag, devtype, l); + DEV_UNLOCK(d); + + return rv; +} + int cdev_close(dev_t dev, int flag, int devtype, lwp_t *l) { diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 663f23eb74ef..5a92e1dccc91 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -1533,6 +1533,22 @@ spec_close(void *v) if (!(flags & FNONBLOCK)) VOP_UNLOCK(vp); + /* + * If we can cancel all outstanding I/O, then wait for it to + * drain before we call .d_close. Drivers that split up + * .d_cancel and .d_close this way need not have any internal + * mechanism for waiting in .d_close for I/O to drain. + */ + if (vp->v_type == VBLK) + error = bdev_cancel(dev, flags, mode, curlwp); + else + error = cdev_cancel(dev, flags, mode, curlwp); + if (error == 0) + spec_io_drain(vp, sn); + else + KASSERTMSG(error == ENODEV, "cancel dev=0x%lx failed with %d", + (unsigned long)dev, error); + if (vp->v_type == VBLK) error = bdev_close(dev, flags, mode, curlwp); else diff --git a/sys/sys/conf.h b/sys/sys/conf.h index 16dd87e5480c..469eaa8ec29d 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -70,6 +70,7 @@ struct vnode; */ struct bdevsw { int (*d_open)(dev_t, int, int, struct lwp *); + int (*d_cancel)(dev_t, int, int, struct lwp *); int (*d_close)(dev_t, int, int, struct lwp *); void (*d_strategy)(struct buf *); int (*d_ioctl)(dev_t, u_long, void *, int, struct lwp *); @@ -86,6 +87,7 @@ struct bdevsw { */ struct cdevsw { int (*d_open)(dev_t, int, int, struct lwp *); + int (*d_cancel)(dev_t, int, int, struct lwp *); int (*d_close)(dev_t, int, int, struct lwp *); int (*d_read)(dev_t, struct uio *, int); int (*d_write)(dev_t, struct uio *, int); @@ -115,6 +117,7 @@ devmajor_t bdevsw_lookup_major(const struct bdevsw *); devmajor_t cdevsw_lookup_major(const struct cdevsw *); #define dev_type_open(n) int n (dev_t, int, int, struct lwp *) +#define dev_type_cancel(n) int n (dev_t, int, int, struct lwp *) #define dev_type_close(n) int n (dev_t, int, int, struct lwp *) #define dev_type_read(n) int n (dev_t, struct uio *, int) #define dev_type_write(n) int n (dev_t, struct uio *, int) @@ -165,6 +168,7 @@ paddr_t nommap(dev_t, off_t, int); /* device access wrappers. */ dev_type_open(bdev_open); +dev_type_cancel(bdev_cancel); dev_type_close(bdev_close); dev_type_strategy(bdev_strategy); dev_type_ioctl(bdev_ioctl); @@ -173,6 +177,7 @@ dev_type_size(bdev_size); dev_type_discard(bdev_discard); dev_type_open(cdev_open); +dev_type_cancel(cdev_cancel); dev_type_close(cdev_close); dev_type_read(cdev_read); dev_type_write(cdev_write); From 4b95a9acb6fa9a93eb82c3a451eb5d8d514a5ad8 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 03:23:08 +0000 Subject: [PATCH 26/30] autoconf(9): New function config_detach_commit. When a driver's .ca_detach function has committed to detaching -- it definitely won't back out with EBUSY, for instance -- it can call this to wake all pending calls to device_lookup_acquire and make them fail immediately. This is necessary to break a deadlock if the device_lookup_acquire calls happen inside I/O operations which the driver's .ca_detach function waits for the completion of -- without config_detach_commit, I/O operations would be stuck in device_lookup_acquire waiting for .ca_detach and .ca_detach would be stuck waiting for I/O operations to return. Most drivers won't need to call this: for autoconf drivers used the traditional way by devsw for userland device nodes, the .ca_detach routine uses vdevgone, and we will arrange to make vdevgone call config_detach_commit automagically in such drivers anyway. --- sys/kern/subr_autoconf.c | 40 +++++++++++++++++++++++++++++++++++----- sys/sys/device.h | 2 ++ 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c index 073ce6f52059..43899b77c1b2 100644 --- a/sys/kern/subr_autoconf.c +++ b/sys/kern/subr_autoconf.c @@ -2054,16 +2054,21 @@ config_detach(device_t dev, int flags) * If it was possible to detach the device, ensure that the * device is deactivated. */ - if (rv == 0) - dev->dv_flags &= ~DVF_ACTIVE; - else if ((flags & DETACH_FORCE) == 0) { + if (rv == 0) { + config_detach_commit(dev); + dev->dv_flags &= ~DVF_ACTIVE; /* XXXSMP */ + } else if ((flags & DETACH_FORCE) == 0) { /* * Detach failed -- likely EBUSY. Reset the * localcount. At this point there can be no * references held, and no new references acquired -- * calls to device_lookup_acquire are held up on - * dv_detaching until config_detach_exit. + * dv_detaching until config_detach_exit. Driver must + * not have called config_detach_commit. */ + KASSERTMSG(!dev->dv_detached, + "%s committed to detaching and then backed out", + device_xname(dev)); localcount_fini(dev->dv_localcount); localcount_init(dev->dv_localcount); goto out; @@ -2143,6 +2148,30 @@ out: return rv; } +/* + * config_detach_commit(dev) + * + * Issued by a driver's .ca_detach routine to notify anyone + * waiting in device_lookup_acquire that the driver is committed + * to detaching the device, which allows device_lookup_acquire to + * wake up and fail immediately. + * + * Safe to call multiple times -- idempotent. Must be called + * during config_detach_enter/exit. Safe to use with + * device_lookup because the device is not actually removed from + * the table until after config_detach_exit. + */ +void +config_detach_commit(device_t dev) +{ + + mutex_enter(&config_misc_lock); + KASSERT(dev->dv_detaching == curlwp); + dev->dv_detached = true; + cv_broadcast(&config_misc_cv); + mutex_exit(&config_misc_lock); +} + int config_detach_children(device_t parent, int flags) { @@ -2643,7 +2672,8 @@ device_lookup_acquire(cfdriver_t cd, int unit) mutex_enter(&alldevs_lock); retry: if (unit < 0 || unit >= cd->cd_ndevs || (dv = cd->cd_devs[unit]) == NULL || - dv->dv_del_gen != 0) { + dv->dv_del_gen != 0 || + dv->dv_detached) { dv = NULL; } else { /* diff --git a/sys/sys/device.h b/sys/sys/device.h index 76d9f8be5ad6..eceb98f6c423 100644 --- a/sys/sys/device.h +++ b/sys/sys/device.h @@ -281,6 +281,7 @@ struct device { struct lwp *dv_attaching; /* thread not yet finished in attach */ struct lwp *dv_detaching; /* detach lock (config_misc_lock/cv) */ + bool dv_detached; /* config_misc_lock */ size_t dv_activity_count; void (**dv_activity_handlers)(device_t, devactive_t); @@ -631,6 +632,7 @@ device_t config_attach_pseudo(cfdata_t); int config_detach(device_t, int); int config_detach_children(device_t, int flags); +void config_detach_commit(device_t); bool config_detach_all(int); int config_deactivate(device_t); void config_defer(device_t, void (*)(device_t)); From 654ceb80737062f850a6dbaccb7c0169f1806706 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 14 Jan 2022 21:33:14 +0000 Subject: [PATCH 27/30] uhid(4): Use d_cfdriver/devtounit to avoid open/detach races. --- sys/dev/usb/uhid.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sys/dev/usb/uhid.c b/sys/dev/usb/uhid.c index 81292fd5130e..6b22f5b21a01 100644 --- a/sys/dev/usb/uhid.c +++ b/sys/dev/usb/uhid.c @@ -134,6 +134,8 @@ const struct cdevsw uhid_cdevsw = { .d_mmap = nommap, .d_kqfilter = uhidkqfilter, .d_discard = nodiscard, + .d_cfdriver = &uhid_cd, + .d_devtounit = dev_minor_unit, .d_flag = D_OTHER }; @@ -244,6 +246,14 @@ uhid_detach(device_t self, int flags) pmf_device_deregister(self); + /* + * We are committed to detaching and have arranged any + * subsequent uhidopen to fail. Wake anyone waiting to open + * the device, blocked since detach began. vdevgone will wait + * for any still pending opens to drain. + */ + config_detach_commit(self); + /* locate the major number */ maj = cdevsw_lookup_major(&uhid_cdevsw); From 6583f042e40b8c0132074fdd274135a7e36d0b59 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 15 Jan 2022 19:29:39 +0000 Subject: [PATCH 28/30] wd(4): Use d_cfdriver/devtounit to avoid open/detach races. --- sys/dev/ata/wd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sys/dev/ata/wd.c b/sys/dev/ata/wd.c index b5d7476d1e6c..e05f648eb5f2 100644 --- a/sys/dev/ata/wd.c +++ b/sys/dev/ata/wd.c @@ -152,6 +152,8 @@ const struct bdevsw wd_bdevsw = { .d_dump = wddump, .d_psize = wdsize, .d_discard = wddiscard, + .d_cfdriver = &wd_cd, + .d_devtounit = disklabel_dev_unit, .d_flag = D_DISK }; @@ -167,6 +169,8 @@ const struct cdevsw wd_cdevsw = { .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = wddiscard, + .d_cfdriver = &wd_cd, + .d_devtounit = disklabel_dev_unit, .d_flag = D_DISK }; @@ -530,6 +534,8 @@ wddetach(device_t self, int flags) if ((rc = disk_begindetach(&dksc->sc_dkdev, wd_lastclose, self, flags)) != 0) return rc; + config_detach_commit(self); + /* locate the major number */ bmaj = bdevsw_lookup_major(&wd_bdevsw); cmaj = cdevsw_lookup_major(&wd_cdevsw); From 7b8a22883f54cd79aea0ce9d4c2e21c1af672437 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 15 Jan 2022 19:30:21 +0000 Subject: [PATCH 29/30] sd(4): Use d_cfdriver/devtounit to avoid open/detach races. --- sys/dev/scsipi/sd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sys/dev/scsipi/sd.c b/sys/dev/scsipi/sd.c index 7bd0d12c070b..91fa43e8df49 100644 --- a/sys/dev/scsipi/sd.c +++ b/sys/dev/scsipi/sd.c @@ -167,6 +167,8 @@ const struct bdevsw sd_bdevsw = { .d_dump = sddump, .d_psize = sdsize, .d_discard = nodiscard, + .d_cfdriver = &sd_cd, + .d_devtounit = disklabel_dev_unit, .d_flag = D_DISK | D_MPSAFE }; @@ -182,6 +184,8 @@ const struct cdevsw sd_cdevsw = { .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, + .d_cfdriver = &sd_cd, + .d_devtounit = disklabel_dev_unit, .d_flag = D_DISK | D_MPSAFE }; @@ -375,6 +379,8 @@ sddetach(device_t self, int flags) if ((rc = disk_begindetach(&dksc->sc_dkdev, sd_lastclose, self, flags)) != 0) return rc; + config_detach_commit(self); + /* locate the major number */ bmaj = bdevsw_lookup_major(&sd_bdevsw); cmaj = cdevsw_lookup_major(&sd_cdevsw); From 3fe235b69c27af07fb7b7009b06424c6851259ba Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 15 Jan 2022 19:55:02 +0000 Subject: [PATCH 30/30] audio(4): Use d_cfdriver/devtounit to avoid open/detach races. --- sys/dev/audio/audio.c | 81 +++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 49 deletions(-) diff --git a/sys/dev/audio/audio.c b/sys/dev/audio/audio.c index d22ea989a180..176749ac449f 100644 --- a/sys/dev/audio/audio.c +++ b/sys/dev/audio/audio.c @@ -521,7 +521,6 @@ static int audio_exlock_mutex_enter(struct audio_softc *); static void audio_exlock_mutex_exit(struct audio_softc *); static int audio_exlock_enter(struct audio_softc *); static void audio_exlock_exit(struct audio_softc *); -static void audio_sc_acquire_foropen(struct audio_softc *, struct psref *); static struct audio_softc *audio_sc_acquire_fromfile(audio_file_t *, struct psref *); static void audio_sc_release(struct audio_softc *, struct psref *); @@ -732,6 +731,8 @@ const struct cdevsw audio_cdevsw = { .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, + .d_cfdriver = &audio_cd, + .d_devtounit = dev_minor_unit, .d_flag = D_OTHER | D_MPSAFE }; @@ -1543,31 +1544,6 @@ audio_exlock_exit(struct audio_softc *sc) audio_exlock_mutex_exit(sc); } -/* - * Increment reference counter for this sc. - * This is intended to be used for open. - */ -void -audio_sc_acquire_foropen(struct audio_softc *sc, struct psref *refp) -{ - int s; - - /* Block audiodetach while we acquire a reference */ - s = pserialize_read_enter(); - - /* - * We don't examine sc_dying here. However, all open methods - * call audio_exlock_enter() right after this, so we can examine - * sc_dying in it. - */ - - /* Acquire a reference */ - psref_acquire(refp, &sc->sc_psref, audio_psref_class); - - /* Now sc won't go away until we drop the reference count */ - pserialize_read_exit(s); -} - /* * Get sc from file, and increment reference counter for this sc. * This is intended to be used for methods other than open. @@ -1680,21 +1656,20 @@ static int audioopen(dev_t dev, int flags, int ifmt, struct lwp *l) { struct audio_softc *sc; - struct psref sc_ref; - int bound; int error; - /* Find the device */ + /* + * Find the device. Because we wired the cdevsw to the audio + * autoconf instance, the system ensures it will not go away + * until after we return. + */ sc = device_lookup_private(&audio_cd, AUDIOUNIT(dev)); if (sc == NULL || sc->hw_if == NULL) return ENXIO; - bound = curlwp_bind(); - audio_sc_acquire_foropen(sc, &sc_ref); - error = audio_exlock_enter(sc); if (error) - goto done; + return error; device_active(sc->sc_dev, DVA_SYSTEM); switch (AUDIODEV(dev)) { @@ -1714,9 +1689,6 @@ audioopen(dev_t dev, int flags, int ifmt, struct lwp *l) } audio_exlock_exit(sc); -done: - audio_sc_release(sc, &sc_ref); - curlwp_bindx(bound); return error; } @@ -2098,30 +2070,41 @@ done: int audiobellopen(dev_t dev, audio_file_t **filep) { + device_t audiodev = NULL; struct audio_softc *sc; - struct psref sc_ref; - int bound; + bool exlock = false; int error; - /* Find the device */ - sc = device_lookup_private(&audio_cd, AUDIOUNIT(dev)); - if (sc == NULL || sc->hw_if == NULL) - return ENXIO; + /* + * Find the autoconf instance and make sure it doesn't go away + * while we are opening it. + */ + audiodev = device_lookup_acquire(&audio_cd, AUDIOUNIT(dev)); + if (audiodev == NULL) { + error = ENXIO; + goto out; + } - bound = curlwp_bind(); - audio_sc_acquire_foropen(sc, &sc_ref); + /* If attach failed, it's hopeless -- give up. */ + sc = device_private(audiodev); + if (sc->hw_if == NULL) { + error = ENXIO; + goto out; + } + /* Take the exclusive configuration lock. */ error = audio_exlock_enter(sc); if (error) - goto done; + goto out; + /* Open the audio device. */ device_active(sc->sc_dev, DVA_SYSTEM); error = audio_open(dev, sc, FWRITE, 0, curlwp, filep); - audio_exlock_exit(sc); -done: - audio_sc_release(sc, &sc_ref); - curlwp_bindx(bound); +out: if (exlock) + audio_exlock_exit(sc); + if (audiodev) + device_release(audiodev); return error; }