From dd21dd60a3e320940684b660c80fad898c1102fd Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 12 Jan 2022 00:46:40 +0000 Subject: [PATCH 01/37] driver(9): devsw_detach never fails. Make it return void. Prune a whole lotta dead branches as a result of this. (Some logic calling this is also wrong for other reasons; devsw_detach is final -- you should never have any reason to decide to roll it back. To be cleaned up in subsequent commits...) XXX kernel ABI change to devsw_detach signature requires bump --- external/cddl/osnet/dev/dtrace/dtrace_modevent.c | 4 +--- external/cddl/osnet/dev/fbt/fbt.c | 3 ++- external/cddl/osnet/dev/sdt/sdt.c | 3 ++- .../cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c | 2 +- share/man/man9/devsw_attach.9 | 15 +++++++++------ sys/coda/coda_psdev.c | 2 +- sys/dev/ccd.c | 2 +- sys/dev/clockctl.c | 8 +++----- sys/dev/hdaudio/hdaudio.c | 6 +----- sys/dev/i2c/i2c.c | 7 ++----- sys/dev/pad/pad.c | 4 +--- sys/dev/raidframe/rf_netbsdkintf.c | 11 +---------- sys/dev/sysmon/sysmon.c | 2 +- sys/dev/tprof/tprof.c | 8 +------- sys/dist/pf/net/pf_ioctl.c | 3 ++- sys/external/bsd/ipf/netinet/ip_fil_netbsd.c | 2 +- sys/fs/autofs/autofs_vfsops.c | 4 +--- sys/kern/kern_drvctl.c | 9 ++------- sys/kern/subr_devsw.c | 3 +-- sys/modules/examples/pollpal/pollpal.c | 3 ++- sys/net/if_tap.c | 5 +---- sys/net/if_tun.c | 9 +-------- sys/rump/dev/lib/libbpf/bpf_component.c | 3 +-- sys/rump/dev/lib/libdrvctl/drvctl_component.c | 4 +--- sys/sys/conf.h | 2 +- 25 files changed, 41 insertions(+), 83 deletions(-) diff --git a/external/cddl/osnet/dev/dtrace/dtrace_modevent.c b/external/cddl/osnet/dev/dtrace/dtrace_modevent.c index cc0f8103fb98..f1338840125a 100644 --- a/external/cddl/osnet/dev/dtrace/dtrace_modevent.c +++ b/external/cddl/osnet/dev/dtrace/dtrace_modevent.c @@ -42,9 +42,7 @@ dtrace_modcmd(modcmd_t cmd, void *data) return error; case MODULE_CMD_FINI: - error = devsw_detach(NULL, &dtrace_cdevsw); - if (error != 0) - return error; + devsw_detach(NULL, &dtrace_cdevsw); error = dtrace_unload(); if (error != 0) { diff --git a/external/cddl/osnet/dev/fbt/fbt.c b/external/cddl/osnet/dev/fbt/fbt.c index b367c2155292..46dd7c1f7f06 100644 --- a/external/cddl/osnet/dev/fbt/fbt.c +++ b/external/cddl/osnet/dev/fbt/fbt.c @@ -1329,7 +1329,8 @@ dtrace_fbt_modcmd(modcmd_t cmd, void *data) error = fbt_unload(); if (error != 0) return error; - return devsw_detach(NULL, &fbt_cdevsw); + devsw_detach(NULL, &fbt_cdevsw); + return 0; case MODULE_CMD_AUTOUNLOAD: return EBUSY; default: diff --git a/external/cddl/osnet/dev/sdt/sdt.c b/external/cddl/osnet/dev/sdt/sdt.c index c3ad129f8284..5a41270a2917 100644 --- a/external/cddl/osnet/dev/sdt/sdt.c +++ b/external/cddl/osnet/dev/sdt/sdt.c @@ -562,7 +562,8 @@ dtrace_sdt_modcmd(modcmd_t cmd, void *data) error = sdt_unload(); if (error != 0) return error; - return devsw_detach(NULL, &sdt_cdevsw); + devsw_detach(NULL, &sdt_cdevsw); + return 0; case MODULE_CMD_AUTOUNLOAD: return EBUSY; default: diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c index 9e19cd1dc0c3..d74d8c71e54d 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c @@ -7231,7 +7231,7 @@ zfs_modcmd(modcmd_t cmd, void *arg) if (error) return error; - (void) devsw_detach(&zfs_bdevsw, &zfs_cdevsw); + devsw_detach(&zfs_bdevsw, &zfs_cdevsw); attacherr: zfs_sysctl_fini(); diff --git a/share/man/man9/devsw_attach.9 b/share/man/man9/devsw_attach.9 index cf862be5846a..6ffc51957a3f 100644 --- a/share/man/man9/devsw_attach.9 +++ b/share/man/man9/devsw_attach.9 @@ -27,7 +27,7 @@ .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" -.Dd April 30, 2017 +.Dd January 11, 2022 .Dt DEVSW 9 .Os .Sh NAME @@ -49,7 +49,7 @@ .Fa "const struct cdevsw *cdev" .Fa "devmajor_t *cmajor" .Fc -.Ft int +.Ft void .Fo devsw_detach .Fa "const struct bdevsw *bdev" .Fa "const struct cdevsw *cdev" @@ -130,6 +130,11 @@ and structures. .Fn devsw_detach should be called before a loaded device driver is unloaded. +The caller must ensure that there are no open instances of the device, +and that the device's +.Fn d_open +function will fail, before calling. +Fn devsw_detach . .Pp The .Fn bdevsw_lookup @@ -155,10 +160,8 @@ or .Sh RETURN VALUES Upon successful completion, .Fn devsw_attach -and -.Fn devsw_detach -return 0. -Otherwise they return an error value. +returns 0. +Otherwise it returns an error value. .Pp In case of failure, .Fn bdevsw_lookup diff --git a/sys/coda/coda_psdev.c b/sys/coda/coda_psdev.c index cede16da3f53..7f531f03fe56 100644 --- a/sys/coda/coda_psdev.c +++ b/sys/coda/coda_psdev.c @@ -758,7 +758,7 @@ vcoda_modcmd(modcmd_t cmd, void *arg) if (VC_OPEN(vcp)) return EBUSY; } - return devsw_detach(NULL, &vcoda_cdevsw); + devsw_detach(NULL, &vcoda_cdevsw); } #endif break; diff --git a/sys/dev/ccd.c b/sys/dev/ccd.c index ab3f94bd422b..3af4f23bcf45 100644 --- a/sys/dev/ccd.c +++ b/sys/dev/ccd.c @@ -1710,7 +1710,7 @@ ccd_modcmd(modcmd_t cmd, void *arg) error = EBUSY; } else { mutex_exit(&ccd_lock); - error = devsw_detach(&ccd_bdevsw, &ccd_cdevsw); + devsw_detach(&ccd_bdevsw, &ccd_cdevsw); ccddetach(); } #endif diff --git a/sys/dev/clockctl.c b/sys/dev/clockctl.c index 0da5e7765fe8..9685c0f129f6 100644 --- a/sys/dev/clockctl.c +++ b/sys/dev/clockctl.c @@ -182,14 +182,12 @@ clockctl_modcmd(modcmd_t cmd, void *data) return EBUSY; } #ifdef _MODULE - error = devsw_detach(NULL, &clockctl_cdevsw); + devsw_detach(NULL, &clockctl_cdevsw); #endif mutex_exit(&clockctl_mtx); - if (error == 0) { - kauth_unlisten_scope(clockctl_listener); - mutex_destroy(&clockctl_mtx); - } + kauth_unlisten_scope(clockctl_listener); + mutex_destroy(&clockctl_mtx); break; default: diff --git a/sys/dev/hdaudio/hdaudio.c b/sys/dev/hdaudio/hdaudio.c index d39ff2db6cde..5c7874778e22 100644 --- a/sys/dev/hdaudio/hdaudio.c +++ b/sys/dev/hdaudio/hdaudio.c @@ -1636,11 +1636,7 @@ hdaudio_modcmd(modcmd_t cmd, void *opaque) error = config_cfdriver_detach(&hdaudio_cd); if (error) break; - error = devsw_detach(NULL, &hdaudio_cdevsw); - if (error) { - config_cfdriver_attach(&hdaudio_cd); - break; - } + devsw_detach(NULL, &hdaudio_cdevsw); #endif break; default: diff --git a/sys/dev/i2c/i2c.c b/sys/dev/i2c/i2c.c index 6f2c0c6a9698..36a3e87d5316 100644 --- a/sys/dev/i2c/i2c.c +++ b/sys/dev/i2c/i2c.c @@ -942,7 +942,7 @@ iic_modcmd(modcmd_t cmd, void *opaque) if (error) { aprint_error("%s: unable to init component\n", iic_cd.cd_name); - (void)devsw_detach(NULL, &iic_cdevsw); + devsw_detach(NULL, &iic_cdevsw); } mutex_exit(&iic_mtx); #endif @@ -960,10 +960,7 @@ iic_modcmd(modcmd_t cmd, void *opaque) mutex_exit(&iic_mtx); break; } - error = devsw_detach(NULL, &iic_cdevsw); - if (error != 0) - config_init_component(cfdriver_ioconf_iic, - cfattach_ioconf_iic, cfdata_ioconf_iic); + devsw_detach(NULL, &iic_cdevsw); #endif mutex_exit(&iic_mtx); break; diff --git a/sys/dev/pad/pad.c b/sys/dev/pad/pad.c index 464f033dd209..3f311bc7f928 100644 --- a/sys/dev/pad/pad.c +++ b/sys/dev/pad/pad.c @@ -777,9 +777,7 @@ pad_modcmd(modcmd_t cmd, void *arg) case MODULE_CMD_FINI: #ifdef _MODULE - error = devsw_detach(NULL, &pad_cdevsw); - if (error) - break; + devsw_detach(NULL, &pad_cdevsw); error = config_fini_component(cfdriver_ioconf_pad, pad_cfattach, cfdata_ioconf_pad); diff --git a/sys/dev/raidframe/rf_netbsdkintf.c b/sys/dev/raidframe/rf_netbsdkintf.c index caeadcb4e739..8aa89841adef 100644 --- a/sys/dev/raidframe/rf_netbsdkintf.c +++ b/sys/dev/raidframe/rf_netbsdkintf.c @@ -4082,16 +4082,7 @@ raid_modcmd_fini(void) return error; } #endif - error = devsw_detach(&raid_bdevsw, &raid_cdevsw); - if (error != 0) { - aprint_error("%s: cannot detach devsw\n",__func__); -#ifdef _MODULE - config_cfdriver_attach(&raid_cd); -#endif - config_cfattach_attach(raid_cd.cd_name, &raid_ca); - mutex_exit(&raid_lock); - return error; - } + devsw_detach(&raid_bdevsw, &raid_cdevsw); rf_BootRaidframe(false); #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) rf_destroy_mutex2(rf_sparet_wait_mutex); diff --git a/sys/dev/sysmon/sysmon.c b/sys/dev/sysmon/sysmon.c index 46aedaad7337..fd2993f0c180 100644 --- a/sys/dev/sysmon/sysmon.c +++ b/sys/dev/sysmon/sysmon.c @@ -356,7 +356,7 @@ sysmon_fini(void) if (error == 0) { mutex_enter(&sysmon_minor_mtx); sm_is_attached = false; - error = devsw_detach(NULL, &sysmon_cdevsw); + devsw_detach(NULL, &sysmon_cdevsw); mutex_exit(&sysmon_minor_mtx); } #endif diff --git a/sys/dev/tprof/tprof.c b/sys/dev/tprof/tprof.c index b069a5b7df5d..136fd190ad14 100644 --- a/sys/dev/tprof/tprof.c +++ b/sys/dev/tprof/tprof.c @@ -768,13 +768,7 @@ tprof_modcmd(modcmd_t cmd, void *arg) case MODULE_CMD_FINI: #if defined(_MODULE) - { - int error; - error = devsw_detach(NULL, &tprof_cdevsw); - if (error) { - return error; - } - } + devsw_detach(NULL, &tprof_cdevsw); #endif /* defined(_MODULE) */ tprof_driver_fini(); return 0; diff --git a/sys/dist/pf/net/pf_ioctl.c b/sys/dist/pf/net/pf_ioctl.c index 94bfb70a411d..e4c13be698f8 100644 --- a/sys/dist/pf/net/pf_ioctl.c +++ b/sys/dist/pf/net/pf_ioctl.c @@ -3459,7 +3459,8 @@ pf_modcmd(modcmd_t cmd, void *opaque) } else { pfdetach(); pflogdetach(); - return devsw_detach(NULL, &pf_cdevsw); + devsw_detach(NULL, &pf_cdevsw); + return 0; } default: return ENOTTY; diff --git a/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c b/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c index d0c4ca95097c..bb4e0706cc9b 100644 --- a/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c +++ b/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c @@ -2256,7 +2256,7 @@ ipl_fini(void *opaque) { #ifdef _MODULE - (void)devsw_detach(NULL, &ipl_cdevsw); + devsw_detach(NULL, &ipl_cdevsw); #endif /* diff --git a/sys/fs/autofs/autofs_vfsops.c b/sys/fs/autofs/autofs_vfsops.c index fbd6eafe6532..1204d1f9b6d3 100644 --- a/sys/fs/autofs/autofs_vfsops.c +++ b/sys/fs/autofs/autofs_vfsops.c @@ -496,9 +496,7 @@ autofs_modcmd(modcmd_t cmd, void *arg) } mutex_exit(&autofs_softc->sc_lock); - error = devsw_detach(NULL, &autofs_cdevsw); - if (error) - break; + devsw_detach(NULL, &autofs_cdevsw); #endif error = vfs_detach(&autofs_vfsops); if (error) diff --git a/sys/kern/kern_drvctl.c b/sys/kern/kern_drvctl.c index 912493dc8608..8a3c24f69b6e 100644 --- a/sys/kern/kern_drvctl.c +++ b/sys/kern/kern_drvctl.c @@ -666,15 +666,10 @@ drvctl_modcmd(modcmd_t cmd, void *arg) devmon_insert_vec = saved_insert_vec; saved_insert_vec = NULL; #ifdef _MODULE - error = devsw_detach(NULL, &drvctl_cdevsw); - if (error != 0) { - saved_insert_vec = devmon_insert_vec; - devmon_insert_vec = devmon_insert; - } + devsw_detach(NULL, &drvctl_cdevsw); #endif mutex_exit(&drvctl_lock); - if (error == 0) - drvctl_fini(); + drvctl_fini(); break; default: diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index 1a0f721fdd65..b95a09d3e6b6 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -343,14 +343,13 @@ devsw_detach_locked(const struct bdevsw *bdev, const struct cdevsw *cdev) } } -int +void devsw_detach(const struct bdevsw *bdev, const struct cdevsw *cdev) { mutex_enter(&device_lock); devsw_detach_locked(bdev, cdev); mutex_exit(&device_lock); - return 0; } /* diff --git a/sys/modules/examples/pollpal/pollpal.c b/sys/modules/examples/pollpal/pollpal.c index b76e0733699b..d8ddc73450e0 100644 --- a/sys/modules/examples/pollpal/pollpal.c +++ b/sys/modules/examples/pollpal/pollpal.c @@ -311,7 +311,8 @@ pollpal_modcmd(modcmd_t cmd, void *arg __unused) case MODULE_CMD_FINI: if (pollpal_nopen != 0) return EBUSY; - return devsw_detach(NULL, &pollpal_cdevsw); + devsw_detach(NULL, &pollpal_cdevsw); + return 0; default: return ENOTTY; } diff --git a/sys/net/if_tap.c b/sys/net/if_tap.c index 0b57ad4a711c..314f4647707c 100644 --- a/sys/net/if_tap.c +++ b/sys/net/if_tap.c @@ -256,9 +256,7 @@ tapdetach(void) if_clone_detach(&tap_cloners); #ifdef _MODULE - error = devsw_detach(NULL, &tap_cdevsw); - if (error != 0) - goto out2; + devsw_detach(NULL, &tap_cdevsw); #endif if (tap_count != 0) { @@ -277,7 +275,6 @@ tapdetach(void) out1: #ifdef _MODULE devsw_attach("tap", NULL, &tap_bmajor, &tap_cdevsw, &tap_cmajor); - out2: #endif if_clone_attach(&tap_cloners); diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c index 59071ddc7ad2..73622837719e 100644 --- a/sys/net/if_tun.c +++ b/sys/net/if_tun.c @@ -142,17 +142,10 @@ tuninit(void) static int tundetach(void) { -#ifdef _MODULE - int error; -#endif if_clone_detach(&tun_cloner); #ifdef _MODULE - error = devsw_detach(NULL, &tun_cdevsw); - if (error != 0) { - if_clone_attach(&tun_cloner); - return error; - } + devsw_detach(NULL, &tun_cdevsw); #endif if (!LIST_EMPTY(&tun_softc_list) || !LIST_EMPTY(&tunz_softc_list)) { diff --git a/sys/rump/dev/lib/libbpf/bpf_component.c b/sys/rump/dev/lib/libbpf/bpf_component.c index 05807d371d40..d41d1987afe8 100644 --- a/sys/rump/dev/lib/libbpf/bpf_component.c +++ b/sys/rump/dev/lib/libbpf/bpf_component.c @@ -50,6 +50,5 @@ RUMP_COMPONENT(RUMP_COMPONENT_NET) panic("bpf devsw attach failed: %d", error); if ((error = rump_vfs_makeonedevnode(S_IFCHR, "/dev/bpf", cmaj, 0)) !=0) panic("cannot create bpf device nodes: %d", error); - if ((error = devsw_detach(NULL, &bpf_cdevsw)) != 0) - panic("cannot detach bpf devsw: %d", error); + devsw_detach(NULL, &bpf_cdevsw); } diff --git a/sys/rump/dev/lib/libdrvctl/drvctl_component.c b/sys/rump/dev/lib/libdrvctl/drvctl_component.c index e2e79f45f9de..ac4e103fdb9c 100644 --- a/sys/rump/dev/lib/libdrvctl/drvctl_component.c +++ b/sys/rump/dev/lib/libdrvctl/drvctl_component.c @@ -51,7 +51,5 @@ RUMP_COMPONENT(RUMP_COMPONENT_DEV) if ( error !=0) panic("cannot create drvctl device node: %d", error); - error = devsw_detach(NULL, &drvctl_cdevsw); - if (error != 0) - panic("cannot detach drvctl devsw: %d", error); + devsw_detach(NULL, &drvctl_cdevsw); } diff --git a/sys/sys/conf.h b/sys/sys/conf.h index 081631d2111f..ba46976a924b 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -104,7 +104,7 @@ extern kmutex_t device_lock; int devsw_attach(const char *, const struct bdevsw *, devmajor_t *, const struct cdevsw *, devmajor_t *); -int devsw_detach(const struct bdevsw *, const struct cdevsw *); +void devsw_detach(const struct bdevsw *, const struct cdevsw *); const struct bdevsw *bdevsw_lookup(dev_t); const struct cdevsw *cdevsw_lookup(dev_t); devmajor_t bdevsw_lookup_major(const struct bdevsw *); From f7767b38e56ba46985fbb151cde89157721c402b Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 12 Jan 2022 01:05:24 +0000 Subject: [PATCH 02/37] driver(9): Fix synchronization of devsw_attach/lookup/detach. (`dev' means either `bdev' or `cdev' for brevity here, e.g. in `devsw_lookup' (bdevsw_lookup, cdevsw_lookup), `dev_open' (bdev_open, cdev_open), `maxdevsws', &c., except for `devsw_attach' and `devsw_detach' which are taken literally.) - Use atomic_store_release and atomic_load_consume for devsw and tables and their entries, which are read unlocked and thus require memory barriers to ensure ordering between initialization in devsw_attach and use in dev_lookup. - Use pserialize(9) and localcount(9) to synchronize dev_open and devsw_detach. => Driver must ensure d_open fails and all open instances have been closed by the time it calls devsw_detach. => Bonus: dev_open is no longer globally serialized through device_lock. - Use atomic_store_release and atomic_load_acquire for max_devsws, which is used in conditionals in the new devsw_lookup_acquire. => It is safe to use atomic_load_relaxed in devsw_lookup because the caller must guarantee the entry is stable, so any increase of max_devsws must have already happened. => devsw_lookup and devsw_lookup_acquire assume that max_devsws never goes down. If you change this you must find some way to adapt the users, preferably without adding much overhead so that devsw operations are cheap. This change introduces an auxiliary table devswref mapping device majors to localcounts of opens in progress. The auxiliary table only occupies one pointer's worth of memory in a monolithic kernel, and is allocated on the fly for dynamically loaded modules. We could ask the module itself to reserve storage for it, but I don't see much value in that, and it would require some changes to the ABI and to config(8). - Omit needless boolean indirection. --- sys/kern/subr_devsw.c | 294 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 240 insertions(+), 54 deletions(-) diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index b95a09d3e6b6..9a30046e9550 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -85,6 +85,9 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp #include #include #include +#include +#include +#include #ifdef DEVSW_DEBUG #define DPRINTF(x) printf x @@ -97,12 +100,22 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp #define CDEVSW_SIZE (sizeof(struct cdevsw *)) #define DEVSWCONV_SIZE (sizeof(struct devsw_conv)) +struct devswref { + struct localcount *dr_lc; +}; + +/* XXX bdevsw, cdevsw, max_bdevsws, and max_cdevsws should be volatile */ extern const struct bdevsw **bdevsw, *bdevsw0[]; extern const struct cdevsw **cdevsw, *cdevsw0[]; extern struct devsw_conv *devsw_conv, devsw_conv0[]; extern const int sys_bdevsws, sys_cdevsws; extern int max_bdevsws, max_cdevsws, max_devsw_convs; +static struct devswref *cdevswref; +static struct devswref *bdevswref; +static struct pserialize *devsw_psz; +static kcondvar_t devsw_cv; + static int bdevsw_attach(const struct bdevsw *, devmajor_t *); static int cdevsw_attach(const struct cdevsw *, devmajor_t *); static void devsw_detach_locked(const struct bdevsw *, const struct cdevsw *); @@ -118,6 +131,9 @@ devsw_init(void) KASSERT(sys_bdevsws < MAXDEVSW - 1); KASSERT(sys_cdevsws < MAXDEVSW - 1); mutex_init(&device_lock, MUTEX_DEFAULT, IPL_NONE); + + devsw_psz = pserialize_create(); + cv_init(&devsw_cv, "devsw"); } int @@ -158,15 +174,13 @@ devsw_attach(const char *devname, error = EEXIST; goto fail; } - - if (bdev != NULL) - bdevsw[*bmajor] = bdev; - cdevsw[*cmajor] = cdev; - - mutex_exit(&device_lock); - return (0); + break; } + /* + * XXX This should allocate what it needs up front so we never + * need to flail around trying to unwind. + */ error = bdevsw_attach(bdev, bmajor); if (error != 0) goto fail; @@ -176,6 +190,13 @@ devsw_attach(const char *devname, goto fail; } + /* + * If we already found a conv, we're done. Otherwise, find an + * empty slot or extend the table. + */ + if (i == max_devsw_convs) + goto fail; + for (i = 0 ; i < max_devsw_convs ; i++) { if (devsw_conv[i].d_name == NULL) break; @@ -224,7 +245,9 @@ devsw_attach(const char *devname, static int bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) { - const struct bdevsw **newptr; + const struct bdevsw **newbdevsw = NULL; + struct devswref *newbdevswref = NULL; + struct localcount *lc; devmajor_t bmajor; int i; @@ -253,20 +276,34 @@ bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) return (ENOMEM); } + if (bdevswref == NULL) { + newbdevswref = kmem_zalloc(MAXDEVSW * sizeof(newbdevswref[0]), + KM_NOSLEEP); + if (newbdevswref == NULL) + return ENOMEM; + atomic_store_release(&bdevswref, newbdevswref); + } + if (*devmajor >= max_bdevsws) { KASSERT(bdevsw == bdevsw0); - newptr = kmem_zalloc(MAXDEVSW * BDEVSW_SIZE, KM_NOSLEEP); - if (newptr == NULL) - return (ENOMEM); - memcpy(newptr, bdevsw, max_bdevsws * BDEVSW_SIZE); - bdevsw = newptr; - max_bdevsws = MAXDEVSW; + newbdevsw = kmem_zalloc(MAXDEVSW * sizeof(newbdevsw[0]), + KM_NOSLEEP); + if (newbdevsw == NULL) + return ENOMEM; + memcpy(newbdevsw, bdevsw, max_bdevsws * sizeof(bdevsw[0])); + atomic_store_release(&bdevsw, newbdevsw); + atomic_store_release(&max_bdevsws, MAXDEVSW); } if (bdevsw[*devmajor] != NULL) return (EEXIST); - bdevsw[*devmajor] = devsw; + KASSERT(bdevswref[*devmajor].dr_lc == NULL); + lc = kmem_zalloc(sizeof(*lc), KM_SLEEP); + localcount_init(lc); + bdevswref[*devmajor].dr_lc = lc; + + atomic_store_release(&bdevsw[*devmajor], devsw); return (0); } @@ -274,7 +311,9 @@ bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) static int cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) { - const struct cdevsw **newptr; + const struct cdevsw **newcdevsw = NULL; + struct devswref *newcdevswref = NULL; + struct localcount *lc; devmajor_t cmajor; int i; @@ -300,20 +339,34 @@ cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) return (ENOMEM); } + if (cdevswref == NULL) { + newcdevswref = kmem_zalloc(MAXDEVSW * sizeof(newcdevswref[0]), + KM_NOSLEEP); + if (newcdevswref == NULL) + return ENOMEM; + atomic_store_release(&cdevswref, newcdevswref); + } + if (*devmajor >= max_cdevsws) { KASSERT(cdevsw == cdevsw0); - newptr = kmem_zalloc(MAXDEVSW * CDEVSW_SIZE, KM_NOSLEEP); - if (newptr == NULL) - return (ENOMEM); - memcpy(newptr, cdevsw, max_cdevsws * CDEVSW_SIZE); - cdevsw = newptr; - max_cdevsws = MAXDEVSW; + newcdevsw = kmem_zalloc(MAXDEVSW * sizeof(newcdevsw[0]), + KM_NOSLEEP); + if (newcdevsw == NULL) + return ENOMEM; + memcpy(newcdevsw, cdevsw, max_cdevsws * sizeof(cdevsw[0])); + atomic_store_release(&cdevsw, newcdevsw); + atomic_store_release(&max_cdevsws, MAXDEVSW); } if (cdevsw[*devmajor] != NULL) return (EEXIST); - cdevsw[*devmajor] = devsw; + KASSERT(cdevswref[*devmajor].dr_lc == NULL); + lc = kmem_zalloc(sizeof(*lc), KM_SLEEP); + localcount_init(lc); + cdevswref[*devmajor].dr_lc = lc; + + atomic_store_release(&cdevsw[*devmajor], devsw); return (0); } @@ -321,25 +374,60 @@ cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) static void devsw_detach_locked(const struct bdevsw *bdev, const struct cdevsw *cdev) { - int i; + int bi, ci = -1/*XXXGCC*/; KASSERT(mutex_owned(&device_lock)); + /* Prevent new references. */ if (bdev != NULL) { - for (i = 0 ; i < max_bdevsws ; i++) { - if (bdevsw[i] != bdev) + for (bi = 0; bi < max_bdevsws; bi++) { + if (bdevsw[bi] != bdev) continue; - bdevsw[i] = NULL; + atomic_store_relaxed(&bdevsw[bi], NULL); break; } + KASSERT(bi < max_bdevsws); } if (cdev != NULL) { - for (i = 0 ; i < max_cdevsws ; i++) { - if (cdevsw[i] != cdev) + for (ci = 0; ci < max_cdevsws; ci++) { + if (cdevsw[ci] != cdev) continue; - cdevsw[i] = NULL; + atomic_store_relaxed(&cdevsw[ci], NULL); break; } + KASSERT(ci < max_cdevsws); + } + + if (bdev == NULL && cdev == NULL) /* XXX possible? */ + return; + + /* + * Wait for all bdevsw_lookup_acquire, cdevsw_lookup_acquire + * calls to notice that the devsw is gone. + */ + pserialize_perform(devsw_psz); + + /* + * Wait for all references to drain. It is the caller's + * responsibility to ensure that at this point, there are no + * extant open instances and all new d_open calls will fail. + * + * Note that localcount_drain may release and reacquire + * device_lock. + */ + if (bdev != NULL) { + localcount_drain(bdevswref[bi].dr_lc, + &devsw_cv, &device_lock); + localcount_fini(bdevswref[bi].dr_lc); + kmem_free(bdevswref[bi].dr_lc, sizeof(*bdevswref[bi].dr_lc)); + bdevswref[bi].dr_lc = NULL; + } + if (cdev != NULL) { + localcount_drain(cdevswref[ci].dr_lc, + &devsw_cv, &device_lock); + localcount_fini(cdevswref[ci].dr_lc); + kmem_free(cdevswref[ci].dr_lc, sizeof(*cdevswref[ci].dr_lc)); + cdevswref[ci].dr_lc = NULL; } } @@ -365,10 +453,59 @@ bdevsw_lookup(dev_t dev) if (dev == NODEV) return (NULL); bmajor = major(dev); - if (bmajor < 0 || bmajor >= max_bdevsws) + if (bmajor < 0 || bmajor >= atomic_load_relaxed(&max_bdevsws)) return (NULL); - return (bdevsw[bmajor]); + return atomic_load_consume(&bdevsw)[bmajor]; +} + +static const struct bdevsw * +bdevsw_lookup_acquire(dev_t dev, struct localcount **lcp) +{ + devmajor_t bmajor; + const struct bdevsw *bdev = NULL, *const *curbdevsw; + struct devswref *curbdevswref; + int s; + + if (dev == NODEV) + return NULL; + bmajor = major(dev); + if (bmajor < 0) + return NULL; + + s = pserialize_read_enter(); + + /* + * max_bdevsws never goes down, so it is safe to rely on this + * condition without any locking for the array access below. + * Test sys_bdevsws first so we can avoid the memory barrier in + * that case. + */ + if (bmajor >= sys_bdevsws && + bmajor >= atomic_load_acquire(&max_bdevsws)) + goto out; + curbdevsw = atomic_load_consume(&bdevsw); + if ((bdev = atomic_load_consume(&curbdevsw[bmajor])) == NULL) + goto out; + + curbdevswref = atomic_load_consume(&bdevswref); + if (curbdevswref == NULL) { + *lcp = NULL; + } else if ((*lcp = curbdevswref[bmajor].dr_lc) != NULL) { + localcount_acquire(*lcp); + } +out: + pserialize_read_exit(s); + return bdev; +} + +static void +bdevsw_release(const struct bdevsw *bdev, struct localcount *lc) +{ + + if (lc == NULL) + return; + localcount_release(lc, &devsw_cv, &device_lock); } /* @@ -384,10 +521,59 @@ cdevsw_lookup(dev_t dev) if (dev == NODEV) return (NULL); cmajor = major(dev); - if (cmajor < 0 || cmajor >= max_cdevsws) + if (cmajor < 0 || cmajor >= atomic_load_relaxed(&max_cdevsws)) return (NULL); - return (cdevsw[cmajor]); + return atomic_load_consume(&cdevsw)[cmajor]; +} + +static const struct cdevsw * +cdevsw_lookup_acquire(dev_t dev, struct localcount **lcp) +{ + devmajor_t cmajor; + const struct cdevsw *cdev = NULL, *const *curcdevsw; + struct devswref *curcdevswref; + int s; + + if (dev == NODEV) + return NULL; + cmajor = major(dev); + if (cmajor < 0) + return NULL; + + s = pserialize_read_enter(); + + /* + * max_cdevsws never goes down, so it is safe to rely on this + * condition without any locking for the array access below. + * Test sys_cdevsws first so we can avoid the memory barrier in + * that case. + */ + if (cmajor >= sys_cdevsws && + cmajor >= atomic_load_acquire(&max_cdevsws)) + goto out; + curcdevsw = atomic_load_consume(&cdevsw); + if ((cdev = atomic_load_consume(&curcdevsw[cmajor])) == NULL) + goto out; + + curcdevswref = atomic_load_consume(&cdevswref); + if (curcdevswref == NULL) { + *lcp = NULL; + } else if ((*lcp = curcdevswref[cmajor].dr_lc) != NULL) { + localcount_acquire(*lcp); + } +out: + pserialize_read_exit(s); + return cdev; +} + +static void +cdevsw_release(const struct cdevsw *cdev, struct localcount *lc) +{ + + if (lc == NULL) + return; + localcount_release(lc, &devsw_cv, &device_lock); } /* @@ -399,10 +585,13 @@ cdevsw_lookup(dev_t dev) devmajor_t bdevsw_lookup_major(const struct bdevsw *bdev) { - devmajor_t bmajor; + const struct bdevsw *const *curbdevsw; + devmajor_t bmajor, bmax; - for (bmajor = 0 ; bmajor < max_bdevsws ; bmajor++) { - if (bdevsw[bmajor] == bdev) + bmax = atomic_load_acquire(&max_bdevsws); + curbdevsw = atomic_load_consume(&bdevsw); + for (bmajor = 0; bmajor < bmax; bmajor++) { + if (atomic_load_relaxed(&curbdevsw[bmajor]) == bdev) return (bmajor); } @@ -418,10 +607,13 @@ bdevsw_lookup_major(const struct bdevsw *bdev) devmajor_t cdevsw_lookup_major(const struct cdevsw *cdev) { - devmajor_t cmajor; + const struct cdevsw *const *curcdevsw; + devmajor_t cmajor, cmax; - for (cmajor = 0 ; cmajor < max_cdevsws ; cmajor++) { - if (cdevsw[cmajor] == cdev) + cmax = atomic_load_acquire(&max_cdevsws); + curcdevsw = atomic_load_consume(&cdevsw); + for (cmajor = 0; cmajor < cmax; cmajor++) { + if (atomic_load_relaxed(&curcdevsw[cmajor]) == cdev) return (cmajor); } @@ -696,15 +888,10 @@ int bdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct bdevsw *d; + struct localcount *lc; int rv, mpflag; - /* - * For open we need to lock, in order to synchronize - * with attach/detach. - */ - mutex_enter(&device_lock); - d = bdevsw_lookup(dev); - mutex_exit(&device_lock); + d = bdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; @@ -712,6 +899,8 @@ bdev_open(dev_t dev, int flag, int devtype, lwp_t *l) rv = (*d->d_open)(dev, flag, devtype, l); DEV_UNLOCK(d); + bdevsw_release(d, lc); + return rv; } @@ -854,15 +1043,10 @@ int cdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct cdevsw *d; + struct localcount *lc; int rv, mpflag; - /* - * For open we need to lock, in order to synchronize - * with attach/detach. - */ - mutex_enter(&device_lock); - d = cdevsw_lookup(dev); - mutex_exit(&device_lock); + d = cdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; @@ -870,6 +1054,8 @@ cdev_open(dev_t dev, int flag, int devtype, lwp_t *l) rv = (*d->d_open)(dev, flag, devtype, l); DEV_UNLOCK(d); + cdevsw_release(d, lc); + return rv; } From 69a052b6696f26d483bd31dbdb345832112f3cb6 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 15 Jan 2022 20:01:53 +0000 Subject: [PATCH 03/37] driver(9): Use xcall(9) rather than pserialize(9) for devsw detach. devsw_init is called too early for the kmem(9) in pserialize_create, but pserialize_create doesn't do anything useful anyway. --- sys/kern/subr_devsw.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index 9a30046e9550..334328484a1a 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -88,6 +88,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp #include #include #include +#include #ifdef DEVSW_DEBUG #define DPRINTF(x) printf x @@ -113,7 +114,6 @@ extern int max_bdevsws, max_cdevsws, max_devsw_convs; static struct devswref *cdevswref; static struct devswref *bdevswref; -static struct pserialize *devsw_psz; static kcondvar_t devsw_cv; static int bdevsw_attach(const struct bdevsw *, devmajor_t *); @@ -132,7 +132,6 @@ devsw_init(void) KASSERT(sys_cdevsws < MAXDEVSW - 1); mutex_init(&device_lock, MUTEX_DEFAULT, IPL_NONE); - devsw_psz = pserialize_create(); cv_init(&devsw_cv, "devsw"); } @@ -404,8 +403,15 @@ devsw_detach_locked(const struct bdevsw *bdev, const struct cdevsw *cdev) /* * Wait for all bdevsw_lookup_acquire, cdevsw_lookup_acquire * calls to notice that the devsw is gone. + * + * XXX Despite the use of the pserialize_read_enter/exit API + * elsewhere in this file, we use xc_barrier here instead of + * pserialize_perform -- because devsw_init is too early for + * pserialize_create. Either pserialize_create should be made + * to work earlier, or it should be nixed altogether. Until + * that is fixed, xc_barrier will serve the same purpose. */ - pserialize_perform(devsw_psz); + xc_barrier(0); /* * Wait for all references to drain. It is the caller's From 9347370975eb2de536fb62a4ae410f30a38b7a8a Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 12 Jan 2022 14:07:35 +0000 Subject: [PATCH 04/37] autoconf(9): New localcount-based device instance references. device_lookup_acquire looks up an autoconf device instance, if found, and acquires a reference the caller must release with device_release. If attach or detach is still in progress, device_lookup_acquire waits until it completes. While references are held, the device's softc will not be freed or reused until the last reference is released. The reference is meant to be held while opening a device in the short term, and then to be passed off to a longer-term reference that can be broken explicitly by detach -- usually a device special vnode, which is broken by vdevgone in the driver's *_detach function. Sleeping while holding a reference is allowed, e.g. waiting to open a tty. A driver must arrange that its *_detach function will interrupt any threads sleeping while holding references and cause them to back out so that detach can complete promptly. Subsequent changes to subr_devsw.c will make bdev_open and cdev_open automatically take a reference to an autoconf instance for drivers that opt into this, so there will be no logic changes needed in most drivers other than to connect the autoconf cfdriver to the bdevsw/cdevsw I/O operation tables. The effect will be that *_detach may run while d_open is in progress, but no new d_open can begin until *_detach has backed out from or committed to detaching. XXX kernel ABI change to struct device requires bump -- later change will make struct device opaque to ABI, but we're not there yet --- sys/kern/subr_autoconf.c | 182 ++++++++++++++++++++++++++++++++++++--- sys/sys/device.h | 6 ++ 2 files changed, 176 insertions(+), 12 deletions(-) diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c index ccb1875f8ba9..74f71dca0bcf 100644 --- a/sys/kern/subr_autoconf.c +++ b/sys/kern/subr_autoconf.c @@ -108,6 +108,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_autoconf.c,v 1.296 2022/03/12 19:26:33 riastrad #include #include #include +#include #include @@ -1453,6 +1454,9 @@ config_devdelete(device_t dev) if (dg->dg_devs != NULL) kmem_free(dg->dg_devs, sizeof(device_t) * dg->dg_ndevs); + localcount_fini(dev->dv_localcount); + kmem_free(dev->dv_localcount, sizeof(*dev->dv_localcount)); + cv_destroy(&dvl->dvl_cv); mutex_destroy(&dvl->dvl_mtx); @@ -1557,6 +1561,7 @@ config_devalloc(const device_t parent, const cfdata_t cf, dev->dv_activity_handlers = NULL; dev->dv_private = dev_private; dev->dv_flags = ca->ca_flags; /* inherit flags from class */ + dev->dv_attaching = curlwp; myunit = config_unit_alloc(dev, cd, cf); if (myunit == -1) { @@ -1605,6 +1610,10 @@ config_devalloc(const device_t parent, const cfdata_t cf, "device-parent", device_xname(parent)); } + dev->dv_localcount = kmem_zalloc(sizeof(*dev->dv_localcount), + KM_SLEEP); + localcount_init(dev->dv_localcount); + if (dev->dv_cfdriver->cd_attrs != NULL) config_add_attrib_dict(dev); @@ -1756,8 +1765,29 @@ config_attach_internal(device_t parent, cfdata_t cf, void *aux, cfprint_t print, /* Let userland know */ devmon_report_device(dev, true); + /* + * Prevent detach until the driver's attach function, and all + * deferred actions, have finished. + */ config_pending_incr(dev); + + /* Call the driver's attach function. */ (*dev->dv_cfattach->ca_attach)(parent, dev, aux); + + /* + * Allow other threads to acquire references to the device now + * that the driver's attach function is done. + */ + mutex_enter(&config_misc_lock); + KASSERT(dev->dv_attaching == curlwp); + dev->dv_attaching = NULL; + cv_broadcast(&config_misc_cv); + mutex_exit(&config_misc_lock); + + /* + * Synchronous parts of attach are done. Allow detach, unless + * the driver's attach function scheduled deferred actions. + */ config_pending_decr(dev); mutex_enter(&config_misc_lock); @@ -1823,8 +1853,29 @@ config_attach_pseudo(cfdata_t cf) /* Let userland know */ devmon_report_device(dev, true); + /* + * Prevent detach until the driver's attach function, and all + * deferred actions, have finished. + */ config_pending_incr(dev); + + /* Call the driver's attach function. */ (*dev->dv_cfattach->ca_attach)(ROOT, dev, NULL); + + /* + * Allow other threads to acquire references to the device now + * that the driver's attach function is done. + */ + mutex_enter(&config_misc_lock); + KASSERT(dev->dv_attaching == curlwp); + dev->dv_attaching = NULL; + cv_broadcast(&config_misc_cv); + mutex_exit(&config_misc_lock); + + /* + * Synchronous parts of attach are done. Allow detach, unless + * the driver's attach function scheduled deferred actions. + */ config_pending_decr(dev); config_process_deferred(&deferred_config_queue, dev); @@ -1873,24 +1924,39 @@ config_dump_garbage(struct devicelist *garbage) static int config_detach_enter(device_t dev) { - int error; + int error = 0; mutex_enter(&config_misc_lock); - for (;;) { - if (dev->dv_pending == 0 && dev->dv_detaching == NULL) { - dev->dv_detaching = curlwp; - error = 0; - break; - } + + /* + * Wait until attach has fully completed, and until any + * concurrent detach (e.g., drvctl racing with USB event + * thread) has completed. + * + * Caller must hold alldevs_nread or alldevs_nwrite (e.g., via + * deviter) to ensure the winner of the race doesn't free the + * device leading the loser of the race into use-after-free. + * + * XXX Not all callers do this! + */ + while (dev->dv_pending || dev->dv_detaching) { KASSERTMSG(dev->dv_detaching != curlwp, "recursively detaching %s", device_xname(dev)); error = cv_wait_sig(&config_misc_cv, &config_misc_lock); if (error) - break; + goto out; } - KASSERT(error || dev->dv_detaching == curlwp); - mutex_exit(&config_misc_lock); + /* + * Attach has completed, and no other concurrent detach is + * running. Claim the device for detaching. This will cause + * all new attempts to acquire references to block. + */ + KASSERT(dev->dv_attaching == NULL); + KASSERT(dev->dv_detaching == NULL); + dev->dv_detaching = curlwp; + +out: mutex_exit(&config_misc_lock); return error; } @@ -1981,9 +2047,10 @@ config_detach(device_t dev, int flags) */ if (rv == 0) dev->dv_flags &= ~DVF_ACTIVE; - else if ((flags & DETACH_FORCE) == 0) + else if ((flags & DETACH_FORCE) == 0) { + /* Detach failed -- likely EBUSY. */ goto out; - else { + } else { panic("config_detach: forced detach of %s failed (%d)", device_xname(dev), rv); } @@ -1992,6 +2059,19 @@ config_detach(device_t dev, int flags) * The device has now been successfully detached. */ + /* + * Wait for all device_lookup_acquire references -- mostly, for + * all attempts to open the device -- to drain. It is the + * responsibility of .ca_detach to ensure anything with open + * references will be interrupted and release them promptly, + * not block indefinitely. All new attempts to acquire + * references will block until dv_detaching clears. + */ + mutex_enter(&config_misc_lock); + localcount_drain(dev->dv_localcount, + &config_misc_cv, &config_misc_lock); + mutex_exit(&config_misc_lock); + /* Let userland know */ devmon_report_device(dev, false); @@ -2499,6 +2579,17 @@ config_alldevs_exit(struct alldevs_foray *af) * device_lookup: * * Look up a device instance for a given driver. + * + * Caller is responsible for ensuring the device's state is + * stable, either by holding a reference already obtained with + * device_lookup_acquire or by otherwise ensuring the device is + * attached and can't be detached (e.g., holding an open device + * node and ensuring *_detach calls vdevgone). + * + * XXX Find a way to assert this. + * + * Safe for use up to and including interrupt context at IPL_VM. + * Never sleeps. */ device_t device_lookup(cfdriver_t cd, int unit) @@ -2527,6 +2618,73 @@ device_lookup_private(cfdriver_t cd, int unit) return device_private(device_lookup(cd, unit)); } +/* + * device_lookup_acquire: + * + * Look up a device instance for a given driver, and return a + * reference to it that must be released by device_release. + * + * => If the device is still attaching, blocks until *_attach has + * returned. + * + * => If the device is detaching, blocks until *_detach has + * returned. May succeed or fail in that case, depending on + * whether *_detach has backed out (EBUSY) or committed to + * detaching. + * + * May sleep. + */ +device_t +device_lookup_acquire(cfdriver_t cd, int unit) +{ + device_t dv; + + ASSERT_SLEEPABLE(); + + /* XXX This should have a pserialized fast path -- TBD. */ + mutex_enter(&config_misc_lock); + mutex_enter(&alldevs_lock); +retry: if (unit < 0 || unit >= cd->cd_ndevs || + (dv = cd->cd_devs[unit]) == NULL || + dv->dv_del_gen != 0) { + dv = NULL; + } else { + /* + * Wait for the device to stabilize, if attaching or + * detaching. Either way we must wait for *_attach or + * *_detach to complete, and either way we must retry: + * even if detaching, *_detach might fail (EBUSY) so + * the device may still be there. + */ + if ((dv->dv_attaching != NULL && dv->dv_attaching != curlwp) || + dv->dv_detaching != NULL) { + mutex_exit(&alldevs_lock); + cv_wait(&config_misc_cv, &config_misc_lock); + mutex_enter(&alldevs_lock); + goto retry; + } + localcount_acquire(dv->dv_localcount); + } + mutex_exit(&alldevs_lock); + mutex_exit(&config_misc_lock); + + return dv; +} + +/* + * device_release: + * + * Release a reference to a device acquired with + * device_lookup_acquire. + */ +void +device_release(device_t dv) +{ + + localcount_release(dv->dv_localcount, + &config_misc_cv, &config_misc_lock); +} + /* * device_find_by_xname: * diff --git a/sys/sys/device.h b/sys/sys/device.h index a443878d9810..9f95c98ffa42 100644 --- a/sys/sys/device.h +++ b/sys/sys/device.h @@ -274,10 +274,12 @@ struct device { void *dv_private; /* this device's private storage */ int *dv_locators; /* our actual locators (optional) */ prop_dictionary_t dv_properties;/* properties dictionary */ + struct localcount *dv_localcount;/* reference count */ int dv_pending; /* config_pending count */ TAILQ_ENTRY(device) dv_pending_list; + struct lwp *dv_attaching; /* thread not yet finished in attach */ struct lwp *dv_detaching; /* detach lock (config_misc_lock/cv) */ size_t dv_activity_count; @@ -651,6 +653,10 @@ void null_childdetached(device_t, device_t); device_t device_lookup(cfdriver_t, int); void *device_lookup_private(cfdriver_t, int); + +device_t device_lookup_acquire(cfdriver_t, int); +void device_release(device_t); + void device_register(device_t, void *); void device_register_post_config(device_t, void *); From b70c435dd84064852eb0594afa3514a7e262a3b4 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 12 Jan 2022 14:18:57 +0000 Subject: [PATCH 05/37] driver(9): New devsw members d_cfdriver, d_devtounit. If set, then bdev_open/cdev_open will use d_devtounit to map the dev_t to an autoconf instance (e.g., /dev/wd0a -> wd0) and hold a reference with device_lookup_acquire across the call to d_open. This guarantees that the autoconf instance cannot be detached while the devsw's d_open function is trying to open it (and also that the autoconf instance has finished *_attach before anyone can open it). Of course, if the underlying hardware has gone away, there will be I/O errors, but this avoids software synchronization bugs between open and detach for drivers that opt into it. It's up to the driver and bus to figure out how to deal with I/O errors from operations on hardware that has gone away while the software hasn't finished notifying everything that it's gone yet. XXX kernel ABI change to struct bdevsw/cdevsw requires bump --- sys/kern/subr_devsw.c | 49 +++++++++++++++++++++++++++++++++++++++++-- sys/sys/conf.h | 4 ++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index 334328484a1a..d0451363a041 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -89,6 +89,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp #include #include #include +#include #ifdef DEVSW_DEBUG #define DPRINTF(x) printf x @@ -895,16 +896,38 @@ bdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct bdevsw *d; struct localcount *lc; - int rv, mpflag; + device_t dv = NULL/*XXXGCC*/; + int unit, rv, mpflag; d = bdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; + if (d->d_devtounit) { + /* + * If the device node corresponds to an autoconf device + * instance, acquire a reference to it so that during + * d_open, device_lookup is stable. + * + * XXX This should also arrange to instantiate cloning + * pseudo-devices if appropriate, but that requires + * reviewing them all to find and verify a common + * pattern. + */ + if ((unit = (*d->d_devtounit)(dev)) == -1) + return ENXIO; + if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL) + return ENXIO; + } + DEV_LOCK(d); rv = (*d->d_open)(dev, flag, devtype, l); DEV_UNLOCK(d); + if (d->d_devtounit) { + device_release(dv); + } + bdevsw_release(d, lc); return rv; @@ -1050,16 +1073,38 @@ cdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct cdevsw *d; struct localcount *lc; - int rv, mpflag; + device_t dv = NULL/*XXXGCC*/; + int unit, rv, mpflag; d = cdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; + if (d->d_devtounit) { + /* + * If the device node corresponds to an autoconf device + * instance, acquire a reference to it so that during + * d_open, device_lookup is stable. + * + * XXX This should also arrange to instantiate cloning + * pseudo-devices if appropriate, but that requires + * reviewing them all to find and verify a common + * pattern. + */ + if ((unit = (*d->d_devtounit)(dev)) == -1) + return ENXIO; + if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL) + return ENXIO; + } + DEV_LOCK(d); rv = (*d->d_open)(dev, flag, devtype, l); DEV_UNLOCK(d); + if (d->d_devtounit) { + device_release(dv); + } + cdevsw_release(d, lc); return rv; diff --git a/sys/sys/conf.h b/sys/sys/conf.h index ba46976a924b..12cfd3bf89a6 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -76,6 +76,8 @@ struct bdevsw { int (*d_dump)(dev_t, daddr_t, void *, size_t); int (*d_psize)(dev_t); int (*d_discard)(dev_t, off_t, off_t); + int (*d_devtounit)(dev_t); + struct cfdriver *d_cfdriver; int d_flag; }; @@ -94,6 +96,8 @@ struct cdevsw { paddr_t (*d_mmap)(dev_t, off_t, int); int (*d_kqfilter)(dev_t, struct knote *); int (*d_discard)(dev_t, off_t, off_t); + int (*d_devtounit)(dev_t); + struct cfdriver *d_cfdriver; int d_flag; }; From c7178e4cddfc2a7ea7af97368ad481c12b8981d5 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 15 Jan 2022 19:11:10 +0000 Subject: [PATCH 06/37] disk(9): New function disklabel_dev_unit. Maps a dev_t like wd3e to an autoconf instance number like 3, with no partition. Same as DISKUNIT macro, but is a symbol whose pointer can be taken. Meant for use with struct bdevsw, cdevsw::d_devtounit. --- sys/kern/subr_disk.c | 7 +++++++ sys/sys/disklabel.h | 1 + 2 files changed, 8 insertions(+) diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c index da664f920382..41218421db57 100644 --- a/sys/kern/subr_disk.c +++ b/sys/kern/subr_disk.c @@ -728,3 +728,10 @@ disk_set_info(device_t dev, struct disk *dk, const char *type) if (odisk_info) prop_object_release(odisk_info); } + +int +disklabel_dev_unit(dev_t dev) +{ + + return DISKUNIT(dev); +} diff --git a/sys/sys/disklabel.h b/sys/sys/disklabel.h index 4e94b8671332..853cdbe668a3 100644 --- a/sys/sys/disklabel.h +++ b/sys/sys/disklabel.h @@ -509,6 +509,7 @@ const char *convertdisklabel(struct disklabel *, void (*)(struct buf *), int bounds_check_with_label(struct disk *, struct buf *, int); int bounds_check_with_mediasize(struct buf *, int, uint64_t); const char *getfstypename(int); +int disklabel_dev_unit(dev_t); #endif #endif /* _LOCORE */ From 548616f3ead1e1e9b90a540cdd16b27d2ad9e008 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 15 Jan 2022 19:18:09 +0000 Subject: [PATCH 07/37] driver(9): New function dev_minor_unit. --- sys/kern/subr_devsw.c | 15 +++++++++++++++ sys/sys/conf.h | 1 + 2 files changed, 16 insertions(+) diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index d0451363a041..d24480114f94 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -1299,3 +1299,18 @@ nommap(dev_t dev, off_t off, int prot) return (paddr_t)-1; } + +/* + * dev_minor_unit(dev) + * + * Returns minor(dev) as an int. Intended for use with struct + * bdevsw, cdevsw::d_devtounit for drivers whose /dev nodes are + * implemented by reference to an autoconf instance with the minor + * number. + */ +int +dev_minor_unit(dev_t dev) +{ + + return minor(dev); +} diff --git a/sys/sys/conf.h b/sys/sys/conf.h index 12cfd3bf89a6..cb4c287d1982 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -280,6 +280,7 @@ devmajor_t devsw_name2blk(const char *, char *, size_t); devmajor_t devsw_name2chr(const char *, char *, size_t); dev_t devsw_chr2blk(dev_t); dev_t devsw_blk2chr(dev_t); +int dev_minor_unit(dev_t); void mm_init(void); #endif /* _KERNEL */ From 16c222583b2facfd5bc13964edca506b3d8c22f7 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 21 Jan 2022 03:07:36 +0000 Subject: [PATCH 08/37] driver(9): Eliminate D_MCLOSE. D_MCLOSE was introduced a few years ago by mistake for audio(4), which should have used -- and now does use -- fd_clone to create per-open state. The semantics was originally to call close once every time the device node is closed, not only for the last close. Nothing uses it any more, and it complicates reasoning about the system, so let's simplify it away. --- sys/miscfs/specfs/spec_vnops.c | 2 +- sys/sys/conf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index b4bc4c34ab03..7c51f9ef854d 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -1276,7 +1276,7 @@ spec_close(void *v) sd->sd_bdevvp = NULL; mutex_exit(&device_lock); - if (count != 0 && (vp->v_type != VCHR || !(cdev_flags(dev) & D_MCLOSE))) + if (count != 0) return 0; /* diff --git a/sys/sys/conf.h b/sys/sys/conf.h index cb4c287d1982..16dd87e5480c 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -63,7 +63,7 @@ struct vnode; #define D_TYPEMASK 0x00ff #define D_MPSAFE 0x0100 #define D_NEGOFFSAFE 0x0200 -#define D_MCLOSE 0x0400 +#define D_UNUSED0 0x0400 /* was D_MCLOSE */ /* * Block device switch table From 844e00c8c95e428182c5da569226760f634c4113 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 21 Jan 2022 12:13:49 +0000 Subject: [PATCH 09/37] specfs: Note lock order for vnode lock, device_lock, v_interlock. --- sys/miscfs/specfs/spec_vnops.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 7c51f9ef854d..df60b05c1cf9 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -85,6 +85,14 @@ __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.183 2021/07/18 23:57:14 dholland Ex #include #include +/* + * Lock order: + * + * vnode lock + * -> device_lock + * -> struct vnode::v_interlock + */ + /* symbolic sleep message strings for devices */ const char devopn[] = "devopn"; const char devio[] = "devio"; From 15cde892dba786ff4c6c675c1069f138cbcd5cce Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:03:49 +0000 Subject: [PATCH 10/37] specfs: Call bdev_open without the vnode lock. There is no need for it to serialize opens, because they are already serialized by sd_opencnt which for block devices is always either 0 or 1. There's not obviously any other reason why the vnode lock should be held across bdev_open, other than that it might be nice to avoid dropping it if not necessary. For character devices we always have to drop the vnode lock because open might hang indefinitely, when opening a tty, which is not allowed while holding the vnode lock. --- sys/miscfs/specfs/spec_vnops.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index df60b05c1cf9..c3b592741d02 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -617,6 +617,7 @@ spec_open(void *v) sd->sd_opencnt = 1; sd->sd_bdevvp = vp; mutex_exit(&device_lock); + VOP_UNLOCK(vp); do { const struct bdevsw *bdev; @@ -636,13 +637,10 @@ spec_open(void *v) if ((name = bdevsw_getname(major(dev))) == NULL) break; - VOP_UNLOCK(vp); - /* Try to autoload device module */ (void) module_autoload(name, MODULE_CLASS_DRIVER); - - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } while (gen != module_gen); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); break; From 4a0683fa953ccdd1336e9f4fc7e9e5f876a0f607 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:10:19 +0000 Subject: [PATCH 11/37] specfs: Assert v_type is VBLK or VCHR in spec_open. Nothing else makes sense. Prune dead branches (and replace default case by panic). --- sys/miscfs/specfs/spec_vnops.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index c3b592741d02..1dfbde683bdc 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -525,7 +525,10 @@ spec_open(void *v) sd = sn->sn_dev; name = NULL; gen = 0; - + + KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d", + vp->v_type); + /* * Don't allow open if fs is mounted -nodev. */ @@ -644,15 +647,8 @@ spec_open(void *v) break; - case VNON: - case VLNK: - case VDIR: - case VREG: - case VBAD: - case VFIFO: - case VSOCK: default: - return 0; + panic("invalid specfs vnode type: %d", vp->v_type); } mutex_enter(&device_lock); From fe6b9a020b0fd69d057d67e19096e20fb2de01e5 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:11:49 +0000 Subject: [PATCH 12/37] specfs: Factor common kauth check out of switch in spec_open. No functional change. --- sys/miscfs/specfs/spec_vnops.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 1dfbde683bdc..e9a818b1eac8 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -546,13 +546,12 @@ spec_open(void *v) req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ; break; } + error = kauth_authorize_device_spec(ap->a_cred, req, vp); + if (error != 0) + return (error); switch (vp->v_type) { case VCHR: - error = kauth_authorize_device_spec(ap->a_cred, req, vp); - if (error != 0) - return (error); - /* * Character devices can accept opens from multiple * vnodes. @@ -595,10 +594,6 @@ spec_open(void *v) break; case VBLK: - error = kauth_authorize_device_spec(ap->a_cred, req, vp); - if (error != 0) - return (error); - /* * For block devices, permit only one open. The buffer * cache cannot remain self-consistent with multiple From a265cd27fb6915ae2704865f401e1a82c3b2a16f Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:34:59 +0000 Subject: [PATCH 13/37] specfs: Split spec_open switch into three sections. The sections are now: 1. Acquire open reference. 1a (intermezzo). Set VV_ISTTY. 2. Drop the vnode lock to call .d_open and autoload modules if necessary. 3. Handle concurrent revoke if it happenend, or release open reference if .d_open failed. No functional change. Sprinkle comments about problems. --- sys/miscfs/specfs/spec_vnops.c | 127 +++++++++++++++++++++++++++------ 1 file changed, 105 insertions(+), 22 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index e9a818b1eac8..7027ba1bbeb4 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -550,6 +550,20 @@ spec_open(void *v) if (error != 0) return (error); + /* + * Acquire an open reference -- as long as we hold onto it, and + * the vnode isn't revoked, it can't be closed. + * + * But first check whether it has been revoked -- if so, we + * can't acquire more open references and we must fail + * immediately with EBADF. + * + * XXX This races with revoke: once we release the vnode lock, + * the vnode may be revoked, and the .d_close callback run, at + * the same time as we're calling .d_open here. Drivers + * shouldn't have to contemplate this scenario; .d_open and + * .d_close should be prevented from running concurrently. + */ switch (vp->v_type) { case VCHR: /* @@ -564,8 +578,68 @@ spec_open(void *v) sd->sd_opencnt++; sn->sn_opencnt++; mutex_exit(&device_lock); + break; + case VBLK: + /* + * For block devices, permit only one open. The buffer + * cache cannot remain self-consistent with multiple + * vnodes holding a block device open. + * + * Treat zero opencnt with non-NULL mountpoint as open. + * This may happen after forced detach of a mounted device. + */ + mutex_enter(&device_lock); + if (sn->sn_gone) { + mutex_exit(&device_lock); + return (EBADF); + } + if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) { + mutex_exit(&device_lock); + return EBUSY; + } + sn->sn_opencnt = 1; + sd->sd_opencnt = 1; + sd->sd_bdevvp = vp; + mutex_exit(&device_lock); + break; + default: + panic("invalid specfs vnode type: %d", vp->v_type); + } + + /* + * Set VV_ISTTY if this is a tty cdev. + * + * XXX This does the wrong thing if the module has to be + * autoloaded. We should maybe set this after autoloading + * modules and calling .d_open successfully, except (a) we need + * the vnode lock to touch it, and (b) once we acquire the + * vnode lock again, the vnode may have been revoked, and + * deadfs's dead_read needs VV_ISTTY to be already set in order + * to return the right answer. So this needs some additional + * synchronization to be made to work correctly with tty driver + * module autoload. For now, let's just hope it doesn't cause + * too much trouble for a tty from an autoloaded driver module + * to fail with EIO instead of returning EOF. + */ + if (vp->v_type == VCHR) { if (cdev_type(dev) == D_TTY) vp->v_vflag |= VV_ISTTY; + } + + /* + * Open the device. If .d_open returns ENXIO (device not + * configured), the driver may not be loaded, so try + * autoloading a module and then try .d_open again if anything + * got loaded. + * + * Because opening the device may block indefinitely, e.g. when + * opening a tty, and loading a module may cross into many + * other subsystems, we must not hold the vnode lock while + * calling .d_open, so release it now and reacquire it when + * done. + */ + switch (vp->v_type) { + case VCHR: VOP_UNLOCK(vp); do { const struct cdevsw *cdev; @@ -594,27 +668,6 @@ spec_open(void *v) break; case VBLK: - /* - * For block devices, permit only one open. The buffer - * cache cannot remain self-consistent with multiple - * vnodes holding a block device open. - * - * Treat zero opencnt with non-NULL mountpoint as open. - * This may happen after forced detach of a mounted device. - */ - mutex_enter(&device_lock); - if (sn->sn_gone) { - mutex_exit(&device_lock); - return (EBADF); - } - if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) { - mutex_exit(&device_lock); - return EBUSY; - } - sn->sn_opencnt = 1; - sd->sd_opencnt = 1; - sd->sd_bdevvp = vp; - mutex_exit(&device_lock); VOP_UNLOCK(vp); do { const struct bdevsw *bdev; @@ -643,9 +696,39 @@ spec_open(void *v) break; default: - panic("invalid specfs vnode type: %d", vp->v_type); + __unreachable(); } + /* + * If it has been revoked since we released the vnode lock and + * reacquired it, then spec_node_revoke has closed it, and we + * must fail with EBADF. + * + * Otherwise, if opening it failed, back out and release the + * open reference. + * + * XXX This is wrong -- we might release the last open + * reference here, but we don't close the device. If only this + * thread's call to open failed, that's fine, but we might + * have: + * + * Thread 1 Thread 2 + * VOP_OPEN + * ... + * .d_open -> 0 (success) + * acquire vnode lock + * do stuff VOP_OPEN + * release vnode lock ... + * .d_open -> EBUSY + * VOP_CLOSE + * acquire vnode lock + * --sd_opencnt != 0 + * => no .d_close + * release vnode lock + * acquire vnode lock + * --sd_opencnt == 0 + * but no .d_close (***) + */ mutex_enter(&device_lock); if (sn->sn_gone) { if (error == 0) From 34aab6fc93f57f3b9c5560417fb8b1efe9327592 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 21 Jan 2022 14:44:00 +0000 Subject: [PATCH 14/37] specfs: Delete bogus comment about .d_open/.d_close at same time. Annoying as it is that .d_open and .d_close can run at the same time, it is also necessary for tty semantics, where open can block indefinitely, and it is the responsibility of close (called via revoke) necessary to interrupt it. --- sys/miscfs/specfs/spec_vnops.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 7027ba1bbeb4..291db6292cf0 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -557,12 +557,6 @@ spec_open(void *v) * But first check whether it has been revoked -- if so, we * can't acquire more open references and we must fail * immediately with EBADF. - * - * XXX This races with revoke: once we release the vnode lock, - * the vnode may be revoked, and the .d_close callback run, at - * the same time as we're calling .d_open here. Drivers - * shouldn't have to contemplate this scenario; .d_open and - * .d_close should be prevented from running concurrently. */ switch (vp->v_type) { case VCHR: From ccf25f225a68c62ad8dcc23fb85a541805d0bb82 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:38:12 +0000 Subject: [PATCH 15/37] specfs: Factor common device_lock out of switch for clarity. No functional change. --- sys/miscfs/specfs/spec_vnops.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 291db6292cf0..fec2ff330bdc 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -558,20 +558,19 @@ spec_open(void *v) * can't acquire more open references and we must fail * immediately with EBADF. */ + mutex_enter(&device_lock); switch (vp->v_type) { case VCHR: /* * Character devices can accept opens from multiple * vnodes. */ - mutex_enter(&device_lock); if (sn->sn_gone) { - mutex_exit(&device_lock); - return (EBADF); + error = EBADF; + break; } sd->sd_opencnt++; sn->sn_opencnt++; - mutex_exit(&device_lock); break; case VBLK: /* @@ -582,23 +581,24 @@ spec_open(void *v) * Treat zero opencnt with non-NULL mountpoint as open. * This may happen after forced detach of a mounted device. */ - mutex_enter(&device_lock); if (sn->sn_gone) { - mutex_exit(&device_lock); - return (EBADF); + error = EBADF; + break; } if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) { - mutex_exit(&device_lock); - return EBUSY; + error = EBUSY; + break; } sn->sn_opencnt = 1; sd->sd_opencnt = 1; sd->sd_bdevvp = vp; - mutex_exit(&device_lock); break; default: panic("invalid specfs vnode type: %d", vp->v_type); } + mutex_exit(&device_lock); + if (error) + return error; /* * Set VV_ISTTY if this is a tty cdev. From f0b14db3dd29c543ddcb35ceac60e7a1b21174a6 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:42:24 +0000 Subject: [PATCH 16/37] specfs: Factor VOP_UNLOCK/vn_lock out of switch for clarity. No functional change. --- sys/miscfs/specfs/spec_vnops.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index fec2ff330bdc..8aa044698028 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -632,9 +632,9 @@ spec_open(void *v) * calling .d_open, so release it now and reacquire it when * done. */ + VOP_UNLOCK(vp); switch (vp->v_type) { case VCHR: - VOP_UNLOCK(vp); do { const struct cdevsw *cdev; @@ -657,12 +657,9 @@ spec_open(void *v) /* Try to autoload device module */ (void) module_autoload(name, MODULE_CLASS_DRIVER); } while (gen != module_gen); - - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); break; case VBLK: - VOP_UNLOCK(vp); do { const struct bdevsw *bdev; @@ -685,13 +682,12 @@ spec_open(void *v) /* Try to autoload device module */ (void) module_autoload(name, MODULE_CLASS_DRIVER); } while (gen != module_gen); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - break; default: __unreachable(); } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * If it has been revoked since we released the vnode lock and From 1705c400fbed19c16a8387392a9b0ddc5797f170 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 00:55:03 +0000 Subject: [PATCH 17/37] specfs: Reorganize D_DISK tail of spec_open and explain what's up. No functional change intended. --- sys/miscfs/specfs/spec_vnops.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 8aa044698028..54232be1ef98 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -732,15 +732,27 @@ spec_open(void *v) } mutex_exit(&device_lock); - if (cdev_type(dev) != D_DISK || error != 0) + /* If anything went wrong, we're done. */ + if (error) return error; - - ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl; - error = (*ioctl)(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, curlwp); - if (error == 0) - uvm_vnp_setsize(vp, (voff_t)pi.pi_secsize * pi.pi_size); + /* + * For disk devices, automagically set the vnode size to the + * partition size, if we can. This applies to block devices + * and character devices alike -- every block device must have + * a corresponding character device. And if the module is + * loaded it will remain loaded until we're done here (it is + * forbidden to devsw_detach until closed). So it is safe to + * query cdev_type unconditionally here. + */ + if (cdev_type(dev) == D_DISK) { + ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl; + if ((*ioctl)(dev, DIOCGPARTINFO, &pi, FREAD, curlwp) == 0) + uvm_vnp_setsize(vp, + (voff_t)pi.pi_secsize * pi.pi_size); + } + /* Success! */ return 0; } From a251a9a8bc3081b2e7e8a8a4cdc53e28aeaf463b Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 01:11:45 +0000 Subject: [PATCH 18/37] specfs: sn_gone cannot be set while we hold the vnode lock. Revoke runs with the vnode lock too, which is exclusive. Add an assertion to this effect in spec_node_revoke to make it clear. XXX Exception: For file systems with nonworking locks, this doesn't work. But I'm not sure suspending such file systems works either. Another reason to get rid of file systems with nonworking locks -- which is only deadfs as used by bdevvp/cdevvp. --- sys/miscfs/specfs/spec_vnops.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 54232be1ef98..8bfc08edf20f 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -402,6 +402,9 @@ spec_node_revoke(vnode_t *vp) specnode_t *sn; specdev_t *sd; + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); + sn = vp->v_specnode; sd = sn->sn_dev; @@ -552,11 +555,8 @@ spec_open(void *v) /* * Acquire an open reference -- as long as we hold onto it, and - * the vnode isn't revoked, it can't be closed. - * - * But first check whether it has been revoked -- if so, we - * can't acquire more open references and we must fail - * immediately with EBADF. + * the vnode isn't revoked, it can't be closed, and the vnode + * can't be revoked until we release the vnode lock. */ mutex_enter(&device_lock); switch (vp->v_type) { @@ -565,10 +565,7 @@ spec_open(void *v) * Character devices can accept opens from multiple * vnodes. */ - if (sn->sn_gone) { - error = EBADF; - break; - } + KASSERT(!sn->sn_gone); sd->sd_opencnt++; sn->sn_opencnt++; break; @@ -581,10 +578,7 @@ spec_open(void *v) * Treat zero opencnt with non-NULL mountpoint as open. * This may happen after forced detach of a mounted device. */ - if (sn->sn_gone) { - error = EBADF; - break; - } + KASSERT(!sn->sn_gone); if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) { error = EBUSY; break; From 43cc66336cd2b9812279376c98659e0ae5e7fb9a Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 01:12:46 +0000 Subject: [PATCH 19/37] specfs: Factor KASSERT out of switch in spec_open. No functional change. --- sys/miscfs/specfs/spec_vnops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 8bfc08edf20f..1cb45f98d485 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -559,13 +559,13 @@ spec_open(void *v) * can't be revoked until we release the vnode lock. */ mutex_enter(&device_lock); + KASSERT(!sn->sn_gone); switch (vp->v_type) { case VCHR: /* * Character devices can accept opens from multiple * vnodes. */ - KASSERT(!sn->sn_gone); sd->sd_opencnt++; sn->sn_opencnt++; break; @@ -578,7 +578,6 @@ spec_open(void *v) * Treat zero opencnt with non-NULL mountpoint as open. * This may happen after forced detach of a mounted device. */ - KASSERT(!sn->sn_gone); if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) { error = EBUSY; break; From 7666729f4fe4745bbee530d68e50a156317814dc Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 01:16:23 +0000 Subject: [PATCH 20/37] specfs: If sd_opencnt is zero, sn_opencnt had better be zero. --- sys/miscfs/specfs/spec_vnops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 1cb45f98d485..0dec841b64cb 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -582,6 +582,7 @@ spec_open(void *v) error = EBUSY; break; } + KASSERTMSG(sn->sn_opencnt == 0, "%u", sn->sn_opencnt); sn->sn_opencnt = 1; sd->sd_opencnt = 1; sd->sd_bdevvp = vp; From c11452d9bd39c456bd434eaa040f2e93c33542b3 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 02:03:31 +0000 Subject: [PATCH 21/37] specfs: Add a comment and assertion to spec_close about refcnts. --- sys/miscfs/specfs/spec_vnops.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 0dec841b64cb..d3e848e60e2c 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -1345,11 +1345,20 @@ spec_close(void *v) panic("spec_close: not special"); } + /* + * Decrement the open reference count of this node and the + * device. For block devices, the open reference count must be + * 1 at this point. If the device's open reference count goes + * to zero, we're the last one out so get the lights. + */ mutex_enter(&device_lock); sn->sn_opencnt--; count = --sd->sd_opencnt; - if (vp->v_type == VBLK) + if (vp->v_type == VBLK) { + KASSERTMSG(count == 0, "block device with %u opens", + count + 1); sd->sd_bdevvp = NULL; + } mutex_exit(&device_lock); if (count != 0) From 10f4a9f3c9138dcce37fdcab0c32ac619a0f86c9 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 01:56:05 +0000 Subject: [PATCH 22/37] specfs: Omit needless vdead_check in spec_fdiscard. The vnode lock is held, so the vnode cannot be revoked without also changing v_op so subsequent uses under the vnode lock will go to deadfs's VOP_FDISCARD instead (which is genfs_eopnotsupp). --- sys/miscfs/specfs/spec_vnops.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index d3e848e60e2c..480e0ec606e9 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -946,17 +946,7 @@ spec_fdiscard(void *v) dev_t dev; vp = ap->a_vp; - dev = NODEV; - - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - - if (dev == NODEV) { - return ENXIO; - } + dev = vp->v_rdev; switch (vp->v_type) { case VCHR: From 1abac447587d9ed62fd22d803d791f251e3c76f7 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 24 Jan 2022 10:06:17 +0000 Subject: [PATCH 23/37] specfs: Paranoia: Assert opencnt is zero on reclaim. --- sys/miscfs/specfs/spec_vnops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 480e0ec606e9..05b897dc5a13 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -1205,6 +1205,8 @@ spec_reclaim(void *v) } */ *ap = v; struct vnode *vp = ap->a_vp; + KASSERT(vp->v_specnode->sn_opencnt == 0); + VOP_UNLOCK(vp); KASSERT(vp->v_mount == dead_rootmount); From cd27c7e73576184861c3ede7e3d03d63aab0bee5 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 31 Jan 2022 17:03:31 +0000 Subject: [PATCH 24/37] specfs: Document sn_opencnt, sd_opencnt, sd_refcnt. --- sys/miscfs/specfs/specdev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h index 621c103e80b1..7e1dd84fe330 100644 --- a/sys/miscfs/specfs/specdev.h +++ b/sys/miscfs/specfs/specdev.h @@ -66,7 +66,7 @@ typedef struct specnode { vnode_t *sn_next; struct specdev *sn_dev; - u_int sn_opencnt; + u_int sn_opencnt; /* # of opens, share of sd_opencnt */ dev_t sn_rdev; bool sn_gone; } specnode_t; @@ -75,8 +75,8 @@ typedef struct specdev { struct mount *sd_mountpoint; struct lockf *sd_lockf; vnode_t *sd_bdevvp; - u_int sd_opencnt; - u_int sd_refcnt; + u_int sd_opencnt; /* # of opens; close when ->0 */ + u_int sd_refcnt; /* # of specnodes referencing this */ dev_t sd_rdev; } specdev_t; From 0a683110ce01e9ba2ac4cbe9577486d61fb90555 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 21 Jan 2022 12:09:14 +0000 Subject: [PATCH 25/37] specfs: Resolve a race between close and a failing reopen. --- sys/miscfs/specfs/spec_vnops.c | 71 +++++++++++++++++++++++++++++----- sys/miscfs/specfs/specdev.h | 1 + 2 files changed, 63 insertions(+), 9 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 05b897dc5a13..b6e83ddefb15 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -258,6 +258,7 @@ spec_node_init(vnode_t *vp, dev_t rdev) sd->sd_refcnt = 1; sd->sd_opencnt = 0; sd->sd_bdevvp = NULL; + sd->sd_opened = false; sn->sn_dev = sd; sd = NULL; } else { @@ -519,6 +520,7 @@ spec_open(void *v) spec_ioctl_t ioctl; u_int gen; const char *name; + bool needclose = false; struct partinfo pi; l = curlwp; @@ -689,12 +691,9 @@ spec_open(void *v) * must fail with EBADF. * * Otherwise, if opening it failed, back out and release the - * open reference. - * - * XXX This is wrong -- we might release the last open - * reference here, but we don't close the device. If only this - * thread's call to open failed, that's fine, but we might - * have: + * open reference. If it was ever successfully opened and we + * got the last reference this way, it's now our job to close + * it. This might happen in the following scenario: * * Thread 1 Thread 2 * VOP_OPEN @@ -711,21 +710,54 @@ spec_open(void *v) * release vnode lock * acquire vnode lock * --sd_opencnt == 0 - * but no .d_close (***) + * + * We can't resolve this by making spec_close wait for .d_open + * to complete before examining sd_opencnt, because .d_open can + * hang indefinitely, e.g. for a tty. */ mutex_enter(&device_lock); if (sn->sn_gone) { if (error == 0) error = EBADF; - } else if (error != 0) { + } else if (error == 0) { + sd->sd_opened = true; + } else if (sd->sd_opencnt == 1 && sd->sd_opened) { + /* + * We're the last reference to a _previous_ open even + * though this one failed, so we have to close it. + * Don't decrement the reference count here -- + * spec_close will do that. + */ + KASSERT(sn->sn_opencnt == 1); + needclose = true; + } else { sd->sd_opencnt--; sn->sn_opencnt--; if (vp->v_type == VBLK) sd->sd_bdevvp = NULL; - } mutex_exit(&device_lock); + /* + * If this open failed, but the device was previously opened, + * and another thread concurrently closed the vnode while we + * were in the middle of reopening it, the other thread will + * see sd_opencnt > 0 and thus decide not to call .d_close -- + * it is now our responsibility to do so. + * + * XXX The flags passed to VOP_CLOSE here are wrong, but + * drivers can't rely on FREAD|FWRITE anyway -- e.g., consider + * a device opened by thread 0 with O_READ, then opened by + * thread 1 with O_WRITE, then closed by thread 0, and finally + * closed by thread 1; the last .d_close call will have FWRITE + * but not FREAD. We should just eliminate the FREAD/FWRITE + * parameter to .d_close altogether. + */ + if (needclose) { + KASSERT(error); + VOP_CLOSE(vp, FNONBLOCK, NOCRED); + } + /* If anything went wrong, we're done. */ if (error) return error; @@ -1342,6 +1374,25 @@ spec_close(void *v) * device. For block devices, the open reference count must be * 1 at this point. If the device's open reference count goes * to zero, we're the last one out so get the lights. + * + * We may find --sd->sd_opencnt gives zero, and yet + * sd->sd_opened is false. This happens if the vnode is + * revoked at the same time as it is being opened, which can + * happen when opening a tty blocks indefinitely. In that + * case, we still must call close -- it is the job of close to + * interrupt the open. Either way, the device will be no + * longer opened, so we have to clear sd->sd_opened; subsequent + * opens will have responsibility for issuing close. + * + * This has the side effect that the sequence of opens might + * happen out of order -- we might end up doing open, open, + * close, close, instead of open, close, open, close. This is + * unavoidable with the current devsw API, where open is + * allowed to block and close must be able to run concurrently + * to interrupt it. It is the driver's responsibility to + * ensure that close is idempotent so that this works. Drivers + * requiring per-open state and exact 1:1 correspondence + * between open and close can use fd_clone. */ mutex_enter(&device_lock); sn->sn_opencnt--; @@ -1351,6 +1402,8 @@ spec_close(void *v) count + 1); sd->sd_bdevvp = NULL; } + if (count == 0) + sd->sd_opened = false; mutex_exit(&device_lock); if (count != 0) diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h index 7e1dd84fe330..018679ffaef2 100644 --- a/sys/miscfs/specfs/specdev.h +++ b/sys/miscfs/specfs/specdev.h @@ -78,6 +78,7 @@ typedef struct specdev { u_int sd_opencnt; /* # of opens; close when ->0 */ u_int sd_refcnt; /* # of specnodes referencing this */ dev_t sd_rdev; + bool sd_opened; /* true if successfully opened */ } specdev_t; /* From c5541664e1488fdee06803e7942ddbb728cbeda9 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 12:52:23 +0000 Subject: [PATCH 26/37] specfs: Drain all I/O operations after last .d_close call. New kind of I/O reference on specdevs, sd_iocnt. This could be done with psref instead; I chose a reference count instead for now because we already have to take a per-object lock anyway, v_interlock, for vdead_check, so another atomic is not likely to hurt much more. We can always change the mechanism inside spec_io_enter/exit/drain later on. Make sure every access to vp->v_rdev or vp->v_specnode and every call to a devsw operation is protected either: - by the vnode lock (with vdead_check if we unlocked/relocked), - by positive sd_opencnt, - by spec_io_enter/exit, or - by sd_opencnt management in open/close. --- sys/miscfs/specfs/spec_vnops.c | 356 +++++++++++++++++++++++++-------- sys/miscfs/specfs/specdev.h | 1 + 2 files changed, 269 insertions(+), 88 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index b6e83ddefb15..e5b46a4a4be1 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -81,6 +81,7 @@ __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.183 2021/07/18 23:57:14 dholland Ex #include #include #include +#include #include #include @@ -173,6 +174,7 @@ const struct vnodeopv_desc spec_vnodeop_opv_desc = { &spec_vnodeop_p, spec_vnodeop_entries }; static kauth_listener_t rawio_listener; +static struct kcondvar specfs_iocv; /* Returns true if vnode is /dev/mem or /dev/kmem. */ bool @@ -218,6 +220,141 @@ spec_init(void) rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE, rawio_listener_cb, NULL); + cv_init(&specfs_iocv, "specio"); +} + +/* + * spec_io_enter(vp, &sn, &dev) + * + * Enter an operation that may not hold vp's vnode lock or an + * fstrans on vp's mount. Until spec_io_exit, the vnode will not + * be revoked. + * + * On success, set sn to the specnode pointer and dev to the dev_t + * number and return zero. Caller must later call spec_io_exit + * when done. + * + * On failure, return ENXIO -- the device has been revoked and no + * longer exists. + */ +static int +spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp) +{ + dev_t dev; + struct specnode *sn; + unsigned iocnt; + int error = 0; + + mutex_enter(vp->v_interlock); + + /* + * Extract all the info we need from the vnode, unless the + * vnode has already been reclaimed. This can happen if the + * underlying device has been removed and all the device nodes + * for it have been revoked. The caller may not hold a vnode + * lock or fstrans to prevent this from happening before it has + * had an opportunity to notice the vnode is dead. + */ + if (vdead_check(vp, VDEAD_NOWAIT) != 0 || + (sn = vp->v_specnode) == NULL || + (dev = vp->v_rdev) == NODEV) { + error = ENXIO; + goto out; + } + + /* + * Notify spec_close that we are doing an I/O operation which + * may not be not bracketed by fstrans(9) and thus is not + * blocked by vfs suspension. + * + * We could hold this reference with psref(9) instead, but we + * already have to take the interlock for vdead_check, so + * there's not much more cost here to another atomic operation. + */ + do { + iocnt = atomic_load_relaxed(&sn->sn_dev->sd_iocnt); + if (__predict_false(iocnt == UINT_MAX)) { + /* + * The I/O count is limited by the number of + * LWPs (which will never overflow this) -- + * unless one driver uses another driver via + * specfs, which is rather unusual, but which + * could happen via pud(4) userspace drivers. + * We could use a 64-bit count, but can't use + * atomics for that on all platforms. + * (Probably better to switch to psref or + * localcount instead.) + */ + error = EBUSY; + goto out; + } + } while (atomic_cas_uint(&sn->sn_dev->sd_iocnt, iocnt, iocnt + 1) + != iocnt); + + /* Success! */ + *snp = sn; + *devp = dev; + error = 0; + +out: mutex_exit(vp->v_interlock); + return error; +} + +/* + * spec_io_exit(vp, sn) + * + * Exit an operation entered with a successful spec_io_enter -- + * allow concurrent spec_node_revoke to proceed. The argument sn + * must match the struct specnode pointer returned by spec_io_exit + * for vp. + */ +static void +spec_io_exit(struct vnode *vp, struct specnode *sn) +{ + struct specdev *sd = sn->sn_dev; + unsigned iocnt; + + KASSERT(vp->v_specnode == sn); + + /* + * We are done. Notify spec_close if appropriate. The + * transition of 1 -> 0 must happen under device_lock so + * spec_close doesn't miss a wakeup. + */ + do { + iocnt = atomic_load_relaxed(&sd->sd_iocnt); + KASSERT(iocnt > 0); + if (iocnt == 1) { + mutex_enter(&device_lock); + if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0) + cv_broadcast(&specfs_iocv); + mutex_exit(&device_lock); + break; + } + } while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt); +} + +/* + * spec_io_drain(sd) + * + * Wait for all existing spec_io_enter/exit sections to complete. + * Caller must ensure spec_io_enter will fail at this point. + */ +static void +spec_io_drain(struct specdev *sd) +{ + + /* + * I/O at the same time as closing is unlikely -- it often + * indicates an application bug. + */ + if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0)) + return; + + mutex_enter(&device_lock); + while (atomic_load_relaxed(&sd->sd_iocnt) > 0) + cv_wait(&specfs_iocv, &device_lock); + mutex_exit(&device_lock); } /* @@ -258,6 +395,7 @@ spec_node_init(vnode_t *vp, dev_t rdev) sd->sd_refcnt = 1; sd->sd_opencnt = 0; sd->sd_bdevvp = NULL; + sd->sd_iocnt = 0; sd->sd_opened = false; sn->sn_dev = sd; sd = NULL; @@ -473,6 +611,7 @@ spec_node_destroy(vnode_t *vp) /* If the device is no longer in use, destroy our record. */ if (refcnt == 1) { + KASSERT(sd->sd_iocnt == 0); KASSERT(sd->sd_opencnt == 0); KASSERT(sd->sd_bdevvp == NULL); kmem_free(sd, sizeof(*sd)); @@ -510,30 +649,28 @@ spec_open(void *v) int a_mode; kauth_cred_t a_cred; } */ *ap = v; - struct lwp *l; - struct vnode *vp; + struct lwp *l = curlwp; + struct vnode *vp = ap->a_vp; dev_t dev; int error; enum kauth_device_req req; specnode_t *sn; specdev_t *sd; spec_ioctl_t ioctl; - u_int gen; - const char *name; + u_int gen = 0; + const char *name = NULL; bool needclose = false; struct partinfo pi; - - l = curlwp; - vp = ap->a_vp; - dev = vp->v_rdev; - sn = vp->v_specnode; - sd = sn->sn_dev; - name = NULL; - gen = 0; + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d", vp->v_type); + dev = vp->v_rdev; + sn = vp->v_specnode; + sd = sn->sn_dev; + /* * Don't allow open if fs is mounted -nodev. */ @@ -798,6 +935,8 @@ spec_read(void *v) struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct lwp *l = curlwp; + struct specnode *sn; + dev_t dev; struct buf *bp; daddr_t bn; int bsize, bscale; @@ -820,9 +959,27 @@ spec_read(void *v) switch (vp->v_type) { case VCHR: + /* + * Release the lock while we sleep -- possibly + * indefinitely, if this is, e.g., a tty -- in + * cdev_read, so we don't hold up everything else that + * might want access to the vnode. + * + * But before we issue the read, take an I/O reference + * to the specnode so close will know when we're done + * reading. Note that the moment we release the lock, + * the vnode's identity may change; hence spec_io_enter + * may fail, and the caller may have a dead vnode on + * their hands, if the file system on which vp lived + * has been unmounted. + */ VOP_UNLOCK(vp); - error = cdev_read(vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_SHARED | LK_RETRY); + error = spec_io_enter(vp, &sn, &dev); + if (error) + goto out; + error = cdev_read(dev, uio, ap->a_ioflag); + spec_io_exit(vp, sn); +out: vn_lock(vp, LK_SHARED | LK_RETRY); return (error); case VBLK: @@ -899,6 +1056,8 @@ spec_write(void *v) struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct lwp *l = curlwp; + struct specnode *sn; + dev_t dev; struct buf *bp; daddr_t bn; int bsize, bscale; @@ -914,9 +1073,27 @@ spec_write(void *v) switch (vp->v_type) { case VCHR: + /* + * Release the lock while we sleep -- possibly + * indefinitely, if this is, e.g., a tty -- in + * cdev_write, so we don't hold up everything else that + * might want access to the vnode. + * + * But before we issue the write, take an I/O reference + * to the specnode so close will know when we're done + * writing. Note that the moment we release the lock, + * the vnode's identity may change; hence spec_io_enter + * may fail, and the caller may have a dead vnode on + * their hands, if the file system on which vp lived + * has been unmounted. + */ VOP_UNLOCK(vp); - error = cdev_write(vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = spec_io_enter(vp, &sn, &dev); + if (error) + goto out; + error = cdev_write(dev, uio, ap->a_ioflag); + spec_io_exit(vp, sn); +out: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (error); case VBLK: @@ -974,10 +1151,12 @@ spec_fdiscard(void *v) off_t a_pos; off_t a_len; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; dev_t dev; - vp = ap->a_vp; + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); + dev = vp->v_rdev; switch (vp->v_type) { @@ -1007,40 +1186,32 @@ spec_ioctl(void *v) int a_fflag; kauth_cred_t a_cred; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int error; - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - vp = ap->a_vp; - dev = NODEV; - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - if (dev == NODEV) { - return ENXIO; - } + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; switch (vp->v_type) { - case VCHR: - return cdev_ioctl(dev, ap->a_command, ap->a_data, + error = cdev_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); - + break; case VBLK: KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); - return bdev_ioctl(dev, ap->a_command, ap->a_data, + error = bdev_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); - + break; default: panic("spec_ioctl"); /* NOTREACHED */ } + + spec_io_exit(vp, sn); + return error; } /* ARGSUSED */ @@ -1051,33 +1222,25 @@ spec_poll(void *v) struct vnode *a_vp; int a_events; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int revents; - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - vp = ap->a_vp; - dev = NODEV; - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - if (dev == NODEV) { + if (spec_io_enter(vp, &sn, &dev) != 0) return POLLERR; - } switch (vp->v_type) { - case VCHR: - return cdev_poll(dev, ap->a_events, curlwp); - + revents = cdev_poll(dev, ap->a_events, curlwp); + break; default: - return (genfs_poll(v)); + revents = genfs_poll(v); + break; } + + spec_io_exit(vp, sn); + return revents; } /* ARGSUSED */ @@ -1088,20 +1251,30 @@ spec_kqfilter(void *v) struct vnode *a_vp; struct proc *a_kn; } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int error; - switch (ap->a_vp->v_type) { + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; + switch (vp->v_type) { case VCHR: - dev = ap->a_vp->v_rdev; - return cdev_kqfilter(dev, ap->a_kn); + error = cdev_kqfilter(dev, ap->a_kn); + break; default: /* * Block devices don't support kqfilter, and refuse it * for any other files (like those vflush()ed) too. */ - return (EOPNOTSUPP); + error = EOPNOTSUPP; + break; } + + spec_io_exit(vp, sn); + return error; } /* @@ -1116,11 +1289,19 @@ spec_mmap(void *v) kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; + struct specnode *sn; + dev_t dev; + int error; KASSERT(vp->v_type == VBLK); - if (bdev_type(vp->v_rdev) != D_DISK) - return EINVAL; + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; + + error = bdev_type(dev) == D_DISK ? 0 : EINVAL; + + spec_io_exit(vp, sn); return 0; } @@ -1165,27 +1346,14 @@ spec_strategy(void *v) } */ *ap = v; struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; + struct specnode *sn = NULL; dev_t dev; int error; - dev = NODEV; - - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) { - KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - - if (dev == NODEV) { - error = ENXIO; + error = spec_io_enter(vp, &sn, &dev); + if (error) goto out; - } + bp->b_dev = dev; if (!(bp->b_flags & B_READ)) { @@ -1205,13 +1373,15 @@ spec_strategy(void *v) } bdev_strategy(bp); - return 0; - -out: - bp->b_error = error; - bp->b_resid = bp->b_bcount; - biodone(bp); + error = 0; +out: if (sn) + spec_io_exit(vp, sn); + if (error) { + bp->b_error = error; + bp->b_resid = bp->b_bcount; + biodone(bp); + } return error; } @@ -1282,14 +1452,18 @@ spec_close(void *v) } */ *ap = v; struct vnode *vp = ap->a_vp; struct session *sess; - dev_t dev = vp->v_rdev; + dev_t dev; int flags = ap->a_fflag; int mode, error, count; specnode_t *sn; specdev_t *sd; + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); + mutex_enter(vp->v_interlock); sn = vp->v_specnode; + dev = vp->v_rdev; sd = sn->sn_dev; /* * If we're going away soon, make this non-blocking. @@ -1422,6 +1596,12 @@ spec_close(void *v) else error = cdev_close(dev, flags, mode, curlwp); + /* + * Wait for all other devsw operations to drain. After this + * point, no bdev/cdev_* can be active for this specdev. + */ + spec_io_drain(sd); + if (!(flags & FNONBLOCK)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h index 018679ffaef2..91e7af9bf09b 100644 --- a/sys/miscfs/specfs/specdev.h +++ b/sys/miscfs/specfs/specdev.h @@ -78,6 +78,7 @@ typedef struct specdev { u_int sd_opencnt; /* # of opens; close when ->0 */ u_int sd_refcnt; /* # of specnodes referencing this */ dev_t sd_rdev; + volatile u_int sd_iocnt; /* # bdev/cdev_* operations active */ bool sd_opened; /* true if successfully opened */ } specdev_t; From b980ff0ff3a64116ca118c1477961434403e4a65 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 12:10:55 +0000 Subject: [PATCH 27/37] specfs: Take an I/O reference in spec_node_setmountedfs. This is not quite correct. We _should_ require the caller to hold a vnode lock around spec_node_getmountedfs, and an exclusive vnode lock around spec_node_setmountedfs, so that it is only necessary to check whether revoke has already happened, not hold an I/O reference. Unfortunately, various callers in various file systems don't follow this sensible rule. So let's at least make sure the vnode can't be revoked in spec_node_setmountedfs, while we're in bdev_ioctl, and leave a comment explaining what the sorry state of affairs is and how to fix it later. --- sys/miscfs/specfs/spec_vnops.c | 43 ++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index e5b46a4a4be1..2a9523c57cee 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -498,6 +498,11 @@ spec_node_lookup_by_mount(struct mount *mp, vnode_t **vpp) /* * Get the file system mounted on this block device. + * + * XXX Caller should hold the vnode lock -- shared or exclusive -- so + * that this can't changed, and the vnode can't be revoked while we + * examine it. But not all callers do, and they're scattered through a + * lot of file systems, so we can't assert this yet. */ struct mount * spec_node_getmountedfs(vnode_t *devvp) @@ -512,23 +517,51 @@ spec_node_getmountedfs(vnode_t *devvp) /* * Set the file system mounted on this block device. + * + * XXX Caller should hold the vnode lock exclusively so this can't be + * changed or assumed by spec_node_getmountedfs while we change it, and + * the vnode can't be revoked while we handle it. But not all callers + * do, and they're scattered through a lot of file systems, so we can't + * assert this yet. Instead, for now, we'll take an I/O reference so + * at least the ioctl doesn't race with revoke/detach. + * + * If you do change this to assert an exclusive vnode lock, you must + * also do vdead_check before trying bdev_ioctl, because the vnode may + * have been revoked by the time the caller locked it, and this is + * _not_ a vop -- calls to spec_node_setmountedfs don't go through + * v_op, so revoking the vnode doesn't prevent further calls. + * + * XXX Caller should additionally have the vnode open, at least if mp + * is nonnull, but I'm not sure all callers do that -- need to audit. + * Currently udf closes the vnode before clearing the mount. */ void spec_node_setmountedfs(vnode_t *devvp, struct mount *mp) { struct dkwedge_info dkw; + struct specnode *sn; + dev_t dev; + int error; KASSERT(devvp->v_type == VBLK); - KASSERT(devvp->v_specnode->sn_dev->sd_mountpoint == NULL || mp == NULL); - devvp->v_specnode->sn_dev->sd_mountpoint = mp; - if (mp == NULL) - return; - if (bdev_ioctl(devvp->v_rdev, DIOCGWEDGEINFO, &dkw, FREAD, curlwp) != 0) + error = spec_io_enter(devvp, &sn, &dev); + if (error) return; + KASSERT(sn->sn_dev->sd_mountpoint == NULL || mp == NULL); + sn->sn_dev->sd_mountpoint = mp; + if (mp == NULL) + goto out; + + error = bdev_ioctl(dev, DIOCGWEDGEINFO, &dkw, FREAD, curlwp); + if (error) + goto out; + strlcpy(mp->mnt_stat.f_mntfromlabel, dkw.dkw_wname, sizeof(mp->mnt_stat.f_mntfromlabel)); + +out: spec_io_exit(devvp, sn); } /* From 29001f4ccb79135d416082bc30ad40ae3080e394 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 23 Jan 2022 19:41:12 +0000 Subject: [PATCH 28/37] specfs: Prevent new opens while close is waiting to drain. Otherwise, bdev/cdev_close could have cancelled all _existing_ opens, and waited for them to complete (and freed resources used by them) -- but a new one could start, and hang (e.g., a tty), at the same time spec_close tries to drain all pending I/O operations, one of which (the new open) is now hanging indefinitely. Preventing the new open from even starting until bdev/cdev_close is finished and all I/O operations have drained avoids this deadlock. --- sys/miscfs/specfs/spec_vnops.c | 28 ++++++++++++++++++++++++++-- sys/miscfs/specfs/specdev.h | 1 + 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 2a9523c57cee..e14ad6e61609 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -397,6 +397,7 @@ spec_node_init(vnode_t *vp, dev_t rdev) sd->sd_bdevvp = NULL; sd->sd_iocnt = 0; sd->sd_opened = false; + sd->sd_closing = false; sn->sn_dev = sd; sd = NULL; } else { @@ -736,8 +737,17 @@ spec_open(void *v) case VCHR: /* * Character devices can accept opens from multiple - * vnodes. + * vnodes. But first, wait for any close to finish. + * Wait under the vnode lock so we don't have to worry + * about the vnode being revoked while we wait. */ + while (sd->sd_closing) { + error = cv_wait_sig(&specfs_iocv, &device_lock); + if (error) + break; + } + if (error) + break; sd->sd_opencnt++; sn->sn_opencnt++; break; @@ -1609,8 +1619,10 @@ spec_close(void *v) count + 1); sd->sd_bdevvp = NULL; } - if (count == 0) + if (count == 0) { sd->sd_opened = false; + sd->sd_closing = true; + } mutex_exit(&device_lock); if (count != 0) @@ -1635,6 +1647,18 @@ spec_close(void *v) */ spec_io_drain(sd); + /* + * Wake any spec_open calls waiting for close to finish -- do + * this before reacquiring the vnode lock, because spec_open + * holds the vnode lock while waiting, so doing this after + * reacquiring the lock would deadlock. + */ + mutex_enter(&device_lock); + KASSERT(sd->sd_closing); + sd->sd_closing = false; + cv_broadcast(&specfs_iocv); + mutex_exit(&device_lock); + if (!(flags & FNONBLOCK)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h index 91e7af9bf09b..8b44cf4339f4 100644 --- a/sys/miscfs/specfs/specdev.h +++ b/sys/miscfs/specfs/specdev.h @@ -80,6 +80,7 @@ typedef struct specdev { dev_t sd_rdev; volatile u_int sd_iocnt; /* # bdev/cdev_* operations active */ bool sd_opened; /* true if successfully opened */ + bool sd_closing; /* true when bdev/cdev_close ongoing */ } specdev_t; /* From c092cf7f3fd4da117345517f84ef6c3e373b914a Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 31 Jan 2022 16:28:10 +0000 Subject: [PATCH 29/37] specfs: Wait for last close in spec_node_revoke. Otherwise, revoke -- and vdevgone, in the detach path of removable devices -- may complete while I/O operations are still running concurrently. --- sys/miscfs/specfs/spec_vnops.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index e14ad6e61609..dd25f68c52ea 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -598,6 +598,16 @@ spec_node_revoke(vnode_t *vp) mutex_enter(&device_lock); KASSERT(sn->sn_opencnt == 0); } + + /* + * We may have revoked the vnode in this thread while another + * thread was in the middle of spec_close, in the window when + * spec_close releases the vnode lock to call .d_close for the + * last close. In that case, wait for the concurrent + * spec_close to complete. + */ + while (sd->sd_closing) + cv_wait(&specfs_iocv, &device_lock); mutex_exit(&device_lock); } From 136faa862d9e94e04cab41b0f699daf4f867cdbd Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 23 Jan 2022 19:30:41 +0000 Subject: [PATCH 30/37] specfs: Take an I/O reference across bdev/cdev_open. - Revoke is used to invalidate all prior access control checks when device permissions are changing, so it must wait for .d_open to exit so any new access must go through new access control checks. - Revoke is used by vdevgone in xyz_detach to wait until all use of the driver's data structures have completed before xyz_detach frees them. So we need to make sure spec_close waits for .d_open too. --- sys/miscfs/specfs/spec_vnops.c | 39 ++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index dd25f68c52ea..0a4fcc145a84 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -695,10 +695,10 @@ spec_open(void *v) } */ *ap = v; struct lwp *l = curlwp; struct vnode *vp = ap->a_vp; - dev_t dev; + dev_t dev, dev1; int error; enum kauth_device_req req; - specnode_t *sn; + specnode_t *sn, *sn1; specdev_t *sd; spec_ioctl_t ioctl; u_int gen = 0; @@ -807,18 +807,34 @@ spec_open(void *v) } /* - * Open the device. If .d_open returns ENXIO (device not - * configured), the driver may not be loaded, so try - * autoloading a module and then try .d_open again if anything - * got loaded. - * * Because opening the device may block indefinitely, e.g. when * opening a tty, and loading a module may cross into many * other subsystems, we must not hold the vnode lock while * calling .d_open, so release it now and reacquire it when * done. + * + * Take an I/O reference so that any concurrent spec_close via + * spec_node_revoke will wait for us to finish calling .d_open. + * The vnode can't be dead at this point because we have it + * locked. Note that if revoked, the driver must interrupt + * .d_open before spec_close starts waiting for I/O to drain so + * this doesn't deadlock. */ VOP_UNLOCK(vp); + error = spec_io_enter(vp, &sn1, &dev1); + if (error) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + return error; + } + KASSERT(sn1 == sn); + KASSERT(dev1 == dev); + + /* + * Open the device. If .d_open returns ENXIO (device not + * configured), the driver may not be loaded, so try + * autoloading a module and then try .d_open again if anything + * got loaded. + */ switch (vp->v_type) { case VCHR: do { @@ -873,7 +889,16 @@ spec_open(void *v) default: __unreachable(); } + + /* + * Release the I/O reference now that we have called .d_open, + * and reacquire the vnode lock. At this point, the device may + * have been revoked, so we must tread carefully. However, sn + * and sd remain valid pointers until we drop our reference. + */ + spec_io_exit(vp, sn); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + KASSERT(vp->v_specnode == sn); /* * If it has been revoked since we released the vnode lock and From fe980700acdc888d397276846321ee122f5d8a0c Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 31 Jan 2022 01:22:08 +0000 Subject: [PATCH 31/37] specfs: Assert opencnt is nonzero before decrementing. --- sys/miscfs/specfs/spec_vnops.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index 0a4fcc145a84..b4aea28ff948 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -946,6 +946,8 @@ spec_open(void *v) KASSERT(sn->sn_opencnt == 1); needclose = true; } else { + KASSERT(sd->sd_opencnt); + KASSERT(sn->sn_opencnt); sd->sd_opencnt--; sn->sn_opencnt--; if (vp->v_type == VBLK) @@ -1647,6 +1649,8 @@ spec_close(void *v) * between open and close can use fd_clone. */ mutex_enter(&device_lock); + KASSERT(sn->sn_opencnt); + KASSERT(sd->sd_opencnt); sn->sn_opencnt--; count = --sd->sd_opencnt; if (vp->v_type == VBLK) { From 3aca82d6045dcfe8d512026556f52230a3875ca6 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 29 Jan 2022 18:13:17 +0000 Subject: [PATCH 32/37] specfs: Let spec_node_lookup_by_dev wait for reclaim to finish. vdevgone relies on this to ensure that if there is a concurrent revoke in progress, it will wait for that revoke to finish -- that way, it can guarantee all I/O operations have completed and the device is closed. --- sys/coda/coda_vfsops.c | 2 +- sys/kern/vfs_mount.c | 3 ++- sys/kern/vfs_subr.c | 9 +++++++-- sys/kern/vfs_vnode.c | 3 ++- sys/miscfs/specfs/spec_vnops.c | 21 +++++++++++++++++++-- sys/miscfs/specfs/specdev.h | 2 +- 6 files changed, 32 insertions(+), 8 deletions(-) diff --git a/sys/coda/coda_vfsops.c b/sys/coda/coda_vfsops.c index 9e25704b6b2f..c0a1b98a9331 100644 --- a/sys/coda/coda_vfsops.c +++ b/sys/coda/coda_vfsops.c @@ -636,7 +636,7 @@ struct mount *devtomp(dev_t dev) struct mount *mp; struct vnode *vp; - if (spec_node_lookup_by_dev(VBLK, dev, &vp) == 0) { + if (spec_node_lookup_by_dev(VBLK, dev, VDEAD_NOWAIT, &vp) == 0) { mp = spec_node_getmountedfs(vp); vrele(vp); } else { diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 2d4d6c28c292..58d3dff41d57 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -1372,7 +1372,8 @@ vfs_mountedon(vnode_t *vp) return ENOTBLK; if (spec_node_getmountedfs(vp) != NULL) return EBUSY; - if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, &vq) == 0) { + if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, VDEAD_NOWAIT, &vq) + == 0) { if (spec_node_getmountedfs(vq) != NULL) error = EBUSY; vrele(vq); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 10cb4e57e129..a64cffb97054 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -495,7 +495,7 @@ int vfinddev(dev_t dev, enum vtype type, vnode_t **vpp) { - return (spec_node_lookup_by_dev(type, dev, vpp) == 0); + return (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, vpp) == 0); } /* @@ -511,7 +511,12 @@ vdevgone(int maj, int minl, int minh, enum vtype type) for (mn = minl; mn <= minh; mn++) { dev = makedev(maj, mn); - while (spec_node_lookup_by_dev(type, dev, &vp) == 0) { + /* + * Passing 0 as flags, instead of VDEAD_NOWAIT, means + * spec_node_lookup_by_dev will wait for vnodes it + * finds concurrently being revoked before returning. + */ + while (spec_node_lookup_by_dev(type, dev, 0, &vp) == 0) { VOP_REVOKE(vp, REVOKEALL); vrele(vp); } diff --git a/sys/kern/vfs_vnode.c b/sys/kern/vfs_vnode.c index 48065852a6ac..58db8e0e97e7 100644 --- a/sys/kern/vfs_vnode.c +++ b/sys/kern/vfs_vnode.c @@ -1234,7 +1234,8 @@ vrevoke(vnode_t *vp) type = vp->v_type; mutex_exit(vp->v_interlock); - while (spec_node_lookup_by_dev(type, dev, &vq) == 0) { + while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq) + == 0) { mp = vrevoke_suspend_next(mp, vq->v_mount); vgone(vq); } diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index b4aea28ff948..c727a42de614 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -424,18 +424,35 @@ spec_node_init(vnode_t *vp, dev_t rdev) * Lookup a vnode by device number and return it referenced. */ int -spec_node_lookup_by_dev(enum vtype type, dev_t dev, vnode_t **vpp) +spec_node_lookup_by_dev(enum vtype type, dev_t dev, int flags, vnode_t **vpp) { int error; vnode_t *vp; - mutex_enter(&device_lock); +top: mutex_enter(&device_lock); for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { if (type == vp->v_type && dev == vp->v_rdev) { mutex_enter(vp->v_interlock); /* If clean or being cleaned, then ignore it. */ if (vdead_check(vp, VDEAD_NOWAIT) == 0) break; + if ((flags & VDEAD_NOWAIT) == 0) { + mutex_exit(&device_lock); + /* + * It may be being revoked as we speak, + * and the caller wants to wait until + * all revocation has completed. Let + * vcache_vget wait for it to finish + * dying; as a side effect, vcache_vget + * releases vp->v_interlock. Note that + * vcache_vget cannot succeed at this + * point because vdead_check already + * failed. + */ + error = vcache_vget(vp); + KASSERT(error); + goto top; + } mutex_exit(vp->v_interlock); } } diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h index 8b44cf4339f4..c095cf24d2d5 100644 --- a/sys/miscfs/specfs/specdev.h +++ b/sys/miscfs/specfs/specdev.h @@ -95,7 +95,7 @@ typedef struct specdev { */ void spec_node_init(vnode_t *, dev_t); void spec_node_destroy(vnode_t *); -int spec_node_lookup_by_dev(enum vtype, dev_t, vnode_t **); +int spec_node_lookup_by_dev(enum vtype, dev_t, int, vnode_t **); int spec_node_lookup_by_mount(struct mount *, vnode_t **); struct mount *spec_node_getmountedfs(vnode_t *); void spec_node_setmountedfs(vnode_t *, struct mount *); From 77643b5190193c7039bfb9b47e62571453104708 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 31 Jan 2022 01:07:56 +0000 Subject: [PATCH 33/37] specfs: Remove specnode from hash table in spec_node_revoke. Previously, it was possible for spec_node_lookup_by_dev to handle a speconde that a concurrent spec_node_destroy is about to remove from the hash table and then free, as soon as spec_node_lookup_by_dev releases device_lock. Now, the ordering is: 1. Remove specnode from hash table in spec_node_revoke. At this point, no _new_ vnode references are possible (other than possibly one acquired by vcache_vget under v_interlock), but there may be existing ones. 2. Mark vnode reclaimed so vcache_vget will fail. 3. The last vrele (or equivalent logic in vcache_vget) will then free the specnode in spec_node_destroy. This way, _if_ a thread in spec_node_lookup_by_dev finds a specnode in the hash table under device_lock/v_interlock, _then_ it will not be freed until the thread completes vcache_vget. This change requires calling spec_node_revoke unconditionally for device special nodes, not just for active ones. Might introduce slightly more contention on device_lock but not much because we already have to take it in this path anyway a little later in spec_node_destroy. --- sys/kern/vfs_vnode.c | 7 ++----- sys/miscfs/specfs/spec_vnops.c | 36 +++++++++++++++++----------------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/sys/kern/vfs_vnode.c b/sys/kern/vfs_vnode.c index 58db8e0e97e7..ee365aaaba08 100644 --- a/sys/kern/vfs_vnode.c +++ b/sys/kern/vfs_vnode.c @@ -1814,7 +1814,7 @@ vcache_reclaim(vnode_t *vp) uint32_t hash; uint8_t temp_buf[64], *temp_key; size_t temp_key_len; - bool recycle, active; + bool recycle; int error; KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 || @@ -1822,7 +1822,6 @@ vcache_reclaim(vnode_t *vp) KASSERT(mutex_owned(vp->v_interlock)); KASSERT(vrefcnt(vp) != 0); - active = (vrefcnt(vp) > 1); temp_key_len = vip->vi_key.vk_key_len; /* * Prevent the vnode from being recycled or brought into use @@ -1865,8 +1864,6 @@ vcache_reclaim(vnode_t *vp) /* * Clean out any cached data associated with the vnode. - * If purging an active vnode, it must be closed and - * deactivated before being reclaimed. */ error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); if (error != 0) { @@ -1876,7 +1873,7 @@ vcache_reclaim(vnode_t *vp) } KASSERTMSG((error == 0), "vinvalbuf failed: %d", error); KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); - if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { + if (vp->v_type == VBLK || vp->v_type == VCHR) { spec_node_revoke(vp); } diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index c727a42de614..dbfe885ddd88 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -591,6 +591,7 @@ spec_node_revoke(vnode_t *vp) { specnode_t *sn; specdev_t *sd; + struct vnode **vpp; KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || (vp->v_vflag & VV_LOCKSWORK) == 0); @@ -604,10 +605,10 @@ spec_node_revoke(vnode_t *vp) mutex_enter(&device_lock); KASSERT(sn->sn_opencnt <= sd->sd_opencnt); + sn->sn_gone = true; if (sn->sn_opencnt != 0) { sd->sd_opencnt -= (sn->sn_opencnt - 1); sn->sn_opencnt = 1; - sn->sn_gone = true; mutex_exit(&device_lock); VOP_CLOSE(vp, FNONBLOCK, NOCRED); @@ -625,6 +626,22 @@ spec_node_revoke(vnode_t *vp) */ while (sd->sd_closing) cv_wait(&specfs_iocv, &device_lock); + + /* + * Remove from the hash so lookups stop returning this + * specnode. We will dissociate it from the specdev -- and + * possibly free the specdev -- in spec_node_destroy. + */ + KASSERT(sn->sn_gone); + KASSERT(sn->sn_opencnt == 0); + for (vpp = &specfs_hash[SPECHASH(vp->v_rdev)];; + vpp = &(*vpp)->v_specnext) { + if (*vpp == vp) { + *vpp = vp->v_specnext; + vp->v_specnext = NULL; + break; + } + } mutex_exit(&device_lock); } @@ -637,7 +654,6 @@ spec_node_destroy(vnode_t *vp) { specnode_t *sn; specdev_t *sd; - vnode_t **vpp, *vp2; int refcnt; sn = vp->v_specnode; @@ -648,22 +664,6 @@ spec_node_destroy(vnode_t *vp) KASSERT(sn->sn_opencnt == 0); mutex_enter(&device_lock); - /* Remove from the hash and destroy the node. */ - vpp = &specfs_hash[SPECHASH(vp->v_rdev)]; - for (vp2 = *vpp;; vp2 = vp2->v_specnext) { - if (vp2 == NULL) { - panic("spec_node_destroy: corrupt hash"); - } - if (vp2 == vp) { - KASSERT(vp == *vpp); - *vpp = vp->v_specnext; - break; - } - if (vp2->v_specnext == vp) { - vp2->v_specnext = vp->v_specnext; - break; - } - } sn = vp->v_specnode; vp->v_specnode = NULL; refcnt = sd->sd_refcnt--; From d985e24779674ccce502a81963a66a82b30a47bf Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 2 Feb 2022 01:20:55 +0000 Subject: [PATCH 34/37] specfs: Reorder struct specnode members to save padding. Shrinks from 40 bytes to 32 bytes on LP64 systems this way. --- sys/miscfs/specfs/specdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h index c095cf24d2d5..8bb9b4de2b2a 100644 --- a/sys/miscfs/specfs/specdev.h +++ b/sys/miscfs/specfs/specdev.h @@ -66,8 +66,8 @@ typedef struct specnode { vnode_t *sn_next; struct specdev *sn_dev; - u_int sn_opencnt; /* # of opens, share of sd_opencnt */ dev_t sn_rdev; + u_int sn_opencnt; /* # of opens, share of sd_opencnt */ bool sn_gone; } specnode_t; From 8ccb6e8d12e84d84e4b86be38253c8c91469c653 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Thu, 20 Jan 2022 23:24:42 +0000 Subject: [PATCH 35/37] autoconf(9): New function config_detach_commit. When a driver's .ca_detach function has committed to detaching -- it definitely won't back out with EBUSY, for instance -- it can call this to wake all pending calls to device_lookup_acquire and make them fail immediately. This is necessary to break a deadlock if the device_lookup_acquire calls happen inside I/O operations which the driver's .ca_detach function waits for the completion of -- without config_detach_commit, I/O operations would be stuck in device_lookup_acquire waiting for .ca_detach and .ca_detach would be stuck waiting for I/O operations to return. Most drivers won't need to call this: for autoconf drivers used the traditional way by devsw for userland device nodes, the .ca_detach routine uses vdevgone, and we will arrange to make vdevgone call config_detach_commit automagically in such drivers anyway. XXX kernel ABI change to struct device requires bump -- later change will make struct device opaque to ABI, but we're not there yet --- sys/kern/subr_autoconf.c | 45 ++++++++++++++++++++++++++++++++++------ sys/sys/device.h | 2 ++ 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c index 74f71dca0bcf..03afc7f54b55 100644 --- a/sys/kern/subr_autoconf.c +++ b/sys/kern/subr_autoconf.c @@ -2045,10 +2045,17 @@ config_detach(device_t dev, int flags) * If it was possible to detach the device, ensure that the * device is deactivated. */ - if (rv == 0) - dev->dv_flags &= ~DVF_ACTIVE; - else if ((flags & DETACH_FORCE) == 0) { - /* Detach failed -- likely EBUSY. */ + if (rv == 0) { + config_detach_commit(dev); + dev->dv_flags &= ~DVF_ACTIVE; /* XXXSMP */ + } else if ((flags & DETACH_FORCE) == 0) { + /* + * Detach failed -- likely EBUSY. Driver must not have + * called config_detach_commit. + */ + KASSERTMSG(!dev->dv_detached, + "%s committed to detaching and then backed out", + device_xname(dev)); goto out; } else { panic("config_detach: forced detach of %s failed (%d)", @@ -2065,7 +2072,8 @@ config_detach(device_t dev, int flags) * responsibility of .ca_detach to ensure anything with open * references will be interrupted and release them promptly, * not block indefinitely. All new attempts to acquire - * references will block until dv_detaching clears. + * references will fail, as config_detach_commit has arranged + * by now. */ mutex_enter(&config_misc_lock); localcount_drain(dev->dv_localcount, @@ -2139,6 +2147,30 @@ out: return rv; } +/* + * config_detach_commit(dev) + * + * Issued by a driver's .ca_detach routine to notify anyone + * waiting in device_lookup_acquire that the driver is committed + * to detaching the device, which allows device_lookup_acquire to + * wake up and fail immediately. + * + * Safe to call multiple times -- idempotent. Must be called + * during config_detach_enter/exit. Safe to use with + * device_lookup because the device is not actually removed from + * the table until after config_detach_exit. + */ +void +config_detach_commit(device_t dev) +{ + + mutex_enter(&config_misc_lock); + KASSERT(dev->dv_detaching == curlwp); + dev->dv_detached = true; + cv_broadcast(&config_misc_cv); + mutex_exit(&config_misc_lock); +} + int config_detach_children(device_t parent, int flags) { @@ -2646,7 +2678,8 @@ device_lookup_acquire(cfdriver_t cd, int unit) mutex_enter(&alldevs_lock); retry: if (unit < 0 || unit >= cd->cd_ndevs || (dv = cd->cd_devs[unit]) == NULL || - dv->dv_del_gen != 0) { + dv->dv_del_gen != 0 || + dv->dv_detached) { dv = NULL; } else { /* diff --git a/sys/sys/device.h b/sys/sys/device.h index 9f95c98ffa42..cfd324a1ac2d 100644 --- a/sys/sys/device.h +++ b/sys/sys/device.h @@ -281,6 +281,7 @@ struct device { struct lwp *dv_attaching; /* thread not yet finished in attach */ struct lwp *dv_detaching; /* detach lock (config_misc_lock/cv) */ + bool dv_detached; /* config_misc_lock */ size_t dv_activity_count; void (**dv_activity_handlers)(device_t, devactive_t); @@ -631,6 +632,7 @@ device_t config_attach_pseudo(cfdata_t); int config_detach(device_t, int); int config_detach_children(device_t, int flags); +void config_detach_commit(device_t); bool config_detach_all(int); int config_deactivate(device_t); void config_defer(device_t, void (*)(device_t)); From d8990b16e2c5174ace38ceeb2eca5322b0ba9913 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 31 Jan 2022 22:56:03 +0000 Subject: [PATCH 36/37] autoconf(9): Disentangle slightly circuitous config_detach logic. No functional change intended. --- sys/kern/subr_autoconf.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c index 03afc7f54b55..5c1c865785f8 100644 --- a/sys/kern/subr_autoconf.c +++ b/sys/kern/subr_autoconf.c @@ -2029,6 +2029,11 @@ config_detach(device_t dev, int flags) alldevs_nwrite++; mutex_exit(&alldevs_lock); + /* + * Call the driver's .ca_detach function, unless it has none or + * we are skipping it because it's unforced shutdown time and + * the driver didn't ask to detach on shutdown. + */ if (!detachall && (flags & (DETACH_SHUTDOWN|DETACH_FORCE)) == DETACH_SHUTDOWN && (dev->dv_flags & DVF_DETACH_SHUTDOWN) == 0) { @@ -2041,31 +2046,39 @@ config_detach(device_t dev, int flags) /* * If it was not possible to detach the device, then we either * panic() (for the forced but failed case), or return an error. - * - * If it was possible to detach the device, ensure that the - * device is deactivated. */ - if (rv == 0) { - config_detach_commit(dev); - dev->dv_flags &= ~DVF_ACTIVE; /* XXXSMP */ - } else if ((flags & DETACH_FORCE) == 0) { + if (rv) { /* - * Detach failed -- likely EBUSY. Driver must not have - * called config_detach_commit. + * Detach failed -- likely EOPNOTSUPP or EBUSY. Driver + * must not have called config_detach_commit. */ KASSERTMSG(!dev->dv_detached, "%s committed to detaching and then backed out", device_xname(dev)); + if (flags & DETACH_FORCE) { + panic("config_detach: forced detach of %s failed (%d)", + device_xname(dev), rv); + } goto out; - } else { - panic("config_detach: forced detach of %s failed (%d)", - device_xname(dev), rv); } /* * The device has now been successfully detached. */ + /* + * If .ca_detach didn't commit to detach, then do that for it. + * This wakes any pending device_lookup_acquire calls so they + * will fail. + */ + config_detach_commit(dev); + + /* + * If it was possible to detach the device, ensure that the + * device is deactivated. + */ + dev->dv_flags &= ~DVF_ACTIVE; /* XXXSMP */ + /* * Wait for all device_lookup_acquire references -- mostly, for * all attempts to open the device -- to drain. It is the From ea1643a73dc31e7901a777dddeef221148c61390 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 31 Jan 2022 14:26:43 +0000 Subject: [PATCH 37/37] driver(9): Make vdevgone call config_detach_commit if appropriate. Make sure to do this before spec_node_lookup_by_dev -- that might wait for a concurrent revoke to complete, which in turn might wait for a concurrent open to complete, which in turn might be waiting for the device to commit to detaching. --- sys/kern/subr_devsw.c | 36 ++++++++++++++++++++++++++++++++++++ sys/kern/vfs_subr.c | 14 ++++++++++++++ sys/sys/conf.h | 4 ++++ 3 files changed, 54 insertions(+) diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index d24480114f94..3087606aebf2 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -1068,6 +1068,24 @@ bdev_discard(dev_t dev, off_t pos, off_t len) return rv; } +void +bdev_detached(dev_t dev) +{ + const struct bdevsw *d; + device_t dv; + int unit; + + if ((d = bdevsw_lookup(dev)) == NULL) + return; + if (d->d_devtounit == NULL) + return; + if ((unit = (*d->d_devtounit)(dev)) == -1) + return; + if ((dv = device_lookup(d->d_cfdriver, unit)) == NULL) + return; + config_detach_commit(dv); +} + int cdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { @@ -1288,6 +1306,24 @@ cdev_type(dev_t dev) return d->d_flag & D_TYPEMASK; } +void +cdev_detached(dev_t dev) +{ + const struct cdevsw *d; + device_t dv; + int unit; + + if ((d = cdevsw_lookup(dev)) == NULL) + return; + if (d->d_devtounit == NULL) + return; + if ((unit = (*d->d_devtounit)(dev)) == -1) + return; + if ((dv = device_lookup(d->d_cfdriver, unit)) == NULL) + return; + config_detach_commit(dv); +} + /* * nommap(dev, off, prot) * diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index a64cffb97054..f6504733960d 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -511,6 +511,20 @@ vdevgone(int maj, int minl, int minh, enum vtype type) for (mn = minl; mn <= minh; mn++) { dev = makedev(maj, mn); + /* + * Notify anyone trying to get at this device that it + * has been detached, and then revoke it. + */ + switch (type) { + case VBLK: + bdev_detached(dev); + break; + case VCHR: + cdev_detached(dev); + break; + default: + panic("invalid specnode type: %d", type); + } /* * Passing 0 as flags, instead of VDEAD_NOWAIT, means * spec_node_lookup_by_dev will wait for vnodes it diff --git a/sys/sys/conf.h b/sys/sys/conf.h index 16dd87e5480c..210a237b8c43 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -172,6 +172,8 @@ dev_type_dump(bdev_dump); dev_type_size(bdev_size); dev_type_discard(bdev_discard); +void bdev_detached(dev_t); + dev_type_open(cdev_open); dev_type_close(cdev_close); dev_type_read(cdev_read); @@ -184,6 +186,8 @@ dev_type_mmap(cdev_mmap); dev_type_kqfilter(cdev_kqfilter); dev_type_discard(cdev_discard); +void cdev_detached(dev_t); + int cdev_type(dev_t); int cdev_flags(dev_t); int bdev_type(dev_t);