diff --git a/external/cddl/osnet/dev/dtrace/dtrace_modevent.c b/external/cddl/osnet/dev/dtrace/dtrace_modevent.c index cc0f8103fb98..f1338840125a 100644 --- a/external/cddl/osnet/dev/dtrace/dtrace_modevent.c +++ b/external/cddl/osnet/dev/dtrace/dtrace_modevent.c @@ -42,9 +42,7 @@ dtrace_modcmd(modcmd_t cmd, void *data) return error; case MODULE_CMD_FINI: - error = devsw_detach(NULL, &dtrace_cdevsw); - if (error != 0) - return error; + devsw_detach(NULL, &dtrace_cdevsw); error = dtrace_unload(); if (error != 0) { diff --git a/external/cddl/osnet/dev/fbt/fbt.c b/external/cddl/osnet/dev/fbt/fbt.c index b367c2155292..46dd7c1f7f06 100644 --- a/external/cddl/osnet/dev/fbt/fbt.c +++ b/external/cddl/osnet/dev/fbt/fbt.c @@ -1329,7 +1329,8 @@ dtrace_fbt_modcmd(modcmd_t cmd, void *data) error = fbt_unload(); if (error != 0) return error; - return devsw_detach(NULL, &fbt_cdevsw); + devsw_detach(NULL, &fbt_cdevsw); + return 0; case MODULE_CMD_AUTOUNLOAD: return EBUSY; default: diff --git a/external/cddl/osnet/dev/sdt/sdt.c b/external/cddl/osnet/dev/sdt/sdt.c index c3ad129f8284..5a41270a2917 100644 --- a/external/cddl/osnet/dev/sdt/sdt.c +++ b/external/cddl/osnet/dev/sdt/sdt.c @@ -562,7 +562,8 @@ dtrace_sdt_modcmd(modcmd_t cmd, void *data) error = sdt_unload(); if (error != 0) return error; - return devsw_detach(NULL, &sdt_cdevsw); + devsw_detach(NULL, &sdt_cdevsw); + return 0; case MODULE_CMD_AUTOUNLOAD: return EBUSY; default: diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c index 9e19cd1dc0c3..d74d8c71e54d 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c @@ -7231,7 +7231,7 @@ zfs_modcmd(modcmd_t cmd, void *arg) if (error) return error; - (void) devsw_detach(&zfs_bdevsw, &zfs_cdevsw); + devsw_detach(&zfs_bdevsw, &zfs_cdevsw); attacherr: zfs_sysctl_fini(); diff --git a/share/man/man9/devsw_attach.9 b/share/man/man9/devsw_attach.9 index cf862be5846a..6ffc51957a3f 100644 --- a/share/man/man9/devsw_attach.9 +++ b/share/man/man9/devsw_attach.9 @@ -27,7 +27,7 @@ .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" -.Dd April 30, 2017 +.Dd January 11, 2022 .Dt DEVSW 9 .Os .Sh NAME @@ -49,7 +49,7 @@ .Fa "const struct cdevsw *cdev" .Fa "devmajor_t *cmajor" .Fc -.Ft int +.Ft void .Fo devsw_detach .Fa "const struct bdevsw *bdev" .Fa "const struct cdevsw *cdev" @@ -130,6 +130,11 @@ and structures. .Fn devsw_detach should be called before a loaded device driver is unloaded. +The caller must ensure that there are no open instances of the device, +and that the device's +.Fn d_open +function will fail, before calling. +Fn devsw_detach . .Pp The .Fn bdevsw_lookup @@ -155,10 +160,8 @@ or .Sh RETURN VALUES Upon successful completion, .Fn devsw_attach -and -.Fn devsw_detach -return 0. -Otherwise they return an error value. +returns 0. +Otherwise it returns an error value. .Pp In case of failure, .Fn bdevsw_lookup diff --git a/sys/coda/coda_psdev.c b/sys/coda/coda_psdev.c index cede16da3f53..7f531f03fe56 100644 --- a/sys/coda/coda_psdev.c +++ b/sys/coda/coda_psdev.c @@ -758,7 +758,7 @@ vcoda_modcmd(modcmd_t cmd, void *arg) if (VC_OPEN(vcp)) return EBUSY; } - return devsw_detach(NULL, &vcoda_cdevsw); + devsw_detach(NULL, &vcoda_cdevsw); } #endif break; diff --git a/sys/coda/coda_vfsops.c b/sys/coda/coda_vfsops.c index 9e25704b6b2f..c0a1b98a9331 100644 --- a/sys/coda/coda_vfsops.c +++ b/sys/coda/coda_vfsops.c @@ -636,7 +636,7 @@ struct mount *devtomp(dev_t dev) struct mount *mp; struct vnode *vp; - if (spec_node_lookup_by_dev(VBLK, dev, &vp) == 0) { + if (spec_node_lookup_by_dev(VBLK, dev, VDEAD_NOWAIT, &vp) == 0) { mp = spec_node_getmountedfs(vp); vrele(vp); } else { diff --git a/sys/dev/ccd.c b/sys/dev/ccd.c index ab3f94bd422b..3af4f23bcf45 100644 --- a/sys/dev/ccd.c +++ b/sys/dev/ccd.c @@ -1710,7 +1710,7 @@ ccd_modcmd(modcmd_t cmd, void *arg) error = EBUSY; } else { mutex_exit(&ccd_lock); - error = devsw_detach(&ccd_bdevsw, &ccd_cdevsw); + devsw_detach(&ccd_bdevsw, &ccd_cdevsw); ccddetach(); } #endif diff --git a/sys/dev/clockctl.c b/sys/dev/clockctl.c index 0da5e7765fe8..9685c0f129f6 100644 --- a/sys/dev/clockctl.c +++ b/sys/dev/clockctl.c @@ -182,14 +182,12 @@ clockctl_modcmd(modcmd_t cmd, void *data) return EBUSY; } #ifdef _MODULE - error = devsw_detach(NULL, &clockctl_cdevsw); + devsw_detach(NULL, &clockctl_cdevsw); #endif mutex_exit(&clockctl_mtx); - if (error == 0) { - kauth_unlisten_scope(clockctl_listener); - mutex_destroy(&clockctl_mtx); - } + kauth_unlisten_scope(clockctl_listener); + mutex_destroy(&clockctl_mtx); break; default: diff --git a/sys/dev/hdaudio/hdaudio.c b/sys/dev/hdaudio/hdaudio.c index d39ff2db6cde..5c7874778e22 100644 --- a/sys/dev/hdaudio/hdaudio.c +++ b/sys/dev/hdaudio/hdaudio.c @@ -1636,11 +1636,7 @@ hdaudio_modcmd(modcmd_t cmd, void *opaque) error = config_cfdriver_detach(&hdaudio_cd); if (error) break; - error = devsw_detach(NULL, &hdaudio_cdevsw); - if (error) { - config_cfdriver_attach(&hdaudio_cd); - break; - } + devsw_detach(NULL, &hdaudio_cdevsw); #endif break; default: diff --git a/sys/dev/i2c/i2c.c b/sys/dev/i2c/i2c.c index 6f2c0c6a9698..36a3e87d5316 100644 --- a/sys/dev/i2c/i2c.c +++ b/sys/dev/i2c/i2c.c @@ -942,7 +942,7 @@ iic_modcmd(modcmd_t cmd, void *opaque) if (error) { aprint_error("%s: unable to init component\n", iic_cd.cd_name); - (void)devsw_detach(NULL, &iic_cdevsw); + devsw_detach(NULL, &iic_cdevsw); } mutex_exit(&iic_mtx); #endif @@ -960,10 +960,7 @@ iic_modcmd(modcmd_t cmd, void *opaque) mutex_exit(&iic_mtx); break; } - error = devsw_detach(NULL, &iic_cdevsw); - if (error != 0) - config_init_component(cfdriver_ioconf_iic, - cfattach_ioconf_iic, cfdata_ioconf_iic); + devsw_detach(NULL, &iic_cdevsw); #endif mutex_exit(&iic_mtx); break; diff --git a/sys/dev/pad/pad.c b/sys/dev/pad/pad.c index 464f033dd209..3f311bc7f928 100644 --- a/sys/dev/pad/pad.c +++ b/sys/dev/pad/pad.c @@ -777,9 +777,7 @@ pad_modcmd(modcmd_t cmd, void *arg) case MODULE_CMD_FINI: #ifdef _MODULE - error = devsw_detach(NULL, &pad_cdevsw); - if (error) - break; + devsw_detach(NULL, &pad_cdevsw); error = config_fini_component(cfdriver_ioconf_pad, pad_cfattach, cfdata_ioconf_pad); diff --git a/sys/dev/raidframe/rf_netbsdkintf.c b/sys/dev/raidframe/rf_netbsdkintf.c index caeadcb4e739..8aa89841adef 100644 --- a/sys/dev/raidframe/rf_netbsdkintf.c +++ b/sys/dev/raidframe/rf_netbsdkintf.c @@ -4082,16 +4082,7 @@ raid_modcmd_fini(void) return error; } #endif - error = devsw_detach(&raid_bdevsw, &raid_cdevsw); - if (error != 0) { - aprint_error("%s: cannot detach devsw\n",__func__); -#ifdef _MODULE - config_cfdriver_attach(&raid_cd); -#endif - config_cfattach_attach(raid_cd.cd_name, &raid_ca); - mutex_exit(&raid_lock); - return error; - } + devsw_detach(&raid_bdevsw, &raid_cdevsw); rf_BootRaidframe(false); #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) rf_destroy_mutex2(rf_sparet_wait_mutex); diff --git a/sys/dev/sysmon/sysmon.c b/sys/dev/sysmon/sysmon.c index 46aedaad7337..fd2993f0c180 100644 --- a/sys/dev/sysmon/sysmon.c +++ b/sys/dev/sysmon/sysmon.c @@ -356,7 +356,7 @@ sysmon_fini(void) if (error == 0) { mutex_enter(&sysmon_minor_mtx); sm_is_attached = false; - error = devsw_detach(NULL, &sysmon_cdevsw); + devsw_detach(NULL, &sysmon_cdevsw); mutex_exit(&sysmon_minor_mtx); } #endif diff --git a/sys/dev/tprof/tprof.c b/sys/dev/tprof/tprof.c index b069a5b7df5d..136fd190ad14 100644 --- a/sys/dev/tprof/tprof.c +++ b/sys/dev/tprof/tprof.c @@ -768,13 +768,7 @@ tprof_modcmd(modcmd_t cmd, void *arg) case MODULE_CMD_FINI: #if defined(_MODULE) - { - int error; - error = devsw_detach(NULL, &tprof_cdevsw); - if (error) { - return error; - } - } + devsw_detach(NULL, &tprof_cdevsw); #endif /* defined(_MODULE) */ tprof_driver_fini(); return 0; diff --git a/sys/dist/pf/net/pf_ioctl.c b/sys/dist/pf/net/pf_ioctl.c index 94bfb70a411d..e4c13be698f8 100644 --- a/sys/dist/pf/net/pf_ioctl.c +++ b/sys/dist/pf/net/pf_ioctl.c @@ -3459,7 +3459,8 @@ pf_modcmd(modcmd_t cmd, void *opaque) } else { pfdetach(); pflogdetach(); - return devsw_detach(NULL, &pf_cdevsw); + devsw_detach(NULL, &pf_cdevsw); + return 0; } default: return ENOTTY; diff --git a/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c b/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c index d0c4ca95097c..bb4e0706cc9b 100644 --- a/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c +++ b/sys/external/bsd/ipf/netinet/ip_fil_netbsd.c @@ -2256,7 +2256,7 @@ ipl_fini(void *opaque) { #ifdef _MODULE - (void)devsw_detach(NULL, &ipl_cdevsw); + devsw_detach(NULL, &ipl_cdevsw); #endif /* diff --git a/sys/fs/autofs/autofs_vfsops.c b/sys/fs/autofs/autofs_vfsops.c index fbd6eafe6532..1204d1f9b6d3 100644 --- a/sys/fs/autofs/autofs_vfsops.c +++ b/sys/fs/autofs/autofs_vfsops.c @@ -496,9 +496,7 @@ autofs_modcmd(modcmd_t cmd, void *arg) } mutex_exit(&autofs_softc->sc_lock); - error = devsw_detach(NULL, &autofs_cdevsw); - if (error) - break; + devsw_detach(NULL, &autofs_cdevsw); #endif error = vfs_detach(&autofs_vfsops); if (error) diff --git a/sys/kern/kern_drvctl.c b/sys/kern/kern_drvctl.c index 912493dc8608..8a3c24f69b6e 100644 --- a/sys/kern/kern_drvctl.c +++ b/sys/kern/kern_drvctl.c @@ -666,15 +666,10 @@ drvctl_modcmd(modcmd_t cmd, void *arg) devmon_insert_vec = saved_insert_vec; saved_insert_vec = NULL; #ifdef _MODULE - error = devsw_detach(NULL, &drvctl_cdevsw); - if (error != 0) { - saved_insert_vec = devmon_insert_vec; - devmon_insert_vec = devmon_insert; - } + devsw_detach(NULL, &drvctl_cdevsw); #endif mutex_exit(&drvctl_lock); - if (error == 0) - drvctl_fini(); + drvctl_fini(); break; default: diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c index ccb1875f8ba9..5c1c865785f8 100644 --- a/sys/kern/subr_autoconf.c +++ b/sys/kern/subr_autoconf.c @@ -108,6 +108,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_autoconf.c,v 1.296 2022/03/12 19:26:33 riastrad #include #include #include +#include #include @@ -1453,6 +1454,9 @@ config_devdelete(device_t dev) if (dg->dg_devs != NULL) kmem_free(dg->dg_devs, sizeof(device_t) * dg->dg_ndevs); + localcount_fini(dev->dv_localcount); + kmem_free(dev->dv_localcount, sizeof(*dev->dv_localcount)); + cv_destroy(&dvl->dvl_cv); mutex_destroy(&dvl->dvl_mtx); @@ -1557,6 +1561,7 @@ config_devalloc(const device_t parent, const cfdata_t cf, dev->dv_activity_handlers = NULL; dev->dv_private = dev_private; dev->dv_flags = ca->ca_flags; /* inherit flags from class */ + dev->dv_attaching = curlwp; myunit = config_unit_alloc(dev, cd, cf); if (myunit == -1) { @@ -1605,6 +1610,10 @@ config_devalloc(const device_t parent, const cfdata_t cf, "device-parent", device_xname(parent)); } + dev->dv_localcount = kmem_zalloc(sizeof(*dev->dv_localcount), + KM_SLEEP); + localcount_init(dev->dv_localcount); + if (dev->dv_cfdriver->cd_attrs != NULL) config_add_attrib_dict(dev); @@ -1756,8 +1765,29 @@ config_attach_internal(device_t parent, cfdata_t cf, void *aux, cfprint_t print, /* Let userland know */ devmon_report_device(dev, true); + /* + * Prevent detach until the driver's attach function, and all + * deferred actions, have finished. + */ config_pending_incr(dev); + + /* Call the driver's attach function. */ (*dev->dv_cfattach->ca_attach)(parent, dev, aux); + + /* + * Allow other threads to acquire references to the device now + * that the driver's attach function is done. + */ + mutex_enter(&config_misc_lock); + KASSERT(dev->dv_attaching == curlwp); + dev->dv_attaching = NULL; + cv_broadcast(&config_misc_cv); + mutex_exit(&config_misc_lock); + + /* + * Synchronous parts of attach are done. Allow detach, unless + * the driver's attach function scheduled deferred actions. + */ config_pending_decr(dev); mutex_enter(&config_misc_lock); @@ -1823,8 +1853,29 @@ config_attach_pseudo(cfdata_t cf) /* Let userland know */ devmon_report_device(dev, true); + /* + * Prevent detach until the driver's attach function, and all + * deferred actions, have finished. + */ config_pending_incr(dev); + + /* Call the driver's attach function. */ (*dev->dv_cfattach->ca_attach)(ROOT, dev, NULL); + + /* + * Allow other threads to acquire references to the device now + * that the driver's attach function is done. + */ + mutex_enter(&config_misc_lock); + KASSERT(dev->dv_attaching == curlwp); + dev->dv_attaching = NULL; + cv_broadcast(&config_misc_cv); + mutex_exit(&config_misc_lock); + + /* + * Synchronous parts of attach are done. Allow detach, unless + * the driver's attach function scheduled deferred actions. + */ config_pending_decr(dev); config_process_deferred(&deferred_config_queue, dev); @@ -1873,24 +1924,39 @@ config_dump_garbage(struct devicelist *garbage) static int config_detach_enter(device_t dev) { - int error; + int error = 0; mutex_enter(&config_misc_lock); - for (;;) { - if (dev->dv_pending == 0 && dev->dv_detaching == NULL) { - dev->dv_detaching = curlwp; - error = 0; - break; - } + + /* + * Wait until attach has fully completed, and until any + * concurrent detach (e.g., drvctl racing with USB event + * thread) has completed. + * + * Caller must hold alldevs_nread or alldevs_nwrite (e.g., via + * deviter) to ensure the winner of the race doesn't free the + * device leading the loser of the race into use-after-free. + * + * XXX Not all callers do this! + */ + while (dev->dv_pending || dev->dv_detaching) { KASSERTMSG(dev->dv_detaching != curlwp, "recursively detaching %s", device_xname(dev)); error = cv_wait_sig(&config_misc_cv, &config_misc_lock); if (error) - break; + goto out; } - KASSERT(error || dev->dv_detaching == curlwp); - mutex_exit(&config_misc_lock); + /* + * Attach has completed, and no other concurrent detach is + * running. Claim the device for detaching. This will cause + * all new attempts to acquire references to block. + */ + KASSERT(dev->dv_attaching == NULL); + KASSERT(dev->dv_detaching == NULL); + dev->dv_detaching = curlwp; + +out: mutex_exit(&config_misc_lock); return error; } @@ -1963,6 +2029,11 @@ config_detach(device_t dev, int flags) alldevs_nwrite++; mutex_exit(&alldevs_lock); + /* + * Call the driver's .ca_detach function, unless it has none or + * we are skipping it because it's unforced shutdown time and + * the driver didn't ask to detach on shutdown. + */ if (!detachall && (flags & (DETACH_SHUTDOWN|DETACH_FORCE)) == DETACH_SHUTDOWN && (dev->dv_flags & DVF_DETACH_SHUTDOWN) == 0) { @@ -1975,23 +2046,53 @@ config_detach(device_t dev, int flags) /* * If it was not possible to detach the device, then we either * panic() (for the forced but failed case), or return an error. - * - * If it was possible to detach the device, ensure that the - * device is deactivated. */ - if (rv == 0) - dev->dv_flags &= ~DVF_ACTIVE; - else if ((flags & DETACH_FORCE) == 0) + if (rv) { + /* + * Detach failed -- likely EOPNOTSUPP or EBUSY. Driver + * must not have called config_detach_commit. + */ + KASSERTMSG(!dev->dv_detached, + "%s committed to detaching and then backed out", + device_xname(dev)); + if (flags & DETACH_FORCE) { + panic("config_detach: forced detach of %s failed (%d)", + device_xname(dev), rv); + } goto out; - else { - panic("config_detach: forced detach of %s failed (%d)", - device_xname(dev), rv); } /* * The device has now been successfully detached. */ + /* + * If .ca_detach didn't commit to detach, then do that for it. + * This wakes any pending device_lookup_acquire calls so they + * will fail. + */ + config_detach_commit(dev); + + /* + * If it was possible to detach the device, ensure that the + * device is deactivated. + */ + dev->dv_flags &= ~DVF_ACTIVE; /* XXXSMP */ + + /* + * Wait for all device_lookup_acquire references -- mostly, for + * all attempts to open the device -- to drain. It is the + * responsibility of .ca_detach to ensure anything with open + * references will be interrupted and release them promptly, + * not block indefinitely. All new attempts to acquire + * references will fail, as config_detach_commit has arranged + * by now. + */ + mutex_enter(&config_misc_lock); + localcount_drain(dev->dv_localcount, + &config_misc_cv, &config_misc_lock); + mutex_exit(&config_misc_lock); + /* Let userland know */ devmon_report_device(dev, false); @@ -2059,6 +2160,30 @@ out: return rv; } +/* + * config_detach_commit(dev) + * + * Issued by a driver's .ca_detach routine to notify anyone + * waiting in device_lookup_acquire that the driver is committed + * to detaching the device, which allows device_lookup_acquire to + * wake up and fail immediately. + * + * Safe to call multiple times -- idempotent. Must be called + * during config_detach_enter/exit. Safe to use with + * device_lookup because the device is not actually removed from + * the table until after config_detach_exit. + */ +void +config_detach_commit(device_t dev) +{ + + mutex_enter(&config_misc_lock); + KASSERT(dev->dv_detaching == curlwp); + dev->dv_detached = true; + cv_broadcast(&config_misc_cv); + mutex_exit(&config_misc_lock); +} + int config_detach_children(device_t parent, int flags) { @@ -2499,6 +2624,17 @@ config_alldevs_exit(struct alldevs_foray *af) * device_lookup: * * Look up a device instance for a given driver. + * + * Caller is responsible for ensuring the device's state is + * stable, either by holding a reference already obtained with + * device_lookup_acquire or by otherwise ensuring the device is + * attached and can't be detached (e.g., holding an open device + * node and ensuring *_detach calls vdevgone). + * + * XXX Find a way to assert this. + * + * Safe for use up to and including interrupt context at IPL_VM. + * Never sleeps. */ device_t device_lookup(cfdriver_t cd, int unit) @@ -2527,6 +2663,74 @@ device_lookup_private(cfdriver_t cd, int unit) return device_private(device_lookup(cd, unit)); } +/* + * device_lookup_acquire: + * + * Look up a device instance for a given driver, and return a + * reference to it that must be released by device_release. + * + * => If the device is still attaching, blocks until *_attach has + * returned. + * + * => If the device is detaching, blocks until *_detach has + * returned. May succeed or fail in that case, depending on + * whether *_detach has backed out (EBUSY) or committed to + * detaching. + * + * May sleep. + */ +device_t +device_lookup_acquire(cfdriver_t cd, int unit) +{ + device_t dv; + + ASSERT_SLEEPABLE(); + + /* XXX This should have a pserialized fast path -- TBD. */ + mutex_enter(&config_misc_lock); + mutex_enter(&alldevs_lock); +retry: if (unit < 0 || unit >= cd->cd_ndevs || + (dv = cd->cd_devs[unit]) == NULL || + dv->dv_del_gen != 0 || + dv->dv_detached) { + dv = NULL; + } else { + /* + * Wait for the device to stabilize, if attaching or + * detaching. Either way we must wait for *_attach or + * *_detach to complete, and either way we must retry: + * even if detaching, *_detach might fail (EBUSY) so + * the device may still be there. + */ + if ((dv->dv_attaching != NULL && dv->dv_attaching != curlwp) || + dv->dv_detaching != NULL) { + mutex_exit(&alldevs_lock); + cv_wait(&config_misc_cv, &config_misc_lock); + mutex_enter(&alldevs_lock); + goto retry; + } + localcount_acquire(dv->dv_localcount); + } + mutex_exit(&alldevs_lock); + mutex_exit(&config_misc_lock); + + return dv; +} + +/* + * device_release: + * + * Release a reference to a device acquired with + * device_lookup_acquire. + */ +void +device_release(device_t dv) +{ + + localcount_release(dv->dv_localcount, + &config_misc_cv, &config_misc_lock); +} + /* * device_find_by_xname: * diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c index 1a0f721fdd65..3087606aebf2 100644 --- a/sys/kern/subr_devsw.c +++ b/sys/kern/subr_devsw.c @@ -85,6 +85,11 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp #include #include #include +#include +#include +#include +#include +#include #ifdef DEVSW_DEBUG #define DPRINTF(x) printf x @@ -97,12 +102,21 @@ __KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.38 2017/11/07 18:35:57 christos Exp #define CDEVSW_SIZE (sizeof(struct cdevsw *)) #define DEVSWCONV_SIZE (sizeof(struct devsw_conv)) +struct devswref { + struct localcount *dr_lc; +}; + +/* XXX bdevsw, cdevsw, max_bdevsws, and max_cdevsws should be volatile */ extern const struct bdevsw **bdevsw, *bdevsw0[]; extern const struct cdevsw **cdevsw, *cdevsw0[]; extern struct devsw_conv *devsw_conv, devsw_conv0[]; extern const int sys_bdevsws, sys_cdevsws; extern int max_bdevsws, max_cdevsws, max_devsw_convs; +static struct devswref *cdevswref; +static struct devswref *bdevswref; +static kcondvar_t devsw_cv; + static int bdevsw_attach(const struct bdevsw *, devmajor_t *); static int cdevsw_attach(const struct cdevsw *, devmajor_t *); static void devsw_detach_locked(const struct bdevsw *, const struct cdevsw *); @@ -118,6 +132,8 @@ devsw_init(void) KASSERT(sys_bdevsws < MAXDEVSW - 1); KASSERT(sys_cdevsws < MAXDEVSW - 1); mutex_init(&device_lock, MUTEX_DEFAULT, IPL_NONE); + + cv_init(&devsw_cv, "devsw"); } int @@ -158,15 +174,13 @@ devsw_attach(const char *devname, error = EEXIST; goto fail; } - - if (bdev != NULL) - bdevsw[*bmajor] = bdev; - cdevsw[*cmajor] = cdev; - - mutex_exit(&device_lock); - return (0); + break; } + /* + * XXX This should allocate what it needs up front so we never + * need to flail around trying to unwind. + */ error = bdevsw_attach(bdev, bmajor); if (error != 0) goto fail; @@ -176,6 +190,13 @@ devsw_attach(const char *devname, goto fail; } + /* + * If we already found a conv, we're done. Otherwise, find an + * empty slot or extend the table. + */ + if (i == max_devsw_convs) + goto fail; + for (i = 0 ; i < max_devsw_convs ; i++) { if (devsw_conv[i].d_name == NULL) break; @@ -224,7 +245,9 @@ devsw_attach(const char *devname, static int bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) { - const struct bdevsw **newptr; + const struct bdevsw **newbdevsw = NULL; + struct devswref *newbdevswref = NULL; + struct localcount *lc; devmajor_t bmajor; int i; @@ -253,20 +276,34 @@ bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) return (ENOMEM); } + if (bdevswref == NULL) { + newbdevswref = kmem_zalloc(MAXDEVSW * sizeof(newbdevswref[0]), + KM_NOSLEEP); + if (newbdevswref == NULL) + return ENOMEM; + atomic_store_release(&bdevswref, newbdevswref); + } + if (*devmajor >= max_bdevsws) { KASSERT(bdevsw == bdevsw0); - newptr = kmem_zalloc(MAXDEVSW * BDEVSW_SIZE, KM_NOSLEEP); - if (newptr == NULL) - return (ENOMEM); - memcpy(newptr, bdevsw, max_bdevsws * BDEVSW_SIZE); - bdevsw = newptr; - max_bdevsws = MAXDEVSW; + newbdevsw = kmem_zalloc(MAXDEVSW * sizeof(newbdevsw[0]), + KM_NOSLEEP); + if (newbdevsw == NULL) + return ENOMEM; + memcpy(newbdevsw, bdevsw, max_bdevsws * sizeof(bdevsw[0])); + atomic_store_release(&bdevsw, newbdevsw); + atomic_store_release(&max_bdevsws, MAXDEVSW); } if (bdevsw[*devmajor] != NULL) return (EEXIST); - bdevsw[*devmajor] = devsw; + KASSERT(bdevswref[*devmajor].dr_lc == NULL); + lc = kmem_zalloc(sizeof(*lc), KM_SLEEP); + localcount_init(lc); + bdevswref[*devmajor].dr_lc = lc; + + atomic_store_release(&bdevsw[*devmajor], devsw); return (0); } @@ -274,7 +311,9 @@ bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor) static int cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) { - const struct cdevsw **newptr; + const struct cdevsw **newcdevsw = NULL; + struct devswref *newcdevswref = NULL; + struct localcount *lc; devmajor_t cmajor; int i; @@ -300,20 +339,34 @@ cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) return (ENOMEM); } + if (cdevswref == NULL) { + newcdevswref = kmem_zalloc(MAXDEVSW * sizeof(newcdevswref[0]), + KM_NOSLEEP); + if (newcdevswref == NULL) + return ENOMEM; + atomic_store_release(&cdevswref, newcdevswref); + } + if (*devmajor >= max_cdevsws) { KASSERT(cdevsw == cdevsw0); - newptr = kmem_zalloc(MAXDEVSW * CDEVSW_SIZE, KM_NOSLEEP); - if (newptr == NULL) - return (ENOMEM); - memcpy(newptr, cdevsw, max_cdevsws * CDEVSW_SIZE); - cdevsw = newptr; - max_cdevsws = MAXDEVSW; + newcdevsw = kmem_zalloc(MAXDEVSW * sizeof(newcdevsw[0]), + KM_NOSLEEP); + if (newcdevsw == NULL) + return ENOMEM; + memcpy(newcdevsw, cdevsw, max_cdevsws * sizeof(cdevsw[0])); + atomic_store_release(&cdevsw, newcdevsw); + atomic_store_release(&max_cdevsws, MAXDEVSW); } if (cdevsw[*devmajor] != NULL) return (EEXIST); - cdevsw[*devmajor] = devsw; + KASSERT(cdevswref[*devmajor].dr_lc == NULL); + lc = kmem_zalloc(sizeof(*lc), KM_SLEEP); + localcount_init(lc); + cdevswref[*devmajor].dr_lc = lc; + + atomic_store_release(&cdevsw[*devmajor], devsw); return (0); } @@ -321,36 +374,77 @@ cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor) static void devsw_detach_locked(const struct bdevsw *bdev, const struct cdevsw *cdev) { - int i; + int bi, ci = -1/*XXXGCC*/; KASSERT(mutex_owned(&device_lock)); + /* Prevent new references. */ if (bdev != NULL) { - for (i = 0 ; i < max_bdevsws ; i++) { - if (bdevsw[i] != bdev) + for (bi = 0; bi < max_bdevsws; bi++) { + if (bdevsw[bi] != bdev) continue; - bdevsw[i] = NULL; + atomic_store_relaxed(&bdevsw[bi], NULL); break; } + KASSERT(bi < max_bdevsws); } if (cdev != NULL) { - for (i = 0 ; i < max_cdevsws ; i++) { - if (cdevsw[i] != cdev) + for (ci = 0; ci < max_cdevsws; ci++) { + if (cdevsw[ci] != cdev) continue; - cdevsw[i] = NULL; + atomic_store_relaxed(&cdevsw[ci], NULL); break; } + KASSERT(ci < max_cdevsws); + } + + if (bdev == NULL && cdev == NULL) /* XXX possible? */ + return; + + /* + * Wait for all bdevsw_lookup_acquire, cdevsw_lookup_acquire + * calls to notice that the devsw is gone. + * + * XXX Despite the use of the pserialize_read_enter/exit API + * elsewhere in this file, we use xc_barrier here instead of + * pserialize_perform -- because devsw_init is too early for + * pserialize_create. Either pserialize_create should be made + * to work earlier, or it should be nixed altogether. Until + * that is fixed, xc_barrier will serve the same purpose. + */ + xc_barrier(0); + + /* + * Wait for all references to drain. It is the caller's + * responsibility to ensure that at this point, there are no + * extant open instances and all new d_open calls will fail. + * + * Note that localcount_drain may release and reacquire + * device_lock. + */ + if (bdev != NULL) { + localcount_drain(bdevswref[bi].dr_lc, + &devsw_cv, &device_lock); + localcount_fini(bdevswref[bi].dr_lc); + kmem_free(bdevswref[bi].dr_lc, sizeof(*bdevswref[bi].dr_lc)); + bdevswref[bi].dr_lc = NULL; + } + if (cdev != NULL) { + localcount_drain(cdevswref[ci].dr_lc, + &devsw_cv, &device_lock); + localcount_fini(cdevswref[ci].dr_lc); + kmem_free(cdevswref[ci].dr_lc, sizeof(*cdevswref[ci].dr_lc)); + cdevswref[ci].dr_lc = NULL; } } -int +void devsw_detach(const struct bdevsw *bdev, const struct cdevsw *cdev) { mutex_enter(&device_lock); devsw_detach_locked(bdev, cdev); mutex_exit(&device_lock); - return 0; } /* @@ -366,10 +460,59 @@ bdevsw_lookup(dev_t dev) if (dev == NODEV) return (NULL); bmajor = major(dev); - if (bmajor < 0 || bmajor >= max_bdevsws) + if (bmajor < 0 || bmajor >= atomic_load_relaxed(&max_bdevsws)) return (NULL); - return (bdevsw[bmajor]); + return atomic_load_consume(&bdevsw)[bmajor]; +} + +static const struct bdevsw * +bdevsw_lookup_acquire(dev_t dev, struct localcount **lcp) +{ + devmajor_t bmajor; + const struct bdevsw *bdev = NULL, *const *curbdevsw; + struct devswref *curbdevswref; + int s; + + if (dev == NODEV) + return NULL; + bmajor = major(dev); + if (bmajor < 0) + return NULL; + + s = pserialize_read_enter(); + + /* + * max_bdevsws never goes down, so it is safe to rely on this + * condition without any locking for the array access below. + * Test sys_bdevsws first so we can avoid the memory barrier in + * that case. + */ + if (bmajor >= sys_bdevsws && + bmajor >= atomic_load_acquire(&max_bdevsws)) + goto out; + curbdevsw = atomic_load_consume(&bdevsw); + if ((bdev = atomic_load_consume(&curbdevsw[bmajor])) == NULL) + goto out; + + curbdevswref = atomic_load_consume(&bdevswref); + if (curbdevswref == NULL) { + *lcp = NULL; + } else if ((*lcp = curbdevswref[bmajor].dr_lc) != NULL) { + localcount_acquire(*lcp); + } +out: + pserialize_read_exit(s); + return bdev; +} + +static void +bdevsw_release(const struct bdevsw *bdev, struct localcount *lc) +{ + + if (lc == NULL) + return; + localcount_release(lc, &devsw_cv, &device_lock); } /* @@ -385,10 +528,59 @@ cdevsw_lookup(dev_t dev) if (dev == NODEV) return (NULL); cmajor = major(dev); - if (cmajor < 0 || cmajor >= max_cdevsws) + if (cmajor < 0 || cmajor >= atomic_load_relaxed(&max_cdevsws)) return (NULL); - return (cdevsw[cmajor]); + return atomic_load_consume(&cdevsw)[cmajor]; +} + +static const struct cdevsw * +cdevsw_lookup_acquire(dev_t dev, struct localcount **lcp) +{ + devmajor_t cmajor; + const struct cdevsw *cdev = NULL, *const *curcdevsw; + struct devswref *curcdevswref; + int s; + + if (dev == NODEV) + return NULL; + cmajor = major(dev); + if (cmajor < 0) + return NULL; + + s = pserialize_read_enter(); + + /* + * max_cdevsws never goes down, so it is safe to rely on this + * condition without any locking for the array access below. + * Test sys_cdevsws first so we can avoid the memory barrier in + * that case. + */ + if (cmajor >= sys_cdevsws && + cmajor >= atomic_load_acquire(&max_cdevsws)) + goto out; + curcdevsw = atomic_load_consume(&cdevsw); + if ((cdev = atomic_load_consume(&curcdevsw[cmajor])) == NULL) + goto out; + + curcdevswref = atomic_load_consume(&cdevswref); + if (curcdevswref == NULL) { + *lcp = NULL; + } else if ((*lcp = curcdevswref[cmajor].dr_lc) != NULL) { + localcount_acquire(*lcp); + } +out: + pserialize_read_exit(s); + return cdev; +} + +static void +cdevsw_release(const struct cdevsw *cdev, struct localcount *lc) +{ + + if (lc == NULL) + return; + localcount_release(lc, &devsw_cv, &device_lock); } /* @@ -400,10 +592,13 @@ cdevsw_lookup(dev_t dev) devmajor_t bdevsw_lookup_major(const struct bdevsw *bdev) { - devmajor_t bmajor; + const struct bdevsw *const *curbdevsw; + devmajor_t bmajor, bmax; - for (bmajor = 0 ; bmajor < max_bdevsws ; bmajor++) { - if (bdevsw[bmajor] == bdev) + bmax = atomic_load_acquire(&max_bdevsws); + curbdevsw = atomic_load_consume(&bdevsw); + for (bmajor = 0; bmajor < bmax; bmajor++) { + if (atomic_load_relaxed(&curbdevsw[bmajor]) == bdev) return (bmajor); } @@ -419,10 +614,13 @@ bdevsw_lookup_major(const struct bdevsw *bdev) devmajor_t cdevsw_lookup_major(const struct cdevsw *cdev) { - devmajor_t cmajor; + const struct cdevsw *const *curcdevsw; + devmajor_t cmajor, cmax; - for (cmajor = 0 ; cmajor < max_cdevsws ; cmajor++) { - if (cdevsw[cmajor] == cdev) + cmax = atomic_load_acquire(&max_cdevsws); + curcdevsw = atomic_load_consume(&cdevsw); + for (cmajor = 0; cmajor < cmax; cmajor++) { + if (atomic_load_relaxed(&curcdevsw[cmajor]) == cdev) return (cmajor); } @@ -697,22 +895,41 @@ int bdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct bdevsw *d; - int rv, mpflag; + struct localcount *lc; + device_t dv = NULL/*XXXGCC*/; + int unit, rv, mpflag; - /* - * For open we need to lock, in order to synchronize - * with attach/detach. - */ - mutex_enter(&device_lock); - d = bdevsw_lookup(dev); - mutex_exit(&device_lock); + d = bdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; + if (d->d_devtounit) { + /* + * If the device node corresponds to an autoconf device + * instance, acquire a reference to it so that during + * d_open, device_lookup is stable. + * + * XXX This should also arrange to instantiate cloning + * pseudo-devices if appropriate, but that requires + * reviewing them all to find and verify a common + * pattern. + */ + if ((unit = (*d->d_devtounit)(dev)) == -1) + return ENXIO; + if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL) + return ENXIO; + } + DEV_LOCK(d); rv = (*d->d_open)(dev, flag, devtype, l); DEV_UNLOCK(d); + if (d->d_devtounit) { + device_release(dv); + } + + bdevsw_release(d, lc); + return rv; } @@ -851,26 +1068,63 @@ bdev_discard(dev_t dev, off_t pos, off_t len) return rv; } +void +bdev_detached(dev_t dev) +{ + const struct bdevsw *d; + device_t dv; + int unit; + + if ((d = bdevsw_lookup(dev)) == NULL) + return; + if (d->d_devtounit == NULL) + return; + if ((unit = (*d->d_devtounit)(dev)) == -1) + return; + if ((dv = device_lookup(d->d_cfdriver, unit)) == NULL) + return; + config_detach_commit(dv); +} + int cdev_open(dev_t dev, int flag, int devtype, lwp_t *l) { const struct cdevsw *d; - int rv, mpflag; + struct localcount *lc; + device_t dv = NULL/*XXXGCC*/; + int unit, rv, mpflag; - /* - * For open we need to lock, in order to synchronize - * with attach/detach. - */ - mutex_enter(&device_lock); - d = cdevsw_lookup(dev); - mutex_exit(&device_lock); + d = cdevsw_lookup_acquire(dev, &lc); if (d == NULL) return ENXIO; + if (d->d_devtounit) { + /* + * If the device node corresponds to an autoconf device + * instance, acquire a reference to it so that during + * d_open, device_lookup is stable. + * + * XXX This should also arrange to instantiate cloning + * pseudo-devices if appropriate, but that requires + * reviewing them all to find and verify a common + * pattern. + */ + if ((unit = (*d->d_devtounit)(dev)) == -1) + return ENXIO; + if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL) + return ENXIO; + } + DEV_LOCK(d); rv = (*d->d_open)(dev, flag, devtype, l); DEV_UNLOCK(d); + if (d->d_devtounit) { + device_release(dv); + } + + cdevsw_release(d, lc); + return rv; } @@ -1052,6 +1306,24 @@ cdev_type(dev_t dev) return d->d_flag & D_TYPEMASK; } +void +cdev_detached(dev_t dev) +{ + const struct cdevsw *d; + device_t dv; + int unit; + + if ((d = cdevsw_lookup(dev)) == NULL) + return; + if (d->d_devtounit == NULL) + return; + if ((unit = (*d->d_devtounit)(dev)) == -1) + return; + if ((dv = device_lookup(d->d_cfdriver, unit)) == NULL) + return; + config_detach_commit(dv); +} + /* * nommap(dev, off, prot) * @@ -1063,3 +1335,18 @@ nommap(dev_t dev, off_t off, int prot) return (paddr_t)-1; } + +/* + * dev_minor_unit(dev) + * + * Returns minor(dev) as an int. Intended for use with struct + * bdevsw, cdevsw::d_devtounit for drivers whose /dev nodes are + * implemented by reference to an autoconf instance with the minor + * number. + */ +int +dev_minor_unit(dev_t dev) +{ + + return minor(dev); +} diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c index da664f920382..41218421db57 100644 --- a/sys/kern/subr_disk.c +++ b/sys/kern/subr_disk.c @@ -728,3 +728,10 @@ disk_set_info(device_t dev, struct disk *dk, const char *type) if (odisk_info) prop_object_release(odisk_info); } + +int +disklabel_dev_unit(dev_t dev) +{ + + return DISKUNIT(dev); +} diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 2d4d6c28c292..58d3dff41d57 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -1372,7 +1372,8 @@ vfs_mountedon(vnode_t *vp) return ENOTBLK; if (spec_node_getmountedfs(vp) != NULL) return EBUSY; - if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, &vq) == 0) { + if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, VDEAD_NOWAIT, &vq) + == 0) { if (spec_node_getmountedfs(vq) != NULL) error = EBUSY; vrele(vq); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 10cb4e57e129..f6504733960d 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -495,7 +495,7 @@ int vfinddev(dev_t dev, enum vtype type, vnode_t **vpp) { - return (spec_node_lookup_by_dev(type, dev, vpp) == 0); + return (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, vpp) == 0); } /* @@ -511,7 +511,26 @@ vdevgone(int maj, int minl, int minh, enum vtype type) for (mn = minl; mn <= minh; mn++) { dev = makedev(maj, mn); - while (spec_node_lookup_by_dev(type, dev, &vp) == 0) { + /* + * Notify anyone trying to get at this device that it + * has been detached, and then revoke it. + */ + switch (type) { + case VBLK: + bdev_detached(dev); + break; + case VCHR: + cdev_detached(dev); + break; + default: + panic("invalid specnode type: %d", type); + } + /* + * Passing 0 as flags, instead of VDEAD_NOWAIT, means + * spec_node_lookup_by_dev will wait for vnodes it + * finds concurrently being revoked before returning. + */ + while (spec_node_lookup_by_dev(type, dev, 0, &vp) == 0) { VOP_REVOKE(vp, REVOKEALL); vrele(vp); } diff --git a/sys/kern/vfs_vnode.c b/sys/kern/vfs_vnode.c index 48065852a6ac..ee365aaaba08 100644 --- a/sys/kern/vfs_vnode.c +++ b/sys/kern/vfs_vnode.c @@ -1234,7 +1234,8 @@ vrevoke(vnode_t *vp) type = vp->v_type; mutex_exit(vp->v_interlock); - while (spec_node_lookup_by_dev(type, dev, &vq) == 0) { + while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq) + == 0) { mp = vrevoke_suspend_next(mp, vq->v_mount); vgone(vq); } @@ -1813,7 +1814,7 @@ vcache_reclaim(vnode_t *vp) uint32_t hash; uint8_t temp_buf[64], *temp_key; size_t temp_key_len; - bool recycle, active; + bool recycle; int error; KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 || @@ -1821,7 +1822,6 @@ vcache_reclaim(vnode_t *vp) KASSERT(mutex_owned(vp->v_interlock)); KASSERT(vrefcnt(vp) != 0); - active = (vrefcnt(vp) > 1); temp_key_len = vip->vi_key.vk_key_len; /* * Prevent the vnode from being recycled or brought into use @@ -1864,8 +1864,6 @@ vcache_reclaim(vnode_t *vp) /* * Clean out any cached data associated with the vnode. - * If purging an active vnode, it must be closed and - * deactivated before being reclaimed. */ error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); if (error != 0) { @@ -1875,7 +1873,7 @@ vcache_reclaim(vnode_t *vp) } KASSERTMSG((error == 0), "vinvalbuf failed: %d", error); KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); - if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { + if (vp->v_type == VBLK || vp->v_type == VCHR) { spec_node_revoke(vp); } diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index b4bc4c34ab03..324297488d5d 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -81,10 +81,19 @@ __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.183 2021/07/18 23:57:14 dholland Ex #include #include #include +#include #include #include +/* + * Lock order: + * + * vnode lock + * -> device_lock + * -> struct vnode::v_interlock + */ + /* symbolic sleep message strings for devices */ const char devopn[] = "devopn"; const char devio[] = "devio"; @@ -165,6 +174,7 @@ const struct vnodeopv_desc spec_vnodeop_opv_desc = { &spec_vnodeop_p, spec_vnodeop_entries }; static kauth_listener_t rawio_listener; +static struct kcondvar specfs_iocv; /* Returns true if vnode is /dev/mem or /dev/kmem. */ bool @@ -210,6 +220,141 @@ spec_init(void) rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE, rawio_listener_cb, NULL); + cv_init(&specfs_iocv, "specio"); +} + +/* + * spec_io_enter(vp, &sn, &dev) + * + * Enter an operation that may not hold vp's vnode lock or an + * fstrans on vp's mount. Until spec_io_exit, the vnode will not + * be revoked. + * + * On success, set sn to the specnode pointer and dev to the dev_t + * number and return zero. Caller must later call spec_io_exit + * when done. + * + * On failure, return ENXIO -- the device has been revoked and no + * longer exists. + */ +static int +spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp) +{ + dev_t dev; + struct specnode *sn; + unsigned iocnt; + int error = 0; + + mutex_enter(vp->v_interlock); + + /* + * Extract all the info we need from the vnode, unless the + * vnode has already been reclaimed. This can happen if the + * underlying device has been removed and all the device nodes + * for it have been revoked. The caller may not hold a vnode + * lock or fstrans to prevent this from happening before it has + * had an opportunity to notice the vnode is dead. + */ + if (vdead_check(vp, VDEAD_NOWAIT) != 0 || + (sn = vp->v_specnode) == NULL || + (dev = vp->v_rdev) == NODEV) { + error = ENXIO; + goto out; + } + + /* + * Notify spec_close that we are doing an I/O operation which + * may not be not bracketed by fstrans(9) and thus is not + * blocked by vfs suspension. + * + * We could hold this reference with psref(9) instead, but we + * already have to take the interlock for vdead_check, so + * there's not much more cost here to another atomic operation. + */ + do { + iocnt = atomic_load_relaxed(&sn->sn_dev->sd_iocnt); + if (__predict_false(iocnt == UINT_MAX)) { + /* + * The I/O count is limited by the number of + * LWPs (which will never overflow this) -- + * unless one driver uses another driver via + * specfs, which is rather unusual, but which + * could happen via pud(4) userspace drivers. + * We could use a 64-bit count, but can't use + * atomics for that on all platforms. + * (Probably better to switch to psref or + * localcount instead.) + */ + error = EBUSY; + goto out; + } + } while (atomic_cas_uint(&sn->sn_dev->sd_iocnt, iocnt, iocnt + 1) + != iocnt); + + /* Success! */ + *snp = sn; + *devp = dev; + error = 0; + +out: mutex_exit(vp->v_interlock); + return error; +} + +/* + * spec_io_exit(vp, sn) + * + * Exit an operation entered with a successful spec_io_enter -- + * allow concurrent spec_node_revoke to proceed. The argument sn + * must match the struct specnode pointer returned by spec_io_exit + * for vp. + */ +static void +spec_io_exit(struct vnode *vp, struct specnode *sn) +{ + struct specdev *sd = sn->sn_dev; + unsigned iocnt; + + KASSERT(vp->v_specnode == sn); + + /* + * We are done. Notify spec_close if appropriate. The + * transition of 1 -> 0 must happen under device_lock so + * spec_close doesn't miss a wakeup. + */ + do { + iocnt = atomic_load_relaxed(&sd->sd_iocnt); + KASSERT(iocnt > 0); + if (iocnt == 1) { + mutex_enter(&device_lock); + if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0) + cv_broadcast(&specfs_iocv); + mutex_exit(&device_lock); + break; + } + } while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt); +} + +/* + * spec_io_drain(sd) + * + * Wait for all existing spec_io_enter/exit sections to complete. + * Caller must ensure spec_io_enter will fail at this point. + */ +static void +spec_io_drain(struct specdev *sd) +{ + + /* + * I/O at the same time as closing is unlikely -- it often + * indicates an application bug. + */ + if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0)) + return; + + mutex_enter(&device_lock); + while (atomic_load_relaxed(&sd->sd_iocnt) > 0) + cv_wait(&specfs_iocv, &device_lock); + mutex_exit(&device_lock); } /* @@ -250,6 +395,9 @@ spec_node_init(vnode_t *vp, dev_t rdev) sd->sd_refcnt = 1; sd->sd_opencnt = 0; sd->sd_bdevvp = NULL; + sd->sd_iocnt = 0; + sd->sd_opened = false; + sd->sd_closing = false; sn->sn_dev = sd; sd = NULL; } else { @@ -276,18 +424,35 @@ spec_node_init(vnode_t *vp, dev_t rdev) * Lookup a vnode by device number and return it referenced. */ int -spec_node_lookup_by_dev(enum vtype type, dev_t dev, vnode_t **vpp) +spec_node_lookup_by_dev(enum vtype type, dev_t dev, int flags, vnode_t **vpp) { int error; vnode_t *vp; - mutex_enter(&device_lock); +top: mutex_enter(&device_lock); for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { if (type == vp->v_type && dev == vp->v_rdev) { mutex_enter(vp->v_interlock); /* If clean or being cleaned, then ignore it. */ if (vdead_check(vp, VDEAD_NOWAIT) == 0) break; + if ((flags & VDEAD_NOWAIT) == 0) { + mutex_exit(&device_lock); + /* + * It may be being revoked as we speak, + * and the caller wants to wait until + * all revocation has completed. Let + * vcache_vget wait for it to finish + * dying; as a side effect, vcache_vget + * releases vp->v_interlock. Note that + * vcache_vget cannot succeed at this + * point because vdead_check already + * failed. + */ + error = vcache_vget(vp); + KASSERT(error); + goto top; + } mutex_exit(vp->v_interlock); } } @@ -351,6 +516,11 @@ spec_node_lookup_by_mount(struct mount *mp, vnode_t **vpp) /* * Get the file system mounted on this block device. + * + * XXX Caller should hold the vnode lock -- shared or exclusive -- so + * that this can't changed, and the vnode can't be revoked while we + * examine it. But not all callers do, and they're scattered through a + * lot of file systems, so we can't assert this yet. */ struct mount * spec_node_getmountedfs(vnode_t *devvp) @@ -365,23 +535,51 @@ spec_node_getmountedfs(vnode_t *devvp) /* * Set the file system mounted on this block device. + * + * XXX Caller should hold the vnode lock exclusively so this can't be + * changed or assumed by spec_node_getmountedfs while we change it, and + * the vnode can't be revoked while we handle it. But not all callers + * do, and they're scattered through a lot of file systems, so we can't + * assert this yet. Instead, for now, we'll take an I/O reference so + * at least the ioctl doesn't race with revoke/detach. + * + * If you do change this to assert an exclusive vnode lock, you must + * also do vdead_check before trying bdev_ioctl, because the vnode may + * have been revoked by the time the caller locked it, and this is + * _not_ a vop -- calls to spec_node_setmountedfs don't go through + * v_op, so revoking the vnode doesn't prevent further calls. + * + * XXX Caller should additionally have the vnode open, at least if mp + * is nonnull, but I'm not sure all callers do that -- need to audit. + * Currently udf closes the vnode before clearing the mount. */ void spec_node_setmountedfs(vnode_t *devvp, struct mount *mp) { struct dkwedge_info dkw; + struct specnode *sn; + dev_t dev; + int error; KASSERT(devvp->v_type == VBLK); - KASSERT(devvp->v_specnode->sn_dev->sd_mountpoint == NULL || mp == NULL); - devvp->v_specnode->sn_dev->sd_mountpoint = mp; - if (mp == NULL) - return; - if (bdev_ioctl(devvp->v_rdev, DIOCGWEDGEINFO, &dkw, FREAD, curlwp) != 0) + error = spec_io_enter(devvp, &sn, &dev); + if (error) return; + KASSERT(sn->sn_dev->sd_mountpoint == NULL || mp == NULL); + sn->sn_dev->sd_mountpoint = mp; + if (mp == NULL) + goto out; + + error = bdev_ioctl(dev, DIOCGWEDGEINFO, &dkw, FREAD, curlwp); + if (error) + goto out; + strlcpy(mp->mnt_stat.f_mntfromlabel, dkw.dkw_wname, sizeof(mp->mnt_stat.f_mntfromlabel)); + +out: spec_io_exit(devvp, sn); } /* @@ -393,6 +591,7 @@ spec_node_revoke(vnode_t *vp) { specnode_t *sn; specdev_t *sd; + struct vnode **vpp; sn = vp->v_specnode; sd = sn->sn_dev; @@ -403,10 +602,10 @@ spec_node_revoke(vnode_t *vp) mutex_enter(&device_lock); KASSERT(sn->sn_opencnt <= sd->sd_opencnt); + sn->sn_gone = true; if (sn->sn_opencnt != 0) { sd->sd_opencnt -= (sn->sn_opencnt - 1); sn->sn_opencnt = 1; - sn->sn_gone = true; mutex_exit(&device_lock); VOP_CLOSE(vp, FNONBLOCK, NOCRED); @@ -414,6 +613,29 @@ spec_node_revoke(vnode_t *vp) mutex_enter(&device_lock); KASSERT(sn->sn_opencnt == 0); } + + /* + * In case the revoke happened concurrently with the last + * close, wait for the last close to complete. + */ + while (sd->sd_closing) + cv_wait(&specfs_iocv, &device_lock); + + /* + * Remove from the hash so lookups stop returning this + * specnode. We will dissociate it from the specdev -- and + * possibly free the specdev -- in spec_node_destroy. + */ + KASSERT(sn->sn_gone); + KASSERT(sn->sn_opencnt == 0); + for (vpp = &specfs_hash[SPECHASH(vp->v_rdev)];; + vpp = &(*vpp)->v_specnext) { + if (*vpp == vp) { + *vpp = vp->v_specnext; + vp->v_specnext = NULL; + break; + } + } mutex_exit(&device_lock); } @@ -426,7 +648,6 @@ spec_node_destroy(vnode_t *vp) { specnode_t *sn; specdev_t *sd; - vnode_t **vpp, *vp2; int refcnt; sn = vp->v_specnode; @@ -437,22 +658,6 @@ spec_node_destroy(vnode_t *vp) KASSERT(sn->sn_opencnt == 0); mutex_enter(&device_lock); - /* Remove from the hash and destroy the node. */ - vpp = &specfs_hash[SPECHASH(vp->v_rdev)]; - for (vp2 = *vpp;; vp2 = vp2->v_specnext) { - if (vp2 == NULL) { - panic("spec_node_destroy: corrupt hash"); - } - if (vp2 == vp) { - KASSERT(vp == *vpp); - *vpp = vp->v_specnext; - break; - } - if (vp2->v_specnext == vp) { - vp2->v_specnext = vp->v_specnext; - break; - } - } sn = vp->v_specnode; vp->v_specnode = NULL; refcnt = sd->sd_refcnt--; @@ -461,6 +666,7 @@ spec_node_destroy(vnode_t *vp) /* If the device is no longer in use, destroy our record. */ if (refcnt == 1) { + KASSERT(sd->sd_iocnt == 0); KASSERT(sd->sd_opencnt == 0); KASSERT(sd->sd_bdevvp == NULL); kmem_free(sd, sizeof(*sd)); @@ -498,26 +704,28 @@ spec_open(void *v) int a_mode; kauth_cred_t a_cred; } */ *ap = v; - struct lwp *l; - struct vnode *vp; - dev_t dev; + struct lwp *l = curlwp; + struct vnode *vp = ap->a_vp; + dev_t dev, dev1; int error; enum kauth_device_req req; - specnode_t *sn; + specnode_t *sn, *sn1; specdev_t *sd; spec_ioctl_t ioctl; - u_int gen; - const char *name; + u_int gen = 0; + const char *name = NULL; + bool needclose = false; struct partinfo pi; - - l = curlwp; - vp = ap->a_vp; + + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); + KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d", + vp->v_type); + dev = vp->v_rdev; sn = vp->v_specnode; sd = sn->sn_dev; - name = NULL; - gen = 0; - + /* * Don't allow open if fs is mounted -nodev. */ @@ -535,28 +743,111 @@ spec_open(void *v) req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ; break; } + error = kauth_authorize_device_spec(ap->a_cred, req, vp); + if (error != 0) + return (error); + /* + * Acquire an open reference -- as long as we hold onto it, and + * the vnode isn't revoked, it can't be closed, and the vnode + * can't be revoked until we release the vnode lock. + */ + mutex_enter(&device_lock); + KASSERT(!sn->sn_gone); switch (vp->v_type) { case VCHR: - error = kauth_authorize_device_spec(ap->a_cred, req, vp); - if (error != 0) - return (error); - /* * Character devices can accept opens from multiple - * vnodes. + * vnodes. But first, wait for any close to finish. + * Wait under the vnode lock so we don't have to worry + * about the vnode being revoked while we wait. */ - mutex_enter(&device_lock); - if (sn->sn_gone) { - mutex_exit(&device_lock); - return (EBADF); + while (sd->sd_closing) { + error = cv_wait_sig(&specfs_iocv, &device_lock); + if (error) + break; } + if (error) + break; sd->sd_opencnt++; sn->sn_opencnt++; - mutex_exit(&device_lock); + break; + case VBLK: + /* + * For block devices, permit only one open. The buffer + * cache cannot remain self-consistent with multiple + * vnodes holding a block device open. + * + * Treat zero opencnt with non-NULL mountpoint as open. + * This may happen after forced detach of a mounted device. + */ + if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) { + error = EBUSY; + break; + } + KASSERTMSG(sn->sn_opencnt == 0, "%u", sn->sn_opencnt); + sn->sn_opencnt = 1; + sd->sd_opencnt = 1; + sd->sd_bdevvp = vp; + break; + default: + panic("invalid specfs vnode type: %d", vp->v_type); + } + mutex_exit(&device_lock); + if (error) + return error; + + /* + * Set VV_ISTTY if this is a tty cdev. + * + * XXX This does the wrong thing if the module has to be + * autoloaded. We should maybe set this after autoloading + * modules and calling .d_open successfully, except (a) we need + * the vnode lock to touch it, and (b) once we acquire the + * vnode lock again, the vnode may have been revoked, and + * deadfs's dead_read needs VV_ISTTY to be already set in order + * to return the right answer. So this needs some additional + * synchronization to be made to work correctly with tty driver + * module autoload. For now, let's just hope it doesn't cause + * too much trouble for a tty from an autoloaded driver module + * to fail with EIO instead of returning EOF. + */ + if (vp->v_type == VCHR) { if (cdev_type(dev) == D_TTY) vp->v_vflag |= VV_ISTTY; - VOP_UNLOCK(vp); + } + + /* + * Because opening the device may block indefinitely, e.g. when + * opening a tty, and loading a module may cross into many + * other subsystems, we must not hold the vnode lock while + * calling .d_open, so release it now and reacquire it when + * done. + * + * Take an I/O reference so that any concurrent spec_close via + * spec_node_revoke will wait for us to finish calling .d_open. + * The vnode can't be dead at this point because we have it + * locked. Note that if revoked, the driver must interrupt + * .d_open before spec_close starts waiting for I/O to drain so + * this doesn't deadlock. + */ + VOP_UNLOCK(vp); + error = spec_io_enter(vp, &sn1, &dev1); + if (error) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + return error; + } + KASSERT(sn1 == sn); + KASSERT(dev1 == dev); + + /* + * Open the device. If .d_open returns ENXIO (device not + * configured), the driver may not be loaded, so try + * autoloading a module and then try .d_open again if anything + * got loaded. + */ + switch (vp->v_type) { + case VCHR: do { const struct cdevsw *cdev; @@ -579,36 +870,9 @@ spec_open(void *v) /* Try to autoload device module */ (void) module_autoload(name, MODULE_CLASS_DRIVER); } while (gen != module_gen); - - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); break; case VBLK: - error = kauth_authorize_device_spec(ap->a_cred, req, vp); - if (error != 0) - return (error); - - /* - * For block devices, permit only one open. The buffer - * cache cannot remain self-consistent with multiple - * vnodes holding a block device open. - * - * Treat zero opencnt with non-NULL mountpoint as open. - * This may happen after forced detach of a mounted device. - */ - mutex_enter(&device_lock); - if (sn->sn_gone) { - mutex_exit(&device_lock); - return (EBADF); - } - if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL) { - mutex_exit(&device_lock); - return EBUSY; - } - sn->sn_opencnt = 1; - sd->sd_opencnt = 1; - sd->sd_bdevvp = vp; - mutex_exit(&device_lock); do { const struct bdevsw *bdev; @@ -628,49 +892,121 @@ spec_open(void *v) if ((name = bdevsw_getname(major(dev))) == NULL) break; - VOP_UNLOCK(vp); - /* Try to autoload device module */ (void) module_autoload(name, MODULE_CLASS_DRIVER); - - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } while (gen != module_gen); - break; - case VNON: - case VLNK: - case VDIR: - case VREG: - case VBAD: - case VFIFO: - case VSOCK: default: - return 0; + __unreachable(); } + /* + * Release the I/O reference now that we have called .d_open, + * and reacquire the vnode lock. At this point, the device may + * have been revoked, so we must tread carefully. However, sn + * and sd remain valid pointers until we drop our reference. + */ + spec_io_exit(vp, sn); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + KASSERT(vp->v_specnode == sn); + + /* + * If it has been revoked since we released the vnode lock and + * reacquired it, then spec_node_revoke has closed it, and we + * must fail with EBADF. + * + * Otherwise, if opening it failed, back out and release the + * open reference. If it was ever successfully opened and we + * got the last reference this way, it's now our job to close + * it. This might happen in the following scenario: + * + * Thread 1 Thread 2 + * VOP_OPEN + * ... + * .d_open -> 0 (success) + * acquire vnode lock + * do stuff VOP_OPEN + * release vnode lock ... + * .d_open -> EBUSY + * VOP_CLOSE + * acquire vnode lock + * --sd_opencnt != 0 + * => no .d_close + * release vnode lock + * acquire vnode lock + * --sd_opencnt == 0 + * + * We can't resolve this by making spec_close wait for .d_open + * to complete before examining sd_opencnt, because .d_open can + * hang indefinitely, e.g. for a tty. + */ mutex_enter(&device_lock); if (sn->sn_gone) { if (error == 0) error = EBADF; - } else if (error != 0) { + } else if (error == 0) { + sd->sd_opened = true; + } else if (sd->sd_opencnt == 1 && sd->sd_opened) { + /* + * We're the last reference to a _previous_ open even + * though this one failed, so we have to close it. + * Don't decrement the reference count here -- + * spec_close will do that. + */ + KASSERT(sn->sn_opencnt == 1); + needclose = true; + } else { + KASSERT(sd->sd_opencnt); + KASSERT(sn->sn_opencnt); sd->sd_opencnt--; sn->sn_opencnt--; if (vp->v_type == VBLK) sd->sd_bdevvp = NULL; - } mutex_exit(&device_lock); - if (cdev_type(dev) != D_DISK || error != 0) + /* + * If this open failed, but the device was previously opened, + * and another thread concurrently closed the vnode while we + * were in the middle of reopening it, the other thread will + * see sd_opencnt > 0 and thus decide not to call .d_close -- + * it is now our responsibility to do so. + * + * XXX The flags passed to VOP_CLOSE here are wrong, but + * drivers can't rely on FREAD|FWRITE anyway -- e.g., consider + * a device opened by thread 0 with O_READ, then opened by + * thread 1 with O_WRITE, then closed by thread 0, and finally + * closed by thread 1; the last .d_close call will have FWRITE + * but not FREAD. We should just eliminate the FREAD/FWRITE + * parameter to .d_close altogether. + */ + if (needclose) { + KASSERT(error); + VOP_CLOSE(vp, FNONBLOCK, NOCRED); + } + + /* If anything went wrong, we're done. */ + if (error) return error; - - ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl; - error = (*ioctl)(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, curlwp); - if (error == 0) - uvm_vnp_setsize(vp, (voff_t)pi.pi_secsize * pi.pi_size); + /* + * For disk devices, automagically set the vnode size to the + * partition size, if we can. This applies to block devices + * and character devices alike -- every block device must have + * a corresponding character device. And if the module is + * loaded it will remain loaded until we're done here (it is + * forbidden to devsw_detach until closed). So it is safe to + * query cdev_type unconditionally here. + */ + if (cdev_type(dev) == D_DISK) { + ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl; + if ((*ioctl)(dev, DIOCGPARTINFO, &pi, FREAD, curlwp) == 0) + uvm_vnp_setsize(vp, + (voff_t)pi.pi_secsize * pi.pi_size); + } + /* Success! */ return 0; } @@ -690,6 +1026,8 @@ spec_read(void *v) struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct lwp *l = curlwp; + struct specnode *sn; + dev_t dev; struct buf *bp; daddr_t bn; int bsize, bscale; @@ -712,9 +1050,27 @@ spec_read(void *v) switch (vp->v_type) { case VCHR: + /* + * Release the lock while we sleep -- possibly + * indefinitely, if this is, e.g., a tty -- in + * cdev_read, so we don't hold up everything else that + * might want access to the vnode. + * + * But before we issue the read, take an I/O reference + * to the specnode so close will know when we're done + * reading. Note that the moment we release the lock, + * the vnode's identity may change; hence spec_io_enter + * may fail, and the caller may have a dead vnode on + * their hands, if the file system on which vp lived + * has been unmounted. + */ VOP_UNLOCK(vp); - error = cdev_read(vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_SHARED | LK_RETRY); + error = spec_io_enter(vp, &sn, &dev); + if (error) + goto out; + error = cdev_read(dev, uio, ap->a_ioflag); + spec_io_exit(vp, sn); +out: vn_lock(vp, LK_SHARED | LK_RETRY); return (error); case VBLK: @@ -791,6 +1147,8 @@ spec_write(void *v) struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct lwp *l = curlwp; + struct specnode *sn; + dev_t dev; struct buf *bp; daddr_t bn; int bsize, bscale; @@ -806,9 +1164,27 @@ spec_write(void *v) switch (vp->v_type) { case VCHR: + /* + * Release the lock while we sleep -- possibly + * indefinitely, if this is, e.g., a tty -- in + * cdev_write, so we don't hold up everything else that + * might want access to the vnode. + * + * But before we issue the write, take an I/O reference + * to the specnode so close will know when we're done + * writing. Note that the moment we release the lock, + * the vnode's identity may change; hence spec_io_enter + * may fail, and the caller may have a dead vnode on + * their hands, if the file system on which vp lived + * has been unmounted. + */ VOP_UNLOCK(vp); - error = cdev_write(vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = spec_io_enter(vp, &sn, &dev); + if (error) + goto out; + error = cdev_write(dev, uio, ap->a_ioflag); + spec_io_exit(vp, sn); +out: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (error); case VBLK: @@ -866,21 +1242,13 @@ spec_fdiscard(void *v) off_t a_pos; off_t a_len; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; dev_t dev; - vp = ap->a_vp; - dev = NODEV; - - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); - if (dev == NODEV) { - return ENXIO; - } + dev = vp->v_rdev; switch (vp->v_type) { case VCHR: @@ -909,40 +1277,32 @@ spec_ioctl(void *v) int a_fflag; kauth_cred_t a_cred; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int error; - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - vp = ap->a_vp; - dev = NODEV; - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - if (dev == NODEV) { - return ENXIO; - } + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; switch (vp->v_type) { - case VCHR: - return cdev_ioctl(dev, ap->a_command, ap->a_data, + error = cdev_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); - + break; case VBLK: KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); - return bdev_ioctl(dev, ap->a_command, ap->a_data, + error = bdev_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); - + break; default: panic("spec_ioctl"); /* NOTREACHED */ } + + spec_io_exit(vp, sn); + return error; } /* ARGSUSED */ @@ -953,33 +1313,25 @@ spec_poll(void *v) struct vnode *a_vp; int a_events; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int revents; - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - vp = ap->a_vp; - dev = NODEV; - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - if (dev == NODEV) { + if (spec_io_enter(vp, &sn, &dev) != 0) return POLLERR; - } switch (vp->v_type) { - case VCHR: - return cdev_poll(dev, ap->a_events, curlwp); - + revents = cdev_poll(dev, ap->a_events, curlwp); + break; default: - return (genfs_poll(v)); + revents = genfs_poll(v); + break; } + + spec_io_exit(vp, sn); + return revents; } /* ARGSUSED */ @@ -990,20 +1342,30 @@ spec_kqfilter(void *v) struct vnode *a_vp; struct proc *a_kn; } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int error; - switch (ap->a_vp->v_type) { + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; + switch (vp->v_type) { case VCHR: - dev = ap->a_vp->v_rdev; - return cdev_kqfilter(dev, ap->a_kn); + error = cdev_kqfilter(dev, ap->a_kn); + break; default: /* * Block devices don't support kqfilter, and refuse it * for any other files (like those vflush()ed) too. */ - return (EOPNOTSUPP); + error = EOPNOTSUPP; + break; } + + spec_io_exit(vp, sn); + return error; } /* @@ -1018,11 +1380,19 @@ spec_mmap(void *v) kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; + struct specnode *sn; + dev_t dev; + int error; KASSERT(vp->v_type == VBLK); - if (bdev_type(vp->v_rdev) != D_DISK) - return EINVAL; + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; + + error = bdev_type(dev) == D_DISK ? 0 : EINVAL; + + spec_io_exit(vp, sn); return 0; } @@ -1067,27 +1437,14 @@ spec_strategy(void *v) } */ *ap = v; struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; + struct specnode *sn = NULL; dev_t dev; int error; - dev = NODEV; - - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) { - KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - - if (dev == NODEV) { - error = ENXIO; + error = spec_io_enter(vp, &sn, &dev); + if (error) goto out; - } + bp->b_dev = dev; if (!(bp->b_flags & B_READ)) { @@ -1107,13 +1464,15 @@ spec_strategy(void *v) } bdev_strategy(bp); - return 0; - -out: - bp->b_error = error; - bp->b_resid = bp->b_bcount; - biodone(bp); + error = 0; +out: if (sn) + spec_io_exit(vp, sn); + if (error) { + bp->b_error = error; + bp->b_resid = bp->b_bcount; + biodone(bp); + } return error; } @@ -1139,6 +1498,8 @@ spec_reclaim(void *v) } */ *ap = v; struct vnode *vp = ap->a_vp; + KASSERT(vp->v_specnode->sn_opencnt == 0); + VOP_UNLOCK(vp); KASSERT(vp->v_mount == dead_rootmount); @@ -1182,14 +1543,18 @@ spec_close(void *v) } */ *ap = v; struct vnode *vp = ap->a_vp; struct session *sess; - dev_t dev = vp->v_rdev; + dev_t dev; int flags = ap->a_fflag; int mode, error, count; specnode_t *sn; specdev_t *sd; + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); + mutex_enter(vp->v_interlock); sn = vp->v_specnode; + dev = vp->v_rdev; sd = sn->sn_dev; /* * If we're going away soon, make this non-blocking. @@ -1269,14 +1634,48 @@ spec_close(void *v) panic("spec_close: not special"); } + /* + * Decrement the open reference count of this node and the + * device. For block devices, the open reference count must be + * 1 at this point. If the device's open reference count goes + * to zero, we're the last one out so get the lights. + * + * We may find --sd->sd_opencnt gives zero, and yet + * sd->sd_opened is false. This happens if the vnode is + * revoked at the same time as it is being opened, which can + * happen when opening a tty blocks indefinitely. In that + * case, we still must call close -- it is the job of close to + * interrupt the open. Either way, the device will be no + * longer opened, so we have to clear sd->sd_opened; subsequent + * opens will have responsibility for issuing close. + * + * This has the side effect that the sequence of opens might + * happen out of order -- we might end up doing open, open, + * close, close, instead of open, close, open, close. This is + * unavoidable with the current devsw API, where open is + * allowed to block and close must be able to run concurrently + * to interrupt it. It is the driver's responsibility to + * ensure that close is idempotent so that this works. Drivers + * requiring per-open state and exact 1:1 correspondence + * between open and close can use fd_clone. + */ mutex_enter(&device_lock); + KASSERT(sn->sn_opencnt); + KASSERT(sd->sd_opencnt); sn->sn_opencnt--; count = --sd->sd_opencnt; - if (vp->v_type == VBLK) + if (vp->v_type == VBLK) { + KASSERTMSG(count == 0, "block device with %u opens", + count + 1); sd->sd_bdevvp = NULL; + } + if (count == 0) { + sd->sd_opened = false; + sd->sd_closing = true; + } mutex_exit(&device_lock); - if (count != 0 && (vp->v_type != VCHR || !(cdev_flags(dev) & D_MCLOSE))) + if (count != 0) return 0; /* @@ -1292,6 +1691,24 @@ spec_close(void *v) else error = cdev_close(dev, flags, mode, curlwp); + /* + * Wait for all other devsw operations to drain. After this + * point, no bdev/cdev_* can be active for this specdev. + */ + spec_io_drain(sd); + + /* + * Wake any spec_open calls waiting for close to finish -- do + * this before reacquiring the vnode lock, because spec_open + * holds the vnode lock while waiting, so doing this after + * reacquiring the lock would deadlock. + */ + mutex_enter(&device_lock); + KASSERT(sd->sd_closing); + sd->sd_closing = false; + cv_broadcast(&specfs_iocv); + mutex_exit(&device_lock); + if (!(flags & FNONBLOCK)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h index 621c103e80b1..8bb9b4de2b2a 100644 --- a/sys/miscfs/specfs/specdev.h +++ b/sys/miscfs/specfs/specdev.h @@ -66,8 +66,8 @@ typedef struct specnode { vnode_t *sn_next; struct specdev *sn_dev; - u_int sn_opencnt; dev_t sn_rdev; + u_int sn_opencnt; /* # of opens, share of sd_opencnt */ bool sn_gone; } specnode_t; @@ -75,9 +75,12 @@ typedef struct specdev { struct mount *sd_mountpoint; struct lockf *sd_lockf; vnode_t *sd_bdevvp; - u_int sd_opencnt; - u_int sd_refcnt; + u_int sd_opencnt; /* # of opens; close when ->0 */ + u_int sd_refcnt; /* # of specnodes referencing this */ dev_t sd_rdev; + volatile u_int sd_iocnt; /* # bdev/cdev_* operations active */ + bool sd_opened; /* true if successfully opened */ + bool sd_closing; /* true when bdev/cdev_close ongoing */ } specdev_t; /* @@ -92,7 +95,7 @@ typedef struct specdev { */ void spec_node_init(vnode_t *, dev_t); void spec_node_destroy(vnode_t *); -int spec_node_lookup_by_dev(enum vtype, dev_t, vnode_t **); +int spec_node_lookup_by_dev(enum vtype, dev_t, int, vnode_t **); int spec_node_lookup_by_mount(struct mount *, vnode_t **); struct mount *spec_node_getmountedfs(vnode_t *); void spec_node_setmountedfs(vnode_t *, struct mount *); diff --git a/sys/modules/examples/pollpal/pollpal.c b/sys/modules/examples/pollpal/pollpal.c index b76e0733699b..d8ddc73450e0 100644 --- a/sys/modules/examples/pollpal/pollpal.c +++ b/sys/modules/examples/pollpal/pollpal.c @@ -311,7 +311,8 @@ pollpal_modcmd(modcmd_t cmd, void *arg __unused) case MODULE_CMD_FINI: if (pollpal_nopen != 0) return EBUSY; - return devsw_detach(NULL, &pollpal_cdevsw); + devsw_detach(NULL, &pollpal_cdevsw); + return 0; default: return ENOTTY; } diff --git a/sys/net/if_tap.c b/sys/net/if_tap.c index 0b57ad4a711c..314f4647707c 100644 --- a/sys/net/if_tap.c +++ b/sys/net/if_tap.c @@ -256,9 +256,7 @@ tapdetach(void) if_clone_detach(&tap_cloners); #ifdef _MODULE - error = devsw_detach(NULL, &tap_cdevsw); - if (error != 0) - goto out2; + devsw_detach(NULL, &tap_cdevsw); #endif if (tap_count != 0) { @@ -277,7 +275,6 @@ tapdetach(void) out1: #ifdef _MODULE devsw_attach("tap", NULL, &tap_bmajor, &tap_cdevsw, &tap_cmajor); - out2: #endif if_clone_attach(&tap_cloners); diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c index 59071ddc7ad2..73622837719e 100644 --- a/sys/net/if_tun.c +++ b/sys/net/if_tun.c @@ -142,17 +142,10 @@ tuninit(void) static int tundetach(void) { -#ifdef _MODULE - int error; -#endif if_clone_detach(&tun_cloner); #ifdef _MODULE - error = devsw_detach(NULL, &tun_cdevsw); - if (error != 0) { - if_clone_attach(&tun_cloner); - return error; - } + devsw_detach(NULL, &tun_cdevsw); #endif if (!LIST_EMPTY(&tun_softc_list) || !LIST_EMPTY(&tunz_softc_list)) { diff --git a/sys/rump/dev/lib/libbpf/bpf_component.c b/sys/rump/dev/lib/libbpf/bpf_component.c index 05807d371d40..d41d1987afe8 100644 --- a/sys/rump/dev/lib/libbpf/bpf_component.c +++ b/sys/rump/dev/lib/libbpf/bpf_component.c @@ -50,6 +50,5 @@ RUMP_COMPONENT(RUMP_COMPONENT_NET) panic("bpf devsw attach failed: %d", error); if ((error = rump_vfs_makeonedevnode(S_IFCHR, "/dev/bpf", cmaj, 0)) !=0) panic("cannot create bpf device nodes: %d", error); - if ((error = devsw_detach(NULL, &bpf_cdevsw)) != 0) - panic("cannot detach bpf devsw: %d", error); + devsw_detach(NULL, &bpf_cdevsw); } diff --git a/sys/rump/dev/lib/libdrvctl/drvctl_component.c b/sys/rump/dev/lib/libdrvctl/drvctl_component.c index e2e79f45f9de..ac4e103fdb9c 100644 --- a/sys/rump/dev/lib/libdrvctl/drvctl_component.c +++ b/sys/rump/dev/lib/libdrvctl/drvctl_component.c @@ -51,7 +51,5 @@ RUMP_COMPONENT(RUMP_COMPONENT_DEV) if ( error !=0) panic("cannot create drvctl device node: %d", error); - error = devsw_detach(NULL, &drvctl_cdevsw); - if (error != 0) - panic("cannot detach drvctl devsw: %d", error); + devsw_detach(NULL, &drvctl_cdevsw); } diff --git a/sys/sys/conf.h b/sys/sys/conf.h index 081631d2111f..210a237b8c43 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -63,7 +63,7 @@ struct vnode; #define D_TYPEMASK 0x00ff #define D_MPSAFE 0x0100 #define D_NEGOFFSAFE 0x0200 -#define D_MCLOSE 0x0400 +#define D_UNUSED0 0x0400 /* was D_MCLOSE */ /* * Block device switch table @@ -76,6 +76,8 @@ struct bdevsw { int (*d_dump)(dev_t, daddr_t, void *, size_t); int (*d_psize)(dev_t); int (*d_discard)(dev_t, off_t, off_t); + int (*d_devtounit)(dev_t); + struct cfdriver *d_cfdriver; int d_flag; }; @@ -94,6 +96,8 @@ struct cdevsw { paddr_t (*d_mmap)(dev_t, off_t, int); int (*d_kqfilter)(dev_t, struct knote *); int (*d_discard)(dev_t, off_t, off_t); + int (*d_devtounit)(dev_t); + struct cfdriver *d_cfdriver; int d_flag; }; @@ -104,7 +108,7 @@ extern kmutex_t device_lock; int devsw_attach(const char *, const struct bdevsw *, devmajor_t *, const struct cdevsw *, devmajor_t *); -int devsw_detach(const struct bdevsw *, const struct cdevsw *); +void devsw_detach(const struct bdevsw *, const struct cdevsw *); const struct bdevsw *bdevsw_lookup(dev_t); const struct cdevsw *cdevsw_lookup(dev_t); devmajor_t bdevsw_lookup_major(const struct bdevsw *); @@ -168,6 +172,8 @@ dev_type_dump(bdev_dump); dev_type_size(bdev_size); dev_type_discard(bdev_discard); +void bdev_detached(dev_t); + dev_type_open(cdev_open); dev_type_close(cdev_close); dev_type_read(cdev_read); @@ -180,6 +186,8 @@ dev_type_mmap(cdev_mmap); dev_type_kqfilter(cdev_kqfilter); dev_type_discard(cdev_discard); +void cdev_detached(dev_t); + int cdev_type(dev_t); int cdev_flags(dev_t); int bdev_type(dev_t); @@ -276,6 +284,7 @@ devmajor_t devsw_name2blk(const char *, char *, size_t); devmajor_t devsw_name2chr(const char *, char *, size_t); dev_t devsw_chr2blk(dev_t); dev_t devsw_blk2chr(dev_t); +int dev_minor_unit(dev_t); void mm_init(void); #endif /* _KERNEL */ diff --git a/sys/sys/device.h b/sys/sys/device.h index a443878d9810..cfd324a1ac2d 100644 --- a/sys/sys/device.h +++ b/sys/sys/device.h @@ -274,11 +274,14 @@ struct device { void *dv_private; /* this device's private storage */ int *dv_locators; /* our actual locators (optional) */ prop_dictionary_t dv_properties;/* properties dictionary */ + struct localcount *dv_localcount;/* reference count */ int dv_pending; /* config_pending count */ TAILQ_ENTRY(device) dv_pending_list; + struct lwp *dv_attaching; /* thread not yet finished in attach */ struct lwp *dv_detaching; /* detach lock (config_misc_lock/cv) */ + bool dv_detached; /* config_misc_lock */ size_t dv_activity_count; void (**dv_activity_handlers)(device_t, devactive_t); @@ -629,6 +632,7 @@ device_t config_attach_pseudo(cfdata_t); int config_detach(device_t, int); int config_detach_children(device_t, int flags); +void config_detach_commit(device_t); bool config_detach_all(int); int config_deactivate(device_t); void config_defer(device_t, void (*)(device_t)); @@ -651,6 +655,10 @@ void null_childdetached(device_t, device_t); device_t device_lookup(cfdriver_t, int); void *device_lookup_private(cfdriver_t, int); + +device_t device_lookup_acquire(cfdriver_t, int); +void device_release(device_t); + void device_register(device_t, void *); void device_register_post_config(device_t, void *); diff --git a/sys/sys/disklabel.h b/sys/sys/disklabel.h index 4e94b8671332..853cdbe668a3 100644 --- a/sys/sys/disklabel.h +++ b/sys/sys/disklabel.h @@ -509,6 +509,7 @@ const char *convertdisklabel(struct disklabel *, void (*)(struct buf *), int bounds_check_with_label(struct disk *, struct buf *, int); int bounds_check_with_mediasize(struct buf *, int, uint64_t); const char *getfstypename(int); +int disklabel_dev_unit(dev_t); #endif #endif /* _LOCORE */