From 86889565a6fb344090ade2abc0c652796b40e39b Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Wed, 19 Jan 2022 12:52:23 +0000 Subject: [PATCH 27/36] specfs: Drain all I/O operations after last .d_close call. New kind of I/O reference on specdevs, sd_iocnt. This could be done with psref instead; I chose a reference count instead for now because we already have to take a per-object lock anyway, v_interlock, for vdead_check, so another atomic is not likely to hurt much more. We can always change the mechanism inside spec_io_enter/exit/drain later on. Make sure every access to vp->v_rdev or vp->v_specnode and every call to a devsw operation is protected either: - by the vnode lock (with vdead_check if we unlocked/relocked), - by positive sd_opencnt, - by spec_io_enter/exit, or - by sd_opencnt management in open/close. --- sys/miscfs/specfs/spec_vnops.c | 339 ++++++++++++++++++++++++--------- sys/miscfs/specfs/specdev.h | 1 + 2 files changed, 252 insertions(+), 88 deletions(-) diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index dccba1d0ebcb..037975f6f96d 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -81,6 +81,7 @@ __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.183 2021/07/18 23:57:14 dholland Ex #include #include #include +#include #include #include @@ -165,6 +166,7 @@ const struct vnodeopv_desc spec_vnodeop_opv_desc = { &spec_vnodeop_p, spec_vnodeop_entries }; static kauth_listener_t rawio_listener; +static struct kcondvar specfs_iocv; /* Returns true if vnode is /dev/mem or /dev/kmem. */ bool @@ -210,6 +212,124 @@ spec_init(void) rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE, rawio_listener_cb, NULL); + cv_init(&specfs_iocv, "specio"); +} + +/* + * spec_io_enter(vp, &sn, &dev) + * + * Enter an operation that may not hold vp's vnode lock or an + * fstrans on vp's mount. Until spec_io_exit, the vnode will not + * be revoked. + * + * On success, set sn to the specnode pointer and dev to the dev_t + * number and return zero. Caller must later call spec_io_exit + * when done. + * + * On failure, return ENXIO -- the device has been revoked and no + * longer exists. + */ +static int +spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp) +{ + dev_t dev; + struct specnode *sn; + unsigned iocnt; + int error = 0; + + mutex_enter(vp->v_interlock); + + /* + * Extract all the info we need from the vnode, unless the + * vnode has already been reclaimed. This can happen if the + * underlying device has been removed and all the device nodes + * for it have been revoked. The caller may not hold a vnode + * lock or fstrans to prevent this from happening before it has + * had an opportunity to notice the vnode is dead. + */ + if (vdead_check(vp, VDEAD_NOWAIT) != 0 || + (sn = vp->v_specnode) == NULL || + (dev = vp->v_rdev) == NODEV) { + error = ENXIO; + goto out; + } + + /* + * Notify spec_node_revoke that we are doing an I/O operation + * which may not be not bracketed by fstrans(9) and thus is not + * blocked by vfs suspension. + * + * We could hold this reference with psref(9) instead, but we + * already have to take the interlock for vdead_check, so + * there's not much more cost here to another atomic operation. + */ + iocnt = atomic_inc_uint_nv(&sn->sn_dev->sd_iocnt); + CTASSERT(MAXLWP < UINT_MAX); + KASSERT(iocnt < UINT_MAX); + + /* Success! */ + *snp = sn; + *devp = dev; + error = 0; + +out: mutex_exit(vp->v_interlock); + return error; +} + +/* + * spec_io_exit(vp, sn) + * + * Exit an operation entered with a successful spec_io_enter -- + * allow concurrent spec_node_revoke to proceed. The argument sn + * must match the struct specnode pointer returned by spec_io_exit + * for vp. + */ +static void +spec_io_exit(struct vnode *vp, struct specnode *sn) +{ + struct specdev *sd = sn->sn_dev; + unsigned iocnt; + + KASSERT(vp->v_specnode == sn); + + /* + * We are done. Notify spec_node_revoke if appropriate. The + * transition of 1 -> 0 must happen under device_lock so + * spec_node_revoke doesn't miss a wakeup. + */ + do { + iocnt = atomic_load_relaxed(&sd->sd_iocnt); + if (iocnt == 1) { + mutex_enter(&device_lock); + if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0) + cv_broadcast(&specfs_iocv); + mutex_exit(&device_lock); + break; + } + } while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt); +} + +/* + * spec_io_drain(sd) + * + * Wait for all existing spec_io_enter/exit sections to complete. + * Caller must ensure spec_io_enter will fail at this point. + */ +static void +spec_io_drain(struct specdev *sd) +{ + + /* + * I/O at the same time as closing is unlikely -- it often + * indicates an application bug. + */ + if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0)) + return; + + mutex_enter(&device_lock); + while (atomic_load_relaxed(&sd->sd_iocnt) > 0) + cv_wait(&specfs_iocv, &device_lock); + mutex_exit(&device_lock); } /* @@ -250,6 +370,7 @@ spec_node_init(vnode_t *vp, dev_t rdev) sd->sd_refcnt = 1; sd->sd_opencnt = 0; sd->sd_bdevvp = NULL; + sd->sd_iocnt = 0; sd->sd_opened = false; sn->sn_dev = sd; sd = NULL; @@ -464,6 +585,7 @@ spec_node_destroy(vnode_t *vp) /* If the device is no longer in use, destroy our record. */ if (refcnt == 1) { + KASSERT(sd->sd_iocnt == 0); KASSERT(sd->sd_opencnt == 0); KASSERT(sd->sd_bdevvp == NULL); kmem_free(sd, sizeof(*sd)); @@ -501,30 +623,28 @@ spec_open(void *v) int a_mode; kauth_cred_t a_cred; } */ *ap = v; - struct lwp *l; - struct vnode *vp; + struct lwp *l = curlwp; + struct vnode *vp = ap->a_vp; dev_t dev; int error; enum kauth_device_req req; specnode_t *sn; specdev_t *sd; spec_ioctl_t ioctl; - u_int gen; - const char *name; + u_int gen = 0; + const char *name = NULL; bool needclose = false; struct partinfo pi; - - l = curlwp; - vp = ap->a_vp; - dev = vp->v_rdev; - sn = vp->v_specnode; - sd = sn->sn_dev; - name = NULL; - gen = 0; + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d", vp->v_type); + dev = vp->v_rdev; + sn = vp->v_specnode; + sd = sn->sn_dev; + /* * Don't allow open if fs is mounted -nodev. */ @@ -794,6 +914,8 @@ spec_read(void *v) struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct lwp *l = curlwp; + struct specnode *sn; + dev_t dev; struct buf *bp; daddr_t bn; int bsize, bscale; @@ -816,9 +938,27 @@ spec_read(void *v) switch (vp->v_type) { case VCHR: + /* + * Release the lock while we sleep -- possibly + * indefinitely, if this is, e.g., a tty -- in + * cdev_read, so we don't hold up everything else that + * might want access to the vnode. + * + * But before we issue the read, take an I/O reference + * to the specnode so close will know when we're done + * writing. Note that the moment we release the lock, + * the vnode's identity may change; hence spec_io_enter + * may fail, and the caller may have a dead vnode on + * their hands, if the file system on which vp lived + * has been unmounted. + */ VOP_UNLOCK(vp); - error = cdev_read(vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_SHARED | LK_RETRY); + error = spec_io_enter(vp, &sn, &dev); + if (error) + goto out; + error = cdev_read(dev, uio, ap->a_ioflag); + spec_io_exit(vp, sn); +out: vn_lock(vp, LK_SHARED | LK_RETRY); return (error); case VBLK: @@ -895,6 +1035,8 @@ spec_write(void *v) struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct lwp *l = curlwp; + struct specnode *sn; + dev_t dev; struct buf *bp; daddr_t bn; int bsize, bscale; @@ -910,9 +1052,27 @@ spec_write(void *v) switch (vp->v_type) { case VCHR: + /* + * Release the lock while we sleep -- possibly + * indefinitely, if this is, e.g., a tty -- in + * cdev_write, so we don't hold up everything else that + * might want access to the vnode. + * + * But before we issue the write, take an I/O reference + * to the specnode so close will know when we're done + * writing. Note that the moment we release the lock, + * the vnode's identity may change; hence spec_io_enter + * may fail, and the caller may have a dead vnode on + * their hands, if the file system on which vp lived + * has been unmounted. + */ VOP_UNLOCK(vp); - error = cdev_write(vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = spec_io_enter(vp, &sn, &dev); + if (error) + goto out; + error = cdev_write(dev, uio, ap->a_ioflag); + spec_io_exit(vp, sn); +out: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (error); case VBLK: @@ -970,10 +1130,12 @@ spec_fdiscard(void *v) off_t a_pos; off_t a_len; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; dev_t dev; - vp = ap->a_vp; + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); + dev = vp->v_rdev; switch (vp->v_type) { @@ -1003,40 +1165,32 @@ spec_ioctl(void *v) int a_fflag; kauth_cred_t a_cred; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int error; - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - vp = ap->a_vp; - dev = NODEV; - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - if (dev == NODEV) { - return ENXIO; - } + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; switch (vp->v_type) { - case VCHR: - return cdev_ioctl(dev, ap->a_command, ap->a_data, + error = cdev_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); - + break; case VBLK: KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); - return bdev_ioctl(dev, ap->a_command, ap->a_data, + error = bdev_ioctl(dev, ap->a_command, ap->a_data, ap->a_fflag, curlwp); - + break; default: panic("spec_ioctl"); /* NOTREACHED */ } + + spec_io_exit(vp, sn); + return error; } /* ARGSUSED */ @@ -1047,33 +1201,25 @@ spec_poll(void *v) struct vnode *a_vp; int a_events; } */ *ap = v; - struct vnode *vp; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int revents; - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - vp = ap->a_vp; - dev = NODEV; - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) { - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - if (dev == NODEV) { + if (spec_io_enter(vp, &sn, &dev) != 0) return POLLERR; - } switch (vp->v_type) { - case VCHR: - return cdev_poll(dev, ap->a_events, curlwp); - + revents = cdev_poll(dev, ap->a_events, curlwp); + break; default: - return (genfs_poll(v)); + revents = genfs_poll(v); + break; } + + spec_io_exit(vp, sn); + return revents; } /* ARGSUSED */ @@ -1084,20 +1230,30 @@ spec_kqfilter(void *v) struct vnode *a_vp; struct proc *a_kn; } */ *ap = v; + struct vnode *vp = ap->a_vp; + struct specnode *sn; dev_t dev; + int error; - switch (ap->a_vp->v_type) { + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; + switch (vp->v_type) { case VCHR: - dev = ap->a_vp->v_rdev; - return cdev_kqfilter(dev, ap->a_kn); + error = cdev_kqfilter(dev, ap->a_kn); + break; default: /* * Block devices don't support kqfilter, and refuse it * for any other files (like those vflush()ed) too. */ - return (EOPNOTSUPP); + error = EOPNOTSUPP; + break; } + + spec_io_exit(vp, sn); + return error; } /* @@ -1112,11 +1268,19 @@ spec_mmap(void *v) kauth_cred_t a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; + struct specnode *sn; + dev_t dev; + int error; KASSERT(vp->v_type == VBLK); - if (bdev_type(vp->v_rdev) != D_DISK) - return EINVAL; + error = spec_io_enter(vp, &sn, &dev); + if (error) + return error; + + error = bdev_type(dev) == D_DISK ? 0 : EINVAL; + + spec_io_exit(vp, sn); return 0; } @@ -1161,27 +1325,14 @@ spec_strategy(void *v) } */ *ap = v; struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; + struct specnode *sn = NULL; dev_t dev; int error; - dev = NODEV; - - /* - * Extract all the info we need from the vnode, taking care to - * avoid a race with VOP_REVOKE(). - */ - - mutex_enter(vp->v_interlock); - if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) { - KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); - dev = vp->v_rdev; - } - mutex_exit(vp->v_interlock); - - if (dev == NODEV) { - error = ENXIO; + error = spec_io_enter(vp, &sn, &dev); + if (error) goto out; - } + bp->b_dev = dev; if (!(bp->b_flags & B_READ)) { @@ -1201,13 +1352,15 @@ spec_strategy(void *v) } bdev_strategy(bp); - return 0; - -out: - bp->b_error = error; - bp->b_resid = bp->b_bcount; - biodone(bp); + error = 0; +out: if (sn) + spec_io_exit(vp, sn); + if (error) { + bp->b_error = error; + bp->b_resid = bp->b_bcount; + biodone(bp); + } return error; } @@ -1278,14 +1431,18 @@ spec_close(void *v) } */ *ap = v; struct vnode *vp = ap->a_vp; struct session *sess; - dev_t dev = vp->v_rdev; + dev_t dev; int flags = ap->a_fflag; int mode, error, count; specnode_t *sn; specdev_t *sd; + KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE || + (vp->v_vflag & VV_LOCKSWORK) == 0); + mutex_enter(vp->v_interlock); sn = vp->v_specnode; + dev = vp->v_rdev; sd = sn->sn_dev; /* * If we're going away soon, make this non-blocking. @@ -1418,6 +1575,12 @@ spec_close(void *v) else error = cdev_close(dev, flags, mode, curlwp); + /* + * Wait for all other devsw operations to drain. After this + * point, no bdev/cdev_* can be active for this specdev. + */ + spec_io_drain(sd); + if (!(flags & FNONBLOCK)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h index 018679ffaef2..91e7af9bf09b 100644 --- a/sys/miscfs/specfs/specdev.h +++ b/sys/miscfs/specfs/specdev.h @@ -78,6 +78,7 @@ typedef struct specdev { u_int sd_opencnt; /* # of opens; close when ->0 */ u_int sd_refcnt; /* # of specnodes referencing this */ dev_t sd_rdev; + volatile u_int sd_iocnt; /* # bdev/cdev_* operations active */ bool sd_opened; /* true if successfully opened */ } specdev_t;