From 86889565a6fb344090ade2abc0c652796b40e39b Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <riastradh@NetBSD.org>
Date: Wed, 19 Jan 2022 12:52:23 +0000
Subject: [PATCH 27/36] specfs: Drain all I/O operations after last .d_close
 call.

New kind of I/O reference on specdevs, sd_iocnt.  This could be done
with psref instead; I chose a reference count instead for now because
we already have to take a per-object lock anyway, v_interlock, for
vdead_check, so another atomic is not likely to hurt much more.  We
can always change the mechanism inside spec_io_enter/exit/drain later
on.

Make sure every access to vp->v_rdev or vp->v_specnode and every call
to a devsw operation is protected either:

- by the vnode lock (with vdead_check if we unlocked/relocked),
- by positive sd_opencnt,
- by spec_io_enter/exit, or
- by sd_opencnt management in open/close.
---
 sys/miscfs/specfs/spec_vnops.c | 339 ++++++++++++++++++++++++---------
 sys/miscfs/specfs/specdev.h    |   1 +
 2 files changed, 252 insertions(+), 88 deletions(-)

diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c
index dccba1d0ebcb..037975f6f96d 100644
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@@ -81,6 +81,7 @@ __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.183 2021/07/18 23:57:14 dholland Ex
 #include <sys/kauth.h>
 #include <sys/fstrans.h>
 #include <sys/module.h>
+#include <sys/atomic.h>
 
 #include <miscfs/genfs/genfs.h>
 #include <miscfs/specfs/specdev.h>
@@ -165,6 +166,7 @@ const struct vnodeopv_desc spec_vnodeop_opv_desc =
 	{ &spec_vnodeop_p, spec_vnodeop_entries };
 
 static kauth_listener_t rawio_listener;
+static struct kcondvar specfs_iocv;
 
 /* Returns true if vnode is /dev/mem or /dev/kmem. */
 bool
@@ -210,6 +212,124 @@ spec_init(void)
 
 	rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
 	    rawio_listener_cb, NULL);
+	cv_init(&specfs_iocv, "specio");
+}
+
+/*
+ * spec_io_enter(vp, &sn, &dev)
+ *
+ *	Enter an operation that may not hold vp's vnode lock or an
+ *	fstrans on vp's mount.  Until spec_io_exit, the vnode will not
+ *	be revoked.
+ *
+ *	On success, set sn to the specnode pointer and dev to the dev_t
+ *	number and return zero.  Caller must later call spec_io_exit
+ *	when done.
+ *
+ *	On failure, return ENXIO -- the device has been revoked and no
+ *	longer exists.
+ */
+static int
+spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp)
+{
+	dev_t dev;
+	struct specnode *sn;
+	unsigned iocnt;
+	int error = 0;
+
+	mutex_enter(vp->v_interlock);
+
+	/*
+	 * Extract all the info we need from the vnode, unless the
+	 * vnode has already been reclaimed.  This can happen if the
+	 * underlying device has been removed and all the device nodes
+	 * for it have been revoked.  The caller may not hold a vnode
+	 * lock or fstrans to prevent this from happening before it has
+	 * had an opportunity to notice the vnode is dead.
+	 */
+	if (vdead_check(vp, VDEAD_NOWAIT) != 0 ||
+	    (sn = vp->v_specnode) == NULL ||
+	    (dev = vp->v_rdev) == NODEV) {
+		error = ENXIO;
+		goto out;
+	}
+
+	/*
+	 * Notify spec_node_revoke that we are doing an I/O operation
+	 * which may not be not bracketed by fstrans(9) and thus is not
+	 * blocked by vfs suspension.
+	 *
+	 * We could hold this reference with psref(9) instead, but we
+	 * already have to take the interlock for vdead_check, so
+	 * there's not much more cost here to another atomic operation.
+	 */
+	iocnt = atomic_inc_uint_nv(&sn->sn_dev->sd_iocnt);
+	CTASSERT(MAXLWP < UINT_MAX);
+	KASSERT(iocnt < UINT_MAX);
+
+	/* Success!  */
+	*snp = sn;
+	*devp = dev;
+	error = 0;
+
+out:	mutex_exit(vp->v_interlock);
+	return error;
+}
+
+/*
+ * spec_io_exit(vp, sn)
+ *
+ *	Exit an operation entered with a successful spec_io_enter --
+ *	allow concurrent spec_node_revoke to proceed.  The argument sn
+ *	must match the struct specnode pointer returned by spec_io_exit
+ *	for vp.
+ */
+static void
+spec_io_exit(struct vnode *vp, struct specnode *sn)
+{
+	struct specdev *sd = sn->sn_dev;
+	unsigned iocnt;
+
+	KASSERT(vp->v_specnode == sn);
+
+	/*
+	 * We are done.  Notify spec_node_revoke if appropriate.  The
+	 * transition of 1 -> 0 must happen under device_lock so
+	 * spec_node_revoke doesn't miss a wakeup.
+	 */
+	do {
+		iocnt = atomic_load_relaxed(&sd->sd_iocnt);
+		if (iocnt == 1) {
+			mutex_enter(&device_lock);
+			if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0)
+				cv_broadcast(&specfs_iocv);
+			mutex_exit(&device_lock);
+			break;
+		}
+	} while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt);
+}
+
+/*
+ * spec_io_drain(sd)
+ *
+ *	Wait for all existing spec_io_enter/exit sections to complete.
+ *	Caller must ensure spec_io_enter will fail at this point.
+ */
+static void
+spec_io_drain(struct specdev *sd)
+{
+
+	/*
+	 * I/O at the same time as closing is unlikely -- it often
+	 * indicates an application bug.
+	 */
+	if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0))
+		return;
+
+	mutex_enter(&device_lock);
+	while (atomic_load_relaxed(&sd->sd_iocnt) > 0)
+		cv_wait(&specfs_iocv, &device_lock);
+	mutex_exit(&device_lock);
 }
 
 /*
@@ -250,6 +370,7 @@ spec_node_init(vnode_t *vp, dev_t rdev)
 		sd->sd_refcnt = 1;
 		sd->sd_opencnt = 0;
 		sd->sd_bdevvp = NULL;
+		sd->sd_iocnt = 0;
 		sd->sd_opened = false;
 		sn->sn_dev = sd;
 		sd = NULL;
@@ -464,6 +585,7 @@ spec_node_destroy(vnode_t *vp)
 
 	/* If the device is no longer in use, destroy our record. */
 	if (refcnt == 1) {
+		KASSERT(sd->sd_iocnt == 0);
 		KASSERT(sd->sd_opencnt == 0);
 		KASSERT(sd->sd_bdevvp == NULL);
 		kmem_free(sd, sizeof(*sd));
@@ -501,30 +623,28 @@ spec_open(void *v)
 		int  a_mode;
 		kauth_cred_t a_cred;
 	} */ *ap = v;
-	struct lwp *l;
-	struct vnode *vp;
+	struct lwp *l = curlwp;
+	struct vnode *vp = ap->a_vp;
 	dev_t dev;
 	int error;
 	enum kauth_device_req req;
 	specnode_t *sn;
 	specdev_t *sd;
 	spec_ioctl_t ioctl;
-	u_int gen;
-	const char *name;
+	u_int gen = 0;
+	const char *name = NULL;
 	bool needclose = false;
 	struct partinfo pi;
-	
-	l = curlwp;
-	vp = ap->a_vp;
-	dev = vp->v_rdev;
-	sn = vp->v_specnode;
-	sd = sn->sn_dev;
-	name = NULL;
-	gen = 0;
 
+	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE ||
+	    (vp->v_vflag & VV_LOCKSWORK) == 0);
 	KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d",
 	    vp->v_type);
 
+	dev = vp->v_rdev;
+	sn = vp->v_specnode;
+	sd = sn->sn_dev;
+
 	/*
 	 * Don't allow open if fs is mounted -nodev.
 	 */
@@ -794,6 +914,8 @@ spec_read(void *v)
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
  	struct lwp *l = curlwp;
+	struct specnode *sn;
+	dev_t dev;
 	struct buf *bp;
 	daddr_t bn;
 	int bsize, bscale;
@@ -816,9 +938,27 @@ spec_read(void *v)
 	switch (vp->v_type) {
 
 	case VCHR:
+		/*
+		 * Release the lock while we sleep -- possibly
+		 * indefinitely, if this is, e.g., a tty -- in
+		 * cdev_read, so we don't hold up everything else that
+		 * might want access to the vnode.
+		 *
+		 * But before we issue the read, take an I/O reference
+		 * to the specnode so close will know when we're done
+		 * writing.  Note that the moment we release the lock,
+		 * the vnode's identity may change; hence spec_io_enter
+		 * may fail, and the caller may have a dead vnode on
+		 * their hands, if the file system on which vp lived
+		 * has been unmounted.
+		 */
 		VOP_UNLOCK(vp);
-		error = cdev_read(vp->v_rdev, uio, ap->a_ioflag);
-		vn_lock(vp, LK_SHARED | LK_RETRY);
+		error = spec_io_enter(vp, &sn, &dev);
+		if (error)
+			goto out;
+		error = cdev_read(dev, uio, ap->a_ioflag);
+		spec_io_exit(vp, sn);
+out:		vn_lock(vp, LK_SHARED | LK_RETRY);
 		return (error);
 
 	case VBLK:
@@ -895,6 +1035,8 @@ spec_write(void *v)
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct lwp *l = curlwp;
+	struct specnode *sn;
+	dev_t dev;
 	struct buf *bp;
 	daddr_t bn;
 	int bsize, bscale;
@@ -910,9 +1052,27 @@ spec_write(void *v)
 	switch (vp->v_type) {
 
 	case VCHR:
+		/*
+		 * Release the lock while we sleep -- possibly
+		 * indefinitely, if this is, e.g., a tty -- in
+		 * cdev_write, so we don't hold up everything else that
+		 * might want access to the vnode.
+		 *
+		 * But before we issue the write, take an I/O reference
+		 * to the specnode so close will know when we're done
+		 * writing.  Note that the moment we release the lock,
+		 * the vnode's identity may change; hence spec_io_enter
+		 * may fail, and the caller may have a dead vnode on
+		 * their hands, if the file system on which vp lived
+		 * has been unmounted.
+		 */
 		VOP_UNLOCK(vp);
-		error = cdev_write(vp->v_rdev, uio, ap->a_ioflag);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		error = spec_io_enter(vp, &sn, &dev);
+		if (error)
+			goto out;
+		error = cdev_write(dev, uio, ap->a_ioflag);
+		spec_io_exit(vp, sn);
+out:		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		return (error);
 
 	case VBLK:
@@ -970,10 +1130,12 @@ spec_fdiscard(void *v)
 		off_t a_pos;
 		off_t a_len;
 	} */ *ap = v;
-	struct vnode *vp;
+	struct vnode *vp = ap->a_vp;
 	dev_t dev;
 
-	vp = ap->a_vp;
+	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE ||
+	    (vp->v_vflag & VV_LOCKSWORK) == 0);
+
 	dev = vp->v_rdev;
 
 	switch (vp->v_type) {
@@ -1003,40 +1165,32 @@ spec_ioctl(void *v)
 		int  a_fflag;
 		kauth_cred_t a_cred;
 	} */ *ap = v;
-	struct vnode *vp;
+	struct vnode *vp = ap->a_vp;
+	struct specnode *sn;
 	dev_t dev;
+	int error;
 
-	/*
-	 * Extract all the info we need from the vnode, taking care to
-	 * avoid a race with VOP_REVOKE().
-	 */
-
-	vp = ap->a_vp;
-	dev = NODEV;
-	mutex_enter(vp->v_interlock);
-	if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) {
-		dev = vp->v_rdev;
-	}
-	mutex_exit(vp->v_interlock);
-	if (dev == NODEV) {
-		return ENXIO;
-	}
+	error = spec_io_enter(vp, &sn, &dev);
+	if (error)
+		return error;
 
 	switch (vp->v_type) {
-
 	case VCHR:
-		return cdev_ioctl(dev, ap->a_command, ap->a_data,
+		error = cdev_ioctl(dev, ap->a_command, ap->a_data,
 		    ap->a_fflag, curlwp);
-
+		break;
 	case VBLK:
 		KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
-		return bdev_ioctl(dev, ap->a_command, ap->a_data,
+		error = bdev_ioctl(dev, ap->a_command, ap->a_data,
 		   ap->a_fflag, curlwp);
-
+		break;
 	default:
 		panic("spec_ioctl");
 		/* NOTREACHED */
 	}
+
+	spec_io_exit(vp, sn);
+	return error;
 }
 
 /* ARGSUSED */
@@ -1047,33 +1201,25 @@ spec_poll(void *v)
 		struct vnode *a_vp;
 		int a_events;
 	} */ *ap = v;
-	struct vnode *vp;
+	struct vnode *vp = ap->a_vp;
+	struct specnode *sn;
 	dev_t dev;
+	int revents;
 
-	/*
-	 * Extract all the info we need from the vnode, taking care to
-	 * avoid a race with VOP_REVOKE().
-	 */
-
-	vp = ap->a_vp;
-	dev = NODEV;
-	mutex_enter(vp->v_interlock);
-	if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode) {
-		dev = vp->v_rdev;
-	}
-	mutex_exit(vp->v_interlock);
-	if (dev == NODEV) {
+	if (spec_io_enter(vp, &sn, &dev) != 0)
 		return POLLERR;
-	}
 
 	switch (vp->v_type) {
-
 	case VCHR:
-		return cdev_poll(dev, ap->a_events, curlwp);
-
+		revents = cdev_poll(dev, ap->a_events, curlwp);
+		break;
 	default:
-		return (genfs_poll(v));
+		revents = genfs_poll(v);
+		break;
 	}
+
+	spec_io_exit(vp, sn);
+	return revents;
 }
 
 /* ARGSUSED */
@@ -1084,20 +1230,30 @@ spec_kqfilter(void *v)
 		struct vnode	*a_vp;
 		struct proc	*a_kn;
 	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	struct specnode *sn;
 	dev_t dev;
+	int error;
 
-	switch (ap->a_vp->v_type) {
+	error = spec_io_enter(vp, &sn, &dev);
+	if (error)
+		return error;
 
+	switch (vp->v_type) {
 	case VCHR:
-		dev = ap->a_vp->v_rdev;
-		return cdev_kqfilter(dev, ap->a_kn);
+		error = cdev_kqfilter(dev, ap->a_kn);
+		break;
 	default:
 		/*
 		 * Block devices don't support kqfilter, and refuse it
 		 * for any other files (like those vflush()ed) too.
 		 */
-		return (EOPNOTSUPP);
+		error = EOPNOTSUPP;
+		break;
 	}
+
+	spec_io_exit(vp, sn);
+	return error;
 }
 
 /*
@@ -1112,11 +1268,19 @@ spec_mmap(void *v)
 		kauth_cred_t a_cred;
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
+	struct specnode *sn;
+	dev_t dev;
+	int error;
 
 	KASSERT(vp->v_type == VBLK);
-	if (bdev_type(vp->v_rdev) != D_DISK)
-		return EINVAL;
 
+	error = spec_io_enter(vp, &sn, &dev);
+	if (error)
+		return error;
+
+	error = bdev_type(dev) == D_DISK ? 0 : EINVAL;
+
+	spec_io_exit(vp, sn);
 	return 0;
 }
 
@@ -1161,27 +1325,14 @@ spec_strategy(void *v)
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
 	struct buf *bp = ap->a_bp;
+	struct specnode *sn = NULL;
 	dev_t dev;
 	int error;
 
-	dev = NODEV;
-
-	/*
-	 * Extract all the info we need from the vnode, taking care to
-	 * avoid a race with VOP_REVOKE().
-	 */
-
-	mutex_enter(vp->v_interlock);
-	if (vdead_check(vp, VDEAD_NOWAIT) == 0 && vp->v_specnode != NULL) {
-		KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
-		dev = vp->v_rdev;
-	}
-	mutex_exit(vp->v_interlock);
-
-	if (dev == NODEV) {
-		error = ENXIO;
+	error = spec_io_enter(vp, &sn, &dev);
+	if (error)
 		goto out;
-	}
+
 	bp->b_dev = dev;
 
 	if (!(bp->b_flags & B_READ)) {
@@ -1201,13 +1352,15 @@ spec_strategy(void *v)
 	}
 	bdev_strategy(bp);
 
-	return 0;
-
-out:
-	bp->b_error = error;
-	bp->b_resid = bp->b_bcount;
-	biodone(bp);
+	error = 0;
 
+out:	if (sn)
+		spec_io_exit(vp, sn);
+	if (error) {
+		bp->b_error = error;
+		bp->b_resid = bp->b_bcount;
+		biodone(bp);
+	}
 	return error;
 }
 
@@ -1278,14 +1431,18 @@ spec_close(void *v)
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
 	struct session *sess;
-	dev_t dev = vp->v_rdev;
+	dev_t dev;
 	int flags = ap->a_fflag;
 	int mode, error, count;
 	specnode_t *sn;
 	specdev_t *sd;
 
+	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE ||
+	    (vp->v_vflag & VV_LOCKSWORK) == 0);
+
 	mutex_enter(vp->v_interlock);
 	sn = vp->v_specnode;
+	dev = vp->v_rdev;
 	sd = sn->sn_dev;
 	/*
 	 * If we're going away soon, make this non-blocking.
@@ -1418,6 +1575,12 @@ spec_close(void *v)
 	else
 		error = cdev_close(dev, flags, mode, curlwp);
 
+	/*
+	 * Wait for all other devsw operations to drain.  After this
+	 * point, no bdev/cdev_* can be active for this specdev.
+	 */
+	spec_io_drain(sd);
+
 	if (!(flags & FNONBLOCK))
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h
index 018679ffaef2..91e7af9bf09b 100644
--- a/sys/miscfs/specfs/specdev.h
+++ b/sys/miscfs/specfs/specdev.h
@@ -78,6 +78,7 @@ typedef struct specdev {
 	u_int		sd_opencnt;	/* # of opens; close when ->0 */
 	u_int		sd_refcnt;	/* # of specnodes referencing this */
 	dev_t		sd_rdev;
+	volatile u_int	sd_iocnt;	/* # bdev/cdev_* operations active */
 	bool		sd_opened;	/* true if successfully opened */
 } specdev_t;