From 3690eba9c1fa6fba4b0164aea9b57620dd451d40 Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <riastradh@NetBSD.org>
Date: Tue, 18 Jan 2022 01:11:50 +0000
Subject: [PATCH] driver(9): New devsw d_cancel op to interrupt I/O before
 close.

If specified, when revoking a device node or closing its last open
node, specfs will:

1. Call d_cancel, which should return promptly without blocking.
2. Wait for all concurrent d_read/write/ioctl/&c. to drain.
3. Call d_close.

Otherwise, specfs will:

1. Call d_close.
2. Wait for all concurrent d_read/write/ioctl/&c. to drain.

This fallback is problematic because often parts of d_close rely on
concurrent devsw operations to have completed already, so it is up to
each driver to have its own mechanism for waiting, and the extra step
in (2) is almost redundant.  But it is still important to ensure that
devsw operations are not active by the time a module tries to invoke
devsw_detach, because only d_open is protected against that.

The signature of d_cancel matches d_close, mostly so we don't raise
questions about `why is this different?'; the lwp argument is not
useful but we should remove it from open/cancel/close all at the same
time.

The only way d_cancel should fail, if it does at all, is with ENODEV,
meaning the driver doesn't support cancelling outstanding I/O, and
will take responsibility for that in d_close.  I would make it return
void and only have bdev_cancel and cdev_cancel possibly return ENODEV
so specfs can detect whether a driver supports it, but this would
break the pattern around devsw operation types.

Drivers are allowed to omit it from struct bdevsw, struct cdevsw --
if so, it is as if they used a function that just returns ENODEV.

XXX kernel ABI change to struct bdevsw/cdevsw requires bump
---
 sys/kern/subr_devsw.c          | 36 ++++++++++++++++++++++++++++++++++
 sys/miscfs/specfs/spec_vnops.c | 16 +++++++++++++++
 sys/sys/conf.h                 |  5 +++++
 3 files changed, 57 insertions(+)

diff --git a/sys/kern/subr_devsw.c b/sys/kern/subr_devsw.c
index 3087606aebf2..1113f770c0e5 100644
--- a/sys/kern/subr_devsw.c
+++ b/sys/kern/subr_devsw.c
@@ -933,6 +933,24 @@ bdev_open(dev_t dev, int flag, int devtype, lwp_t *l)
 	return rv;
 }
 
+int
+bdev_cancel(dev_t dev, int flag, int devtype, struct lwp *l)
+{
+	const struct bdevsw *d;
+	int rv, mpflag;
+
+	if ((d = bdevsw_lookup(dev)) == NULL)
+		return ENXIO;
+	if (d->d_cancel == NULL)
+		return ENODEV;
+
+	DEV_LOCK(d);
+	rv = (*d->d_cancel)(dev, flag, devtype, l);
+	DEV_UNLOCK(d);
+
+	return rv;
+}
+
 int
 bdev_close(dev_t dev, int flag, int devtype, lwp_t *l)
 {
@@ -1128,6 +1146,24 @@ cdev_open(dev_t dev, int flag, int devtype, lwp_t *l)
 	return rv;
 }
 
+int
+cdev_cancel(dev_t dev, int flag, int devtype, struct lwp *l)
+{
+	const struct cdevsw *d;
+	int rv, mpflag;
+
+	if ((d = cdevsw_lookup(dev)) == NULL)
+		return ENXIO;
+	if (d->d_cancel == NULL)
+		return ENODEV;
+
+	DEV_LOCK(d);
+	rv = (*d->d_cancel)(dev, flag, devtype, l);
+	DEV_UNLOCK(d);
+
+	return rv;
+}
+
 int
 cdev_close(dev_t dev, int flag, int devtype, lwp_t *l)
 {
diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c
index 324297488d5d..b1d1cc3b73be 100644
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@@ -1686,6 +1686,22 @@ spec_close(void *v)
 	if (!(flags & FNONBLOCK))
 		VOP_UNLOCK(vp);
 
+	/*
+	 * If we can cancel all outstanding I/O, then wait for it to
+	 * drain before we call .d_close.  Drivers that split up
+	 * .d_cancel and .d_close this way need not have any internal
+	 * mechanism for waiting in .d_close for I/O to drain.
+	 */
+	if (vp->v_type == VBLK)
+		error = bdev_cancel(dev, flags, mode, curlwp);
+	else
+		error = cdev_cancel(dev, flags, mode, curlwp);
+	if (error == 0)
+		spec_io_drain(sd);
+	else
+		KASSERTMSG(error == ENODEV, "cancel dev=0x%lx failed with %d",
+		    (unsigned long)dev, error);
+
 	if (vp->v_type == VBLK)
 		error = bdev_close(dev, flags, mode, curlwp);
 	else
diff --git a/sys/sys/conf.h b/sys/sys/conf.h
index 210a237b8c43..23c65819ddcf 100644
--- a/sys/sys/conf.h
+++ b/sys/sys/conf.h
@@ -70,6 +70,7 @@ struct vnode;
  */
 struct bdevsw {
 	int		(*d_open)(dev_t, int, int, struct lwp *);
+	int		(*d_cancel)(dev_t, int, int, struct lwp *);
 	int		(*d_close)(dev_t, int, int, struct lwp *);
 	void		(*d_strategy)(struct buf *);
 	int		(*d_ioctl)(dev_t, u_long, void *, int, struct lwp *);
@@ -86,6 +87,7 @@ struct bdevsw {
  */
 struct cdevsw {
 	int		(*d_open)(dev_t, int, int, struct lwp *);
+	int		(*d_cancel)(dev_t, int, int, struct lwp *);
 	int		(*d_close)(dev_t, int, int, struct lwp *);
 	int		(*d_read)(dev_t, struct uio *, int);
 	int		(*d_write)(dev_t, struct uio *, int);
@@ -115,6 +117,7 @@ devmajor_t bdevsw_lookup_major(const struct bdevsw *);
 devmajor_t cdevsw_lookup_major(const struct cdevsw *);
 
 #define	dev_type_open(n)	int n (dev_t, int, int, struct lwp *)
+#define	dev_type_cancel(n)	int n (dev_t, int, int, struct lwp *)
 #define	dev_type_close(n)	int n (dev_t, int, int, struct lwp *)
 #define	dev_type_read(n)	int n (dev_t, struct uio *, int)
 #define	dev_type_write(n)	int n (dev_t, struct uio *, int)
@@ -165,6 +168,7 @@ paddr_t	nommap(dev_t, off_t, int);
 /* device access wrappers. */
 
 dev_type_open(bdev_open);
+dev_type_cancel(bdev_cancel);
 dev_type_close(bdev_close);
 dev_type_strategy(bdev_strategy);
 dev_type_ioctl(bdev_ioctl);
@@ -175,6 +179,7 @@ dev_type_discard(bdev_discard);
 void	bdev_detached(dev_t);
 
 dev_type_open(cdev_open);
+dev_type_cancel(cdev_cancel);
 dev_type_close(cdev_close);
 dev_type_read(cdev_read);
 dev_type_write(cdev_write);