From b036533e2c64eaf4e5d9de1432592a4728af261a Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 21 Feb 2020 23:38:18 +0000 Subject: [PATCH] WIP: Define vfs_vnode_iterator_next_live and use it in lfs. Like vfs_vnode_iterator_next, but skip any nodes that are being reclaimed -- we do not care about them. This is kind of gross, but it breaks the lfs deadlock: * sqlite3 vstate_wait_stable vcache_get vfs_vnode_iterator_next1 lfs_writevnodes lfs_segwrite lfs_flush_fs lfs_update lfs_fsync VOP_FSYNC sys_fsync syscall * tar mtsleep lfs_seglock lfs_truncate ulfs_inactive VOP_INACTIVE vrelel genfs_rename_exit genfs_sane_rename lfs_sane_rename genfs_insane_rename VOP_RENAME do_sys_renameat syscall Specifically: - sqlite3 holds the seglock and is waiting for VOP_INACTIVE to finish - tar is in VOP_INACTIVE and waiting for the seglock Although rename is a dirop, the last reference may not be released until long after rename is done, so we can't truncate the inode _during_ the dirop when the segment writer is blocked. I believe in this state, it is not possible for the vnode to become referenced again, so it's not harmful to skip it from the segment -- its continued existence can matter only to running processes that still have the file open. --- sys/kern/vfs_mount.c | 59 +++++++++++++++++++++++++++++++++++++++ sys/sys/mount.h | 2 ++ sys/ufs/lfs/lfs_segment.c | 4 +-- 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 5fcd9e156ea6..b4596cb529eb 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -402,6 +402,65 @@ vfs_vnode_iterator_destroy(struct vnode_iterator *vni) vnfree_marker(mvp); } +static bool +vlive_check(struct vnode *vp) +{ + vnode_impl_t *vip = VNODE_TO_VIMPL(vp); + + KASSERT(mutex_owned(vp->v_interlock)); + + while (vip->vi_state != VS_LOADED && vip->vi_state != VS_BLOCKED) { + if (vip->vi_state == VS_RECLAIMING || + vip->vi_state == VS_RECLAIMED) + return ENOENT; + cv_wait(&vp->v_cv, vp->v_interlock); + } + + return 0; +} + +struct vnode * +vfs_vnode_iterator_next_live(struct vnode_iterator *vni, + bool (*f)(void *, struct vnode *), void *cl) +{ + vnode_impl_t *mvip = &vni->vi_vnode; + struct mount *mp = VIMPL_TO_VNODE(mvip)->v_mount; + vnode_t *vp; + vnode_impl_t *vip; + int error; + + KASSERT(vnis_marker(VIMPL_TO_VNODE(mvip))); + + do { + mutex_enter(&mntvnode_lock); + vip = TAILQ_NEXT(mvip, vi_mntvnodes); + TAILQ_REMOVE(&mp->mnt_vnodelist, mvip, vi_mntvnodes); + VIMPL_TO_VNODE(mvip)->v_usecount = 0; +again: + vp = VIMPL_TO_VNODE(vip); + if (vp == NULL) { + mutex_exit(&mntvnode_lock); + return NULL; + } + mutex_enter(vp->v_interlock); + if (vnis_marker(vp) || + vlive_check(vp) || + (f && !(*f)(cl, vp))) { + mutex_exit(vp->v_interlock); + vip = TAILQ_NEXT(vip, vi_mntvnodes); + goto again; + } + + TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vip, mvip, vi_mntvnodes); + VIMPL_TO_VNODE(mvip)->v_usecount = 1; + mutex_exit(&mntvnode_lock); + error = vcache_vget(vp); + KASSERT(error == 0 || error == ENOENT); + } while (error != 0); + + return vp; +} + static struct vnode * vfs_vnode_iterator_next1(struct vnode_iterator *vni, bool (*f)(void *, struct vnode *), void *cl, bool do_wait) diff --git a/sys/sys/mount.h b/sys/sys/mount.h index dab6058da6e2..6a08f1520a02 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -418,6 +418,8 @@ void vfs_vnode_iterator_init(struct mount *, struct vnode_iterator **); void vfs_vnode_iterator_destroy(struct vnode_iterator *); struct vnode *vfs_vnode_iterator_next(struct vnode_iterator *, bool (*)(void *, struct vnode *), void *); +struct vnode *vfs_vnode_iterator_next_live(struct vnode_iterator *, + bool (*)(void *, struct vnode *), void *); /* Syncer */ extern int syncer_maxdelay; diff --git a/sys/ufs/lfs/lfs_segment.c b/sys/ufs/lfs/lfs_segment.c index b54855c74234..ccb33df0714b 100644 --- a/sys/ufs/lfs/lfs_segment.c +++ b/sys/ufs/lfs/lfs_segment.c @@ -490,7 +490,7 @@ lfs_writevnodes_selector(void *cl, struct vnode *vp) KASSERT(mutex_owned(vp->v_interlock)); ip = VTOI(vp); - if (ip == NULL || vp->v_type == VNON) + if (ip == NULL || vp->v_type == VNON || ip->i_nlink <= 0) return false; if ((op == VN_DIROP && !(vp->v_uflag & VU_DIROP)) || (op != VN_DIROP && op != VN_CLEAN && (vp->v_uflag & VU_DIROP))) { @@ -536,7 +536,7 @@ lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op) vfs_vnode_iterator_init(mp, &marker); ctx.op = op; ctx.fs = fs; - while ((vp = vfs_vnode_iterator_next(marker, + while ((vp = vfs_vnode_iterator_next_live(marker, lfs_writevnodes_selector, &ctx)) != NULL) { ip = VTOI(vp);