From 2f08ed11d2ef1db875551adbeaa2a13b293560ff Mon Sep 17 00:00:00 2001 From: pgoyette Date: Wed, 7 Aug 2019 00:38:01 +0000 Subject: [PATCH 01/30] Many years ago someone created a new __link_set_sysctl_funcs to hold the list of routines that need to be called for setting up sysctl variables. This worked great for all code included in the kernel itself, but didn't deal with modules that want to create their own sysctl data. So, we ended up with a lot of #ifdef _MODULE blocks so modules could explicitly call their setup functions when loaded as non-built-in modules. So today, we complete the task that was started so many years ago. When modules are loaded, after we've called xxx_modcmd(INIT...) we check if the module contains its own __link_set_sysctl_funcs, and if so we call the functions listed. We add a struct sysctllog member to the struct module so we can call sysctl_teardown() when the module gets unloaded. (The sequence of events ensures that the sysctl stuff doesn't get created until the rest of the module's init code does any required memory allocation.) So, no more need to explicitly call the sysctl setup routines when built as a loadable module. --- sys/dev/ccd.c | 10 ++------ sys/dev/iscsi/iscsi_main.c | 7 +----- sys/dev/usb/usbnet.c | 4 +-- sys/kern/kern_module.c | 44 +++++++++++++++++++++++++++++++-- sys/kern/sysv_ipc.c | 29 ++++++---------------- sys/kern/sysv_msg.c | 10 +++----- sys/kern/sysv_sem.c | 10 +++----- sys/kern/sysv_shm.c | 10 +++----- sys/miscfs/genfs/layer_vfsops.c | 13 ++-------- sys/sys/module.h | 6 ++++- sys/sys/msg.h | 4 +-- sys/sys/sem.h | 4 +-- sys/sys/shm.h | 4 +-- 13 files changed, 77 insertions(+), 78 deletions(-) diff --git a/sys/dev/ccd.c b/sys/dev/ccd.c index c0c73bae696f..a72334bb8d7e 100644 --- a/sys/dev/ccd.c +++ b/sys/dev/ccd.c @@ -1,4 +1,4 @@ -/* $NetBSD: ccd.c,v 1.179 2019/03/27 19:13:34 martin Exp $ */ +/* $NetBSD: ccd.c,v 1.180 2019/08/07 00:38:01 pgoyette Exp $ */ /*- * Copyright (c) 1996, 1997, 1998, 1999, 2007, 2009 The NetBSD Foundation, Inc. @@ -88,7 +88,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: ccd.c,v 1.179 2019/03/27 19:13:34 martin Exp $"); +__KERNEL_RCSID(0, "$NetBSD: ccd.c,v 1.180 2019/08/07 00:38:01 pgoyette Exp $"); #include #include @@ -216,10 +216,6 @@ static void printiinfo(struct ccdiinfo *); static LIST_HEAD(, ccd_softc) ccds = LIST_HEAD_INITIALIZER(ccds); static kmutex_t ccd_lock; -#ifdef _MODULE -static struct sysctllog *ccd_clog; -#endif - SYSCTL_SETUP_PROTO(sysctl_kern_ccd_setup); static struct ccd_softc * @@ -1681,7 +1677,6 @@ ccd_modcmd(modcmd_t cmd, void *arg) error = devsw_attach("ccd", &ccd_bdevsw, &bmajor, &ccd_cdevsw, &cmajor); - sysctl_kern_ccd_setup(&ccd_clog); #endif break; @@ -1696,7 +1691,6 @@ ccd_modcmd(modcmd_t cmd, void *arg) error = devsw_detach(&ccd_bdevsw, &ccd_cdevsw); ccddetach(); } - sysctl_teardown(&ccd_clog); #endif break; diff --git a/sys/dev/iscsi/iscsi_main.c b/sys/dev/iscsi/iscsi_main.c index 45189b96fa7d..c10a7e49d03c 100644 --- a/sys/dev/iscsi/iscsi_main.c +++ b/sys/dev/iscsi/iscsi_main.c @@ -1,4 +1,4 @@ -/* $NetBSD: iscsi_main.c,v 1.30 2019/07/13 17:06:00 mlelstv Exp $ */ +/* $NetBSD: iscsi_main.c,v 1.31 2019/08/07 00:38:02 pgoyette Exp $ */ /*- * Copyright (c) 2004,2005,2006,2011 The NetBSD Foundation, Inc. @@ -677,7 +677,6 @@ iscsi_modcmd(modcmd_t cmd, void *arg) #ifdef _MODULE devmajor_t cmajor = NODEVMAJOR, bmajor = NODEVMAJOR; int error; - static struct sysctllog *clog; #endif switch (cmd) { @@ -723,8 +722,6 @@ iscsi_modcmd(modcmd_t cmd, void *arg) config_cfdriver_detach(&iscsi_cd); return ENXIO; } - - sysctl_iscsi_setup(&clog); #endif return 0; break; @@ -735,8 +732,6 @@ iscsi_modcmd(modcmd_t cmd, void *arg) if (error) return error; - sysctl_teardown(&clog); - config_cfattach_detach(iscsi_cd.cd_name, &iscsi_ca); config_cfdriver_detach(&iscsi_cd); devsw_detach(NULL, &iscsi_cdevsw); diff --git a/sys/dev/usb/usbnet.c b/sys/dev/usb/usbnet.c index 28afa57c717f..e47184c99e6d 100644 --- a/sys/dev/usb/usbnet.c +++ b/sys/dev/usb/usbnet.c @@ -1,4 +1,4 @@ -/* $NetBSD: usbnet.c,v 1.25.2.4 2019/12/17 12:55:10 martin Exp $ */ +/* $NetBSD: usbnet.c,v 1.7 2019/08/07 00:38:02 pgoyette Exp $ */ /* * Copyright (c) 2019 Matthew R. Green @@ -33,7 +33,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: usbnet.c,v 1.25.2.4 2019/12/17 12:55:10 martin Exp $"); +__KERNEL_RCSID(0, "$NetBSD: usbnet.c,v 1.7 2019/08/07 00:38:02 pgoyette Exp $"); #include #include diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c index ad23ede9f987..ea3a98987707 100644 --- a/sys/kern/kern_module.c +++ b/sys/kern/kern_module.c @@ -1,4 +1,4 @@ -/* $NetBSD: kern_module.c,v 1.136 2019/06/19 15:01:01 pgoyette Exp $ */ +/* $NetBSD: kern_module.c,v 1.137 2019/08/07 00:38:02 pgoyette Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. @@ -34,7 +34,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: kern_module.c,v 1.136 2019/06/19 15:01:01 pgoyette Exp $"); +__KERNEL_RCSID(0, "$NetBSD: kern_module.c,v 1.137 2019/08/07 00:38:02 pgoyette Exp $"); #define _MODULE_INTERNAL @@ -946,6 +946,35 @@ module_do_builtin(const module_t *pmod, const char *name, module_t **modp, return error; } +/* + * module_load_sysctl + * + * Check to see if the module has any SYSCTL_SETUP() routine(s) + * registered. If so, call it (them). + */ + +static void +module_load_sysctl(module_t *mod) +{ + void (**ls_funcp)(struct sysctllog **); + void *ls_start; + size_t ls_size, count; + int error; + + error = kobj_find_section(mod->mod_kobj, "link_set_sysctl_funcs", + &ls_start, &ls_size); + if (error == 0) { + count = ls_size / sizeof(ls_start); + ls_funcp = ls_start; + while (count--) { + (**ls_funcp)(&mod->mod_sysctllog); + ls_funcp++; + } + } + else + error = 0; /* no setup funcs registered */ +} + /* * module_do_load: * @@ -1265,6 +1294,8 @@ module_do_load(const char *name, bool isdep, int flags, goto fail1; } + module_load_sysctl(mod); /* Set-up module's sysctl if any */ + /* * Good, the module loaded successfully. Put it onto the * list and add references to its requisite modules. @@ -1344,9 +1375,18 @@ module_do_unload(const char *name, bool load_requires_force) prev_active = module_active; module_active = mod; module_callback_unload(mod); + + /* + * If there were any registered SYSCTL_SETUP funcs, make sure + * we release the sysctl entries + */ + if (mod->mod_sysctllog) { + sysctl_teardown(&mod->mod_sysctllog); + } error = (*mod->mod_info->mi_modcmd)(MODULE_CMD_FINI, NULL); module_active = prev_active; if (error != 0) { + module_load_sysctl(mod); /* re-enable sysctl stuff */ module_print("cannot unload module `%s' error=%d", name, error); return error; diff --git a/sys/kern/sysv_ipc.c b/sys/kern/sysv_ipc.c index 9198464cc3d0..37b4b6358642 100644 --- a/sys/kern/sysv_ipc.c +++ b/sys/kern/sysv_ipc.c @@ -1,4 +1,4 @@ -/* $NetBSD: sysv_ipc.c,v 1.39 2019/04/10 10:03:50 pgoyette Exp $ */ +/* $NetBSD: sysv_ipc.c,v 1.40 2019/08/07 00:38:02 pgoyette Exp $ */ /*- * Copyright (c) 1998, 2007 The NetBSD Foundation, Inc. @@ -30,7 +30,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: sysv_ipc.c,v 1.39 2019/04/10 10:03:50 pgoyette Exp $"); +__KERNEL_RCSID(0, "$NetBSD: sysv_ipc.c,v 1.40 2019/08/07 00:38:02 pgoyette Exp $"); #ifdef _KERNEL_OPT #include "opt_sysv.h" @@ -132,8 +132,6 @@ MODULE(MODULE_CLASS_EXEC, sysv_ipc, NULL); SYSCTL_SETUP_PROTO(sysctl_ipc_setup); -static struct sysctllog *sysctl_sysvipc_clog = NULL; - static const struct syscall_package sysvipc_syscalls[] = { #if defined(SYSVSHM) { SYS___shmctl50, 0, (sy_call_t *)sys___shmctl50 }, @@ -180,12 +178,12 @@ sysv_ipc_modcmd(modcmd_t cmd, void *arg) * sysctl data */ #ifdef SYSVSHM - error = shminit(&sysctl_sysvipc_clog); + error = shminit(); if (error != 0) return error; #endif #ifdef SYSVSEM - error = seminit(&sysctl_sysvipc_clog); + error = seminit(); if (error != 0) { #ifdef SYSVSHM shmfini(); @@ -194,7 +192,7 @@ sysv_ipc_modcmd(modcmd_t cmd, void *arg) } #endif #ifdef SYSVMSG - error = msginit(&sysctl_sysvipc_clog); + error = msginit(); if (error != 0) { #ifdef SYSVSEM semfini(); @@ -205,11 +203,6 @@ sysv_ipc_modcmd(modcmd_t cmd, void *arg) return error; } #endif - -#ifdef _MODULE - /* Set up the common sysctl tree */ - sysctl_ipc_setup(&sysctl_sysvipc_clog); -#endif break; case MODULE_CMD_FINI: /* @@ -228,7 +221,7 @@ sysv_ipc_modcmd(modcmd_t cmd, void *arg) #ifdef SYSVSEM if (semfini()) { #ifdef SYSVSHM - shminit(NULL); + shminit(); #endif return EBUSY; } @@ -236,20 +229,14 @@ sysv_ipc_modcmd(modcmd_t cmd, void *arg) #ifdef SYSVMSG if (msgfini()) { #ifdef SYSVSEM - seminit(NULL); + seminit(); #endif #ifdef SYSVSHM - shminit(NULL); + shminit(); #endif return EBUSY; } #endif - -#ifdef _MODULE - /* Remove the sysctl sub-trees */ - sysctl_teardown(&sysctl_sysvipc_clog); -#endif - /* Unlink the system calls. */ error = syscall_disestablish(NULL, sysvipc_syscalls); if (error) diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c index 691b1dfc95fd..89e231540131 100644 --- a/sys/kern/sysv_msg.c +++ b/sys/kern/sysv_msg.c @@ -1,4 +1,4 @@ -/* $NetBSD: sysv_msg.c,v 1.74.4.1 2019/10/15 19:05:38 martin Exp $ */ +/* $NetBSD: sysv_msg.c,v 1.75 2019/08/07 00:38:02 pgoyette Exp $ */ /*- * Copyright (c) 1999, 2006, 2007 The NetBSD Foundation, Inc. @@ -50,7 +50,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: sysv_msg.c,v 1.74.4.1 2019/10/15 19:05:38 martin Exp $"); +__KERNEL_RCSID(0, "$NetBSD: sysv_msg.c,v 1.75 2019/08/07 00:38:02 pgoyette Exp $"); #ifdef _KERNEL_OPT #include "opt_sysv.h" @@ -94,7 +94,7 @@ extern int kern_has_sysvmsg; SYSCTL_SETUP_PROTO(sysctl_ipc_msg_setup); int -msginit(struct sysctllog **clog) +msginit(void) { int i, sz; vaddr_t v; @@ -167,10 +167,6 @@ msginit(struct sysctllog **clog) kern_has_sysvmsg = 1; -#ifdef _MODULE - if (clog) - sysctl_ipc_msg_setup(clog); -#endif return 0; } diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c index 269da37012c0..24b44301158d 100644 --- a/sys/kern/sysv_sem.c +++ b/sys/kern/sysv_sem.c @@ -1,4 +1,4 @@ -/* $NetBSD: sysv_sem.c,v 1.97 2019/04/10 10:03:50 pgoyette Exp $ */ +/* $NetBSD: sysv_sem.c,v 1.98 2019/08/07 00:38:02 pgoyette Exp $ */ /*- * Copyright (c) 1999, 2007 The NetBSD Foundation, Inc. @@ -39,7 +39,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: sysv_sem.c,v 1.97 2019/04/10 10:03:50 pgoyette Exp $"); +__KERNEL_RCSID(0, "$NetBSD: sysv_sem.c,v 1.98 2019/08/07 00:38:02 pgoyette Exp $"); #ifdef _KERNEL_OPT #include "opt_sysv.h" @@ -102,7 +102,7 @@ static ONCE_DECL(exithook_control); static int seminit_exithook(void); int -seminit(struct sysctllog **clog) +seminit(void) { int i, sz; vaddr_t v; @@ -145,10 +145,6 @@ seminit(struct sysctllog **clog) kern_has_sysvsem = 1; -#ifdef _MODULE - if (clog) - sysctl_ipc_sem_setup(clog); -#endif return 0; } diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c index 73bf0df57254..b62db267a3c1 100644 --- a/sys/kern/sysv_shm.c +++ b/sys/kern/sysv_shm.c @@ -1,4 +1,4 @@ -/* $NetBSD: sysv_shm.c,v 1.135.2.4 2019/10/10 17:23:45 martin Exp $ */ +/* $NetBSD: sysv_shm.c,v 1.137 2019/08/07 00:38:02 pgoyette Exp $ */ /*- * Copyright (c) 1999, 2007 The NetBSD Foundation, Inc. @@ -61,7 +61,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: sysv_shm.c,v 1.135.2.4 2019/10/10 17:23:45 martin Exp $"); +__KERNEL_RCSID(0, "$NetBSD: sysv_shm.c,v 1.137 2019/08/07 00:38:02 pgoyette Exp $"); #ifdef _KERNEL_OPT #include "opt_sysv.h" @@ -938,7 +938,7 @@ shmrealloc(int newshmni) } int -shminit(struct sysctllog **clog) +shminit(void) { vaddr_t v; size_t sz; @@ -983,10 +983,6 @@ shminit(struct sysctllog **clog) uvm_shmexit = shmexit; uvm_shmfork = shmfork; -#ifdef _MODULE - if (clog) - sysctl_ipc_shm_setup(clog); -#endif return 0; } diff --git a/sys/miscfs/genfs/layer_vfsops.c b/sys/miscfs/genfs/layer_vfsops.c index bcf10c9e05a7..113d5390a65b 100644 --- a/sys/miscfs/genfs/layer_vfsops.c +++ b/sys/miscfs/genfs/layer_vfsops.c @@ -1,4 +1,4 @@ -/* $NetBSD: layer_vfsops.c,v 1.51 2017/06/04 08:02:26 hannken Exp $ */ +/* $NetBSD: layer_vfsops.c,v 1.52 2019/08/07 00:38:02 pgoyette Exp $ */ /* * Copyright (c) 1999 National Aeronautics & Space Administration @@ -74,7 +74,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: layer_vfsops.c,v 1.51 2017/06/04 08:02:26 hannken Exp $"); +__KERNEL_RCSID(0, "$NetBSD: layer_vfsops.c,v 1.52 2019/08/07 00:38:02 pgoyette Exp $"); #include #include @@ -97,20 +97,11 @@ MODULE(MODULE_CLASS_MISC, layerfs, NULL); static int layerfs_modcmd(modcmd_t cmd, void *arg) { -#ifdef _MODULE - static struct sysctllog *layerfs_clog = NULL; -#endif switch (cmd) { case MODULE_CMD_INIT: -#ifdef _MODULE - sysctl_vfs_layerfs_setup(&layerfs_clog); -#endif return 0; case MODULE_CMD_FINI: -#ifdef _MODULE - sysctl_teardown(&layerfs_clog); -#endif return 0; default: return ENOTTY; diff --git a/sys/sys/module.h b/sys/sys/module.h index b60d046137b9..19417ec30c17 100644 --- a/sys/sys/module.h +++ b/sys/sys/module.h @@ -1,4 +1,4 @@ -/* $NetBSD: module.h,v 1.46 2019/04/08 11:32:49 pgoyette Exp $ */ +/* $NetBSD: module.h,v 1.47 2019/08/07 00:38:02 pgoyette Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. @@ -83,6 +83,9 @@ typedef struct modinfo { } const modinfo_t; /* Per module information, maintained by kern_module.c */ + +struct sysctllog; + typedef struct module { u_int mod_refcnt; int mod_flags; @@ -97,6 +100,7 @@ typedef struct module { modsrc_t mod_source; time_t mod_autotime; specificdata_reference mod_sdref; + struct sysctllog *mod_sysctllog; } module_t; /* diff --git a/sys/sys/msg.h b/sys/sys/msg.h index f661a5c065e9..436f4a1c69d4 100644 --- a/sys/sys/msg.h +++ b/sys/sys/msg.h @@ -1,4 +1,4 @@ -/* $NetBSD: msg.h,v 1.27 2019/04/10 10:03:50 pgoyette Exp $ */ +/* $NetBSD: msg.h,v 1.28 2019/08/07 00:38:02 pgoyette Exp $ */ /*- * Copyright (c) 1999, 2007 The NetBSD Foundation, Inc. @@ -209,7 +209,7 @@ __END_DECLS struct proc; -int msginit(struct sysctllog **); +int msginit(void); int msgfini(void); int msgctl1(struct lwp *, int, int, struct msqid_ds *); int msgsnd1(struct lwp *, int, const char *, size_t, int, size_t, diff --git a/sys/sys/sem.h b/sys/sys/sem.h index a4ce2c2093ff..ece89e3b22e6 100644 --- a/sys/sys/sem.h +++ b/sys/sys/sem.h @@ -1,4 +1,4 @@ -/* $NetBSD: sem.h,v 1.33 2019/04/10 10:03:50 pgoyette Exp $ */ +/* $NetBSD: sem.h,v 1.34 2019/08/07 00:38:02 pgoyette Exp $ */ /*- * Copyright (c) 1999 The NetBSD Foundation, Inc. @@ -222,7 +222,7 @@ int semconfig(int); #endif __END_DECLS #else -int seminit(struct sysctllog **); +int seminit(void); int semfini(void); void semexit(struct proc *, void *); diff --git a/sys/sys/shm.h b/sys/sys/shm.h index c6c429390f88..ff2c60a1abf3 100644 --- a/sys/sys/shm.h +++ b/sys/sys/shm.h @@ -1,4 +1,4 @@ -/* $NetBSD: shm.h,v 1.52.4.1 2019/09/13 06:25:26 martin Exp $ */ +/* $NetBSD: shm.h,v 1.53 2019/08/07 00:38:02 pgoyette Exp $ */ /*- * Copyright (c) 1999 The NetBSD Foundation, Inc. @@ -172,7 +172,7 @@ extern int shm_nused; struct vmspace; -int shminit(struct sysctllog **); +int shminit(void); int shmfini(void); void shmfork(struct vmspace *, struct vmspace *); void shmexit(struct vmspace *); From 4640991901cfc4223d06c854ca628fe710a3f052 Mon Sep 17 00:00:00 2001 From: pgoyette Date: Thu, 8 Aug 2019 18:08:41 +0000 Subject: [PATCH 02/30] When modules are unloaded, we call sysctl_teardown() before calling the module's modcmd(CMD_FINI) code. If the modcmd() call returns an error, we attempted to re-instate the module's sysctl stuff. This doesn't work well for built-in modulesi (where "unload" actually means "disable"), since they don't have any ``struct kobj''. So check first, and don't try to find the __link_set_sysctl_funcs for built-in modules. --- sys/kern/kern_module.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c index ea3a98987707..3823fdb84d1c 100644 --- a/sys/kern/kern_module.c +++ b/sys/kern/kern_module.c @@ -1,4 +1,4 @@ -/* $NetBSD: kern_module.c,v 1.137 2019/08/07 00:38:02 pgoyette Exp $ */ +/* $NetBSD: kern_module.c,v 1.138 2019/08/08 18:08:41 pgoyette Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. @@ -34,7 +34,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: kern_module.c,v 1.137 2019/08/07 00:38:02 pgoyette Exp $"); +__KERNEL_RCSID(0, "$NetBSD: kern_module.c,v 1.138 2019/08/08 18:08:41 pgoyette Exp $"); #define _MODULE_INTERNAL @@ -949,7 +949,7 @@ module_do_builtin(const module_t *pmod, const char *name, module_t **modp, /* * module_load_sysctl * - * Check to see if the module has any SYSCTL_SETUP() routine(s) + * Check to see if a non-builtin module has any SYSCTL_SETUP() routine(s) * registered. If so, call it (them). */ @@ -961,6 +961,13 @@ module_load_sysctl(module_t *mod) size_t ls_size, count; int error; + /* + * Built-in modules don't have a mod_kobj so we cannot search + * for their link_set_sysctl_funcs + */ + if (mod->mod_source == MODULE_SOURCE_KERNEL) + return; + error = kobj_find_section(mod->mod_kobj, "link_set_sysctl_funcs", &ls_start, &ls_size); if (error == 0) { From 368f155fdb8a798d125e40933907857aba65cde8 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 22 Feb 2020 23:02:34 +0000 Subject: [PATCH 03/30] Teach module loader to attach evcnts, from pgoyette@. --- sys/kern/kern_module.c | 74 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c index 3823fdb84d1c..0d7a644c5cc9 100644 --- a/sys/kern/kern_module.c +++ b/sys/kern/kern_module.c @@ -54,6 +54,7 @@ __KERNEL_RCSID(0, "$NetBSD: kern_module.c,v 1.138 2019/08/08 18:08:41 pgoyette E #include #include #include +#include #include @@ -978,8 +979,74 @@ module_load_sysctl(module_t *mod) ls_funcp++; } } - else - error = 0; /* no setup funcs registered */ +} + +/* + * module_load_evcnt + * + * Check to see if a non-builtin module has any static evcnt's defined; + * if so, attach them. + */ + +static void +module_load_evcnt(module_t *mod) +{ + struct evcnt * const *ls_evp; + void *ls_start; + size_t ls_size, count; + int error; + + /* + * Built-in modules' static evcnt stuff will be handled + * automatically as part of general kernel initialization + */ + if (mod->mod_source == MODULE_SOURCE_KERNEL) + return; + + error = kobj_find_section(mod->mod_kobj, "link_set_evcnts", + &ls_start, &ls_size); + if (error == 0) { + count = ls_size / sizeof(*ls_evp); + ls_evp = ls_start; + while (count--) { + evcnt_attach_static(*ls_evp); + ls_evp++; + } + } +} + +/* + * module_unload_evcnt + * + * Check to see if a non-builtin module has any static evcnt's defined; + * if so, detach them. + */ + +static void +module_unload_evcnt(module_t *mod) +{ + struct evcnt * const *ls_evp; + void *ls_start; + size_t ls_size, count; + int error; + + /* + * Built-in modules' static evcnt stuff will be handled + * automatically as part of general kernel initialization + */ + if (mod->mod_source == MODULE_SOURCE_KERNEL) + return; + + error = kobj_find_section(mod->mod_kobj, "link_set_evcnts", + &ls_start, &ls_size); + if (error == 0) { + count = ls_size / sizeof(*ls_evp); + ls_evp = ls_start; + while (count--) { + evcnt_detach(*ls_evp); + ls_evp++; + } + } } /* @@ -1302,6 +1369,7 @@ module_do_load(const char *name, bool isdep, int flags, } module_load_sysctl(mod); /* Set-up module's sysctl if any */ + module_load_evcnt(mod); /* Attach any static evcnt needed */ /* * Good, the module loaded successfully. Put it onto the @@ -1390,10 +1458,12 @@ module_do_unload(const char *name, bool load_requires_force) if (mod->mod_sysctllog) { sysctl_teardown(&mod->mod_sysctllog); } + module_unload_evcnt(mod); error = (*mod->mod_info->mi_modcmd)(MODULE_CMD_FINI, NULL); module_active = prev_active; if (error != 0) { module_load_sysctl(mod); /* re-enable sysctl stuff */ + module_load_evcnt(mod); /* and reenable evcnts */ module_print("cannot unload module `%s' error=%d", name, error); return error; From 01d70e2ebf6a89fccfcf29c32dee312cec3ef721 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Thu, 20 Feb 2020 15:06:47 +0000 Subject: [PATCH 04/30] Fix kassert in lfs by initializing vp first. --- lib/libp2k/p2k.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/libp2k/p2k.c b/lib/libp2k/p2k.c index 45ae7f78ab59..9942ea8b8368 100644 --- a/lib/libp2k/p2k.c +++ b/lib/libp2k/p2k.c @@ -789,7 +789,7 @@ do_makenode(struct puffs_usermount *pu, struct p2k_node *p2n_dir, struct p2k_node *p2n; struct componentname *cn; struct vattr *va_x; - struct vnode *vp; + struct vnode *vp = NULL; int rv; p2n = malloc(sizeof(*p2n)); From 3873e68b4bb5d450154b97c13e12fc769af394e1 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 21 Feb 2020 23:25:17 +0000 Subject: [PATCH 05/30] Use a marker node to iterate lfs_dchainhd / i_lfs_dchain. I believe elements can be removed while the lock is dropped, including the next node we're hanging on to. --- sys/ufs/lfs/lfs_inode.h | 1 + sys/ufs/lfs/lfs_subr.c | 36 +++++++++++++++++++++++++++++------- sys/ufs/lfs/lfs_vnops.c | 28 +++++++++++++++++++++++++--- 3 files changed, 55 insertions(+), 10 deletions(-) diff --git a/sys/ufs/lfs/lfs_inode.h b/sys/ufs/lfs/lfs_inode.h index e0d0404d226e..384f6f09990d 100644 --- a/sys/ufs/lfs/lfs_inode.h +++ b/sys/ufs/lfs/lfs_inode.h @@ -123,6 +123,7 @@ struct inode { /* unused 0x0400 */ /* was FFS-only IN_SPACECOUNTED */ #define IN_PAGING 0x1000 /* LFS: file is on paging queue */ #define IN_CDIROP 0x4000 /* LFS: dirop completed pending i/o */ +#define IN_MARKER 0x00010000 /* LFS: marker inode for iteration */ /* XXX this is missing some of the flags */ #define IN_ALLMOD (IN_MODIFIED|IN_ACCESS|IN_CHANGE|IN_UPDATE|IN_MODIFY|IN_ACCESSED|IN_CLEANING) diff --git a/sys/ufs/lfs/lfs_subr.c b/sys/ufs/lfs/lfs_subr.c index 9a91e784736a..83b5ae7b69e1 100644 --- a/sys/ufs/lfs/lfs_subr.c +++ b/sys/ufs/lfs/lfs_subr.c @@ -337,10 +337,14 @@ lfs_seglock(struct lfs *fs, unsigned long flags) static void lfs_unmark_dirop(struct lfs *); +static struct evcnt lfs_dchain_marker_pass_dirop = + EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "lfs", "dchain marker pass dirop"); +EVCNT_ATTACH_STATIC(lfs_dchain_marker_pass_dirop); + static void lfs_unmark_dirop(struct lfs *fs) { - struct inode *ip, *nip; + struct inode *ip, *marker; struct vnode *vp; int doit; @@ -349,13 +353,28 @@ lfs_unmark_dirop(struct lfs *fs) doit = !(fs->lfs_flags & LFS_UNDIROP); if (doit) fs->lfs_flags |= LFS_UNDIROP; - if (!doit) { - mutex_exit(&lfs_lock); + mutex_exit(&lfs_lock); + + if (!doit) return; - } - for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) { - nip = TAILQ_NEXT(ip, i_lfs_dchain); + marker = pool_get(&lfs_inode_pool, PR_WAITOK); + KASSERT(fs != NULL); + memset(marker, 0, sizeof(*marker)); + marker->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK); + memset(marker->inode_ext.lfs, 0, sizeof(*marker->inode_ext.lfs)); + marker->i_state |= IN_MARKER; + + mutex_enter(&lfs_lock); + TAILQ_INSERT_HEAD(&fs->lfs_dchainhd, marker, i_lfs_dchain); + while ((ip = TAILQ_NEXT(marker, i_lfs_dchain)) != NULL) { + TAILQ_REMOVE(&fs->lfs_dchainhd, marker, i_lfs_dchain); + TAILQ_INSERT_AFTER(&fs->lfs_dchainhd, ip, marker, + i_lfs_dchain); + if (ip->i_state & IN_MARKER) { + lfs_dchain_marker_pass_dirop.ev_count++; + continue; + } vp = ITOV(ip); if ((ip->i_state & (IN_ADIROP | IN_CDIROP)) == IN_CDIROP) { --lfs_dirvcount; @@ -371,10 +390,13 @@ lfs_unmark_dirop(struct lfs *fs) ip->i_state &= ~IN_CDIROP; } } - + TAILQ_REMOVE(&fs->lfs_dchainhd, marker, i_lfs_dchain); fs->lfs_flags &= ~LFS_UNDIROP; wakeup(&fs->lfs_flags); mutex_exit(&lfs_lock); + + pool_put(&lfs_inoext_pool, marker->inode_ext.lfs); + pool_put(&lfs_inode_pool, marker); } static void diff --git a/sys/ufs/lfs/lfs_vnops.c b/sys/ufs/lfs/lfs_vnops.c index 137c65397c21..e7b4ca25c1cc 100644 --- a/sys/ufs/lfs/lfs_vnops.c +++ b/sys/ufs/lfs/lfs_vnops.c @@ -1594,6 +1594,10 @@ lfs_strategy(void *v) return VOP_STRATEGY(vp, bp); } +static struct evcnt lfs_dchain_marker_pass_flush = + EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "lfs", "dchain marker pass flush"); +EVCNT_ATTACH_STATIC(lfs_dchain_marker_pass_flush); + /* * Inline lfs_segwrite/lfs_writevnodes, but just for dirops. * Technically this is a checkpoint (the on-disk state is valid) @@ -1602,7 +1606,7 @@ lfs_strategy(void *v) int lfs_flush_dirops(struct lfs *fs) { - struct inode *ip, *nip; + struct inode *ip, *marker; struct vnode *vp; extern int lfs_dostats; /* XXX this does not belong here */ struct segment *sp; @@ -1626,6 +1630,12 @@ lfs_flush_dirops(struct lfs *fs) if (lfs_dostats) ++lfs_stats.flush_invoked; + marker = pool_get(&lfs_inode_pool, PR_WAITOK); + memset(marker, 0, sizeof(*marker)); + marker->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK); + memset(marker->inode_ext.lfs, 0, sizeof(*marker->inode_ext.lfs)); + marker->i_state = IN_MARKER; + lfs_imtime(fs); lfs_seglock(fs, flags); sp = fs->lfs_sp; @@ -1644,8 +1654,15 @@ lfs_flush_dirops(struct lfs *fs) * */ mutex_enter(&lfs_lock); - for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) { - nip = TAILQ_NEXT(ip, i_lfs_dchain); + TAILQ_INSERT_HEAD(&fs->lfs_dchainhd, marker, i_lfs_dchain); + while ((ip = TAILQ_NEXT(marker, i_lfs_dchain)) != NULL) { + TAILQ_REMOVE(&fs->lfs_dchainhd, marker, i_lfs_dchain); + TAILQ_INSERT_AFTER(&fs->lfs_dchainhd, ip, marker, + i_lfs_dchain); + if (ip->i_state & IN_MARKER) { + lfs_dchain_marker_pass_flush.ev_count++; + continue; + } mutex_exit(&lfs_lock); vp = ITOV(ip); mutex_enter(vp->v_interlock); @@ -1704,7 +1721,9 @@ lfs_flush_dirops(struct lfs *fs) /* XXX only for non-directories? --KS */ LFS_SET_UINO(ip, IN_MODIFIED); } + TAILQ_REMOVE(&fs->lfs_dchainhd, marker, i_lfs_dchain); mutex_exit(&lfs_lock); + /* We've written all the dirops there are */ ssp = (SEGSUM *)sp->segsum; lfs_ss_setflags(fs, ssp, lfs_ss_getflags(fs, ssp) & ~(SS_CONT)); @@ -1712,6 +1731,9 @@ lfs_flush_dirops(struct lfs *fs) (void) lfs_writeseg(fs, sp); lfs_segunlock(fs); + pool_put(&lfs_inoext_pool, marker->inode_ext.lfs); + pool_put(&lfs_inode_pool, marker); + return error; } From d8687df04673464bcd7af5467413c220ba8c7488 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 21 Feb 2020 23:33:17 +0000 Subject: [PATCH 06/30] Just use VOP_BWRITE for lfs_bwrite_log. Hope this doesn't cause trouble with vfs_suspend. --- sys/ufs/lfs/lfs_debug.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sys/ufs/lfs/lfs_debug.c b/sys/ufs/lfs/lfs_debug.c index a9484c611cd4..516a693e42f1 100644 --- a/sys/ufs/lfs/lfs_debug.c +++ b/sys/ufs/lfs/lfs_debug.c @@ -84,16 +84,12 @@ struct lfs_log_entry lfs_log[LFS_LOGLENGTH]; int lfs_bwrite_log(struct buf *bp, const char *file, int line) { - struct vop_bwrite_args a; - - a.a_desc = VDESC(vop_bwrite); - a.a_bp = bp; if (!(bp->b_flags & B_GATHERED) && !(bp->b_oflags & BO_DELWRI)) { LFS_ENTER_LOG("write", file, line, bp->b_lblkno, bp->b_flags, curproc->p_pid); } - return (VCALL(bp->b_vp, VOFFSET(vop_bwrite), &a)); + return VOP_BWRITE(bp->b_vp, bp); } void From 7b1b4c6416eda3ee669ee9a89e7ae59dff3c32c7 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 21 Feb 2020 23:34:50 +0000 Subject: [PATCH 07/30] Teach lfs to transition ro<->rw. --- sys/ufs/lfs/lfs_vfsops.c | 139 ++++++++++++++++++++++----------------- 1 file changed, 77 insertions(+), 62 deletions(-) diff --git a/sys/ufs/lfs/lfs_vfsops.c b/sys/ufs/lfs/lfs_vfsops.c index a2ac2d16d89e..44991f109fd7 100644 --- a/sys/ufs/lfs/lfs_vfsops.c +++ b/sys/ufs/lfs/lfs_vfsops.c @@ -120,6 +120,7 @@ MODULE(MODULE_CLASS_VFS, lfs, NULL); static int lfs_gop_write(struct vnode *, struct vm_page **, int, int); static int lfs_mountfs(struct vnode *, struct mount *, struct lwp *); +static int lfs_flushfiles(struct mount *, int); static struct sysctllog *lfs_sysctl_log; @@ -755,23 +756,18 @@ lfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) ump = VFSTOULFS(mp); fs = ump->um_lfs; - if (fs->lfs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + if (!fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDONLY)) { /* * Changing from read/write to read-only. - * XXX: shouldn't we sync here? or does vfs do that? */ -#ifdef LFS_QUOTA2 - /* XXX: quotas should remain on when readonly */ - if (fs->lfs_use_quota2) { - error = lfsquota2_umount(mp, 0); - if (error) { - return error; - } - } -#endif - } - - if (fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) { + int flags = WRITECLOSE; + if (mp->mnt_flag & MNT_FORCE) + flags |= FORCECLOSE; + error = lfs_flushfiles(mp, flags); + if (error) + return error; + fs->lfs_ronly = 1; + } else if (fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) { /* * Changing from read-only to read/write. * Note in the superblocks that we're writing. @@ -805,8 +801,9 @@ lfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) lfs_writesuper(fs, lfs_sb_getsboff(fs, 1)); } } + if (args->fspec == NULL) - return EINVAL; + return 0; } error = set_statvfs_info(path, UIO_USERSPACE, args->fspec, @@ -1137,6 +1134,7 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) mp->mnt_stat.f_iosize = lfs_sb_getbsize(fs); mp->mnt_flag |= MNT_LOCAL; mp->mnt_fs_bshift = lfs_sb_getbshift(fs); + mp->mnt_iflag |= IMNT_CAN_RWTORO; if (fs->um_maxsymlinklen > 0) mp->mnt_iflag |= IMNT_DTYPE; else @@ -1328,22 +1326,72 @@ out: int lfs_unmount(struct mount *mp, int mntflags) { - struct lwp *l = curlwp; struct ulfsmount *ump; struct lfs *fs; - int error, flags, ronly; - vnode_t *vp; + int error, ronly; + + ump = VFSTOULFS(mp); + fs = ump->um_lfs; - flags = 0; - if (mntflags & MNT_FORCE) - flags |= FORCECLOSE; + error = lfs_flushfiles(mp, mntflags & MNT_FORCE ? FORCECLOSE : 0); + if (error) + return error; + + /* Finish with the Ifile, now that we're done with it */ + vgone(fs->lfs_ivnode); + + ronly = !fs->lfs_ronly; + if (fs->lfs_devvp->v_type != VBAD) + spec_node_setmountedfs(fs->lfs_devvp, NULL); + vn_lock(fs->lfs_devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_CLOSE(fs->lfs_devvp, + ronly ? FREAD : FREAD|FWRITE, NOCRED); + vput(fs->lfs_devvp); + + /* Complain about page leakage */ + if (fs->lfs_pages > 0) + printf("lfs_unmount: still claim %d pages (%d in subsystem)\n", + fs->lfs_pages, lfs_subsys_pages); + + /* Free per-mount data structures */ + free(fs->lfs_ino_bitmap, M_SEGMENT); + free(fs->lfs_suflags[0], M_SEGMENT); + free(fs->lfs_suflags[1], M_SEGMENT); + free(fs->lfs_suflags, M_SEGMENT); + lfs_free_resblks(fs); + cv_destroy(&fs->lfs_sleeperscv); + cv_destroy(&fs->lfs_diropscv); + cv_destroy(&fs->lfs_stopcv); + cv_destroy(&fs->lfs_nextsegsleep); + + rw_destroy(&fs->lfs_fraglock); + rw_destroy(&fs->lfs_iflock); + + kmem_free(fs, sizeof(struct lfs)); + kmem_free(ump, sizeof(*ump)); + + mp->mnt_data = NULL; + mp->mnt_flag &= ~MNT_LOCAL; + return (error); +} + +static int +lfs_flushfiles(struct mount *mp, int flags) +{ + struct lwp *l = curlwp; + struct ulfsmount *ump; + struct lfs *fs; + struct vnode *vp; + int error; ump = VFSTOULFS(mp); fs = ump->um_lfs; /* Two checkpoints */ - lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); - lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); + if (!fs->lfs_ronly) { + lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); + lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC); + } /* wake up the cleaner so it can die */ /* XXX: shouldn't this be *after* the error cases below? */ @@ -1383,51 +1431,18 @@ lfs_unmount(struct mount *mp, int mntflags) mutex_exit(vp->v_interlock); /* Explicitly write the superblock, to update serial and pflags */ - lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) | LFS_PF_CLEAN); - lfs_writesuper(fs, lfs_sb_getsboff(fs, 0)); - lfs_writesuper(fs, lfs_sb_getsboff(fs, 1)); + if (!fs->lfs_ronly) { + lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) | LFS_PF_CLEAN); + lfs_writesuper(fs, lfs_sb_getsboff(fs, 0)); + lfs_writesuper(fs, lfs_sb_getsboff(fs, 1)); + } mutex_enter(&lfs_lock); while (fs->lfs_iocount) mtsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs_umount", 0, &lfs_lock); mutex_exit(&lfs_lock); - /* Finish with the Ifile, now that we're done with it */ - vgone(fs->lfs_ivnode); - - ronly = !fs->lfs_ronly; - if (fs->lfs_devvp->v_type != VBAD) - spec_node_setmountedfs(fs->lfs_devvp, NULL); - vn_lock(fs->lfs_devvp, LK_EXCLUSIVE | LK_RETRY); - error = VOP_CLOSE(fs->lfs_devvp, - ronly ? FREAD : FREAD|FWRITE, NOCRED); - vput(fs->lfs_devvp); - - /* Complain about page leakage */ - if (fs->lfs_pages > 0) - printf("lfs_unmount: still claim %d pages (%d in subsystem)\n", - fs->lfs_pages, lfs_subsys_pages); - - /* Free per-mount data structures */ - free(fs->lfs_ino_bitmap, M_SEGMENT); - free(fs->lfs_suflags[0], M_SEGMENT); - free(fs->lfs_suflags[1], M_SEGMENT); - free(fs->lfs_suflags, M_SEGMENT); - lfs_free_resblks(fs); - cv_destroy(&fs->lfs_sleeperscv); - cv_destroy(&fs->lfs_diropscv); - cv_destroy(&fs->lfs_stopcv); - cv_destroy(&fs->lfs_nextsegsleep); - - rw_destroy(&fs->lfs_fraglock); - rw_destroy(&fs->lfs_iflock); - - kmem_free(fs, sizeof(struct lfs)); - kmem_free(ump, sizeof(*ump)); - - mp->mnt_data = NULL; - mp->mnt_flag &= ~MNT_LOCAL; - return (error); + return 0; } /* From b84025edf69afd3a4cd55fd638e006af70dfc08f Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 22 Feb 2020 03:55:00 +0000 Subject: [PATCH 08/30] Prevent new dirops while we issue lfs_flush_dirops. lfs_flush_dirops assumes (by KASSERT((ip->i_state & IN_ADIROP) == 0)) that vnodes on the dchain will not become involved in active dirops even while holding no other locks (lfs_lock, v_interlock), so we must set lfs_writer here. All other callers already set lfs_writer. We set fs->lfs_writer++ without explicitly doing lfs_writer_enter because (a) we already waited for the dirops to drain, and (b) we hold lfs_lock and cannot drop it before setting lfs_writer. --- sys/ufs/lfs/lfs_bio.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sys/ufs/lfs/lfs_bio.c b/sys/ufs/lfs/lfs_bio.c index f2972c009827..f23c6736cd54 100644 --- a/sys/ufs/lfs/lfs_bio.c +++ b/sys/ufs/lfs/lfs_bio.c @@ -653,9 +653,14 @@ lfs_check(struct vnode *vp, daddr_t blkno, int flags) /* If there are too many pending dirops, we have to flush them. */ if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) || lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) { + KASSERT(fs->lfs_dirops == 0); + fs->lfs_writer++; mutex_exit(&lfs_lock); lfs_flush_dirops(fs); mutex_enter(&lfs_lock); + if (--fs->lfs_writer == 0) + cv_broadcast(&fs->lfs_diropscv); + KASSERT(fs->lfs_dirops == 0); } else if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS || locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES || lfs_subsys_pages > LFS_MAX_PAGES || From 0189a318a28bca90cf7d068d7a81015271af88bc Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 22 Feb 2020 03:57:44 +0000 Subject: [PATCH 09/30] Assert lfs_writer where I think we can now prove it. --- sys/ufs/lfs/lfs_vnops.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sys/ufs/lfs/lfs_vnops.c b/sys/ufs/lfs/lfs_vnops.c index e7b4ca25c1cc..5e1711662c06 100644 --- a/sys/ufs/lfs/lfs_vnops.c +++ b/sys/ufs/lfs/lfs_vnops.c @@ -1615,12 +1615,14 @@ lfs_flush_dirops(struct lfs *fs) int error = 0; ASSERT_MAYBE_SEGLOCK(fs); - KASSERT(fs->lfs_nadirop == 0); + KASSERT(fs->lfs_nadirop == 0); /* stable during lfs_writer */ + KASSERT(fs->lfs_dirops == 0); /* stable during lfs_writer */ if (fs->lfs_ronly) return EROFS; mutex_enter(&lfs_lock); + KASSERT(fs->lfs_writer); if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) { mutex_exit(&lfs_lock); return 0; @@ -1654,6 +1656,7 @@ lfs_flush_dirops(struct lfs *fs) * */ mutex_enter(&lfs_lock); + KASSERT(fs->lfs_writer); TAILQ_INSERT_HEAD(&fs->lfs_dchainhd, marker, i_lfs_dchain); while ((ip = TAILQ_NEXT(marker, i_lfs_dchain)) != NULL) { TAILQ_REMOVE(&fs->lfs_dchainhd, marker, i_lfs_dchain); @@ -1754,6 +1757,7 @@ lfs_flush_pchain(struct lfs *fs) int error, error2; ASSERT_NO_SEGLOCK(fs); + KASSERT(fs->lfs_writer); if (fs->lfs_ronly) return EROFS; From 3af9c9dd0e842a91264df41ff6dc8b022edcc96e Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 22 Feb 2020 03:58:09 +0000 Subject: [PATCH 10/30] Serialize access to the splay tree with lfs_lock. --- sys/ufs/lfs/lfs_balloc.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/sys/ufs/lfs/lfs_balloc.c b/sys/ufs/lfs/lfs_balloc.c index 3513e17388d5..fb8c4d5c0336 100644 --- a/sys/ufs/lfs/lfs_balloc.c +++ b/sys/ufs/lfs/lfs_balloc.c @@ -660,9 +660,10 @@ lfs_register_block(struct vnode *vp, daddr_t lbn) static void lfs_do_deregister(struct lfs *fs, struct inode *ip, struct lbnentry *lbp) { + + KASSERT(mutex_owned(&lfs_lock)); ASSERT_MAYBE_SEGLOCK(fs); - mutex_enter(&lfs_lock); --ip->i_lfs_nbtree; SPLAY_REMOVE(lfs_splay, &ip->i_lfs_lbtree, lbp); if (fs->lfs_favail > lfs_btofsb(fs, (1 << lfs_sb_getbshift(fs)))) @@ -671,9 +672,12 @@ lfs_do_deregister(struct lfs *fs, struct inode *ip, struct lbnentry *lbp) if (locked_fakequeue_count > 0) --locked_fakequeue_count; lfs_subsys_pages -= lfs_sb_getbsize(fs) >> PAGE_SHIFT; - mutex_exit(&lfs_lock); + mutex_exit(&lfs_lock); pool_put(&lfs_lbnentry_pool, lbp); + mutex_enter(&lfs_lock); + + KASSERT(mutex_owned(&lfs_lock)); } void @@ -690,19 +694,18 @@ lfs_deregister_block(struct vnode *vp, daddr_t lbn) if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM) return; + mutex_enter(&lfs_lock); fs = ip->i_lfs; tmp.lbn = lbn; - lbp = SPLAY_FIND(lfs_splay, &ip->i_lfs_lbtree, &tmp); - if (lbp == NULL) - return; - - lfs_do_deregister(fs, ip, lbp); + if ((lbp = SPLAY_FIND(lfs_splay, &ip->i_lfs_lbtree, &tmp)) != NULL) + lfs_do_deregister(fs, ip, lbp); + mutex_exit(&lfs_lock); } void lfs_deregister_all(struct vnode *vp) { - struct lbnentry *lbp, *nlbp; + struct lbnentry *lbp; struct lfs_splay *hd; struct lfs *fs; struct inode *ip; @@ -711,8 +714,8 @@ lfs_deregister_all(struct vnode *vp) fs = ip->i_lfs; hd = &ip->i_lfs_lbtree; - for (lbp = SPLAY_MIN(lfs_splay, hd); lbp != NULL; lbp = nlbp) { - nlbp = SPLAY_NEXT(lfs_splay, hd, lbp); + mutex_enter(&lfs_lock); + while ((lbp = SPLAY_MIN(lfs_splay, hd)) != NULL) lfs_do_deregister(fs, ip, lbp); - } + mutex_exit(&lfs_lock); } From cd9c704519d2ebb52fff0063f8ded69992596087 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 22 Feb 2020 16:02:37 +0000 Subject: [PATCH 11/30] Change some cheap KDASSERT into KASSERT. --- sys/ufs/lfs/lfs_segment.c | 2 +- sys/ufs/lfs/lfs_vnops.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sys/ufs/lfs/lfs_segment.c b/sys/ufs/lfs/lfs_segment.c index 54207304bf04..ca2fd9b3fb86 100644 --- a/sys/ufs/lfs/lfs_segment.c +++ b/sys/ufs/lfs/lfs_segment.c @@ -399,7 +399,7 @@ lfs_vflush(struct vnode *vp) * still not done with this vnode. * XXX we can do better than this. */ - KDASSERT(ip->i_number != LFS_IFILE_INUM); + KASSERT(ip->i_number != LFS_IFILE_INUM); lfs_writeinode(fs, sp, ip); mutex_enter(&lfs_lock); LFS_SET_UINO(ip, IN_MODIFIED); diff --git a/sys/ufs/lfs/lfs_vnops.c b/sys/ufs/lfs/lfs_vnops.c index 5e1711662c06..f038d98d26a7 100644 --- a/sys/ufs/lfs/lfs_vnops.c +++ b/sys/ufs/lfs/lfs_vnops.c @@ -1706,7 +1706,7 @@ lfs_flush_dirops(struct lfs *fs) break; } } - KDASSERT(ip->i_number != LFS_IFILE_INUM); + KASSERT(ip->i_number != LFS_IFILE_INUM); error = lfs_writeinode(fs, sp, ip); mutex_enter(&lfs_lock); if (error && (sp->seg_flags & SEGM_SINGLE)) { @@ -1828,7 +1828,7 @@ lfs_flush_pchain(struct lfs *fs) LFS_SET_UINO(ip, IN_MODIFIED); mutex_exit(&lfs_lock); } - KDASSERT(ip->i_number != LFS_IFILE_INUM); + KASSERT(ip->i_number != LFS_IFILE_INUM); error2 = lfs_writeinode(fs, sp, ip); VOP_UNLOCK(vp); From 6631b92b3d90e6bae3388fcb04010562b95cd055 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 22 Feb 2020 16:12:42 +0000 Subject: [PATCH 12/30] Take a reference and fix assertions in lfs_flush_dirops. Fixes panic: KASSERT((ip->i_state & IN_ADIROP) == 0) at lfs_vnops.c:1670 lfs_flush_dirops lfs_check lfs_setattr VOP_SETATTR change_mode sys_fchmod syscall This assertion -- and the assertion that vp->v_uflag has VU_DIROP set -- is valid only until we release lfs_lock, because we may race with lfs_unmark_dirop which will remove the nodes and change the flags. Further, vp itself is valid only as long as it is referenced, which it is as long as it's on the dchain, but lfs_unmark_dirop drops the dchain's reference. --- sys/ufs/lfs/lfs_vnops.c | 45 ++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/sys/ufs/lfs/lfs_vnops.c b/sys/ufs/lfs/lfs_vnops.c index f038d98d26a7..eab03e502a14 100644 --- a/sys/ufs/lfs/lfs_vnops.c +++ b/sys/ufs/lfs/lfs_vnops.c @@ -1622,7 +1622,6 @@ lfs_flush_dirops(struct lfs *fs) return EROFS; mutex_enter(&lfs_lock); - KASSERT(fs->lfs_writer); if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) { mutex_exit(&lfs_lock); return 0; @@ -1666,13 +1665,33 @@ lfs_flush_dirops(struct lfs *fs) lfs_dchain_marker_pass_flush.ev_count++; continue; } - mutex_exit(&lfs_lock); vp = ITOV(ip); - mutex_enter(vp->v_interlock); + /* + * Prevent the vnode from going away if it's just been + * put out in the segment and lfs_unmark_dirop is about + * to release it. While it is on the list it is always + * referenced, so it cannot be reclaimed until we + * release it. + */ + vref(vp); + + /* + * Since we hold lfs_writer, the node can't be in an + * active dirop. Since it's on the list and we hold a + * reference to it, it can't be reclaimed now. + */ KASSERT((ip->i_state & IN_ADIROP) == 0); KASSERT(vp->v_uflag & VU_DIROP); - KASSERT(vdead_check(vp, VDEAD_NOWAIT) == 0); + + /* + * After we release lfs_lock, if we were in the middle + * of writing a segment, lfs_unmark_dirop may end up + * clearing VU_DIROP, and we have no way to stop it. + * That should be OK -- we'll just have less to do + * here. + */ + mutex_exit(&lfs_lock); /* * All writes to directories come from dirops; all @@ -1682,15 +1701,6 @@ lfs_flush_dirops(struct lfs *fs) * directory blocks inodes and file inodes. So we don't * really need to lock. */ - if (vdead_check(vp, VDEAD_NOWAIT) != 0) { - mutex_exit(vp->v_interlock); - mutex_enter(&lfs_lock); - continue; - } - mutex_exit(vp->v_interlock); - /* XXX see below - * waslocked = VOP_ISLOCKED(vp); - */ if (vp->v_type != VREG && ((ip->i_state & IN_ALLMOD) || !VPISEMPTY(vp))) { error = lfs_writefile(fs, sp, vp); @@ -1701,6 +1711,7 @@ lfs_flush_dirops(struct lfs *fs) mutex_exit(&lfs_lock); } if (error && (sp->seg_flags & SEGM_SINGLE)) { + vrele(vp); mutex_enter(&lfs_lock); error = EAGAIN; break; @@ -1708,8 +1719,9 @@ lfs_flush_dirops(struct lfs *fs) } KASSERT(ip->i_number != LFS_IFILE_INUM); error = lfs_writeinode(fs, sp, ip); - mutex_enter(&lfs_lock); if (error && (sp->seg_flags & SEGM_SINGLE)) { + vrele(vp); + mutex_enter(&lfs_lock); error = EAGAIN; break; } @@ -1722,7 +1734,12 @@ lfs_flush_dirops(struct lfs *fs) * write them. */ /* XXX only for non-directories? --KS */ + mutex_enter(&lfs_lock); LFS_SET_UINO(ip, IN_MODIFIED); + mutex_exit(&lfs_lock); + + vrele(vp); + mutex_enter(&lfs_lock); } TAILQ_REMOVE(&fs->lfs_dchainhd, marker, i_lfs_dchain); mutex_exit(&lfs_lock); From 6de2374d026a6765d74fb9d37e62bab6b5678eee Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 22 Feb 2020 16:18:48 +0000 Subject: [PATCH 13/30] Don't lfs_writer_enter while holding v_interlock. There's no need to lfs_writer_enter at all here, as far as I can see. lfs_flush_fs will do it for us. --- sys/ufs/lfs/lfs_pages.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/sys/ufs/lfs/lfs_pages.c b/sys/ufs/lfs/lfs_pages.c index 479d950d567b..25ae245915fa 100644 --- a/sys/ufs/lfs/lfs_pages.c +++ b/sys/ufs/lfs/lfs_pages.c @@ -710,29 +710,30 @@ retry: (vp->v_uflag & VU_DIROP)) { DLOG((DLOG_PAGE, "lfs_putpages: flushing VU_DIROP\n")); - lfs_writer_enter(fs, "ppdirop"); + /* + * NB: lfs_flush_fs can recursively call lfs_putpages, + * but it won't reach this branch because it passes + * PGO_LOCKED. + */ - /* Note if we hold the vnode locked */ - if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) - { - DLOG((DLOG_PAGE, "lfs_putpages: dirop inode already locked\n")); - } else { - DLOG((DLOG_PAGE, "lfs_putpages: dirop inode not locked\n")); - } mutex_exit(vp->v_interlock); - mutex_enter(&lfs_lock); lfs_flush_fs(fs, sync ? SEGM_SYNC : 0); mutex_exit(&lfs_lock); - mutex_enter(vp->v_interlock); - lfs_writer_leave(fs); /* * The flush will have cleaned out this vnode as well, * no need to do more to it. * XXX then why are we falling through and continuing? */ + + /* + * XXX State may have changed while we dropped the + * lock; start over just in case. The above comment + * suggests this should maybe instead be goto out. + */ + goto retry; } /* From 9c0e87c1a33a710c6049c96ee6048796b48e4d91 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 22 Feb 2020 18:05:40 +0000 Subject: [PATCH 14/30] Break deadlock in PR kern/52301. The lock order is lfs_writer -> lfs_seglock. The problem in 52301 is that lfs_segwrite violates this lock order by sometimes doing lfs_seglock -> lfs_writer, either (a) when doing a checkpoint or (b), opportunistically, when there are no dirops pending. Both cases can deadlock, because dirops sometimes take the seglock (lfs_truncate, lfs_valloc, lfs_vfree): (a) There may be dirops pending, and they may be waiting for the seglock, so we can't wait for them to complete while holding the seglock. (b) The test for fs->lfs_dirops == 0 happens unlocked, and the state may change by the time lfs_writer_enter acquires lfs_lock. To resolve this in each case: (a) Do lfs_writer_enter before lfs_seglock, since we will need it unconditionally anyway. The worst performance impact of this should be that some dirops get delayed a little bit. (b) Create a new lfs_writer_tryenter to use at this point so that the test for fs->lfs_dirops == 0 and the acquisition of lfs_writer happen atomically under lfs_lock. --- sys/ufs/lfs/lfs_extern.h | 1 + sys/ufs/lfs/lfs_segment.c | 18 +++++++++++++----- sys/ufs/lfs/lfs_subr.c | 17 ++++++++++++++++- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/sys/ufs/lfs/lfs_extern.h b/sys/ufs/lfs/lfs_extern.h index 0e2724d78968..a99728306f43 100644 --- a/sys/ufs/lfs/lfs_extern.h +++ b/sys/ufs/lfs/lfs_extern.h @@ -211,6 +211,7 @@ int lfs_seglock(struct lfs *, unsigned long); void lfs_segunlock(struct lfs *); void lfs_segunlock_relock(struct lfs *); int lfs_writer_enter(struct lfs *, const char *); +int lfs_writer_tryenter(struct lfs *); void lfs_writer_leave(struct lfs *); void lfs_wakeup_cleaner(struct lfs *); diff --git a/sys/ufs/lfs/lfs_segment.c b/sys/ufs/lfs/lfs_segment.c index ca2fd9b3fb86..b54855c74234 100644 --- a/sys/ufs/lfs/lfs_segment.c +++ b/sys/ufs/lfs/lfs_segment.c @@ -624,6 +624,15 @@ lfs_segwrite(struct mount *mp, int flags) */ do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags); + /* + * If we know we're gonna need the writer lock, take it now to + * preserve the lock order lfs_writer -> lfs_seglock. + */ + if (do_ckp) { + lfs_writer_enter(fs, "ckpwriter"); + writer_set = 1; + } + /* We can't do a partial write and checkpoint at the same time. */ if (do_ckp) flags &= ~SEGM_SINGLE; @@ -653,11 +662,10 @@ lfs_segwrite(struct mount *mp, int flags) break; } - if (do_ckp || fs->lfs_dirops == 0) { - if (!writer_set) { - lfs_writer_enter(fs, "lfs writer"); - writer_set = 1; - } + if (do_ckp || + (writer_set = lfs_writer_tryenter(fs)) != 0) { + KASSERT(writer_set); + KASSERT(fs->lfs_writer); error = lfs_writevnodes(fs, mp, sp, VN_DIROP); if (um_error == 0) um_error = error; diff --git a/sys/ufs/lfs/lfs_subr.c b/sys/ufs/lfs/lfs_subr.c index 83b5ae7b69e1..db7c3444390f 100644 --- a/sys/ufs/lfs/lfs_subr.c +++ b/sys/ufs/lfs/lfs_subr.c @@ -575,7 +575,7 @@ lfs_writer_enter(struct lfs *fs, const char *wmesg) { int error = 0; - ASSERT_MAYBE_SEGLOCK(fs); + ASSERT_NO_SEGLOCK(fs); mutex_enter(&lfs_lock); /* disallow dirops during flush */ @@ -596,6 +596,21 @@ lfs_writer_enter(struct lfs *fs, const char *wmesg) return error; } +int +lfs_writer_tryenter(struct lfs *fs) +{ + int writer_set; + + ASSERT_MAYBE_SEGLOCK(fs); + mutex_enter(&lfs_lock); + writer_set = (fs->lfs_dirops == 0); + if (writer_set) + fs->lfs_writer++; + mutex_exit(&lfs_lock); + + return writer_set; +} + void lfs_writer_leave(struct lfs *fs) { From dd6ccc131fba42bf41a6fbaadedd42c7550f6ea7 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 22 Feb 2020 23:43:52 +0000 Subject: [PATCH 15/30] Initialize/destroy lfs_allclean_wakeup in modcmd, not lfs_mountfs. Fixes reloading lfs.kmod. --- sys/ufs/lfs/lfs_vfsops.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/sys/ufs/lfs/lfs_vfsops.c b/sys/ufs/lfs/lfs_vfsops.c index 44991f109fd7..505638374677 100644 --- a/sys/ufs/lfs/lfs_vfsops.c +++ b/sys/ufs/lfs/lfs_vfsops.c @@ -356,6 +356,7 @@ lfs_modcmd(modcmd_t cmd, void *arg) break; } lfs_sysctl_setup(&lfs_sysctl_log); + cv_init(&lfs_allclean_wakeup, "segment"); break; case MODULE_CMD_FINI: error = vfs_detach(&lfs_vfsops); @@ -363,6 +364,7 @@ lfs_modcmd(modcmd_t cmd, void *arg) break; syscall_disestablish(NULL, lfs_syscalls); sysctl_teardown(&lfs_sysctl_log); + cv_destroy(&lfs_allclean_wakeup); break; default: error = ENOTTY; @@ -857,7 +859,6 @@ lfs_checkmagic(struct lfs *fs) int lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) { - static bool lfs_mounted_once = false; struct lfs *primarysb, *altsb, *thesb; struct buf *primarybuf, *altbuf; struct lfs *fs; @@ -1091,12 +1092,6 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) cv_init(&fs->lfs_stopcv, "lfsstop"); cv_init(&fs->lfs_nextsegsleep, "segment"); - /* Initialize values for all LFS mounts */ - if (!lfs_mounted_once) { - cv_init(&lfs_allclean_wakeup, "segment"); - lfs_mounted_once = true; - } - /* Set the file system readonly/modify bits. */ fs->lfs_ronly = ronly; if (ronly == 0) From a6198ff436898948e27a86a9cce1e5f1a42b2408 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 22 Feb 2020 23:59:46 +0000 Subject: [PATCH 16/30] In lfs_update, hold lfs_writer around lfs_vflush. Otherwise, we might do lfs_vflush -> lfs_seglock -> lfs_segwait(SEGM_CKP) -> lfs_writer_enter which is the reverse of the lfs_writer -> lfs_seglock ordering. --- sys/ufs/lfs/lfs_inode.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/sys/ufs/lfs/lfs_inode.c b/sys/ufs/lfs/lfs_inode.c index 0266de39f7d9..9dd98afc7d42 100644 --- a/sys/ufs/lfs/lfs_inode.c +++ b/sys/ufs/lfs/lfs_inode.c @@ -133,6 +133,7 @@ lfs_update(struct vnode *vp, const struct timespec *acc, struct inode *ip; struct lfs *fs = VFSTOULFS(vp->v_mount)->um_lfs; int flags; + int error; ASSERT_NO_SEGLOCK(fs); if (vp->v_mount->mnt_flag & MNT_RDONLY) @@ -175,7 +176,7 @@ lfs_update(struct vnode *vp, const struct timespec *acc, vp->v_iflag | vp->v_vflag | vp->v_uflag, ip->i_state)); if (fs->lfs_dirops == 0) - lfs_flush_fs(fs, SEGM_SYNC); + break; else mtsleep(&fs->lfs_writer, PRIBIO+1, "lfs_fsync", 0, &lfs_lock); @@ -183,8 +184,18 @@ lfs_update(struct vnode *vp, const struct timespec *acc, twice? */ } --fs->lfs_diropwait; + fs->lfs_writer++; + if (vp->v_uflag & VU_DIROP) { + KASSERT(fs->lfs_dirops == 0); + lfs_flush_fs(fs, SEGM_SYNC); + } + mutex_exit(&lfs_lock); + error = lfs_vflush(vp); + mutex_enter(&lfs_lock); + if (--fs->lfs_writer == 0) + cv_broadcast(&fs->lfs_diropscv); mutex_exit(&lfs_lock); - return lfs_vflush(vp); + return error; } return 0; } From 8e06ab355248f0c2ab3b3a5b32731e58bb419e30 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 23 Feb 2020 01:28:18 +0000 Subject: [PATCH 17/30] Call lfs_orphan in lfs_rename while we're still in the dirop. --- sys/ufs/lfs/lfs_rename.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sys/ufs/lfs/lfs_rename.c b/sys/ufs/lfs/lfs_rename.c index 866a97a1b634..a1e730c89a8e 100644 --- a/sys/ufs/lfs/lfs_rename.c +++ b/sys/ufs/lfs/lfs_rename.c @@ -1061,6 +1061,9 @@ lfs_gro_rename(struct mount *mp, kauth_cred_t cred, fdvp, fcnp, fde, fvp, tdvp, tcnp, tde, tvp); + if (tvp && VTOI(tvp)->i_nlink == 0) + lfs_orphan(VTOI(tvp)->i_lfs, VTOI(tvp)->i_number); + UNMARK_VNODE(fdvp); UNMARK_VNODE(fvp); UNMARK_VNODE(tdvp); From c56f32cab434c7e49a3887e8fccb863e9f5b9fe9 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 21 Feb 2020 23:35:54 +0000 Subject: [PATCH 18/30] Teach LFS_ORPHAN_NEXTFREE about lfs64. --- sys/ufs/lfs/lfs.h | 5 +++-- sys/ufs/lfs/lfs_alloc.c | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sys/ufs/lfs/lfs.h b/sys/ufs/lfs/lfs.h index 56e04d738206..069a43fe9127 100644 --- a/sys/ufs/lfs/lfs.h +++ b/sys/ufs/lfs/lfs.h @@ -596,8 +596,9 @@ typedef union iinfo { /* magic value for daddrs */ #define LFS_UNUSED_DADDR 0 /* out-of-band daddr */ -/* magic value for if_nextfree */ -#define LFS_ORPHAN_NEXTFREE (~(uint32_t)0) /* indicate orphaned file */ +/* magic value for if_nextfree -- indicate orphaned file */ +#define LFS_ORPHAN_NEXTFREE(fs) \ + ((fs)->lfs_is64 ? ~(uint64_t)0 : ~(uint32_t)0) typedef struct ifile64 IFILE64; struct ifile64 { diff --git a/sys/ufs/lfs/lfs_alloc.c b/sys/ufs/lfs/lfs_alloc.c index f8ca60104574..e227bb72e597 100644 --- a/sys/ufs/lfs/lfs_alloc.c +++ b/sys/ufs/lfs/lfs_alloc.c @@ -757,7 +757,7 @@ lfs_order_freelist(struct lfs *fs) * but presumably it doesn't work... not sure what * happens to such files currently. -- dholland 20160806 */ - if (lfs_if_getnextfree(fs, ifp) == LFS_ORPHAN_NEXTFREE && + if (lfs_if_getnextfree(fs, ifp) == LFS_ORPHAN_NEXTFREE(fs) && VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp) == 0) { unsigned segno; @@ -851,6 +851,6 @@ lfs_orphan(struct lfs *fs, ino_t ino) struct buf *bp; LFS_IENTRY(ifp, fs, ino, bp); - lfs_if_setnextfree(fs, ifp, LFS_ORPHAN_NEXTFREE); + lfs_if_setnextfree(fs, ifp, LFS_ORPHAN_NEXTFREE(fs)); LFS_BWRITE_LOG(bp); } From e8c000cf24896935e0a0dbc7b099379bac0be8ee Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 23 Feb 2020 03:09:18 +0000 Subject: [PATCH 19/30] Dust off the orphan detection code and try to make it work. --- sys/ufs/lfs/lfs_alloc.c | 156 +++++++++++++++++++++++++++++---------- sys/ufs/lfs/lfs_extern.h | 3 +- sys/ufs/lfs/lfs_vfsops.c | 9 ++- 3 files changed, 127 insertions(+), 41 deletions(-) diff --git a/sys/ufs/lfs/lfs_alloc.c b/sys/ufs/lfs/lfs_alloc.c index e227bb72e597..bd4e4e8b4468 100644 --- a/sys/ufs/lfs/lfs_alloc.c +++ b/sys/ufs/lfs/lfs_alloc.c @@ -705,16 +705,16 @@ lfs_vfree(struct vnode *vp, ino_t ino, int mode) * Takes the segmenet lock. */ void -lfs_order_freelist(struct lfs *fs) +lfs_order_freelist(struct lfs *fs, ino_t **orphanp, size_t *norphanp) { CLEANERINFO *cip; IFILE *ifp = NULL; struct buf *bp; ino_t ino, firstino, lastino, maxino; -#ifdef notyet - struct vnode *vp; -#endif - + ino_t *orphan = NULL; + size_t norphan = 0; + size_t norphan_alloc = 0; + ASSERT_NO_SEGLOCK(fs); lfs_seglock(fs, SEGM_PROT); @@ -745,7 +745,6 @@ lfs_order_freelist(struct lfs *fs) if (ino == LFS_UNUSED_INUM || ino == LFS_IFILE_INUM) continue; -#ifdef notyet /* * Address orphaned files. * @@ -757,39 +756,26 @@ lfs_order_freelist(struct lfs *fs) * but presumably it doesn't work... not sure what * happens to such files currently. -- dholland 20160806 */ - if (lfs_if_getnextfree(fs, ifp) == LFS_ORPHAN_NEXTFREE(fs) && - VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp) == 0) { - unsigned segno; - - /* get the segment the inode in on disk */ - segno = lfs_dtosn(fs, lfs_if_getdaddr(fs, ifp)); - - /* truncate the inode */ - lfs_truncate(vp, 0, 0, NOCRED); - vput(vp); - - /* load the segment summary */ - LFS_SEGENTRY(sup, fs, segno, bp); - /* update the number of bytes in the segment */ - KASSERT(sup->su_nbytes >= DINOSIZE(fs)); - sup->su_nbytes -= DINOSIZE(fs); - /* write the segment summary */ - LFS_WRITESEGENTRY(sup, fs, segno, bp); - - /* Drop the on-disk address */ - lfs_if_setdaddr(fs, ifp, LFS_UNUSED_DADDR); - /* write the ifile entry */ - LFS_BWRITE_LOG(bp); - - /* - * and reload it (XXX: why? I guess - * LFS_BWRITE_LOG drops it...) - */ - LFS_IENTRY(ifp, fs, ino, bp); - - /* Fall through to next if block */ + if (lfs_if_getnextfree(fs, ifp) == LFS_ORPHAN_NEXTFREE(fs)) { + if (orphan == NULL) { + norphan_alloc = 32; /* XXX pulled from arse */ + orphan = kmem_zalloc(sizeof(orphan[0]) * + norphan_alloc, KM_SLEEP); + } else if (norphan == norphan_alloc) { + ino_t *orphan_new; + if (norphan_alloc >= 4096) + norphan_alloc += 4096; + else + norphan_alloc *= 2; + orphan_new = kmem_zalloc(sizeof(orphan[0]) * + norphan_alloc, KM_SLEEP); + memcpy(orphan_new, orphan, sizeof(orphan[0]) * + norphan); + kmem_free(orphan, sizeof(orphan[0]) * norphan); + orphan = orphan_new; + } + orphan[norphan++] = ino; } -#endif if (lfs_if_getdaddr(fs, ifp) == LFS_UNUSED_DADDR) { @@ -836,6 +822,22 @@ lfs_order_freelist(struct lfs *fs) /* done */ lfs_segunlock(fs); + + /* + * Shrink the array of orphans so we don't have to carry around + * the allocation size. + */ + if (norphan < norphan_alloc) { + ino_t *orphan_new = kmem_alloc(sizeof(orphan[0]) * norphan, + KM_SLEEP); + memcpy(orphan_new, orphan, sizeof(orphan[0]) * norphan); + kmem_free(orphan, sizeof(orphan[0]) * norphan_alloc); + orphan = orphan_new; + norphan_alloc = norphan; + } + + *orphanp = orphan; + *norphanp = norphan; } /* @@ -854,3 +856,81 @@ lfs_orphan(struct lfs *fs, ino_t ino) lfs_if_setnextfree(fs, ifp, LFS_ORPHAN_NEXTFREE(fs)); LFS_BWRITE_LOG(bp); } + +/* + * Free orphans discovered during mount. This is a separate stage + * because it requires fs->lfs_suflags to be set up, which is not done + * by the time we run lfs_order_freelist. It's possible that we could + * run lfs_order_freelist later (i.e., set up fs->lfs_suflags sooner) + * but that requires more thought than I can put into this at the + * moment. + */ +void +lfs_free_orphans(struct lfs *fs, ino_t *orphan, size_t norphan) +{ + size_t i; + + for (i = 0; i < norphan; i++) { + ino_t ino = orphan[i]; + unsigned segno; + struct vnode *vp; + struct inode *ip; + struct buf *bp; + IFILE *ifp; + SEGUSE *sup; + int error; + + /* Get the segment the inode is in on disk. */ + LFS_IENTRY(ifp, fs, ino, bp); + segno = lfs_dtosn(fs, lfs_if_getdaddr(fs, ifp)); + brelse(bp, 0); + + /* + * Try to get the vnode. If we can't, tough -- hope + * you have backups! + */ + error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp); + if (error) { + printf("orphan %jd vget error %d\n", (intmax_t)ino, + error); + continue; + } + + /* + * Sanity-check the inode. + * + * XXX What to do if it is still referenced? + */ + ip = VTOI(vp); + if (ip->i_nlink != 0) + printf("orphan %jd nlink %d\n", (intmax_t)ino, + ip->i_nlink); + + /* + * Truncate the inode, to free any blocks allocated for + * it, and release it, to free the inode number. + * + * XXX Isn't it redundant to truncate? Won't vput do + * that for us? + */ + error = lfs_truncate(vp, 0, 0, NOCRED); + if (error) + printf("orphan %jd truncate error %d", (intmax_t)ino, + error); + vput(vp); + + /* Update the number of bytes in the segment summary. */ + LFS_SEGENTRY(sup, fs, segno, bp); + KASSERT(sup->su_nbytes >= DINOSIZE(fs)); + sup->su_nbytes -= DINOSIZE(fs); + LFS_WRITESEGENTRY(sup, fs, segno, bp); + + /* Drop the on-disk address. */ + LFS_IENTRY(ifp, fs, ino, bp); + lfs_if_setdaddr(fs, ifp, LFS_UNUSED_DADDR); + LFS_BWRITE_LOG(bp); + } + + if (orphan) + kmem_free(orphan, sizeof(orphan[0]) * norphan); +} diff --git a/sys/ufs/lfs/lfs_extern.h b/sys/ufs/lfs/lfs_extern.h index a99728306f43..5530d870af7d 100644 --- a/sys/ufs/lfs/lfs_extern.h +++ b/sys/ufs/lfs/lfs_extern.h @@ -127,9 +127,10 @@ extern kcondvar_t locked_queue_cv; int lfs_valloc(struct vnode *, int, kauth_cred_t, ino_t *, int *); int lfs_valloc_fixed(struct lfs *, ino_t, int); int lfs_vfree(struct vnode *, ino_t, int); -void lfs_order_freelist(struct lfs *); +void lfs_order_freelist(struct lfs *, ino_t **, size_t *); int lfs_extend_ifile(struct lfs *, kauth_cred_t); void lfs_orphan(struct lfs *, ino_t); +void lfs_free_orphans(struct lfs *, ino_t *, size_t); /* lfs_balloc.c */ int lfs_balloc(struct vnode *, off_t, int, kauth_cred_t, int, struct buf **); diff --git a/sys/ufs/lfs/lfs_vfsops.c b/sys/ufs/lfs/lfs_vfsops.c index 505638374677..5e2e4aa039d0 100644 --- a/sys/ufs/lfs/lfs_vfsops.c +++ b/sys/ufs/lfs/lfs_vfsops.c @@ -870,6 +870,8 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) CLEANERINFO *cip; SEGUSE *sup; daddr_t sb_addr; + ino_t *orphan; + size_t norphan; cred = l ? l->l_cred : NOCRED; @@ -1162,8 +1164,8 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) fs->lfs_ivnode = vp; vref(vp); - /* Set up inode bitmap and order free list */ - lfs_order_freelist(fs); + /* Set up inode bitmap, order free list, and gather orphans. */ + lfs_order_freelist(fs, &orphan, &norphan); /* Set up segment usage flags for the autocleaner. */ fs->lfs_nactive = 0; @@ -1202,6 +1204,9 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) brelse(bp, 0); } + /* Free the orphans we discovered while ordering the freelist. */ + lfs_free_orphans(fs, orphan, norphan); + /* * XXX: if the fs has quotas, quotas should be on even if * readonly. Otherwise you can't query the quota info! From f87cbd466bc165ab6c265bd304c3df9b8747529d Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 23 Feb 2020 05:20:54 +0000 Subject: [PATCH 20/30] lfs_writer_enter can't fail; keep it simple and don't pretend it can. Assert that mtsleep can't fail either -- it doesn't catch signals and there's no timeout. --- sys/ufs/lfs/lfs_extern.h | 2 +- sys/ufs/lfs/lfs_subr.c | 11 ++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/sys/ufs/lfs/lfs_extern.h b/sys/ufs/lfs/lfs_extern.h index 5530d870af7d..e757076b4b70 100644 --- a/sys/ufs/lfs/lfs_extern.h +++ b/sys/ufs/lfs/lfs_extern.h @@ -211,7 +211,7 @@ void lfs_free(struct lfs *, void *, int); int lfs_seglock(struct lfs *, unsigned long); void lfs_segunlock(struct lfs *); void lfs_segunlock_relock(struct lfs *); -int lfs_writer_enter(struct lfs *, const char *); +void lfs_writer_enter(struct lfs *, const char *); int lfs_writer_tryenter(struct lfs *); void lfs_writer_leave(struct lfs *); void lfs_wakeup_cleaner(struct lfs *); diff --git a/sys/ufs/lfs/lfs_subr.c b/sys/ufs/lfs/lfs_subr.c index db7c3444390f..19ad3f98c84c 100644 --- a/sys/ufs/lfs/lfs_subr.c +++ b/sys/ufs/lfs/lfs_subr.c @@ -561,6 +561,7 @@ lfs_segunlock(struct lfs *fs) lfs_unmark_dirop(fs); } else { --fs->lfs_seglock; + KASSERT(fs->lfs_seglock != 0); mutex_exit(&lfs_lock); } } @@ -570,10 +571,10 @@ lfs_segunlock(struct lfs *fs) * * No simple_locks are held when we enter and none are held when we return. */ -int +void lfs_writer_enter(struct lfs *fs, const char *wmesg) { - int error = 0; + int error; ASSERT_NO_SEGLOCK(fs); mutex_enter(&lfs_lock); @@ -585,15 +586,11 @@ lfs_writer_enter(struct lfs *fs, const char *wmesg) ++fs->lfs_diropwait; error = mtsleep(&fs->lfs_writer, PRIBIO+1, wmesg, 0, &lfs_lock); + KASSERT(error == 0); --fs->lfs_diropwait; } - if (error) - fs->lfs_writer--; - mutex_exit(&lfs_lock); - - return error; } int From 756b2741c415fc3858c0d6860e20ec0bde1cbb02 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 23 Feb 2020 15:10:56 +0000 Subject: [PATCH 21/30] Fix userland references to LFS_ORPHAN_NEXTFREE. Forgot to grep for these or do a full distribution build, oops! --- sbin/fsck_lfs/pass1.c | 2 +- usr.sbin/dumplfs/dumplfs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sbin/fsck_lfs/pass1.c b/sbin/fsck_lfs/pass1.c index d816f8e5094c..48919f55989e 100644 --- a/sbin/fsck_lfs/pass1.c +++ b/sbin/fsck_lfs/pass1.c @@ -307,7 +307,7 @@ checkinode(ino_t inumber, struct inodesc * idesc) */ if (lfs_dino_getnlink(fs, dp) <= 0) { LFS_IENTRY(ifp, fs, inumber, bp); - if (lfs_if_getnextfree(fs, ifp) == LFS_ORPHAN_NEXTFREE) { + if (lfs_if_getnextfree(fs, ifp) == LFS_ORPHAN_NEXTFREE(fs)) { statemap[inumber] = (mode == LFS_IFDIR ? DCLEAR : FCLEAR); /* Add this to our list of orphans */ zlnp = emalloc(sizeof *zlnp); diff --git a/usr.sbin/dumplfs/dumplfs.c b/usr.sbin/dumplfs/dumplfs.c index 912c4579359b..bc4590dd0773 100644 --- a/usr.sbin/dumplfs/dumplfs.c +++ b/usr.sbin/dumplfs/dumplfs.c @@ -133,7 +133,7 @@ print_ientry(int i, struct lfs *lfsp, IFILE *ip) else printf("%d\tINUSE\t%u\t%8jX\t%s\n", i, version, (intmax_t)daddr, - nextfree == LFS_ORPHAN_NEXTFREE ? "FFFFFFFF" : "-"); + nextfree == LFS_ORPHAN_NEXTFREE(lfsp) ? "orphan" : "-"); } /* From 11d46566a8bb0e0234453df51f49d81fec75a550 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 23 Feb 2020 15:19:46 +0000 Subject: [PATCH 22/30] Fix missing by removing the evcnts instead. Just wanted to confirm that a race might happen, and indeed it did. These serve little diagnostic value otherwise. --- sys/ufs/lfs/lfs_subr.c | 8 +------- sys/ufs/lfs/lfs_vnops.c | 8 +------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/sys/ufs/lfs/lfs_subr.c b/sys/ufs/lfs/lfs_subr.c index 19ad3f98c84c..6904e7d4c0a9 100644 --- a/sys/ufs/lfs/lfs_subr.c +++ b/sys/ufs/lfs/lfs_subr.c @@ -337,10 +337,6 @@ lfs_seglock(struct lfs *fs, unsigned long flags) static void lfs_unmark_dirop(struct lfs *); -static struct evcnt lfs_dchain_marker_pass_dirop = - EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "lfs", "dchain marker pass dirop"); -EVCNT_ATTACH_STATIC(lfs_dchain_marker_pass_dirop); - static void lfs_unmark_dirop(struct lfs *fs) { @@ -371,10 +367,8 @@ lfs_unmark_dirop(struct lfs *fs) TAILQ_REMOVE(&fs->lfs_dchainhd, marker, i_lfs_dchain); TAILQ_INSERT_AFTER(&fs->lfs_dchainhd, ip, marker, i_lfs_dchain); - if (ip->i_state & IN_MARKER) { - lfs_dchain_marker_pass_dirop.ev_count++; + if (ip->i_state & IN_MARKER) continue; - } vp = ITOV(ip); if ((ip->i_state & (IN_ADIROP | IN_CDIROP)) == IN_CDIROP) { --lfs_dirvcount; diff --git a/sys/ufs/lfs/lfs_vnops.c b/sys/ufs/lfs/lfs_vnops.c index eab03e502a14..719da6296093 100644 --- a/sys/ufs/lfs/lfs_vnops.c +++ b/sys/ufs/lfs/lfs_vnops.c @@ -1594,10 +1594,6 @@ lfs_strategy(void *v) return VOP_STRATEGY(vp, bp); } -static struct evcnt lfs_dchain_marker_pass_flush = - EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "lfs", "dchain marker pass flush"); -EVCNT_ATTACH_STATIC(lfs_dchain_marker_pass_flush); - /* * Inline lfs_segwrite/lfs_writevnodes, but just for dirops. * Technically this is a checkpoint (the on-disk state is valid) @@ -1661,10 +1657,8 @@ lfs_flush_dirops(struct lfs *fs) TAILQ_REMOVE(&fs->lfs_dchainhd, marker, i_lfs_dchain); TAILQ_INSERT_AFTER(&fs->lfs_dchainhd, ip, marker, i_lfs_dchain); - if (ip->i_state & IN_MARKER) { - lfs_dchain_marker_pass_flush.ev_count++; + if (ip->i_state & IN_MARKER) continue; - } vp = ITOV(ip); /* From b635afd713961e229b996d4d062127949206822e Mon Sep 17 00:00:00 2001 From: "ad@NetBSD.org" Date: Sun, 23 Feb 2020 15:22:04 +0000 Subject: [PATCH 23/30] Fix !DIAGNOSTIC compile --- sys/ufs/lfs/lfs_subr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/ufs/lfs/lfs_subr.c b/sys/ufs/lfs/lfs_subr.c index 6904e7d4c0a9..596fae5acb4e 100644 --- a/sys/ufs/lfs/lfs_subr.c +++ b/sys/ufs/lfs/lfs_subr.c @@ -568,7 +568,7 @@ lfs_segunlock(struct lfs *fs) void lfs_writer_enter(struct lfs *fs, const char *wmesg) { - int error; + int error __diagused; ASSERT_NO_SEGLOCK(fs); mutex_enter(&lfs_lock); From 424ce760a395e01b2274fff94976dfb7ba0ecd80 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 21 Feb 2020 23:17:33 +0000 Subject: [PATCH 24/30] WIP: lfs write flushbehind, like ffs does. This is tuned to the lfs segment size. It's kind of wrong because we really want to start flushing when the file system as a whole has accrued a segment's worth of data -- not just when a single file has. Writing a large number of little files won't trigger this. Currently WIP because it breaks tests/fs/vfs/t_full. --- sys/ufs/lfs/ulfs_readwrite.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/sys/ufs/lfs/ulfs_readwrite.c b/sys/ufs/lfs/ulfs_readwrite.c index 12984c77a04f..01805dd18e4f 100644 --- a/sys/ufs/lfs/ulfs_readwrite.c +++ b/sys/ufs/lfs/ulfs_readwrite.c @@ -35,6 +35,8 @@ #include __KERNEL_RCSID(1, "$NetBSD: ulfs_readwrite.c,v 1.25 2019/06/20 00:49:11 christos Exp $"); +#include + #define FS struct lfs #define I_FS i_lfs #define READ lfs_read @@ -278,7 +280,6 @@ WRITE(void *v) KASSERT(vp->v_type == VREG); - async = true; lfs_availwait(fs, lfs_btofsb(fs, uio->uio_resid)); lfs_check(vp, LFS_UNUSED_LBN, 0); @@ -395,7 +396,16 @@ WRITE(void *v) * XXXUBC simplistic async flushing. */ - __USE(async); + unsigned shift = ilog2(lfs_segsize(fs)) - 1; + if (!async && + (oldoff >> shift) != (uio->uio_offset >> shift)) { + mutex_enter(vp->v_interlock); + error = VOP_PUTPAGES(vp, (oldoff >> shift) << shift, + (uio->uio_offset >> shift) << shift, + PGO_CLEANIT | PGO_LAZY); + if (error) + break; + } } if (error == 0 && ioflag & IO_SYNC) { mutex_enter(vp->v_interlock); From 35c70c0e0545cfe336ba3ff6d83d33121e5f4a36 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 7 Mar 2020 04:56:40 +0000 Subject: [PATCH 25/30] CTASSERT lfs on-disk structure sizes. --- sys/ufs/lfs/lfs.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sys/ufs/lfs/lfs.h b/sys/ufs/lfs/lfs.h index 069a43fe9127..932ded1e1040 100644 --- a/sys/ufs/lfs/lfs.h +++ b/sys/ufs/lfs/lfs.h @@ -355,6 +355,7 @@ struct lfs_dirheader32 { uint8_t dh_type; /* file type, see below */ uint8_t dh_namlen; /* length of string in d_name */ }; +__CTASSERT(sizeof(struct lfs_dirheader32) == 8); struct lfs_dirheader64 { uint32_t dh_inoA; /* inode number of entry */ @@ -363,6 +364,7 @@ struct lfs_dirheader64 { uint8_t dh_type; /* file type, see below */ uint8_t dh_namlen; /* length of string in d_name */ }; +__CTASSERT(sizeof(struct lfs_dirheader64) == 12); union lfs_dirheader { struct lfs_dirheader64 u_64; @@ -381,6 +383,7 @@ struct lfs_dirtemplate32 { struct lfs_dirheader32 dotdot_header; char dotdot_name[4]; /* ditto */ }; +__CTASSERT(sizeof(struct lfs_dirtemplate32) == 2*(8 + 4)); struct lfs_dirtemplate64 { struct lfs_dirheader64 dot_header; @@ -388,6 +391,7 @@ struct lfs_dirtemplate64 { struct lfs_dirheader64 dotdot_header; char dotdot_name[4]; /* ditto */ }; +__CTASSERT(sizeof(struct lfs_dirtemplate64) == 2*(12 + 4)); union lfs_dirtemplate { struct lfs_dirtemplate64 u_64; @@ -408,6 +412,7 @@ struct lfs_odirtemplate { uint16_t dotdot_namlen; char dotdot_name[4]; /* ditto */ }; +__CTASSERT(sizeof(struct lfs_odirtemplate) == 2*(8 + 4)); #endif /* @@ -441,6 +446,7 @@ struct lfs32_dinode { uint32_t di_gid; /* 116: File group. */ uint64_t di_modrev; /* 120: i_modrev for NFSv4 */ }; +__CTASSERT(sizeof(struct lfs32_dinode) == 128); struct lfs64_dinode { uint16_t di_mode; /* 0: IFMT, permissions; see below. */ @@ -469,6 +475,7 @@ struct lfs64_dinode { uint64_t di_inumber; /* 240: Inode number */ uint64_t di_spare[1]; /* 248: Reserved; currently unused */ }; +__CTASSERT(sizeof(struct lfs64_dinode) == 256); union lfs_dinode { struct lfs64_dinode u_64; @@ -529,6 +536,7 @@ struct segusage { uint32_t su_flags; /* 12: segment flags */ uint64_t su_lastmod; /* 16: last modified timestamp */ }; +__CTASSERT(sizeof(struct segusage) == 24); typedef struct segusage_v1 SEGUSE_V1; struct segusage_v1 { @@ -538,6 +546,7 @@ struct segusage_v1 { uint16_t su_ninos; /* 10: number of inode blocks in seg */ uint32_t su_flags; /* 12: segment flags */ }; +__CTASSERT(sizeof(struct segusage_v1) == 16); /* * On-disk file information. One per file with data blocks in the segment. @@ -555,6 +564,7 @@ struct finfo64 { uint32_t fi_lastlength; /* length of last block in array */ uint32_t fi_pad; /* unused */ }; +__CTASSERT(sizeof(struct finfo64) == 24); typedef struct finfo32 FINFO32; struct finfo32 { @@ -563,6 +573,7 @@ struct finfo32 { uint32_t fi_ino; /* inode number */ uint32_t fi_lastlength; /* length of last block in array */ }; +__CTASSERT(sizeof(struct finfo32) == 16); typedef union finfo { struct finfo64 u_64; @@ -580,10 +591,12 @@ typedef union finfo { typedef struct iinfo64 { uint64_t ii_block; /* block number */ } IINFO64; +__CTASSERT(sizeof(struct iinfo64) == 8); typedef struct iinfo32 { uint32_t ii_block; /* block number */ } IINFO32; +__CTASSERT(sizeof(struct iinfo32) == 4); typedef union iinfo { struct iinfo64 u_64; @@ -608,6 +621,7 @@ struct ifile64 { int64_t if_daddr; /* inode disk address */ uint64_t if_nextfree; /* next-unallocated inode */ }; +__CTASSERT(sizeof(struct ifile64) == 32); typedef struct ifile32 IFILE32; struct ifile32 { @@ -617,6 +631,7 @@ struct ifile32 { uint32_t if_atime_sec; /* Last access time, seconds */ uint32_t if_atime_nsec; /* and nanoseconds */ }; +__CTASSERT(sizeof(struct ifile32) == 20); typedef struct ifile_v1 IFILE_V1; struct ifile_v1 { @@ -628,6 +643,7 @@ struct ifile_v1 { struct timespec if_atime; /* Last access time */ #endif }; +__CTASSERT(sizeof(struct ifile_v1) == 12); /* * Note: struct ifile_v1 is often handled by accessing the first three @@ -657,6 +673,7 @@ typedef struct _cleanerinfo32 { uint32_t free_tail; /* 20: tail of the inode free list */ uint32_t flags; /* 24: status word from the kernel */ } CLEANERINFO32; +__CTASSERT(sizeof(struct _cleanerinfo32) == 28); typedef struct _cleanerinfo64 { uint32_t clean; /* 0: number of clean segments */ @@ -668,6 +685,7 @@ typedef struct _cleanerinfo64 { uint32_t flags; /* 40: status word from the kernel */ uint32_t pad; /* 44: must be 64-bit aligned */ } CLEANERINFO64; +__CTASSERT(sizeof(struct _cleanerinfo64) == 48); /* this must not go to disk directly of course */ typedef union _cleanerinfo { @@ -705,6 +723,7 @@ struct segsum_v1 { uint16_t ss_pad; /* 26: extra space */ /* FINFO's and inode daddr's... */ }; +__CTASSERT(sizeof(struct segsum_v1) == 28); typedef struct segsum32 SEGSUM32; struct segsum32 { @@ -722,6 +741,7 @@ struct segsum32 { uint64_t ss_create; /* 40: time stamp */ /* FINFO's and inode daddr's... */ }; +__CTASSERT(sizeof(struct segsum32) == 48); typedef struct segsum64 SEGSUM64; struct segsum64 { @@ -739,6 +759,7 @@ struct segsum64 { uint64_t ss_create; /* 48: time stamp */ /* FINFO's and inode daddr's... */ }; +__CTASSERT(sizeof(struct segsum64) == 56); typedef union segsum SEGSUM; union segsum { From 4fa02325da67a7f8953d42ad8d9c8198d5612266 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Tue, 17 Mar 2020 03:37:57 +0000 Subject: [PATCH 26/30] Avoid misaligned access to lfs64 on-disk records in memory. lfs64 directory entries are only 32-bit aligned in order to conserve space in directory blocks, and we had a hack to stuff a 64-bit inode in them. This replaces the hack by __aligned(4) __packed, and goes further: 1. It's not clear that all the other lfs64 data structures are 64-bit aligned on disk to begin with. We can go through these later and upgrade them from struct foo64 { ... } __aligned(4) __packed; union foo { struct foo64 f64; ... }; to struct foo64 { ... }; union foo { struct foo64 f64 __aligned(8); ... } __aligned(4) __packed; if we really want to take advantage of 64-bit memory accesses. However, the __aligned(4) __packed must remain on the union because: 2. We access even the lfs32 data structures via a union that has lfs64 members, and it turns out that compilers will assume access through a union with 64-bit aligned members implies the whole union has 64-bit alignment, even if we're only accessing a 32-bit aligned member. --- sys/ufs/lfs/lfs.h | 36 ++++++++++++++++++++++++++---------- sys/ufs/lfs/lfs_accessors.h | 23 ++--------------------- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/sys/ufs/lfs/lfs.h b/sys/ufs/lfs/lfs.h index 932ded1e1040..b5819761708a 100644 --- a/sys/ufs/lfs/lfs.h +++ b/sys/ufs/lfs/lfs.h @@ -358,18 +358,19 @@ struct lfs_dirheader32 { __CTASSERT(sizeof(struct lfs_dirheader32) == 8); struct lfs_dirheader64 { - uint32_t dh_inoA; /* inode number of entry */ - uint32_t dh_inoB; /* inode number of entry */ + uint64_t dh_ino; /* inode number of entry */ uint16_t dh_reclen; /* length of this record */ uint8_t dh_type; /* file type, see below */ uint8_t dh_namlen; /* length of string in d_name */ -}; +} __aligned(4) __packed; __CTASSERT(sizeof(struct lfs_dirheader64) == 12); union lfs_dirheader { struct lfs_dirheader64 u_64; struct lfs_dirheader32 u_32; }; +__CTASSERT(__alignof(union lfs_dirheader) == __alignof(struct lfs_dirheader64)); +__CTASSERT(__alignof(union lfs_dirheader) == __alignof(struct lfs_dirheader32)); typedef union lfs_dirheader LFS_DIRHEADER; @@ -481,6 +482,8 @@ union lfs_dinode { struct lfs64_dinode u_64; struct lfs32_dinode u_32; }; +__CTASSERT(__alignof(union lfs_dinode) == __alignof(struct lfs64_dinode)); +__CTASSERT(__alignof(union lfs_dinode) == __alignof(struct lfs32_dinode)); /* * The di_db fields may be overlaid with other information for @@ -563,7 +566,7 @@ struct finfo64 { uint64_t fi_ino; /* inode number */ uint32_t fi_lastlength; /* length of last block in array */ uint32_t fi_pad; /* unused */ -}; +} __aligned(4) __packed; __CTASSERT(sizeof(struct finfo64) == 24); typedef struct finfo32 FINFO32; @@ -579,6 +582,8 @@ typedef union finfo { struct finfo64 u_64; struct finfo32 u_32; } FINFO; +__CTASSERT(__alignof(union finfo) == __alignof(struct finfo64)); +__CTASSERT(__alignof(union finfo) == __alignof(struct finfo32)); /* * inode info (part of the segment summary) @@ -590,7 +595,7 @@ typedef union finfo { typedef struct iinfo64 { uint64_t ii_block; /* block number */ -} IINFO64; +} __aligned(4) __packed IINFO64; __CTASSERT(sizeof(struct iinfo64) == 8); typedef struct iinfo32 { @@ -602,6 +607,8 @@ typedef union iinfo { struct iinfo64 u_64; struct iinfo32 u_32; } IINFO; +__CTASSERT(__alignof(union iinfo) == __alignof(struct iinfo64)); +__CTASSERT(__alignof(union iinfo) == __alignof(struct iinfo32)); /* * Index file inode entries. @@ -620,7 +627,7 @@ struct ifile64 { uint64_t if_atime_sec; /* Last access time, seconds */ int64_t if_daddr; /* inode disk address */ uint64_t if_nextfree; /* next-unallocated inode */ -}; +} __aligned(4) __packed; __CTASSERT(sizeof(struct ifile64) == 32); typedef struct ifile32 IFILE32; @@ -655,6 +662,9 @@ typedef union ifile { struct ifile32 u_32; struct ifile_v1 u_v1; } IFILE; +__CTASSERT(__alignof(union ifile) == __alignof(struct ifile64)); +__CTASSERT(__alignof(union ifile) == __alignof(struct ifile32)); +__CTASSERT(__alignof(union ifile) == __alignof(struct ifile_v1)); /* * Cleaner information structure. This resides in the ifile and is used @@ -684,7 +694,7 @@ typedef struct _cleanerinfo64 { uint64_t free_tail; /* 32: tail of the inode free list */ uint32_t flags; /* 40: status word from the kernel */ uint32_t pad; /* 44: must be 64-bit aligned */ -} CLEANERINFO64; +} __aligned(4) __packed CLEANERINFO64; __CTASSERT(sizeof(struct _cleanerinfo64) == 48); /* this must not go to disk directly of course */ @@ -692,6 +702,8 @@ typedef union _cleanerinfo { CLEANERINFO32 u_32; CLEANERINFO64 u_64; } CLEANERINFO; +__CTASSERT(__alignof(union _cleanerinfo) == __alignof(struct _cleanerinfo32)); +__CTASSERT(__alignof(union _cleanerinfo) == __alignof(struct _cleanerinfo64)); /* * On-disk segment summary information @@ -740,7 +752,7 @@ struct segsum32 { uint64_t ss_serial; /* 32: serial number */ uint64_t ss_create; /* 40: time stamp */ /* FINFO's and inode daddr's... */ -}; +} __aligned(4) __packed; __CTASSERT(sizeof(struct segsum32) == 48); typedef struct segsum64 SEGSUM64; @@ -758,7 +770,7 @@ struct segsum64 { uint64_t ss_serial; /* 40: serial number */ uint64_t ss_create; /* 48: time stamp */ /* FINFO's and inode daddr's... */ -}; +} __aligned(4) __packed; __CTASSERT(sizeof(struct segsum64) == 56); typedef union segsum SEGSUM; @@ -767,7 +779,9 @@ union segsum { struct segsum32 u_32; struct segsum_v1 u_v1; }; - +__CTASSERT(__alignof(union segsum) == __alignof(struct segsum64)); +__CTASSERT(__alignof(union segsum) == __alignof(struct segsum32)); +__CTASSERT(__alignof(union segsum) == __alignof(struct segsum_v1)); /* * On-disk super block. @@ -956,6 +970,8 @@ struct dlfs64 { uint32_t dlfs_cksum; /* 508: checksum for superblock checking */ }; +__CTASSERT(__alignof(struct dlfs) == __alignof(struct dlfs64)); + /* Type used for the inode bitmap */ typedef uint32_t lfs_bm_t; diff --git a/sys/ufs/lfs/lfs_accessors.h b/sys/ufs/lfs/lfs_accessors.h index b4accd6c358a..f058ace2359d 100644 --- a/sys/ufs/lfs/lfs_accessors.h +++ b/sys/ufs/lfs/lfs_accessors.h @@ -274,17 +274,7 @@ static __inline uint64_t lfs_dir_getino(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh) { if (fs->lfs_is64) { - uint64_t ino; - - /* - * XXX we can probably write this in a way that's both - * still legal and generates better code. - */ - memcpy(&ino, &dh->u_64.dh_inoA, sizeof(dh->u_64.dh_inoA)); - memcpy((char *)&ino + sizeof(dh->u_64.dh_inoA), - &dh->u_64.dh_inoB, - sizeof(dh->u_64.dh_inoB)); - return LFS_SWAP_uint64_t(fs, ino); + return LFS_SWAP_uint64_t(fs, dh->u_64.dh_ino); } else { return LFS_SWAP_uint32_t(fs, dh->u_32.dh_ino); } @@ -331,16 +321,7 @@ static __inline void lfs_dir_setino(STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint64_t ino) { if (fs->lfs_is64) { - - ino = LFS_SWAP_uint64_t(fs, ino); - /* - * XXX we can probably write this in a way that's both - * still legal and generates better code. - */ - memcpy(&dh->u_64.dh_inoA, &ino, sizeof(dh->u_64.dh_inoA)); - memcpy(&dh->u_64.dh_inoB, - (char *)&ino + sizeof(dh->u_64.dh_inoA), - sizeof(dh->u_64.dh_inoB)); + dh->u_64.dh_ino = LFS_SWAP_uint64_t(fs, ino); } else { dh->u_32.dh_ino = LFS_SWAP_uint32_t(fs, ino); } From d469b9419702a47ccb52e82ccc928eba6e81e6e2 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 21 Mar 2020 18:42:48 +0000 Subject: [PATCH 27/30] Fix clang build after packed lfs64 accessor change. --- sys/arch/i386/stand/efiboot/bootx64/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sys/arch/i386/stand/efiboot/bootx64/Makefile b/sys/arch/i386/stand/efiboot/bootx64/Makefile index 828dd693aa3a..03006b4c3be5 100644 --- a/sys/arch/i386/stand/efiboot/bootx64/Makefile +++ b/sys/arch/i386/stand/efiboot/bootx64/Makefile @@ -9,4 +9,9 @@ EXTRA_SOURCES= efibootx64.c startprog64.S multiboot64.S COPTS+= -mno-red-zone CPPFLAGS+= -DEFI_FUNCTION_WRAPPER +# Follow the suit of Makefile.kern.inc; needed for the lfs64 union +# accessors -- they don't actually dereference the resulting pointer, +# just use it for type-checking. +CWARNFLAGS.clang+= -Wno-error=address-of-packed-member + .include "${.CURDIR}/../Makefile.efiboot" From caf79acd56ea01d286a7a8fa4246613fba8fcb0c Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 21 Mar 2020 18:58:32 +0000 Subject: [PATCH 28/30] Suppress spurious address-of-packed error in rump lfs too. --- sys/rump/fs/lib/liblfs/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sys/rump/fs/lib/liblfs/Makefile b/sys/rump/fs/lib/liblfs/Makefile index 2750eec8e0ee..172d396de5ed 100644 --- a/sys/rump/fs/lib/liblfs/Makefile +++ b/sys/rump/fs/lib/liblfs/Makefile @@ -21,5 +21,10 @@ CFLAGS+= -DLFS_KERNEL_RFW COPTS.lfs_inode.c+=-O0 .endif +# Follow the suit of Makefile.kern.inc; needed for the lfs64 union +# accessors -- they don't actually dereference the resulting pointer, +# just use it for type-checking. +CWARNFLAGS.clang+= -Wno-error=address-of-packed-member + .include .include From 6235e813652380cb835d148ed66c9e944fb34a47 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 17 May 2020 20:31:13 +0000 Subject: [PATCH 29/30] WIP: Don't block vnode iteration during VOP_INACTIVE. lfs needs this to break a deadlock. --- sys/kern/vfs_vnode.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/sys/kern/vfs_vnode.c b/sys/kern/vfs_vnode.c index 35be3e0f5818..6ebf57081d83 100644 --- a/sys/kern/vfs_vnode.c +++ b/sys/kern/vfs_vnode.c @@ -764,7 +764,6 @@ vrelel(vnode_t *vp, int flags) if (VSTATE_GET(vp) == VS_RECLAIMED) { VOP_UNLOCK(vp); } else { - VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED); mutex_exit(vp->v_interlock); /* @@ -780,7 +779,6 @@ vrelel(vnode_t *vp, int flags) if (!recycle) VOP_UNLOCK(vp); mutex_enter(vp->v_interlock); - VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); if (!recycle) { if (vtryrele(vp)) { mutex_exit(vp->v_interlock); From 4c4a73aa79ef629cb9f2e650dfec6ebefd669f76 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 17 May 2020 20:31:47 +0000 Subject: [PATCH 30/30] Skip unlinked inodes. They no longer matter on disk so we don't need to write anything out for them. --- sys/ufs/lfs/lfs_segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/ufs/lfs/lfs_segment.c b/sys/ufs/lfs/lfs_segment.c index b54855c74234..341b862c1f0f 100644 --- a/sys/ufs/lfs/lfs_segment.c +++ b/sys/ufs/lfs/lfs_segment.c @@ -490,7 +490,7 @@ lfs_writevnodes_selector(void *cl, struct vnode *vp) KASSERT(mutex_owned(vp->v_interlock)); ip = VTOI(vp); - if (ip == NULL || vp->v_type == VNON) + if (ip == NULL || vp->v_type == VNON || ip->i_nlink <= 0) return false; if ((op == VN_DIROP && !(vp->v_uflag & VU_DIROP)) || (op != VN_DIROP && op != VN_CLEAN && (vp->v_uflag & VU_DIROP))) {