From 74ce528609987b9b9a5de4b2d0be7b570379a816 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 10 Sep 2021 21:54:27 +0000 Subject: [PATCH 1/4] sys/kern: Allow custom fileops to specify fo_seek method. Previously only vnodes allowed lseek/pread[v]/pwrite[v], which meant converting a regular device to a cloning device doesn't always work. Semantics is: (*fp->f_ops->fo_seek)(fp, delta, whence, newoffp, flags) 1. Compute a new offset according to whence + delta -- that is, if whence is SEEK_CUR, add delta to fp->f_offset; if whence is SEEK_END, add delta to end of file; if whence is SEEK_CUR, use delta as is. 2. If newoffp is nonnull, return the new offset in *newoffp. 3. If flags & FOF_UPDATE_OFFSET, set fp->f_offset to the new offset. Access to fp->f_offset, and *newoffp if newoffp = &fp->f_offset, must happen under the object lock (e.g., vnode lock), in order to synchronize fp->f_offset reads and writes. --- sys/compat/netbsd32/netbsd32_fs.c | 24 +++-------- sys/kern/sys_generic.c | 34 ++++++++-------- sys/kern/vfs_syscalls.c | 68 ++++++++----------------------- sys/kern/vfs_vnops.c | 51 +++++++++++++++++++++++ sys/sys/file.h | 2 +- 5 files changed, 94 insertions(+), 85 deletions(-) diff --git a/sys/compat/netbsd32/netbsd32_fs.c b/sys/compat/netbsd32/netbsd32_fs.c index 667ea38871ff..4a4e638e050e 100644 --- a/sys/compat/netbsd32/netbsd32_fs.c +++ b/sys/compat/netbsd32/netbsd32_fs.c @@ -648,7 +648,6 @@ netbsd32_preadv(struct lwp *l, const struct netbsd32_preadv_args *uap, register_ syscallarg(netbsd32_off_t) offset; } */ file_t *fp; - struct vnode *vp; off_t offset; int error, fd = SCARG(uap, fd); @@ -660,19 +659,14 @@ netbsd32_preadv(struct lwp *l, const struct netbsd32_preadv_args *uap, register_ return EBADF; } - vp = fp->f_vnode; - if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { + if (fp->f_ops->fo_seek == NULL) { error = ESPIPE; goto out; } offset = SCARG(uap, offset); - - /* - * XXX This works because no file systems actually - * XXX take any action on the seek operation. - */ - if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0) + error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0); + if (error) goto out; return dofilereadv32(fd, fp, SCARG_P32(uap, iovp), @@ -694,7 +688,6 @@ netbsd32_pwritev(struct lwp *l, const struct netbsd32_pwritev_args *uap, registe syscallarg(netbsd32_off_t) offset; } */ file_t *fp; - struct vnode *vp; off_t offset; int error, fd = SCARG(uap, fd); @@ -706,19 +699,14 @@ netbsd32_pwritev(struct lwp *l, const struct netbsd32_pwritev_args *uap, registe return EBADF; } - vp = fp->f_vnode; - if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { + if (fp->f_ops->fo_seek == NULL) { error = ESPIPE; goto out; } offset = SCARG(uap, offset); - - /* - * XXX This works because no file systems actually - * XXX take any action on the seek operation. - */ - if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0) + error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0); + if (error) goto out; return dofilewritev32(fd, fp, SCARG_P32(uap, iovp), diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 82555d419e8a..7c033e10aff2 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -208,17 +208,18 @@ do_filereadv(int fd, const struct iovec *iovp, int iovcnt, if (offset == NULL) offset = &fp->f_offset; else { - struct vnode *vp = fp->f_vnode; - if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { + /* + * Caller must not specify &fp->f_offset -- we can't + * safely dereference it for the call to fo_seek + * without holding some underlying object lock. + */ + KASSERT(offset != &fp->f_offset); + if (fp->f_ops->fo_seek == NULL) { error = ESPIPE; goto out; } - /* - * Test that the device is seekable ? - * XXX This works because no file systems actually - * XXX take any action on the seek operation. - */ - error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred); + error = (*fp->f_ops->fo_seek)(fp, *offset, SEEK_SET, NULL, + 0); if (error != 0) goto out; } @@ -408,17 +409,18 @@ do_filewritev(int fd, const struct iovec *iovp, int iovcnt, if (offset == NULL) offset = &fp->f_offset; else { - struct vnode *vp = fp->f_vnode; - if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { + /* + * Caller must not specify &fp->f_offset -- we can't + * safely dereference it for the call to fo_seek + * without holding some underlying object lock. + */ + KASSERT(offset != &fp->f_offset); + if (fp->f_ops->fo_seek == NULL) { error = ESPIPE; goto out; } - /* - * Test that the device is seekable ? - * XXX This works because no file systems actually - * XXX take any action on the seek operation. - */ - error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred); + error = (*fp->f_ops->fo_seek)(fp, *offset, SEEK_SET, NULL, + 0); if (error != 0) goto out; } diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 8e4ba7f9c433..747c5e16b770 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -2856,50 +2856,30 @@ sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval) syscallarg(off_t) offset; syscallarg(int) whence; } */ - kauth_cred_t cred = l->l_cred; file_t *fp; - struct vnode *vp; - struct vattr vattr; - off_t newoff; int error, fd; + switch (SCARG(uap, whence)) { + case SEEK_CUR: + case SEEK_END: + case SEEK_SET: + break; + default: + return EINVAL; + } + fd = SCARG(uap, fd); if ((fp = fd_getfile(fd)) == NULL) return (EBADF); - vp = fp->f_vnode; - if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { + if (fp->f_ops->fo_seek == NULL) { error = ESPIPE; goto out; } - vn_lock(vp, LK_SHARED | LK_RETRY); - - switch (SCARG(uap, whence)) { - case SEEK_CUR: - newoff = fp->f_offset + SCARG(uap, offset); - break; - case SEEK_END: - error = VOP_GETATTR(vp, &vattr, cred); - if (error) { - VOP_UNLOCK(vp); - goto out; - } - newoff = SCARG(uap, offset) + vattr.va_size; - break; - case SEEK_SET: - newoff = SCARG(uap, offset); - break; - default: - error = EINVAL; - VOP_UNLOCK(vp); - goto out; - } - VOP_UNLOCK(vp); - if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == 0) { - *(off_t *)retval = fp->f_offset = newoff; - } + error = (*fp->f_ops->fo_seek)(fp, SCARG(uap, offset), + SCARG(uap, whence), (off_t *)retval, FOF_UPDATE_OFFSET); out: fd_putfile(fd); return (error); @@ -2918,7 +2898,6 @@ sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval) syscallarg(off_t) offset; } */ file_t *fp; - struct vnode *vp; off_t offset; int error, fd = SCARG(uap, fd); @@ -2930,19 +2909,14 @@ sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval) return (EBADF); } - vp = fp->f_vnode; - if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { + if (fp->f_ops->fo_seek == NULL) { error = ESPIPE; goto out; } offset = SCARG(uap, offset); - - /* - * XXX This works because no file systems actually - * XXX take any action on the seek operation. - */ - if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0) + error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0); + if (error) goto out; /* dofileread() will unuse the descriptor for us */ @@ -2985,7 +2959,6 @@ sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval) syscallarg(off_t) offset; } */ file_t *fp; - struct vnode *vp; off_t offset; int error, fd = SCARG(uap, fd); @@ -2997,19 +2970,14 @@ sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval) return (EBADF); } - vp = fp->f_vnode; - if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { + if (fp->f_ops->fo_seek == NULL) { error = ESPIPE; goto out; } offset = SCARG(uap, offset); - - /* - * XXX This works because no file systems actually - * XXX take any action on the seek operation. - */ - if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != 0) + error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0); + if (error) goto out; /* dofilewrite() will unuse the descriptor for us */ diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 4530a085a11d..718fb9bf0610 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -121,6 +121,7 @@ static int vn_statfile(file_t *fp, struct stat *sb); static int vn_ioctl(file_t *fp, u_long com, void *data); static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *, struct uvm_object **, int *); +static int vn_seek(struct file *, off_t, int, off_t *, int); const struct fileops vnops = { .fo_name = "vn", @@ -134,6 +135,7 @@ const struct fileops vnops = { .fo_kqfilter = vn_kqfilter, .fo_restart = fnullop_restart, .fo_mmap = vn_mmap, + .fo_seek = vn_seek, }; /* @@ -1110,7 +1112,56 @@ vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp, return 0; } +static int +vn_seek(struct file *fp, off_t delta, int whence, off_t *newoffp, + int flags) +{ + kauth_cred_t cred = fp->f_cred; + off_t oldoff, newoff; + struct vnode *vp = fp->f_vnode; + struct vattr vattr; + int error; + + if (vp->v_type == VFIFO) + return ESPIPE; + + vn_lock(vp, LK_SHARED | LK_RETRY); + + /* Compute the old and new offsets. */ + oldoff = fp->f_offset; + switch (whence) { + case SEEK_CUR: + newoff = oldoff + delta; /* XXX arithmetic overflow */ + break; + case SEEK_END: + error = VOP_GETATTR(vp, &vattr, cred); + if (error) + goto out; + newoff = delta + vattr.va_size; /* XXX arithmetic overflow */ + break; + case SEEK_SET: + newoff = delta; + break; + default: + error = EINVAL; + goto out; + } + + /* Pass the proposed change to the file system to audit. */ + error = VOP_SEEK(vp, oldoff, newoff, cred); + if (error) + goto out; + + /* Success! */ + if (newoffp) + *newoffp = newoff; + if (flags & FOF_UPDATE_OFFSET) + fp->f_offset = newoff; + error = 0; +out: VOP_UNLOCK(vp); + return error; +} /* * Check that the vnode is still valid, and if so diff --git a/sys/sys/file.h b/sys/sys/file.h index 00fc724944d4..93704f1f2fd3 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -94,7 +94,7 @@ struct fileops { void (*fo_restart) (struct file *); int (*fo_mmap) (struct file *, off_t *, size_t, int, int *, int *, struct uvm_object **, int *); - void (*fo_spare2) (void); + int (*fo_seek) (struct file *, off_t, int, off_t *, int); }; union file_data { From b874d882a0fff6098b1099b8ef7eb29445af5019 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 10 Sep 2021 22:24:17 +0000 Subject: [PATCH 2/4] sys/kern: Avoid fp->f_offset without the object (here, vnode) lock. --- sys/kern/sys_descrip.c | 5 ++++- sys/kern/vfs_vnops.c | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sys/kern/sys_descrip.c b/sys/kern/sys_descrip.c index 9e6b67618a27..245214e5cab1 100644 --- a/sys/kern/sys_descrip.c +++ b/sys/kern/sys_descrip.c @@ -243,8 +243,11 @@ do_fcntl_lock(int fd, int cmd, struct flock *fl) return error; vp = fp->f_vnode; - if (fl->l_whence == SEEK_CUR) + if (fl->l_whence == SEEK_CUR) { + vn_lock(vp, LK_SHARED | LK_RETRY); fl->l_start += fp->f_offset; + VOP_UNLOCK(vp); + } flg = F_POSIX; p = curproc; diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 718fb9bf0610..706d52cad5ba 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -817,10 +817,11 @@ vn_ioctl(file_t *fp, u_long com, void *data) if (com == FIONREAD) { vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, kauth_cred_get()); + if (error == 0) + *(int *)data = vattr.va_size - fp->f_offset; VOP_UNLOCK(vp); if (error) return (error); - *(int *)data = vattr.va_size - fp->f_offset; return (0); } if ((com == FIONWRITE) || (com == FIONSPACE)) { From 2d1887992ec869bf2fbd80452c3bb62d0c29eece Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 10 Sep 2021 16:29:39 +0000 Subject: [PATCH 3/4] ksyms(4): Take a complete snapshot on each open. - Snapshots are stored in pageable anonymous uvm objects. - Snapshots are reference-counted so they can be reused across opens. - Opening /dev/ksyms blocks module unload until snapshot is taken. - Merely holding /dev/ksyms open does not block module unload. - /dev/ksyms is now mmappable. This slightly changes the behaviour of fstat(2) on /dev/ksyms -- it is a little more useful now! In particular, st_size is the size of the symbol table. Some other fields which were not very useful to begin with -- st_dev, st_ino, st_mode, st_nlink, st_*time, st_blksize, st_blocks -- are now different, and independent of the file system on which the device node resides. Discussed in https://mail-index.netbsd.org/source-changes-d/2021/08/17/msg013425.html This is option (3), adapted to make the ksyms snapshots pageable, after options (1) and (2) posed practical problems. --- sys/kern/kern_ksyms.c | 594 +++++++++++++++++++++++++++++++----------- 1 file changed, 435 insertions(+), 159 deletions(-) diff --git a/sys/kern/kern_ksyms.c b/sys/kern/kern_ksyms.c index 4cffb61341c7..e55252d5f7e9 100644 --- a/sys/kern/kern_ksyms.c +++ b/sys/kern/kern_ksyms.c @@ -86,6 +86,9 @@ __KERNEL_RCSID(0, "$NetBSD: kern_ksyms.c,v 1.102 2021/09/07 16:56:25 riastradh E #include #include #include +#include +#include +#include #include #include #include @@ -94,6 +97,9 @@ __KERNEL_RCSID(0, "$NetBSD: kern_ksyms.c,v 1.102 2021/09/07 16:56:25 riastradh E #include #include #include +#include + +#include #ifdef DDB #include @@ -104,6 +110,15 @@ __KERNEL_RCSID(0, "$NetBSD: kern_ksyms.c,v 1.102 2021/09/07 16:56:25 riastradh E #include "ioconf.h" #endif +struct ksyms_snapshot { + uint64_t ks_refcnt; + uint64_t ks_gen; + struct uvm_object *ks_uobj; + size_t ks_size; + dev_t ks_dev; + int ks_maxlen; +}; + #define KSYMS_MAX_ID 98304 #ifdef KDTRACE_HOOKS static uint32_t ksyms_nmap[KSYMS_MAX_ID]; /* sorted symbol table map */ @@ -112,15 +127,20 @@ static uint32_t *ksyms_nmap = NULL; #endif static int ksyms_maxlen; -static uint64_t ksyms_opencnt; -static struct ksyms_symtab *ksyms_last_snapshot; static bool ksyms_initted; static bool ksyms_loaded; static kmutex_t ksyms_lock __cacheline_aligned; static struct ksyms_symtab kernel_symtab; +static kcondvar_t ksyms_cv; +static struct lwp *ksyms_snapshotting; +static struct ksyms_snapshot *ksyms_snapshot; +static uint64_t ksyms_snapshot_gen; static void ksyms_hdr_init(const void *); static void ksyms_sizes_calc(void); +static struct ksyms_snapshot *ksyms_snapshot_alloc(int, size_t, dev_t, + uint64_t); +static void ksyms_snapshot_release(struct ksyms_snapshot *); #ifdef KSYMS_DEBUG #define FOLLOW_CALLS 1 @@ -245,6 +265,7 @@ ksyms_init(void) if (!ksyms_initted) { mutex_init(&ksyms_lock, MUTEX_DEFAULT, IPL_NONE); + cv_init(&ksyms_cv, "ksyms"); ksyms_initted = true; } } @@ -328,7 +349,6 @@ addsymtab(const char *name, void *symstart, size_t symsize, tab->sd_minsym = UINTPTR_MAX; tab->sd_maxsym = 0; tab->sd_usroffset = 0; - tab->sd_gone = false; tab->sd_ctfstart = ctfstart; tab->sd_ctfsize = ctfsize; tab->sd_nmap = nmap; @@ -446,9 +466,9 @@ addsymtab(const char *name, void *symstart, size_t symsize, KASSERT(cold || mutex_owned(&ksyms_lock)); /* - * Ensure ddb never witnesses an inconsistent state of the - * queue, unless memory is so corrupt that we crash in - * TAILQ_INSERT_TAIL. + * Publish the symtab. Do this at splhigh to ensure ddb never + * witnesses an inconsistent state of the queue, unless memory + * is so corrupt that we crash in TAILQ_INSERT_TAIL. */ s = splhigh(); TAILQ_INSERT_TAIL(&ksyms_symtabs, tab, sd_queue); @@ -557,6 +577,9 @@ ksyms_addsyms_elf(int symsize, void *start, void *end) kernel_symtab.sd_symstart, kernel_symtab.sd_strstart, (long)kernel_symtab.sd_symsize/sizeof(Elf_Sym)); #endif + + /* Should be no snapshot to invalidate yet. */ + KASSERT(ksyms_snapshot == NULL); } /* @@ -577,6 +600,9 @@ ksyms_addsyms_explicit(void *ehdr, void *symstart, size_t symsize, ksyms_hdr_init(ehdr); addsymtab("netbsd", symstart, symsize, strstart, strsize, &kernel_symtab, symstart, NULL, 0, ksyms_nmap); + + /* Should be no snapshot to invalidate yet. */ + KASSERT(ksyms_snapshot == NULL); } /* @@ -601,8 +627,6 @@ ksyms_getval_unlocked(const char *mod, const char *sym, Elf_Sym **symp, #endif TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { - if (__predict_false(st->sd_gone)) - continue; if (mod != NULL && strcmp(st->sd_name, mod)) continue; if ((es = findsym(sym, st, type)) != NULL) { @@ -636,8 +660,6 @@ ksyms_get_mod(const char *mod) mutex_enter(&ksyms_lock); TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { - if (__predict_false(st->sd_gone)) - continue; if (mod != NULL && strcmp(st->sd_name, mod)) continue; break; @@ -671,8 +693,6 @@ ksyms_mod_foreach(const char *mod, ksyms_callback_t callback, void *opaque) /* find the module */ TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { - if (__predict_false(st->sd_gone)) - continue; if (mod != NULL && strcmp(st->sd_name, mod)) continue; @@ -716,8 +736,6 @@ ksyms_getname(const char **mod, const char **sym, vaddr_t v, int f) return ENOENT; TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { - if (st->sd_gone) - continue; if (v < st->sd_minsym || v > st->sd_maxsym) continue; sz = st->sd_symsize/sizeof(Elf_Sym); @@ -762,6 +780,7 @@ ksyms_modload(const char *name, void *symstart, vsize_t symsize, char *strstart, vsize_t strsize) { struct ksyms_symtab *st; + struct ksyms_snapshot *ks; void *nmap; st = kmem_zalloc(sizeof(*st), KM_SLEEP); @@ -770,7 +789,12 @@ ksyms_modload(const char *name, void *symstart, vsize_t symsize, mutex_enter(&ksyms_lock); addsymtab(name, symstart, symsize, strstart, strsize, st, symstart, NULL, 0, nmap); + ks = ksyms_snapshot; + ksyms_snapshot = NULL; mutex_exit(&ksyms_lock); + + if (ks) + ksyms_snapshot_release(ks); } /* @@ -780,37 +804,48 @@ void ksyms_modunload(const char *name) { struct ksyms_symtab *st; - bool do_free = false; + struct ksyms_snapshot *ks; int s; mutex_enter(&ksyms_lock); TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { - if (st->sd_gone) - continue; if (strcmp(name, st->sd_name) != 0) continue; - st->sd_gone = true; - ksyms_sizes_calc(); - if (ksyms_opencnt == 0) { - /* - * Ensure ddb never witnesses an inconsistent - * state of the queue, unless memory is so - * corrupt that we crash in TAILQ_REMOVE. - */ - s = splhigh(); - TAILQ_REMOVE(&ksyms_symtabs, st, sd_queue); - splx(s); - do_free = true; - } break; } - mutex_exit(&ksyms_lock); KASSERT(st != NULL); - if (do_free) { - kmem_free(st->sd_nmap, st->sd_nmapsize * sizeof(uint32_t)); - kmem_free(st, sizeof(*st)); - } + /* Wait for any snapshot in progress to complete. */ + while (ksyms_snapshotting) + cv_wait(&ksyms_cv, &ksyms_lock); + + /* + * Remove the symtab. Do this at splhigh to ensure ddb never + * witnesses an inconsistent state of the queue, unless memory + * is so corrupt that we crash in TAILQ_REMOVE. + */ + s = splhigh(); + TAILQ_REMOVE(&ksyms_symtabs, st, sd_queue); + splx(s); + + /* Recompute the ksyms sizes now that we've removed st. */ + ksyms_sizes_calc(); + + /* Invalidate the global ksyms snapshot. */ + ks = ksyms_snapshot; + ksyms_snapshot = NULL; + mutex_exit(&ksyms_lock); + + /* + * No more references are possible. Free the name map and the + * symtab itself, which we had allocated in ksyms_modload. + */ + kmem_free(st->sd_nmap, st->sd_nmapsize * sizeof(uint32_t)); + kmem_free(st, sizeof(*st)); + + /* Release the formerly global ksyms snapshot, if any. */ + if (ks) + ksyms_snapshot_release(ks); } #ifdef DDB @@ -830,8 +865,6 @@ ksyms_sift(char *mod, char *sym, int mode) return ENOENT; TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { - if (st->sd_gone) - continue; if (mod && strcmp(mod, st->sd_name)) continue; sb = st->sd_strstart - st->sd_usroffset; @@ -893,8 +926,6 @@ ksyms_sizes_calc(void) ksyms_symsz = ksyms_strsz = 0; TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { - if (__predict_false(st->sd_gone)) - continue; delta = ksyms_strsz - st->sd_usroffset; if (delta != 0) { for (i = 0; i < st->sd_symsize/sizeof(Elf_Sym); i++) @@ -997,164 +1028,401 @@ ksyms_hdr_init(const void *hdraddr) SHTCOPY(".SUNW_ctf"); } -static int -ksymsopen(dev_t dev, int oflags, int devtype, struct lwp *l) +static struct ksyms_snapshot * +ksyms_snapshot_alloc(int maxlen, size_t size, dev_t dev, uint64_t gen) { - if (minor(dev) != 0 || !ksyms_loaded) - return ENXIO; + struct ksyms_snapshot *ks; - /* - * Create a "snapshot" of the kernel symbol table. Bumping - * ksyms_opencnt will prevent symbol tables from being freed. - */ - mutex_enter(&ksyms_lock); - if (ksyms_opencnt++) - goto out; - ksyms_hdr.kh_shdr[SYMTAB].sh_size = ksyms_symsz; - ksyms_hdr.kh_shdr[SYMTAB].sh_info = ksyms_symsz / sizeof(Elf_Sym); - ksyms_hdr.kh_shdr[STRTAB].sh_offset = ksyms_symsz + - ksyms_hdr.kh_shdr[SYMTAB].sh_offset; - ksyms_hdr.kh_shdr[STRTAB].sh_size = ksyms_strsz; - ksyms_hdr.kh_shdr[SHCTF].sh_offset = ksyms_strsz + - ksyms_hdr.kh_shdr[STRTAB].sh_offset; - ksyms_hdr.kh_shdr[SHCTF].sh_size = ksyms_ctfsz; - ksyms_last_snapshot = TAILQ_LAST(&ksyms_symtabs, ksyms_symtab_queue); -out: mutex_exit(&ksyms_lock); + ks = kmem_zalloc(sizeof(*ks), KM_SLEEP); + ks->ks_refcnt = 1; + ks->ks_gen = gen; + ks->ks_uobj = uao_create(size, 0); + ks->ks_size = size; + ks->ks_dev = dev; + ks->ks_maxlen = maxlen; - return 0; + return ks; } -static int -ksymsclose(dev_t dev, int oflags, int devtype, struct lwp *l) +static void +ksyms_snapshot_release(struct ksyms_snapshot *ks) { - struct ksyms_symtab *st, *next; - TAILQ_HEAD(, ksyms_symtab) to_free = TAILQ_HEAD_INITIALIZER(to_free); - int s; + uint64_t refcnt; - /* Discard references to symbol tables. */ mutex_enter(&ksyms_lock); - if (--ksyms_opencnt) - goto out; - ksyms_last_snapshot = NULL; - TAILQ_FOREACH_SAFE(st, &ksyms_symtabs, sd_queue, next) { - if (st->sd_gone) { - /* - * Ensure ddb never witnesses an inconsistent - * state of the queue, unless memory is so - * corrupt that we crash in TAILQ_REMOVE. - */ - s = splhigh(); - TAILQ_REMOVE(&ksyms_symtabs, st, sd_queue); - splx(s); - TAILQ_INSERT_TAIL(&to_free, st, sd_queue); - } - } - if (!TAILQ_EMPTY(&to_free)) - ksyms_sizes_calc(); -out: mutex_exit(&ksyms_lock); + refcnt = --ks->ks_refcnt; + mutex_exit(&ksyms_lock); - TAILQ_FOREACH_SAFE(st, &to_free, sd_queue, next) { - kmem_free(st->sd_nmap, st->sd_nmapsize * sizeof(uint32_t)); - kmem_free(st, sizeof(*st)); - } + if (refcnt) + return; - return 0; + uao_detach(ks->ks_uobj); + kmem_free(ks, sizeof(*ks)); } static int -ksymsread(dev_t dev, struct uio *uio, int ioflag) +ubc_copyfrombuf(struct uvm_object *uobj, struct uio *uio, const void *buf, + size_t n) { + struct iovec iov = { .iov_base = __UNCONST(buf), .iov_len = n }; + + uio->uio_iov = &iov; + uio->uio_iovcnt = 1; + uio->uio_resid = n; + + return ubc_uiomove(uobj, uio, n, UVM_ADV_SEQUENTIAL, UBC_WRITE); +} + +static int +ksyms_take_snapshot(struct ksyms_snapshot *ks, struct ksyms_symtab *last) +{ + struct uvm_object *uobj = ks->ks_uobj; + struct uio uio; struct ksyms_symtab *st; - size_t filepos, inpos, off; int error; + /* Caller must have initiated snapshotting. */ + KASSERT(ksyms_snapshotting == curlwp); + + /* Start a uio transfer to reuse incrementally. */ + uio.uio_offset = 0; + uio.uio_rw = UIO_WRITE; /* write from buffer to uobj */ + UIO_SETUP_SYSSPACE(&uio); + /* - * First: Copy out the ELF header. XXX Lose if ksymsopen() - * occurs during read of the header. + * First: Copy out the ELF header. */ - off = uio->uio_offset; - if (off < sizeof(struct ksyms_hdr)) { - error = uiomove((char *)&ksyms_hdr + off, - sizeof(struct ksyms_hdr) - off, uio); - if (error != 0) - return error; - } + error = ubc_copyfrombuf(uobj, &uio, &ksyms_hdr, sizeof(ksyms_hdr)); + if (error) + return error; /* - * Copy out the symbol table. + * Copy out the symbol table. The list of symtabs is + * guaranteed to be nonempty because we always have an entry + * for the main kernel. We stop at last, not at the end of the + * tailq or NULL, because entries beyond last are not included + * in this snapshot (and may not be fully initialized memory as + * we witness it). */ - filepos = sizeof(struct ksyms_hdr); - TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { - if (__predict_false(st->sd_gone)) - continue; - if (uio->uio_resid == 0) - return 0; - if (uio->uio_offset <= st->sd_symsize + filepos) { - inpos = uio->uio_offset - filepos; - error = uiomove((char *)st->sd_symstart + inpos, - st->sd_symsize - inpos, uio); - if (error != 0) - return error; - } - filepos += st->sd_symsize; - if (st == ksyms_last_snapshot) + KASSERT(uio.uio_offset == sizeof(struct ksyms_hdr)); + for (st = TAILQ_FIRST(&ksyms_symtabs); + ; + st = TAILQ_NEXT(st, sd_queue)) { + error = ubc_copyfrombuf(uobj, &uio, st->sd_symstart, + st->sd_symsize); + if (error) + return error; + if (st == last) break; } /* * Copy out the string table */ - KASSERT(filepos == sizeof(struct ksyms_hdr) + + KASSERT(uio.uio_offset == sizeof(struct ksyms_hdr) + ksyms_hdr.kh_shdr[SYMTAB].sh_size); for (st = TAILQ_FIRST(&ksyms_symtabs); ; st = TAILQ_NEXT(st, sd_queue)) { - if (uio->uio_resid == 0) - return 0; - if (uio->uio_offset <= st->sd_strsize + filepos) { - inpos = uio->uio_offset - filepos; - error = uiomove((char *)st->sd_strstart + inpos, - st->sd_strsize - inpos, uio); - if (error != 0) - return error; - } - filepos += st->sd_strsize; - if (st == ksyms_last_snapshot) + error = ubc_copyfrombuf(uobj, &uio, st->sd_strstart, + st->sd_strsize); + if (error) + return error; + if (st == last) break; } /* * Copy out the CTF table. */ + KASSERT(uio.uio_offset == sizeof(struct ksyms_hdr) + + ksyms_hdr.kh_shdr[SYMTAB].sh_size + + ksyms_hdr.kh_shdr[STRTAB].sh_size); st = TAILQ_FIRST(&ksyms_symtabs); if (st->sd_ctfstart != NULL) { - if (uio->uio_resid == 0) - return 0; - if (uio->uio_offset <= st->sd_ctfsize + filepos) { - inpos = uio->uio_offset - filepos; - error = uiomove((char *)st->sd_ctfstart + inpos, - st->sd_ctfsize - inpos, uio); - if (error != 0) - return error; - } - filepos += st->sd_ctfsize; + error = ubc_copyfrombuf(uobj, &uio, st->sd_ctfstart, + st->sd_ctfsize); + if (error) + return error; } + KASSERT(uio.uio_offset == sizeof(struct ksyms_hdr) + + ksyms_hdr.kh_shdr[SYMTAB].sh_size + + ksyms_hdr.kh_shdr[STRTAB].sh_size + + ksyms_hdr.kh_shdr[SHCTF].sh_size); + KASSERT(uio.uio_offset == ks->ks_size); + return 0; } +static const struct fileops ksyms_fileops; + static int -ksymswrite(dev_t dev, struct uio *uio, int ioflag) +ksymsopen(dev_t dev, int flags, int devtype, struct lwp *l) { - return EROFS; + struct file *fp = NULL; + int fd = -1; + struct ksyms_snapshot *ks = NULL; + size_t size; + struct ksyms_symtab *last; + int maxlen; + uint64_t gen; + int error; + + if (minor(dev) != 0 || !ksyms_loaded) + return ENXIO; + + /* Allocate a private file. */ + error = fd_allocfile(&fp, &fd); + if (error) + return error; + + mutex_enter(&ksyms_lock); + + /* + * Wait until we have a snapshot, or until there is no snapshot + * being taken right now so we can take one. + */ + while ((ks = ksyms_snapshot) == NULL && ksyms_snapshotting) { + error = cv_wait_sig(&ksyms_cv, &ksyms_lock); + if (error) + goto out; + } + + /* + * If there's a usable snapshot, increment its reference count + * (can't overflow, 64-bit) and just reuse it. + */ + if (ks) { + ks->ks_refcnt++; + goto out; + } + + /* Find the current length of the symtab object. */ + size = sizeof(struct ksyms_hdr); + size += ksyms_strsz; + size += ksyms_symsz; + size += ksyms_ctfsz; + + /* Start a new snapshot. */ + ksyms_hdr.kh_shdr[SYMTAB].sh_size = ksyms_symsz; + ksyms_hdr.kh_shdr[SYMTAB].sh_info = ksyms_symsz / sizeof(Elf_Sym); + ksyms_hdr.kh_shdr[STRTAB].sh_offset = ksyms_symsz + + ksyms_hdr.kh_shdr[SYMTAB].sh_offset; + ksyms_hdr.kh_shdr[STRTAB].sh_size = ksyms_strsz; + ksyms_hdr.kh_shdr[SHCTF].sh_offset = ksyms_strsz + + ksyms_hdr.kh_shdr[STRTAB].sh_offset; + ksyms_hdr.kh_shdr[SHCTF].sh_size = ksyms_ctfsz; + last = TAILQ_LAST(&ksyms_symtabs, ksyms_symtab_queue); + maxlen = ksyms_maxlen; + gen = ksyms_snapshot_gen++; + + /* + * Prevent ksyms entries from being removed while we take the + * snapshot. + */ + KASSERT(ksyms_snapshotting == NULL); + ksyms_snapshotting = curlwp; + mutex_exit(&ksyms_lock); + + /* Create a snapshot and write the symtab to it. */ + ks = ksyms_snapshot_alloc(maxlen, size, dev, gen); + error = ksyms_take_snapshot(ks, last); + + /* + * Snapshot creation is done. Wake up anyone waiting to remove + * entries (module unload). + */ + mutex_enter(&ksyms_lock); + KASSERTMSG(ksyms_snapshotting == curlwp, "lwp %p stole snapshot", + ksyms_snapshotting); + ksyms_snapshotting = NULL; + cv_broadcast(&ksyms_cv); + + /* If we failed, give up. */ + if (error) + goto out; + + /* Cache the snapshot for the next reader. */ + KASSERT(ksyms_snapshot == NULL); + ksyms_snapshot = ks; + ks->ks_refcnt++; + KASSERT(ks->ks_refcnt == 2); + +out: mutex_exit(&ksyms_lock); + if (error) { + if (fp) + fd_abort(curproc, fp, fd); + if (ks) + ksyms_snapshot_release(ks); + } else { + KASSERT(fp); + KASSERT(ks); + error = fd_clone(fp, fd, flags, &ksyms_fileops, ks); + KASSERTMSG(error == EMOVEFD, "error=%d", error); + } + return error; +} + +static int +ksymsclose(struct file *fp) +{ + struct ksyms_snapshot *ks = fp->f_data; + + ksyms_snapshot_release(ks); + + return 0; +} + +static int +ksymsread(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, + int flags) +{ + const struct ksyms_snapshot *ks = fp->f_data; + size_t count; + int error; + + /* + * Since we don't have a per-object lock, we might as well use + * the struct file lock to serialize access to fp->f_offset -- + * but if the caller isn't relying on or updating fp->f_offset, + * there's no need to do even that. We could use ksyms_lock, + * but why bother with a global lock if not needed? Either + * way, the lock we use here must agree with what ksymsseek + * takes (nothing else in ksyms uses fp->f_offset). + */ + if (offp == &fp->f_offset) + mutex_enter(&fp->f_lock); + + /* Refuse negative offsets. */ + if (*offp < 0) { + error = EINVAL; + goto out; + } + + /* Return nothing at or past end of file. */ + if (*offp >= ks->ks_size) { + error = 0; + goto out; + } + + /* + * 1. Set up the uio to transfer from offset *offp. + * 2. Transfer as many bytes as we can (at most uio->uio_resid + * or what's left in the ksyms). + * 3. If requested, update *offp to reflect the number of bytes + * transferred. + */ + uio->uio_offset = *offp; + count = uio->uio_resid; + error = ubc_uiomove(ks->ks_uobj, uio, MIN(count, ks->ks_size - *offp), + UVM_ADV_SEQUENTIAL, UBC_READ|UBC_PARTIALOK); + if (flags & FOF_UPDATE_OFFSET) + *offp += count - uio->uio_resid; + +out: if (offp == &fp->f_offset) + mutex_exit(&fp->f_lock); + return error; +} + +static int +ksymsstat(struct file *fp, struct stat *st) +{ + const struct ksyms_snapshot *ks = fp->f_data; + + memset(st, 0, sizeof(*st)); + + st->st_dev = NODEV; + st->st_ino = 0; + st->st_mode = S_IFCHR; + st->st_nlink = 1; + st->st_uid = kauth_cred_geteuid(fp->f_cred); + st->st_gid = kauth_cred_getegid(fp->f_cred); + st->st_rdev = ks->ks_dev; + st->st_size = ks->ks_size; + /* zero time */ + st->st_blksize = MAXPHYS; /* XXX ubc_winsize */ + st->st_blocks = 0; + st->st_gen = ks->ks_gen; + + return 0; +} + +static int +ksymsmmap(struct file *fp, off_t *offp, size_t nbytes, int prot, int *flagsp, + int *advicep, struct uvm_object **uobjp, int *maxprotp) +{ + const struct ksyms_snapshot *ks = fp->f_data; + + /* uvm_mmap guarantees page-aligned offset and size. */ + KASSERT(*offp == round_page(*offp)); + KASSERT(nbytes == round_page(nbytes)); + + /* Refuse negative offsets. */ + if (*offp < 0) + return EINVAL; + + /* Refuse mappings that pass the end of file. */ + if (nbytes > round_page(ks->ks_size) || + *offp > round_page(ks->ks_size) - nbytes) + return EINVAL; /* XXX ??? */ + + /* Success! */ + *advicep = UVM_ADV_SEQUENTIAL; + *uobjp = ks->ks_uobj; + *maxprotp = prot & VM_PROT_READ; + return 0; +} + +static int +ksymsseek(struct file *fp, off_t delta, int whence, off_t *newoffp, int flags) +{ + struct ksyms_snapshot *ks = fp->f_data; + off_t base, newoff; + int error; + + mutex_enter(&fp->f_lock); + + switch (whence) { + case SEEK_CUR: + base = fp->f_offset; + break; + case SEEK_END: + base = ks->ks_size; + break; + case SEEK_SET: + base = 0; + break; + default: + error = EINVAL; + goto out; + } + + /* Compute the new offset and validate it. */ + newoff = base + delta; /* XXX arithmetic overflow */ + if (newoff < 0) { + error = EINVAL; + goto out; + } + + /* Success! */ + if (newoffp) + *newoffp = newoff; + if (flags & FOF_UPDATE_OFFSET) + fp->f_offset = newoff; + error = 0; + +out: mutex_exit(&fp->f_lock); + return error; } __CTASSERT(offsetof(struct ksyms_ogsymbol, kg_name) == offsetof(struct ksyms_gsymbol, kg_name)); __CTASSERT(offsetof(struct ksyms_gvalue, kv_name) == offsetof(struct ksyms_gsymbol, kg_name)); static int -ksymsioctl(dev_t dev, u_long cmd, void *data, int fflag, struct lwp *l) +ksymsioctl(struct file *fp, u_long cmd, void *data) { + struct ksyms_snapshot *ks = fp->f_data; struct ksyms_ogsymbol *okg = (struct ksyms_ogsymbol *)data; struct ksyms_gsymbol *kg = (struct ksyms_gsymbol *)data; struct ksyms_gvalue *kv = (struct ksyms_gvalue *)data; @@ -1165,8 +1433,8 @@ ksymsioctl(dev_t dev, u_long cmd, void *data, int fflag, struct lwp *l) char *str = NULL; int len; - /* Read ksyms_maxlen only once while not holding the lock. */ - len = ksyms_maxlen; + /* Read cached ksyms_maxlen. */ + len = ks->ks_maxlen; if (cmd == OKIOCGVALUE || cmd == OKIOCGSYMBOL || cmd == KIOCGVALUE || cmd == KIOCGSYMBOL) { @@ -1196,8 +1464,6 @@ ksymsioctl(dev_t dev, u_long cmd, void *data, int fflag, struct lwp *l) */ mutex_enter(&ksyms_lock); TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { - if (st->sd_gone) - continue; if ((sym = findsym(str, st, KSYMS_ANY)) == NULL) continue; #ifdef notdef @@ -1238,8 +1504,6 @@ ksymsioctl(dev_t dev, u_long cmd, void *data, int fflag, struct lwp *l) */ mutex_enter(&ksyms_lock); TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { - if (st->sd_gone) - continue; if ((sym = findsym(str, st, KSYMS_ANY)) == NULL) continue; #ifdef notdef @@ -1264,10 +1528,7 @@ ksymsioctl(dev_t dev, u_long cmd, void *data, int fflag, struct lwp *l) /* * Get total size of symbol table. */ - mutex_enter(&ksyms_lock); - *(int *)data = ksyms_strsz + ksyms_symsz + - sizeof(struct ksyms_hdr); - mutex_exit(&ksyms_lock); + *(int *)data = ks->ks_size; break; default: @@ -1280,15 +1541,30 @@ ksymsioctl(dev_t dev, u_long cmd, void *data, int fflag, struct lwp *l) const struct cdevsw ksyms_cdevsw = { .d_open = ksymsopen, - .d_close = ksymsclose, - .d_read = ksymsread, - .d_write = ksymswrite, - .d_ioctl = ksymsioctl, - .d_stop = nullstop, + .d_close = noclose, + .d_read = noread, + .d_write = nowrite, + .d_ioctl = noioctl, + .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, - .d_kqfilter = nullkqfilter, + .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_OTHER | D_MPSAFE }; + +static const struct fileops ksyms_fileops = { + .fo_name = "ksyms", + .fo_read = ksymsread, + .fo_write = fbadop_write, + .fo_ioctl = ksymsioctl, + .fo_fcntl = fnullop_fcntl, + .fo_poll = fnullop_poll, + .fo_stat = ksymsstat, + .fo_close = ksymsclose, + .fo_kqfilter = fnullop_kqfilter, + .fo_restart = fnullop_restart, + .fo_mmap = ksymsmmap, + .fo_seek = ksymsseek, +}; From 3cf6541791eb60ca356c8423d5908334f479420f Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Thu, 19 Aug 2021 13:05:45 +0000 Subject: [PATCH 4/4] ksyms: Use pserialize(9) for kernel access to ksyms. This makes it available in interrupt context, e.g. for printing messages with kernel symbol names for return addresses as drm wants to do. --- sys/arch/sparc64/sparc64/machdep.c | 10 +++- sys/kern/kern_ksyms.c | 88 ++++++++++++++++++++++-------- sys/kern/subr_csan.c | 4 ++ sys/kern/subr_msan.c | 6 ++ sys/sys/ksyms.h | 8 +++ 5 files changed, 90 insertions(+), 26 deletions(-) diff --git a/sys/arch/sparc64/sparc64/machdep.c b/sys/arch/sparc64/sparc64/machdep.c index eead86222944..8a1e3538bc8f 100644 --- a/sys/arch/sparc64/sparc64/machdep.c +++ b/sys/arch/sparc64/sparc64/machdep.c @@ -102,6 +102,7 @@ __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.302 2021/09/07 16:56:13 riastradh Exp #include #include #include +#include #include @@ -836,17 +837,22 @@ get_symbol_and_offset(const char **mod, const char **sym, vaddr_t *offset, vaddr { static char symbuf[256]; unsigned long symaddr; + int s, error; #if NKSYMS || defined(DDB) || defined(MODULAR) + s = pserialize_read_enter(); if (ksyms_getname(mod, sym, pc, KSYMS_CLOSEST|KSYMS_PROC|KSYMS_ANY) == 0) { - if (ksyms_getval(*mod, *sym, &symaddr, - KSYMS_CLOSEST|KSYMS_PROC|KSYMS_ANY) != 0) + error = ksyms_getval(*mod, *sym, &symaddr, + KSYMS_CLOSEST|KSYMS_PROC|KSYMS_ANY); + pserialize_read_exit(s); + if (error) goto failed; *offset = (vaddr_t)(pc - symaddr); return; } + pserialize_read_exit(s); #endif failed: snprintf(symbuf, sizeof symbuf, "%llx", (unsigned long long)pc); diff --git a/sys/kern/kern_ksyms.c b/sys/kern/kern_ksyms.c index e55252d5f7e9..c882ce88fea5 100644 --- a/sys/kern/kern_ksyms.c +++ b/sys/kern/kern_ksyms.c @@ -97,6 +97,7 @@ __KERNEL_RCSID(0, "$NetBSD: kern_ksyms.c,v 1.102 2021/09/07 16:56:25 riastradh E #include #include #include +#include #include #include @@ -135,6 +136,7 @@ static kcondvar_t ksyms_cv; static struct lwp *ksyms_snapshotting; static struct ksyms_snapshot *ksyms_snapshot; static uint64_t ksyms_snapshot_gen; +static pserialize_t ksyms_psz __read_mostly; static void ksyms_hdr_init(const void *); static void ksyms_sizes_calc(void); @@ -165,6 +167,7 @@ int ksyms_strsz; int ksyms_ctfsz; /* this is not currently used by savecore(8) */ TAILQ_HEAD(ksyms_symtab_queue, ksyms_symtab) ksyms_symtabs = TAILQ_HEAD_INITIALIZER(ksyms_symtabs); +static struct pslist_head ksyms_symtabs_psz = PSLIST_INITIALIZER; static int ksyms_verify(const void *symstart, const void *strstart) @@ -266,6 +269,7 @@ ksyms_init(void) if (!ksyms_initted) { mutex_init(&ksyms_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&ksyms_cv, "ksyms"); + ksyms_psz = pserialize_create(); ksyms_initted = true; } } @@ -468,9 +472,19 @@ addsymtab(const char *name, void *symstart, size_t symsize, /* * Publish the symtab. Do this at splhigh to ensure ddb never * witnesses an inconsistent state of the queue, unless memory - * is so corrupt that we crash in TAILQ_INSERT_TAIL. + * is so corrupt that we crash in PSLIST_WRITER_INSERT_AFTER or + * TAILQ_INSERT_TAIL. */ + PSLIST_ENTRY_INIT(tab, sd_pslist); s = splhigh(); + if (TAILQ_EMPTY(&ksyms_symtabs)) { + PSLIST_WRITER_INSERT_HEAD(&ksyms_symtabs_psz, tab, sd_pslist); + } else { + struct ksyms_symtab *last; + + last = TAILQ_LAST(&ksyms_symtabs, ksyms_symtab_queue); + PSLIST_WRITER_INSERT_AFTER(last, tab, sd_pslist); + } TAILQ_INSERT_TAIL(&ksyms_symtabs, tab, sd_queue); splx(s); @@ -612,7 +626,9 @@ ksyms_addsyms_explicit(void *ehdr, void *symstart, size_t symsize, * "val" is a pointer to the corresponding value, if call succeeded. * Returns 0 if success or ENOENT if no such entry. * - * Call with ksyms_lock, unless known that the symbol table can't change. + * If symp is nonnull, caller must hold ksyms_lock or module_lock, have + * ksyms_opencnt nonzero, be in a pserialize read section, be in ddb + * with all other CPUs quiescent. */ int ksyms_getval_unlocked(const char *mod, const char *sym, Elf_Sym **symp, @@ -620,51 +636,61 @@ ksyms_getval_unlocked(const char *mod, const char *sym, Elf_Sym **symp, { struct ksyms_symtab *st; Elf_Sym *es; + int s, error = ENOENT; #ifdef KSYMS_DEBUG if (ksyms_debug & FOLLOW_CALLS) printf("%s: mod %s sym %s valp %p\n", __func__, mod, sym, val); #endif - TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { + s = pserialize_read_enter(); + PSLIST_READER_FOREACH(st, &ksyms_symtabs_psz, struct ksyms_symtab, + sd_pslist) { if (mod != NULL && strcmp(st->sd_name, mod)) continue; if ((es = findsym(sym, st, type)) != NULL) { *val = es->st_value; if (symp) *symp = es; - return 0; + error = 0; + break; } } - return ENOENT; + pserialize_read_exit(s); + return error; } int ksyms_getval(const char *mod, const char *sym, unsigned long *val, int type) { - int rc; if (!ksyms_loaded) return ENOENT; - mutex_enter(&ksyms_lock); - rc = ksyms_getval_unlocked(mod, sym, NULL, val, type); - mutex_exit(&ksyms_lock); - return rc; + /* No locking needed -- we read the table pserialized. */ + return ksyms_getval_unlocked(mod, sym, NULL, val, type); } +/* + * ksyms_get_mod(mod) + * + * Return the symtab for the given module name. Caller must ensure + * that the module cannot be unloaded until after this returns. + */ struct ksyms_symtab * ksyms_get_mod(const char *mod) { struct ksyms_symtab *st; + int s; - mutex_enter(&ksyms_lock); - TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { + s = pserialize_read_enter(); + PSLIST_READER_FOREACH(st, &ksyms_symtabs_psz, struct ksyms_symtab, + sd_pslist) { if (mod != NULL && strcmp(st->sd_name, mod)) continue; break; } - mutex_exit(&ksyms_lock); + pserialize_read_exit(s); return st; } @@ -720,7 +746,9 @@ ksyms_mod_foreach(const char *mod, ksyms_callback_t callback, void *opaque) * Get "mod" and "symbol" associated with an address. * Returns 0 if success or ENOENT if no such entry. * - * Call with ksyms_lock, unless known that the symbol table can't change. + * Caller must hold ksyms_lock or module_lock, have ksyms_opencnt + * nonzero, be in a pserialize read section, or be in ddb with all + * other CPUs quiescent. */ int ksyms_getname(const char **mod, const char **sym, vaddr_t v, int f) @@ -735,7 +763,8 @@ ksyms_getname(const char **mod, const char **sym, vaddr_t v, int f) if (!ksyms_loaded) return ENOENT; - TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { + PSLIST_READER_FOREACH(st, &ksyms_symtabs_psz, struct ksyms_symtab, + sd_pslist) { if (v < st->sd_minsym || v > st->sd_maxsym) continue; sz = st->sd_symsize/sizeof(Elf_Sym); @@ -822,12 +851,21 @@ ksyms_modunload(const char *name) /* * Remove the symtab. Do this at splhigh to ensure ddb never * witnesses an inconsistent state of the queue, unless memory - * is so corrupt that we crash in TAILQ_REMOVE. + * is so corrupt that we crash in TAILQ_REMOVE or + * PSLIST_WRITER_REMOVE. */ s = splhigh(); TAILQ_REMOVE(&ksyms_symtabs, st, sd_queue); + PSLIST_WRITER_REMOVE(st, sd_pslist); splx(s); + /* + * And wait a grace period, in case there are any pserialized + * readers in flight. + */ + pserialize_perform(ksyms_psz); + PSLIST_ENTRY_DESTROY(st, sd_pslist); + /* Recompute the ksyms sizes now that we've removed st. */ ksyms_sizes_calc(); @@ -1431,7 +1469,7 @@ ksymsioctl(struct file *fp, u_long cmd, void *data) unsigned long val; int error = 0; char *str = NULL; - int len; + int len, s; /* Read cached ksyms_maxlen. */ len = ks->ks_maxlen; @@ -1462,8 +1500,9 @@ ksymsioctl(struct file *fp, u_long cmd, void *data) * Use the in-kernel symbol lookup code for fast * retreival of a symbol. */ - mutex_enter(&ksyms_lock); - TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { + s = pserialize_read_enter(); + PSLIST_READER_FOREACH(st, &ksyms_symtabs_psz, + struct ksyms_symtab, sd_pslist) { if ((sym = findsym(str, st, KSYMS_ANY)) == NULL) continue; #ifdef notdef @@ -1477,10 +1516,10 @@ ksymsioctl(struct file *fp, u_long cmd, void *data) } if (sym != NULL) { memcpy(©, sym, sizeof(copy)); - mutex_exit(&ksyms_lock); + pserialize_read_exit(s); error = copyout(©, okg->kg_sym, sizeof(Elf_Sym)); } else { - mutex_exit(&ksyms_lock); + pserialize_read_exit(s); error = ENOENT; } kmem_free(str, len); @@ -1502,8 +1541,9 @@ ksymsioctl(struct file *fp, u_long cmd, void *data) * Use the in-kernel symbol lookup code for fast * retreival of a symbol. */ - mutex_enter(&ksyms_lock); - TAILQ_FOREACH(st, &ksyms_symtabs, sd_queue) { + s = pserialize_read_enter(); + PSLIST_READER_FOREACH(st, &ksyms_symtabs_psz, + struct ksyms_symtab, sd_pslist) { if ((sym = findsym(str, st, KSYMS_ANY)) == NULL) continue; #ifdef notdef @@ -1520,7 +1560,7 @@ ksymsioctl(struct file *fp, u_long cmd, void *data) } else { error = ENOENT; } - mutex_exit(&ksyms_lock); + pserialize_read_exit(s); kmem_free(str, len); break; diff --git a/sys/kern/subr_csan.c b/sys/kern/subr_csan.c index aa0f2e8b6894..65c398a6e85b 100644 --- a/sys/kern/subr_csan.c +++ b/sys/kern/subr_csan.c @@ -40,6 +40,7 @@ __KERNEL_RCSID(0, "$NetBSD: subr_csan.c,v 1.12 2021/09/07 16:56:13 riastradh Exp #include #include #include +#include #ifdef KCSAN_PANIC #define REPORT panic @@ -94,7 +95,9 @@ static inline void kcsan_report(csan_cell_t *new, cpuid_t newcpu, csan_cell_t *old, cpuid_t oldcpu) { const char *newsym, *oldsym; + int s; + s = pserialize_read_enter(); if (ksyms_getname(NULL, &newsym, (vaddr_t)new->pc, KSYMS_PROC) != 0) { newsym = "Unknown"; } @@ -110,6 +113,7 @@ kcsan_report(csan_cell_t *new, cpuid_t newcpu, csan_cell_t *old, cpuid_t oldcpu) oldcpu, (old->atomic ? "Atomic " : ""), (old->write ? "Write" : "Read"), (void *)old->addr, old->size, (void *)old->pc, oldsym); + pserialize_read_exit(s); kcsan_md_unwind(); } diff --git a/sys/kern/subr_msan.c b/sys/kern/subr_msan.c index abd30ac3a763..4ac5adf3212a 100644 --- a/sys/kern/subr_msan.c +++ b/sys/kern/subr_msan.c @@ -152,6 +152,7 @@ kmsan_report_hook(const void *addr, size_t size, size_t off, const char *hook) uintptr_t ptr; char buf[128]; int type; + int s; if (__predict_false(panicstr != NULL || db_active || kmsan_reporting)) return; @@ -172,6 +173,7 @@ kmsan_report_hook(const void *addr, size_t size, size_t off, const char *hook) typename = kmsan_orig_name(type); if (kmsan_md_is_pc(ptr)) { + s = pserialize_read_enter(); if (ksyms_getname(&mod, &sym, (vaddr_t)ptr, KSYMS_PROC)) { REPORT("MSan: Uninitialized %s Memory In %s " "At Offset %zu/%zu, IP %p\n", typename, hook, off, @@ -181,6 +183,7 @@ kmsan_report_hook(const void *addr, size_t size, size_t off, const char *hook) "At Offset %zu/%zu, From %s()\n", typename, hook, off, size, sym); } + pserialize_read_exit(s); } else { var = (char *)ptr + 4; strlcpy(buf, var, sizeof(buf)); @@ -208,6 +211,7 @@ kmsan_report_inline(msan_orig_t orig, unsigned long pc) uintptr_t ptr; char buf[128]; int type; + int s; if (__predict_false(panicstr != NULL || db_active || kmsan_reporting)) return; @@ -225,6 +229,7 @@ kmsan_report_inline(msan_orig_t orig, unsigned long pc) typename = kmsan_orig_name(type); if (kmsan_md_is_pc(ptr)) { + s = pserialize_read_enter(); if (ksyms_getname(&mod, &sym, (vaddr_t)ptr, KSYMS_PROC)) { REPORT("MSan: Uninitialized %s Memory, " "Origin %x\n", typename, orig); @@ -232,6 +237,7 @@ kmsan_report_inline(msan_orig_t orig, unsigned long pc) REPORT("MSan: Uninitialized %s Memory " "From %s()\n", typename, sym); } + pserialize_read_exit(s); } else { var = (char *)ptr + 4; strlcpy(buf, var, sizeof(buf)); diff --git a/sys/sys/ksyms.h b/sys/sys/ksyms.h index dcdd9a40bb8b..9dfba5481f34 100644 --- a/sys/sys/ksyms.h +++ b/sys/sys/ksyms.h @@ -39,6 +39,10 @@ #include #include +#ifdef _KERNEL +#include +#endif + struct ksyms_symtab { TAILQ_ENTRY(ksyms_symtab) sd_queue; /* All active tables */ const char *sd_name; /* Name of this table */ @@ -55,6 +59,10 @@ struct ksyms_symtab { int sd_ctfsize; /* Size in bytes of CTF contents */ uint32_t *sd_nmap; /* Name map for sorted symbols */ int sd_nmapsize; /* Total span of map */ + +#ifdef _KERNEL + struct pslist_entry sd_pslist; +#endif }; /*