From 5337cdac75081c94a3b7b360b961398f2b218248 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Thu, 6 Jul 2023 19:09:44 +0000 Subject: [PATCH 1/2] crashme(9): New crash methods with raised ipl or kpreempt disabled. --- sys/kern/kern_crashme.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/sys/kern/kern_crashme.c b/sys/kern/kern_crashme.c index 0d6b716354aa..39eebd530362 100644 --- a/sys/kern/kern_crashme.c +++ b/sys/kern/kern_crashme.c @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef DDB #include @@ -67,6 +68,8 @@ static int crashme_ddb(int); static int crashme_kernel_lock_spinout(int); #endif static int crashme_mutex_recursion(int); +static int crashme_spl_spinout(int); +static int crashme_kpreempt_spinout(int); #define CMNODE(name, lname, func) \ { \ @@ -88,6 +91,10 @@ static crashme_node nodes[] = { #endif CMNODE("mutex_recursion", "enter the same mutex twice", crashme_mutex_recursion), + CMNODE("spl_spinout", "infinite loop at raised spl", + crashme_spl_spinout), + CMNODE("kpreempt_spinout", "infinite loop with kpreempt disabled", + crashme_kpreempt_spinout), }; static crashme_node *first_node; static kmutex_t crashme_lock; @@ -332,3 +339,35 @@ crashme_mutex_recursion(int flags) return -1; } } + +static int +crashme_spl_spinout(int flags) +{ + int s; + + printf("%s: raising ipl to %d\n", __func__, flags); + s = splraiseipl(makeiplcookie(flags)); + printf("%s: raised ipl to %d, s=%d\n", __func__, flags, s); + for (;;) + __insn_barrier(); + printf("%s: exited infinite loop!?\n", __func__); + splx(s); + printf("%s: lowered ipl to s=%d\n", __func__, s); + + return 0; +} + +static int +crashme_kpreempt_spinout(int flags) +{ + + kpreempt_disable(); + printf("%s: disabled kpreemption\n", __func__); + for (;;) + __insn_barrier(); + printf("%s: exited infinite loop!?\n", __func__); + kpreempt_enable(); + printf("%s: re-enabled kpreemption\n", __func__); + + return 0; +} From 2c2fd3421210b74cc55a4d1a5554c58def0a0112 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Thu, 6 Jul 2023 19:11:58 +0000 Subject: [PATCH 2/2] heartbeat(9): New mechanism to check progress of kernel. This uses hard interrupts to check progress of low-priority soft interrupts, and one CPU to check progress of another CPU. If no progress has been made after a configurable number of seconds (kern.heartbeat.max_period, default 15), then the system panics -- preferably on the CPU that is stuck so we get a stack trace in dmesg of where it was stuck, but if the stuckness was detected by another CPU and the stuck CPU doesn't acknowledge the request to panic within one second, the detecting CPU panics instead. This doesn't supplant hardware watchdog timers. It is possible for hard interrupts to be stuck on all CPUs for some reason too; in that case heartbeat(9) has no opportunity to complete. Downside: heartbeat(9) relies on hardclock to run at a reasonably consistent rate, which might cause trouble for the glorious tickless future. However, it could be adapted to take a parameter for an approximate number of units that have elapsed since the last call on the current CPU, rather than treating that as a constant 1. --- share/man/man9/heartbeat.9 | 129 ++++++++++ sys/kern/files.kern | 1 + sys/kern/init_main.c | 7 + sys/kern/kern_clock.c | 6 + sys/kern/kern_heartbeat.c | 482 +++++++++++++++++++++++++++++++++++++ sys/sys/cpu_data.h | 9 + sys/sys/heartbeat.h | 42 ++++ 7 files changed, 676 insertions(+) create mode 100644 share/man/man9/heartbeat.9 create mode 100644 sys/kern/kern_heartbeat.c create mode 100644 sys/sys/heartbeat.h diff --git a/share/man/man9/heartbeat.9 b/share/man/man9/heartbeat.9 new file mode 100644 index 000000000000..97ce0d8458ab --- /dev/null +++ b/share/man/man9/heartbeat.9 @@ -0,0 +1,129 @@ +.\" $NetBSD$ +.\" +.\" Copyright (c) 2023 The NetBSD Foundation, Inc. +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS +.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS +.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.\" POSSIBILITY OF SUCH DAMAGE. +.\" +.Dd July 6, 2023 +.Dt HEARTBEAT 9 +.Os +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh NAME +.Nm heartbeat +.Nd periodic checks to ensure CPUs are making progress +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh SYNOPSIS +.In sys/heartbeat.h +.\" +.Ft void +.Fn heartbeat_start void +.Ft void +.Fn heartbeat void +.Fd "#ifdef DDB" +.Ft void +.Fn heartbeat_dump void +.Fd "#endif" +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh DESCRIPTION +The +.Nm +subsystem verifies that soft interrupts +.Pq Xr softint 9 +and the system +.Xr timecounter 9 +are making progress over time, once +.Fn heartbeat_start +has been called. +The number of seconds before +.Nm +panics without progress is controlled by the sysctl knob +.Li kern.heartbeat.max_period , +which defaults to 15. +.Pp +The periodic hardware timer interrupt handler calls +.Fn heartbeat +every tick on each CPU. +Once per second +.Po +i.e., every +.Xr hz 9 +ticks +.Pc , +.Fn heartbeat +schedules a soft interrupt at priority +.Dv SOFTINT_CLOCK +to advance the current CPU's view of +.Xr time_uptime 9 . +.Pp +.Fn heartbeat +checks whether +.Xr time_uptime 9 +has changed, to see if either the +.Xr timecounter 9 +or soft intrrupts on the current CPU are stuck. +If it hasn't advanced within +.Li kern.heartbeat.max_period +seconds worth of ticks, or if it has updated and the current CPU's view +of it hasn't been updated by more than +.Li kern.heartbeat.max_period +seconds, then +.Fn heartbeat +panics. +.Pp +.Fn heartbeat +also checks whether the next online CPU has advanced its view of +.Xr time_uptime 9 , +to see if soft interrupts +.Pq including Xr callout 9 +on that CPU are stuck. +If it hasn't updated within +.Li kern.heartbeat.max_period +seconds, +.Fn heartbeat +sends an +.Xr ipi 9 +to panic on that CPU. +If that CPU has not acknowledged the +.Xr ipi 9 +within one second, +.Fn heartbeat +panics on the current CPU instead. +.Pp +The +.Fn heartbeat_dump +function prints all the heartbeat counter, uptime cache, and uptime +cache timestamp (in units of heartbeats) to the console. +It can be invoked from +.Xr ddb 9 +by +.Ql call heartbeat_dump . +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh SEE ALSO +.Xr wdogctl 8 , +.Xr swwdog 4 +.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.Sh HISTORY +The +.Nm +subsystem first appeared in +.Nx 11.0 . diff --git a/sys/kern/files.kern b/sys/kern/files.kern index 03eb05ec2588..46513c2608d6 100644 --- a/sys/kern/files.kern +++ b/sys/kern/files.kern @@ -48,6 +48,7 @@ file kern/kern_exec.c kern file kern/kern_exit.c kern file kern/kern_fork.c kern file kern/kern_idle.c kern +file kern/kern_heartbeat.c kern file kern/kern_hook.c kern file kern/kern_kthread.c kern file kern/kern_ktrace.c ktrace diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index f0428ad570f8..f71526befee5 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -199,6 +199,7 @@ extern void *_binary_splash_image_end; #include #include #include +#include #include #include @@ -557,6 +558,12 @@ main(void) /* Once all CPUs are detected, initialize the per-CPU cprng_fast. */ cprng_fast_init(); + /* + * Now that softints can be established, start monitoring + * system heartbeat on all CPUs. + */ + heartbeat_start(); + ssp_init(); ubc_init(); /* must be after autoconfig */ diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 53fa4ed7554d..b453a2dcfbf0 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -92,6 +92,7 @@ __KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.149 2023/06/30 21:42:05 riastradh E #include #include #include +#include #ifdef GPROF #include @@ -335,6 +336,11 @@ hardclock(struct clockframe *frame) tc_ticktock(); } + /* + * Make sure the CPUs and timecounter are making progress. + */ + heartbeat(); + /* * Update real-time timeout queue. */ diff --git a/sys/kern/kern_heartbeat.c b/sys/kern/kern_heartbeat.c new file mode 100644 index 000000000000..fa468a6f1de0 --- /dev/null +++ b/sys/kern/kern_heartbeat.c @@ -0,0 +1,482 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2023 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD$"); + +#ifdef _KERNEL_OPT +#include "opt_ddb.h" +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DDB +#include +#endif + +kmutex_t heartbeat_lock __cacheline_aligned; +void *heartbeat_sih __read_mostly; +unsigned heartbeat_max_period_secs __read_mostly; +unsigned heartbeat_max_period_ticks __read_mostly; + +/* + * heartbeat_max_period_ticks(SYSCTLFN_ARGS) + * + * Sysctl handler for sysctl kern.heartbeat.max_period. Verifies + * it lies within a reasonable interval and sets it. + */ +static int +heartbeat_max_period_sysctl(SYSCTLFN_ARGS) +{ + struct sysctlnode node; + unsigned period; + int error; + + mutex_enter(&heartbeat_lock); + + period = heartbeat_max_period_secs; + node = *rnode; + node.sysctl_data = . + error = sysctl_lookup(SYSCTLFN_CALL(&node)); + if (error || newp == NULL) + goto out; + + /* + * Ensure there's plenty of slop between heartbeats. + */ + if (period > UINT_MAX/4/hz) { + error = EOVERFLOW; + goto out; + } + atomic_store_relaxed(&heartbeat_max_period_secs, period); + atomic_store_relaxed(&heartbeat_max_period_ticks, period*hz); + error = 0; + +out: mutex_exit(&heartbeat_lock); + return error; +} + +/* + * sysctl_heartbeat_setup() + * + * Set up the kern.heartbeat.* sysctl subtree. + */ +SYSCTL_SETUP(sysctl_heartbeat_setup, "sysctl kern.heartbeat setup") +{ + const struct sysctlnode *rnode; + int error; + + mutex_init(&heartbeat_lock, MUTEX_DEFAULT, IPL_NONE); + + /* kern.heartbeat */ + error = sysctl_createv(NULL, 0, NULL, &rnode, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "heartbeat", + SYSCTL_DESCR("Kernel heartbeat parameters"), + NULL, 0, NULL, 0, + CTL_KERN, CTL_CREATE, CTL_EOL); + if (error) { + printf("%s: failed to create kern.heartbeat: %d\n", + __func__, error); + return; + } + + /* kern.heartbeat.max_period */ + error = sysctl_createv(NULL, 0, &rnode, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "max_period", + SYSCTL_DESCR("Max seconds between heartbeats before panic"), + &heartbeat_max_period_sysctl, 0, NULL, 0, + CTL_CREATE, CTL_EOL); + if (error) { + printf("%s: failed to create kern.heartbeat.max_period: %d\n", + __func__, error); + return; + } +} + +/* + * heartbeat_intr(cookie) + * + * Soft interrupt handler to update the local CPU's view of the + * system uptime. This runs at the same priority level as + * callouts, so if callouts are stuck on this CPU, it won't run, + * and eventually another CPU will notice that this one is stuck. + * + * Don't do spl* here -- keep it to a minimum so if anything goes + * wrong we don't end up with hard interrupts blocked and unable + * to detect a missed heartbeat. + */ +static void +heartbeat_intr(void *cookie) +{ + unsigned count = atomic_load_relaxed(&curcpu()->ci_heartbeat_count); + unsigned uptime = atomic_load_relaxed(&time_uptime); + + atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_stamp, count); + atomic_store_relaxed(&curcpu()->ci_heartbeat_uptime_cache, uptime); +} + +/* + * heartbeat_start() + * + * Start system heartbeat monitoring. + */ +void +heartbeat_start(void) +{ + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + + /* + * First establish a softint so we can schedule it once ready. + */ + heartbeat_sih = softint_establish(SOFTINT_CLOCK|SOFTINT_MPSAFE, + &heartbeat_intr, NULL); + + /* + * Next, make sure we have a reasonably up-to-date time_uptime + * cache on all CPUs so we don't think we had an instant heart + * attack. + */ + for (CPU_INFO_FOREACH(cii, ci)) { + ci->ci_heartbeat_count = 0; + ci->ci_heartbeat_uptime_cache = time_uptime; + ci->ci_heartbeat_uptime_stamp = 0; + } + + /* + * Finally, kick it off by setting the periods so that + * heartbeat() will check for stopped heartbeats. + */ + atomic_store_relaxed(&heartbeat_max_period_secs, 15); + atomic_store_relaxed(&heartbeat_max_period_ticks, 15*hz); +} + +/* + * defibrillator(cookie) + * + * IPI handler for defibrillation. If the CPU's heart has stopped + * beating normally, but the CPU can still execute things, + * acknowledge the IPI to the doctor and then panic so we at least + * get a stack trace from whatever the current CPU is stuck doing, + * if not a core dump. + * + * (This metaphor is a little stretched, since defibrillation is + * usually administered when the heart _hasn't_ stopped beating, + * and causes the heart to stop temporarily, and one hopes it is + * not fatal. But we're (software) engineers, so we can stretch + * metaphors like silly putty in a blender.) + */ +static void +defibrillator(void *cookie) +{ + bool *ack = cookie; + + atomic_store_relaxed(ack, true); + panic("%s[%d %s]: heart stopped beating\n", cpu_name(curcpu()), + curlwp->l_lid, + curlwp->l_name ? curlwp->l_name : curproc->p_comm); +} + +/* + * defibrillate(ci) + * + * The patient CPU ci's heart has stopped beating. Force the + * patient CPU ci to panic, or panic on this CPU if the patient + * CPU doesn't respond within 1sec. + */ +static void __noinline +defibrillate(struct cpu_info *ci) +{ + bool ack = false; + ipi_msg_t msg = { + .func = &defibrillator, + .arg = &ack, + }; + unsigned countdown = 1000; /* 1sec */ + + KASSERT(kpreempt_disabled()); + + /* + * First notify the console that the patient CPU's heart seems + * to have stopped beating. + */ + printf("%s: found %s heart stopped beating\n", + cpu_name(curcpu()), cpu_name(ci)); + + /* + * Next, give the patient CPU a chance to panic, so we get a + * stack trace on that CPU even if we don't get a crash dump. + */ + ipi_unicast(&msg, ci); + + /* + * Busy-wait up to 1sec for the patient CPU to print a stack + * trace and panic. If the patient CPU acknowledges the IPI, + * or if we're panicking anyway, just give up and stop here -- + * the system is coming down soon and we should avoid getting + * in the way. + */ + while (countdown --> 0) { + if (atomic_load_relaxed(&ack) || + atomic_load_relaxed(&panicstr) != NULL) + return; + DELAY(1000); /* 1ms */ + } + + /* + * The patient CPU failed to acknowledge the panic request. + * Panic now; with any luck, we'll get a crash dump. + */ + panic("%s: found %s heart stopped beating and unresponsive\n", + cpu_name(curcpu()), cpu_name(ci)); +} + +/* + * select_patient() + * + * Select another CPU to check the heartbeat of. Returns NULL if + * there are no other online CPUs. Never returns curcpu(). + * Caller must have kpreemption disabled. + */ +static struct cpu_info * +select_patient(void) +{ + CPU_INFO_ITERATOR cii; + struct cpu_info *first = NULL, *patient = NULL, *ci; + bool passedcur = false; + + KASSERT(kpreempt_disabled()); + + /* + * In the iteration order of all CPUs, find the next online CPU + * after curcpu(), or the first online one if curcpu() is last + * in the iteration order. + */ + for (CPU_INFO_FOREACH(cii, ci)) { + if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE) { + continue; + } + if (passedcur) { + /* + * (...|curcpu()|ci|...) + * + * Found the patient right after curcpu(). + */ + KASSERT(patient != ci); + patient = ci; + break; + } + if (ci == curcpu()) { + /* + * (...|prev|ci=curcpu()|next|...) + * + * Note that we want next (or first, if there's + * nothing after curcpu()). + */ + passedcur = true; + continue; + } + if (first == NULL) { + /* + * (ci|...|curcpu()|...) + * + * Record ci as first in case there's nothing + * after curcpu(). + */ + first = ci; + continue; + } + } + + /* + * If we hit the end, wrap around to the beginning. + */ + if (patient == NULL) { + KASSERT(passedcur); + patient = first; + } + + return patient; +} + +/* + * heartbeat() + * + * 1. Count a heartbeat on the local CPU. + * + * 2. Panic if the system uptime doesn't seem to have advanced in + * a while. + * + * 3. Select another CPU to check the heartbeat of. + * + * 4. Panic if the other CPU doesn't seem to have noticed the + * system uptime advancing in a while. + */ +void +heartbeat(void) +{ + unsigned period_ticks, period_secs; + unsigned count, uptime, cache, stamp, d; + struct cpu_info *patient; + + KASSERT(kpreempt_disabled()); + + period_ticks = atomic_load_relaxed(&heartbeat_max_period_ticks); + period_secs = atomic_load_relaxed(&heartbeat_max_period_secs); + if (__predict_false(period_ticks == 0) || + __predict_false(period_secs == 0)) + return; + + /* + * Count a heartbeat on this CPU. + */ + count = curcpu()->ci_heartbeat_count++; + + /* + * If the uptime hasn't changed, make sure that we haven't + * counted too many of our own heartbeats since the uptime last + * changed, and stop here -- we only do the cross-CPU work once + * per second. + */ + uptime = atomic_load_relaxed(&time_uptime); + cache = atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_cache); + if (__predict_true(cache == uptime)) { + /* + * Timecounter hasn't advanced by more than a second. + * Make sure the timecounter isn't stuck according to + * our heartbeats. + * + * Our own heartbeat count can't roll back, and + * time_uptime should be updated before it wraps + * around, so d should never go negative; hence no + * check for d < UINT_MAX/2. + */ + stamp = + atomic_load_relaxed(&curcpu()->ci_heartbeat_uptime_stamp); + d = count - stamp; + if (__predict_false(d > period_ticks)) { + panic("%s: time has not advanced in %u heartbeats", + cpu_name(curcpu()), d); + } + return; + } + + /* + * If the uptime has changed, make sure that it hasn't changed + * so much that softints must be stuck on this CPU. Since + * time_uptime is monotonic, this can't go negative. + * + * This uses the hard timer interrupt handler on the current + * CPU to ensure soft interrupts at all priority levels have + * made progress. + */ + d = uptime - cache; + if (__predict_false(d > period_secs)) { + panic("%s: softints stuck for %u seconds", + cpu_name(curcpu()), d); + } + + /* + * Schedule a softint to update our cache of the system uptime + * so the next call to heartbeat, on this or another CPU, can + * detect progress on this one. + */ + softint_schedule(heartbeat_sih); + + /* + * Select a patient to check the heartbeat of. If there's no + * other online CPU, nothing to do. + */ + patient = select_patient(); + if (patient == NULL) + return; + + /* + * Verify that time is advancing on the patient CPU. If the + * delta exceeds UINT_MAX/2, that means it is already ahead by + * a little on the other CPU, and the subtraction went + * negative, which is OK. If the CPU has been + * offlined since we selected it, no worries. + * + * This uses the current CPU to ensure the other CPU has made + * progress, even if the other CPU's hard timer interrupt + * handler is stuck for some reason. + * + * XXX Maybe confirm it hasn't gone negative by more than + * max_period? + */ + d = uptime - atomic_load_relaxed(&patient->ci_heartbeat_uptime_cache); + if (__predict_false(d > period_secs) && + __predict_false(d < UINT_MAX/2) && + ((patient->ci_schedstate.spc_flags & SPCF_OFFLINE) == 0)) + defibrillate(patient); +} + +/* + * heartbeat_dump() + * + * Print the heartbeat data of all CPUs. Can be called from ddb. + */ +#ifdef DDB +static unsigned +db_read_unsigned(const unsigned *p) +{ + unsigned x; + + db_read_bytes((db_addr_t)p, sizeof(x), (char *)&x); + + return x; +} + +void +heartbeat_dump(void) +{ + struct cpu_info *ci; + + db_printf("Heartbeats:\n"); + for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) { + db_printf("cpu%u: count %u uptime %u stamp %u\n", + db_read_unsigned(&ci->ci_index), + db_read_unsigned(&ci->ci_heartbeat_count), + db_read_unsigned(&ci->ci_heartbeat_uptime_cache), + db_read_unsigned(&ci->ci_heartbeat_uptime_stamp)); + } +} +#endif diff --git a/sys/sys/cpu_data.h b/sys/sys/cpu_data.h index 4c2c6fe3d760..9cf2622b5e31 100644 --- a/sys/sys/cpu_data.h +++ b/sys/sys/cpu_data.h @@ -188,6 +188,11 @@ struct cpu_data { kcpuset_t *cpu_kcpuset; /* kcpuset_t of this cpu only */ struct lwp * volatile cpu_pcu_curlwp[PCU_UNIT_COUNT]; int64_t cpu_counts[CPU_COUNT_MAX];/* per-CPU counts */ + + unsigned cpu_heartbeat_count; /* # of heartbeats */ + unsigned cpu_heartbeat_uptime_cache; /* last time_uptime */ + unsigned cpu_heartbeat_uptime_stamp; /* heartbeats since + * uptime changed */ }; #define ci_schedstate ci_data.cpu_schedstate @@ -216,6 +221,10 @@ struct cpu_data { #define ci_faultrng ci_data.cpu_faultrng #define ci_counts ci_data.cpu_counts +#define ci_heartbeat_count ci_data.cpu_heartbeat_count +#define ci_heartbeat_uptime_cache ci_data.cpu_heartbeat_uptime_cache +#define ci_heartbeat_uptime_stamp ci_data.cpu_heartbeat_uptime_stamp + #define cpu_nsyscall cpu_counts[CPU_COUNT_NSYSCALL] #define cpu_ntrap cpu_counts[CPU_COUNT_NTRAP] #define cpu_nswtch cpu_counts[CPU_COUNT_NSWTCH] diff --git a/sys/sys/heartbeat.h b/sys/sys/heartbeat.h new file mode 100644 index 000000000000..44ee28d97a85 --- /dev/null +++ b/sys/sys/heartbeat.h @@ -0,0 +1,42 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2023 The NetBSD Foundation, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_HEARTBEAT_H +#define _SYS_HEARTBEAT_H + +#ifndef _KERNEL +#error No user-serviceable parts in this heart. +#endif + +void heartbeat_start(void); + +void heartbeat(void); + +void heartbeat_dump(void); + +#endif /* _SYS_HEARTBEAT_H */