From db9063b18328343cfa76aa2dd6df33fce89e7304 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:03:19 +0000 Subject: [PATCH 01/16] aarch64: Add missing barriers in cpu_switchto. Details in comments. Note: This is a conservative change that inserts a barrier where there was a comment saying none is needed, which is probably correct. The goal of this change is to systematically insert barriers to be confident in correctness; a subsequent change will remove some, as an optimization, with an explanation of why the barrier is not needed. PR kern/57240 XXX pullup-9 XXX pullup-10 --- sys/arch/aarch64/aarch64/cpuswitch.S | 44 ++++++++++++++++++++++++++-- sys/arch/aarch64/aarch64/locore.S | 4 +++ 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/sys/arch/aarch64/aarch64/cpuswitch.S b/sys/arch/aarch64/aarch64/cpuswitch.S index 6fd1ab26bf57..38f5f297b374 100644 --- a/sys/arch/aarch64/aarch64/cpuswitch.S +++ b/sys/arch/aarch64/aarch64/cpuswitch.S @@ -125,8 +125,44 @@ ENTRY_NP(cpu_switchto) msr tpidr_el1, x1 /* switch curlwp to new lwp */ ldr x3, [x1, #L_CPU] + + /* + * Issue store-before-store barriers around the store that sets + * ci->ci_curlwp so that any prior mutex_exit is visible to any + * thread that witnesses this store, and this store is visible + * to any thread that could witness a subsequent store to + * mtx->mtx_owner in mutex_exit. These serve as + * membar_producers that match the membar_consumers in + * mutex_vector_enter. + * + * See kern_mutex.c for details -- this is necessary for + * adaptive mutexes to detect whether the lwp is on the CPU in + * order to safely block without requiring atomic r/m/w in + * mutex_exit. + * + * Note that on aarch64, after updating ci->ci_curlwp, we don't + * need to issue a second membar_producer, to match the + * membar_consumer between mutex_oncpu(owner) and + * MUTEX_HAS_WAITERS(mtx), because mutex_exit itself issues a + * stronger barrier before releasing the mutex in one of three + * paths, which are nonpreemptible: + * + * - the mutex_exit stub in lock_stubs.S, which uses + * a single STLXR store-release instruction; + * - the mutex_vector_exit path under #ifdef LOCKDEBUG + * which does MUTEX_RELEASE (i.e., membar_release(); + * store mtx->mtx_owner = 0) at splhigh, blocking + * preemption; or + * - the mutex_vector_exit path which does MUTEX_RELEASE + * under the turnstile lock, blocking preemption. + * + * But just to be on the safe side let's issue both for now, + * and remove the second one only after testing. + */ + dmb ishst /* membar_producer */ str x1, [x3, #CI_CURLWP] /* switch curlwp to new lwp */ - dmb ishst /* see comments in kern_mutex.c */ + dmb ishst /* membar_producer */ + ENABLE_INTERRUPT /* @@ -201,8 +237,9 @@ ENTRY_NP(cpu_switchto_softint) /* onto new stack */ sub sp, x4, #TF_SIZE /* new sp := softlwp->l_md_utf - 1 */ msr tpidr_el1, x0 /* curlwp = softlwp; */ + dmb ishst /* for mutex_enter; see cpu_switchto */ str x0, [x20, #CI_CURLWP] /* curcpu()->ci_curlwp = softlwp; */ - /* no need for memory barrier here */ + dmb ishst /* for mutex_enter; see cpu_switchto */ mov x5, #CPACR_FPEN_NONE msr cpacr_el1, x5 /* cpacr_el1 = CPACR_FPEN_NONE */ @@ -244,8 +281,9 @@ ENTRY_NP(cpu_switchto_softint) DISABLE_INTERRUPT msr tpidr_el1, x19 /* curlwp = pinned_lwp */ ldr x3, [x19, #L_CPU] /* x3 = curlwp->l_cpu */ + dmb ishst /* for mutex_enter; see cpu_switchto */ str x19, [x3, #CI_CURLWP] /* curlwp->l_cpu->ci_curlwp := x19 */ - dmb ishst /* see comments in kern_mutex.c */ + dmb ishst /* for mutex_enter; see cpu_switchto */ mov sp, x4 /* restore pinned_lwp sp */ msr cpacr_el1, x5 /* restore pinned_lwp cpacr */ diff --git a/sys/arch/aarch64/aarch64/locore.S b/sys/arch/aarch64/aarch64/locore.S index b72be4ce7dc8..1c0e57d7c9cd 100644 --- a/sys/arch/aarch64/aarch64/locore.S +++ b/sys/arch/aarch64/aarch64/locore.S @@ -529,6 +529,10 @@ mp_vstart: */ ldr x1, [x0, #CI_IDLELWP] /* x0 = curcpu()->ci_idlelwp */ msr tpidr_el1, x1 /* tpidr_el1 = curlwp = x1 */ + /* + * No membar needed because we're not switching from a + * previous lwp; see cpu_switchto. + */ str x1, [x0, #CI_CURLWP] /* curlwp is idlelwp */ /* get my stack from lwp */ From 6b4a7e27da9f5588f1ac4a828cab9517b96e8e42 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:34:18 +0000 Subject: [PATCH 02/16] alpha: Add missing barriers in cpu_switchto. Details in comments. PR kern/57240 XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/alpha/include/asm.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/sys/arch/alpha/include/asm.h b/sys/arch/alpha/include/asm.h index 910167763f38..c5ca590f58b8 100644 --- a/sys/arch/alpha/include/asm.h +++ b/sys/arch/alpha/include/asm.h @@ -669,10 +669,32 @@ label: ASCIZ msg; \ #define GET_CURLWP \ call_pal PAL_OSF1_rdval +/* + * Issue store-before-store barriers around the store that sets + * ci->ci_curlwp so that any prior mutex_exit is visible to any + * thread that witnesses this store, and this store is visible + * to any thread that could witness a subsequent store to + * mtx->mtx_owner in mutex_exit. These serve as + * membar_producers that match the membar_consumers in + * mutex_vector_enter. + * + * See kern_mutex.c for details -- this is necessary for + * adaptive mutexes to detect whether the lwp is on the CPU in + * order to safely block without requiring atomic r/m/w in + * mutex_exit. + * + * Note: In alpha_softint_switchto, we could potentially elide + * the second barrier -- when we enter a softint lwp, it can't + * be holding any mutexes, so it can't release any until after + * it has acquired htem, so we need not participate in the + * protocol with mutex_vector_enter barriers here. + */ #define SET_CURLWP(r) \ ldq v0, L_CPU(r) ; \ mov r, a0 ; \ + wmb /* XXX patch out if !MP? */ ; \ stq r, CPU_INFO_CURLWP(v0) ; \ + wmb /* XXX patch out if !MP? */ ; \ call_pal PAL_OSF1_wrval #else /* if not MULTIPROCESSOR... */ From 82d5a22441b3cd7b99c2dd4ba9e53c2fedf1db94 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:40:32 +0000 Subject: [PATCH 03/16] arm32: Add missing barriers in cpu_switchto. Details in comments. PR kern/57240 XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/arm/arm/armv6_start.S | 4 ++++ sys/arch/arm/arm32/cpuswitch.S | 32 +++++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/sys/arch/arm/arm/armv6_start.S b/sys/arch/arm/arm/armv6_start.S index 150704305a72..2133bbb9e2b7 100644 --- a/sys/arch/arm/arm/armv6_start.S +++ b/sys/arch/arm/arm/armv6_start.S @@ -943,6 +943,10 @@ armv7_mpcontinuation: #else #error either TPIDRPRW_IS_CURCPU or TPIDRPRW_IS_CURLWP must be defined #endif + /* + * No membar needed because we're not switching from a + * previous lwp; see cpu_switchto. + */ str r6, [r5, #CI_CURLWP] // and note we are running on it mov r0, r5 // pass cpu_info diff --git a/sys/arch/arm/arm32/cpuswitch.S b/sys/arch/arm/arm32/cpuswitch.S index 1ffda42dbd86..fa1f6fb90264 100644 --- a/sys/arch/arm/arm32/cpuswitch.S +++ b/sys/arch/arm/arm32/cpuswitch.S @@ -189,11 +189,28 @@ ENTRY(cpu_switchto) mcr p15, 0, r6, c13, c0, 4 /* set current lwp */ #endif + /* + * Issue store-before-store barriers around the store that sets + * ci->ci_curlwp so that any prior mutex_exit is visible to any + * thread that witnesses this store, and this store is visible + * to any thread that could witness a subsequent store to + * mtx->mtx_owner in mutex_exit. These serve as + * membar_producers that match the membar_consumers in + * mutex_vector_enter. + * + * See kern_mutex.c for details -- this is necessary for + * adaptive mutexes to detect whether the lwp is on the CPU in + * order to safely block without requiring atomic r/m/w in + * mutex_exit. + */ + /* We have a new curlwp now so make a note of it */ +#ifdef _ARM_ARCH_7 + dmb +#endif str r6, [r5, #(CI_CURLWP)] - #ifdef _ARM_ARCH_7 - dmb /* see comments in kern_mutex.c */ + dmb #endif /* Get the new pcb */ @@ -391,10 +408,13 @@ ENTRY_NP(softint_switch) */ #if defined(TPIDRPRW_IS_CURLWP) mcr p15, 0, r5, c13, c0, 4 /* save new lwp */ +#endif +#ifdef _ARM_ARCH_7 + dmb /* for mutex_enter; see cpu_switchto */ #endif str r5, [r7, #(CI_CURLWP)] /* save new lwp */ #ifdef _ARM_ARCH_7 - dmb /* see comments in kern_mutex.c */ + dmb /* for mutex_enter; see cpu_switchto */ #endif #ifdef KASAN @@ -427,8 +447,14 @@ ENTRY_NP(softint_switch) IRQdisable #if defined(TPIDRPRW_IS_CURLWP) mcr p15, 0, r4, c13, c0, 4 /* restore pinned lwp */ +#endif +#ifdef _ARM_ARCH_7 + dmb /* for mutex_enter; see cpu_switchto */ #endif str r4, [r7, #(CI_CURLWP)] /* restore pinned lwp */ +#ifdef _ARM_ARCH_7 + dmb /* for mutex_enter; see cpu_switchto */ +#endif ldr sp, [r2, #(PCB_KSP)] /* now running on the old stack. */ /* At this point we can allow IRQ's again. */ From bcaa92506ea131d5d9fbaf90d101489efc3ff73d Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 20 Feb 2023 12:16:59 +0000 Subject: [PATCH 04/16] ia64: Add missing barriers in cpu_switchto. (ia64 has never really worked, so no pullups needed, right?) PR kern/57240 --- sys/arch/ia64/ia64/machdep.c | 3 +++ sys/arch/ia64/ia64/vm_machdep.c | 21 +++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/sys/arch/ia64/ia64/machdep.c b/sys/arch/ia64/ia64/machdep.c index e94ae727da66..12d54eb03d55 100644 --- a/sys/arch/ia64/ia64/machdep.c +++ b/sys/arch/ia64/ia64/machdep.c @@ -638,6 +638,9 @@ ia64_init(void) /* * Initialise process context. XXX: This should really be in cpu_switchto + * + * No membar needed because we're not switching from a + * previous lwp; see cpu_switchto. */ ci->ci_curlwp = &lwp0; diff --git a/sys/arch/ia64/ia64/vm_machdep.c b/sys/arch/ia64/ia64/vm_machdep.c index 25592ffbb109..4638ac033b29 100644 --- a/sys/arch/ia64/ia64/vm_machdep.c +++ b/sys/arch/ia64/ia64/vm_machdep.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -77,9 +78,25 @@ cpu_switchto(lwp_t *oldlwp, lwp_t *newlwp, bool returning) register uint64_t reg9 __asm("r9"); KASSERT(newlwp != NULL); - + + /* + * Issue store-before-store barriers around the store that sets + * ci->ci_curlwp so that any prior mutex_exit is visible to any + * thread that witnesses this store, and this store is visible + * to any thread that could witness a subsequent store to + * mtx->mtx_owner in mutex_exit. These serve as + * membar_producers that match the membar_consumers in + * mutex_vector_enter. + * + * See kern_mutex.c for details -- this is necessary for + * adaptive mutexes to detect whether the lwp is on the CPU in + * order to safely block without requiring atomic r/m/w in + * mutex_exit. + */ + membar_producer(); ci->ci_curlwp = newlwp; - + membar_producer(); + /* required for lwp_startup, copy oldlwp into r9, "mov r9=in0" */ __asm __volatile("mov %0=%1" : "=r"(reg9) : "r"(oldlwp)); From 2b1492ceb2e686199b28a705b9017ef80c10e360 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 16:17:04 +0000 Subject: [PATCH 05/16] mips: Add missing barriers in cpu_switchto. Otherwise adaptive mutexes don't work -- not reliably, anyway. Details in comments. PR kern/57240 XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/evbmips/ingenic/cpu_startup.S | 4 ++++ sys/arch/mips/include/asm.h | 12 ++++++++++++ sys/arch/mips/mips/locore.S | 21 +++++++++++++++++++++ sys/arch/mips/mips/locore_mips3.S | 4 ++++ 4 files changed, 41 insertions(+) diff --git a/sys/arch/evbmips/ingenic/cpu_startup.S b/sys/arch/evbmips/ingenic/cpu_startup.S index 31abcd1671a3..e34ad1da7924 100644 --- a/sys/arch/evbmips/ingenic/cpu_startup.S +++ b/sys/arch/evbmips/ingenic/cpu_startup.S @@ -56,6 +56,10 @@ NESTED_NOPROFILE(ingenic_trampoline, 0, ra) nop beqz MIPS_CURLWP, 1b nop + /* + * No membar needed because we're not switching from a + * previous lwp; see cpu_switchto. + */ PTR_S MIPS_CURLWP, CPU_INFO_CURLWP(a0) li v0, 0 diff --git a/sys/arch/mips/include/asm.h b/sys/arch/mips/include/asm.h index e896114a5939..30779359bbba 100644 --- a/sys/arch/mips/include/asm.h +++ b/sys/arch/mips/include/asm.h @@ -610,6 +610,18 @@ _C_LABEL(x): #define SYNC_PLUNGER /* nothing */ #endif +/* + * Store-before-store and load-before-load barriers. These could be + * made weaker than release (load/store-before-store) and acquire + * (load-before-load/store) barriers, and newer MIPS does have + * instruction encodings for finer-grained barriers like this, but I + * dunno how to appropriately conditionalize their use or get the + * assembler to be happy with them, so we'll use these definitions for + * now. + */ +#define SYNC_PRODUCER SYNC_REL +#define SYNC_CONSUMER SYNC_ACQ + /* CPU dependent hook for cp0 load delays */ #if defined(MIPS1) || defined(MIPS2) || defined(MIPS3) #define MFC0_HAZARD sll $0,$0,1 /* super scalar nop */ diff --git a/sys/arch/mips/mips/locore.S b/sys/arch/mips/mips/locore.S index 87b8e6ed0f73..477f3fd38fb3 100644 --- a/sys/arch/mips/mips/locore.S +++ b/sys/arch/mips/mips/locore.S @@ -286,7 +286,24 @@ NESTED(cpu_switchto, CALLFRAME_SIZ, ra) PTR_L t2, L_CPU(MIPS_CURLWP) nop # patchable load delay slot + + /* + * Issue store-before-store barriers around the store that sets + * ci->ci_curlwp so that any prior mutex_exit is visible to any + * thread that witnesses this store, and this store is visible + * to any thread that could witness a subsequent store to + * mtx->mtx_owner in mutex_exit. These serve as + * membar_producers that match the membar_consumers in + * mutex_vector_enter. + * + * See kern_mutex.c for details -- this is necessary for + * adaptive mutexes to detect whether the lwp is on the CPU in + * order to safely block without requiring atomic r/m/w in + * mutex_exit. + */ + SYNC_PRODUCER /* XXX fixup to nop for uniprocessor boot */ PTR_S MIPS_CURLWP, CPU_INFO_CURLWP(t2) + SYNC_PRODUCER /* XXX fixup to nop for uniprocessor boot */ /* Check for restartable atomic sequences (RAS) */ PTR_L a0, L_PROC(MIPS_CURLWP) # argument to ras_lookup @@ -406,7 +423,9 @@ NESTED(softint_fast_dispatch, CALLFRAME_SIZ, ra) move MIPS_CURLWP, a0 # switch to softint lwp PTR_L s1, L_CPU(MIPS_CURLWP) # get curcpu() nop # patchable load delay slot + SYNC_PRODUCER /* XXX fixup */ /* for mutex_enter; see cpu_switchto */ PTR_S MIPS_CURLWP, CPU_INFO_CURLWP(s1) # ... + SYNC_PRODUCER /* XXX fixup */ /* for mutex_enter; see cpu_switchto */ move s2, sp # remember sp move s3, t0 # remember curpcb @@ -417,7 +436,9 @@ NESTED(softint_fast_dispatch, CALLFRAME_SIZ, ra) move sp, s2 # restore stack move MIPS_CURLWP, s0 # restore curlwp + SYNC_PRODUCER /* XXX fixup */ /* for mutex_enter; see cpu_switchto */ PTR_S MIPS_CURLWP, CPU_INFO_CURLWP(s1) # .... + SYNC_PRODUCER /* XXX fixup */ /* for mutex_enter; see cpu_switchto */ REG_L ra, CALLFRAME_RA(sp) # load early since we use it diff --git a/sys/arch/mips/mips/locore_mips3.S b/sys/arch/mips/mips/locore_mips3.S index 7b791f917bb8..ea67592f0fe0 100644 --- a/sys/arch/mips/mips/locore_mips3.S +++ b/sys/arch/mips/mips/locore_mips3.S @@ -809,6 +809,10 @@ NESTED_NOPROFILE(cpu_trampoline, 0, ra) nop beqz MIPS_CURLWP, 1b nop + /* + * No membar needed because we're not switching from a + * previous lwp; see cpu_switchto. + */ PTR_S MIPS_CURLWP, CPU_INFO_CURLWP(a0) #ifdef _LP64 From 524e6adc6da220767a6ff5b061b84002f24883e3 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:36:20 +0000 Subject: [PATCH 06/16] powerpc: Add missing barriers in cpu_switchto. Details in comments. PR kern/57240 XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/powerpc/powerpc/locore_subr.S | 33 ++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/sys/arch/powerpc/powerpc/locore_subr.S b/sys/arch/powerpc/powerpc/locore_subr.S index c910c10a38e5..18329449d933 100644 --- a/sys/arch/powerpc/powerpc/locore_subr.S +++ b/sys/arch/powerpc/powerpc/locore_subr.S @@ -215,7 +215,28 @@ ENTRY(cpu_switchto) */ GET_CPUINFO(%r7) + + /* + * Issue store-before-store barriers around the store that sets + * ci->ci_curlwp so that any prior mutex_exit is visible to any + * thread that witnesses this store, and this store is visible + * to any thread that could witness a subsequent store to + * mtx->mtx_owner in mutex_exit. These serve as + * membar_producers that match the membar_consumers in + * mutex_vector_enter. + * + * See kern_mutex.c for details -- this is necessary for + * adaptive mutexes to detect whether the lwp is on the CPU in + * order to safely block without requiring atomic r/m/w in + * mutex_exit. + */ +#ifdef MULTIPROCESSOR + sync /* XXX use eieio if available -- cheaper */ +#endif stptr %r31,CI_CURLWP(%r7) +#ifdef MULTIPROCESSOR + sync /* XXX use eieio if available -- cheaper */ +#endif mr %r13,%r31 #ifdef PPC_BOOKE mtsprg2 %r31 /* save curlwp in sprg2 */ @@ -389,7 +410,13 @@ _ENTRY(softint_fast_dispatch) * to a kernel thread */ +#ifdef MULTIPROCESSOR + sync /* XXX eieio */ /* for mutex_enter; see cpu_switchto */ +#endif stptr %r3, CI_CURLWP(%r7) +#ifdef MULTIPROCESSOR + sync /* XXX eieio */ /* for mutex_enter; see cpu_switchto */ +#endif mr %r13, %r3 #ifdef PPC_BOOKE mtsprg2 %r3 @@ -423,7 +450,13 @@ _ENTRY(softint_fast_dispatch) #endif GET_CPUINFO(%r7) +#ifdef MULTIPROCESSOR + sync /* XXX eieio */ /* for mutex_enter; see cpu_switchto */ +#endif stptr %r30, CI_CURLWP(%r7) +#ifdef MULTIPROCESSOR + sync /* XXX eieio */ /* for mutex_enter; see cpu_switchto */ +#endif mr %r13, %r30 #ifdef PPC_BOOKE mtsprg2 %r30 From 98ce85ef0f55b783e83de6e79788b07fc5b5433e Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:37:53 +0000 Subject: [PATCH 07/16] riscv: Add missing barriers in cpu_switchto. Details in comments. PR kern/57240 --- sys/arch/riscv/riscv/cpu_switch.S | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/sys/arch/riscv/riscv/cpu_switch.S b/sys/arch/riscv/riscv/cpu_switch.S index acb023b3b9d9..0d7f8236399f 100644 --- a/sys/arch/riscv/riscv/cpu_switch.S +++ b/sys/arch/riscv/riscv/cpu_switch.S @@ -62,7 +62,33 @@ ENTRY_NP(cpu_switchto) mv tp, a1 // # put the new lwp in thread pointer PTR_L t1, L_CPU(tp) // # get curcpu + + /* + * Issue store-before-store barriers around the store that sets + * ci->ci_curlwp so that any prior mutex_exit is visible to any + * thread that witnesses this store, and this store is visible + * to any thread that could witness a subsequent store to + * mtx->mtx_owner in mutex_exit. These serve as + * membar_producers that match the membar_consumers in + * mutex_vector_enter. + * + * See kern_mutex.c for details -- this is necessary for + * adaptive mutexes to detect whether the lwp is on the CPU in + * order to safely block without requiring atomic r/m/w in + * mutex_exit. + * + * With an appropriate definition of mutex_exit, if it issues a + * barrier and sets mtx->mtx_owner = 0 in a single atomic + * store-release instruction, the second barrier could be + * unnecessary. + * + * But just to be on the safe side let's issue both for now, + * and remove the second one only after testing -- and after + * implementing an appropriate mutex_exit stub. + */ + fence w,w PTR_S tp, CI_CURLWP(t1) // # update curcpu with the new curlwp + fence w,w REG_L sp, L_MD_KTF(tp) // # load its kernel stack pointer REG_L t4, TF_SR(sp) // # fetch status register @@ -154,14 +180,18 @@ ENTRY_NP(cpu_fast_switchto) PTR_S sp, L_MD_KTF(tp) // save trapframe ptr in oldlwp mv tp, a0 // set thread pointer to newlwp + fence w,w // for mutex_enter; see cpu_switchto PTR_S tp, CI_CURLWP(t1) // update curlwp + fence w,w // for mutex_enter; see cpu_switchto PTR_L sp, L_MD_KTF(tp) // switch to its stack csrw sstatus, t0 // reenable interrupts call _C_LABEL(softint_dispatch) csrrci t0, sstatus, SR_SIE // disable interrupts PTR_L t1, L_CPU(tp) // get curcpu() again mv tp, s0 // return to pinned lwp + fence w,w // for mutex_enter; see cpu_switchto PTR_S tp, CI_CURLWP(t1) // restore curlwp + fence w,w // for mutex_enter; see cpu_switchto csrw sstatus, t0 // reenable interrupts mv sp, s1 // restore stack pointer From 401cca4fa1c9212296879c4ea1c346ac7009a30f Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 20 Feb 2023 00:14:39 +0000 Subject: [PATCH 08/16] sparc: Note where STBAR would go in cpu_switchto for non-TSO. No functional change. PR kern/57240 --- sys/arch/sparc/sparc/locore.s | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sys/arch/sparc/sparc/locore.s b/sys/arch/sparc/sparc/locore.s index 5d7edbec3461..ffdec4231f1e 100644 --- a/sys/arch/sparc/sparc/locore.s +++ b/sys/arch/sparc/sparc/locore.s @@ -4889,7 +4889,17 @@ Lwb1: SAVE; SAVE; SAVE; SAVE; SAVE; SAVE; /* 6 of each: */ /* set new cpcb, and curlwp */ sethi %hi(curlwp), %l7 st %g5, [%l6 + %lo(cpcb)] ! cpcb = newpcb; + /* + * No membar needed because we run in TSO. But if we didn't, + * we would need two STBARs, one before and one after this + * store; see kern_mutex.c for details. If we had fast + * softints, we would also need one before setting curlwp to + * the softint lwp, and two on either side of setting curlwp + * back when the softint returns. + */ + /* stbar */ st %g3, [%l7 + %lo(curlwp)] ! curlwp = l; + /* stbar */ /* compute new wim */ ld [%g5 + PCB_WIM], %o0 From c077c7d27430c79db2d38d34cbca54c37039783d Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:13:32 +0000 Subject: [PATCH 09/16] sparc64: Add missing barriers in cpu_switchto. Details in comments. PR kern/57240 XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/sparc64/sparc64/locore.s | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sys/arch/sparc64/sparc64/locore.s b/sys/arch/sparc64/sparc64/locore.s index 17bf12f8b2cd..1e2abefe4643 100644 --- a/sys/arch/sparc64/sparc64/locore.s +++ b/sys/arch/sparc64/sparc64/locore.s @@ -6731,9 +6731,24 @@ ENTRY(cpu_switchto) * Load the new lwp. To load, we must change stacks and * alter cpcb and the window control registers, hence we must * keep interrupts disabled. + * + * Issue store-before-store barriers around the store that sets + * ci->ci_curlwp so that any prior mutex_exit is visible to any + * thread that witnesses this store, and this store is visible + * to any thread that could witness a subsequent store to + * mtx->mtx_owner in mutex_exit. These serve as + * membar_producers that match the membar_consumers in + * mutex_vector_enter. + * + * See kern_mutex.c for details -- this is necessary for + * adaptive mutexes to detect whether the lwp is on the CPU in + * order to safely block without requiring atomic r/m/w in + * mutex_exit. */ + membar #StoreStore STPTR %i1, [%l7 + %lo(CURLWP)] ! curlwp = l; + membar #StoreStore STPTR %l1, [%l6 + %lo(CPCB)] ! cpcb = newpcb; ldx [%l1 + PCB_SP], %i6 @@ -6826,7 +6841,9 @@ ENTRY(softint_fastintr) sethi %hi(USPACE - TF_SIZE - CC64FSZ - STKB), %o3 LDPTR [%i0 + L_PCB], %l1 ! l1 = softint pcb or %o3, %lo(USPACE - TF_SIZE - CC64FSZ - STKB), %o3 + membar #StoreStore /* for mutex_enter; see cpu_switchto */ STPTR %i0, [%l7 + %lo(CURLWP)] + membar #StoreStore /* for mutex_enter; see cpu_switchto */ add %l1, %o3, %i6 STPTR %l1, [%l6 + %lo(CPCB)] stx %i6, [%l1 + PCB_SP] @@ -6839,7 +6856,9 @@ ENTRY(softint_fastintr) /* switch back to interrupted lwp */ ldx [%l5 + PCB_SP], %i6 + membar #StoreStore /* for mutex_enter; see cpu_switchto */ STPTR %l0, [%l7 + %lo(CURLWP)] + membar #StoreStore /* for mutex_enter; see cpu_switchto */ STPTR %l5, [%l6 + %lo(CPCB)] restore ! rewind register window From 3e83512f8757037d129b05b8d8e686a3e3521398 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Tue, 21 Feb 2023 12:12:21 +0000 Subject: [PATCH 10/16] aarch64: Optimization: Omit needless membar when triggering softint. When we are triggering a softint, it can't already hold any mutexes. So any path to mutex_exit(mtx) must go via mutex_enter(mtx), which is always done with atomic r/m/w, and we need not issue any explicit barrier between ci->ci_curlwp = softlwp and a potential store to mtx->mtx_owner in mutex_exit. PR kern/57240 XXX pullup-9 XXX pullup-10 --- sys/arch/aarch64/aarch64/cpuswitch.S | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sys/arch/aarch64/aarch64/cpuswitch.S b/sys/arch/aarch64/aarch64/cpuswitch.S index 38f5f297b374..daa0521e4180 100644 --- a/sys/arch/aarch64/aarch64/cpuswitch.S +++ b/sys/arch/aarch64/aarch64/cpuswitch.S @@ -239,7 +239,13 @@ ENTRY_NP(cpu_switchto_softint) msr tpidr_el1, x0 /* curlwp = softlwp; */ dmb ishst /* for mutex_enter; see cpu_switchto */ str x0, [x20, #CI_CURLWP] /* curcpu()->ci_curlwp = softlwp; */ - dmb ishst /* for mutex_enter; see cpu_switchto */ + /* + * No need for barrier after ci->ci_curlwp = softlwp -- when we + * enter a softint lwp, it can't be holding any mutexes, so it + * can't release any until after it has acquired them, so we + * need not participate in the protocol with mutex_vector_enter + * barriers here. + */ mov x5, #CPACR_FPEN_NONE msr cpacr_el1, x5 /* cpacr_el1 = CPACR_FPEN_NONE */ From 1f48be114e6f5da039e5b97b2b3cd612df362338 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Tue, 21 Feb 2023 11:25:43 +0000 Subject: [PATCH 11/16] aarch64: Optimization: Omit needless membar after setting ci->ci_curlwp. This works because the barrier-and-store in all mutex_exit paths is nonpreemptible, so we can't be switching to somewhere in the middle of it; all paths from setting ci->ci_curlwp = owner to setting mtx->mtx_owner = 0 are already separated by a sufficient barrier for the needs of mutex_vector_enter. PR kern/57240 XXX pullup-9 XXX pullup-10 --- sys/arch/aarch64/aarch64/cpuswitch.S | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sys/arch/aarch64/aarch64/cpuswitch.S b/sys/arch/aarch64/aarch64/cpuswitch.S index daa0521e4180..b32619b9eec8 100644 --- a/sys/arch/aarch64/aarch64/cpuswitch.S +++ b/sys/arch/aarch64/aarch64/cpuswitch.S @@ -155,13 +155,10 @@ ENTRY_NP(cpu_switchto) * preemption; or * - the mutex_vector_exit path which does MUTEX_RELEASE * under the turnstile lock, blocking preemption. - * - * But just to be on the safe side let's issue both for now, - * and remove the second one only after testing. */ dmb ishst /* membar_producer */ str x1, [x3, #CI_CURLWP] /* switch curlwp to new lwp */ - dmb ishst /* membar_producer */ + /* no membar_producer needed */ ENABLE_INTERRUPT @@ -289,7 +286,7 @@ ENTRY_NP(cpu_switchto_softint) ldr x3, [x19, #L_CPU] /* x3 = curlwp->l_cpu */ dmb ishst /* for mutex_enter; see cpu_switchto */ str x19, [x3, #CI_CURLWP] /* curlwp->l_cpu->ci_curlwp := x19 */ - dmb ishst /* for mutex_enter; see cpu_switchto */ + /* no barrier; see cpu_switchto */ mov sp, x4 /* restore pinned_lwp sp */ msr cpacr_el1, x5 /* restore pinned_lwp cpacr */ From 37acad8fd7ac88fff3e70753e655c69c9d182b74 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Tue, 21 Feb 2023 00:07:48 +0000 Subject: [PATCH 12/16] arm32: Optimization: Omit needless membar when triggering softint. When we are triggering a softint, it can't already hold any mutexes. So any path to mutex_exit(mtx) must go via mutex_enter(mtx), which is always done with atomic r/m/w, and we need not issue any explicit barrier between ci->ci_curlwp = softlwp and a potential store to mtx->mtx_owner in mutex_exit. PR kern/57240 XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/arm/arm32/cpuswitch.S | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sys/arch/arm/arm32/cpuswitch.S b/sys/arch/arm/arm32/cpuswitch.S index fa1f6fb90264..cf06de441d95 100644 --- a/sys/arch/arm/arm32/cpuswitch.S +++ b/sys/arch/arm/arm32/cpuswitch.S @@ -413,9 +413,13 @@ ENTRY_NP(softint_switch) dmb /* for mutex_enter; see cpu_switchto */ #endif str r5, [r7, #(CI_CURLWP)] /* save new lwp */ -#ifdef _ARM_ARCH_7 - dmb /* for mutex_enter; see cpu_switchto */ -#endif + /* + * No need for barrier after ci->ci_curlwp = softlwp -- when we + * enter a softint lwp, it can't be holding any mutexes, so it + * can't release any until after it has acquired them, so we + * need not participate in the protocol with mutex_vector_enter + * barriers here. + */ #ifdef KASAN mov r0, r5 From f03ba532bb19c7cfed8557c07b0f42006d798fbf Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Tue, 21 Feb 2023 12:15:35 +0000 Subject: [PATCH 13/16] mips: Optimization: Omit needless membar when triggering softint. When we are triggering a softint, it can't already hold any mutexes. So any path to mutex_exit(mtx) must go via mutex_enter(mtx), which is always done with atomic r/m/w, and we need not issue any explicit barrier between ci->ci_curlwp = softlwp and a potential store to mtx->mtx_owner in mutex_exit. PR kern/57240 XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/mips/mips/locore.S | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sys/arch/mips/mips/locore.S b/sys/arch/mips/mips/locore.S index 477f3fd38fb3..168c1a091302 100644 --- a/sys/arch/mips/mips/locore.S +++ b/sys/arch/mips/mips/locore.S @@ -425,7 +425,13 @@ NESTED(softint_fast_dispatch, CALLFRAME_SIZ, ra) nop # patchable load delay slot SYNC_PRODUCER /* XXX fixup */ /* for mutex_enter; see cpu_switchto */ PTR_S MIPS_CURLWP, CPU_INFO_CURLWP(s1) # ... - SYNC_PRODUCER /* XXX fixup */ /* for mutex_enter; see cpu_switchto */ + /* + * No need for barrier after ci->ci_curlwp = softlwp -- when we + * enter a softint lwp, it can't be holding any mutexes, so it + * can't release any until after it has acquired them, so we + * need not participate in the protocol with mutex_vector_enter + * barriers here. + */ move s2, sp # remember sp move s3, t0 # remember curpcb From 9b43c62eaab693cd02b5fab8527eac9e92c69663 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Tue, 21 Feb 2023 12:16:35 +0000 Subject: [PATCH 14/16] powerpc: Optimization: Omit needless membar when triggering softint. When we are triggering a softint, it can't already hold any mutexes. So any path to mutex_exit(mtx) must go via mutex_enter(mtx), which is always done with atomic r/m/w, and we need not issue any explicit barrier between ci->ci_curlwp = softlwp and a potential store to mtx->mtx_owner in mutex_exit. PR kern/57240 XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/powerpc/powerpc/locore_subr.S | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sys/arch/powerpc/powerpc/locore_subr.S b/sys/arch/powerpc/powerpc/locore_subr.S index 18329449d933..03d939c5dd0a 100644 --- a/sys/arch/powerpc/powerpc/locore_subr.S +++ b/sys/arch/powerpc/powerpc/locore_subr.S @@ -414,9 +414,13 @@ _ENTRY(softint_fast_dispatch) sync /* XXX eieio */ /* for mutex_enter; see cpu_switchto */ #endif stptr %r3, CI_CURLWP(%r7) -#ifdef MULTIPROCESSOR - sync /* XXX eieio */ /* for mutex_enter; see cpu_switchto */ -#endif + /* + * No need for barrier after ci->ci_curlwp = softlwp -- when we + * enter a softint lwp, it can't be holding any mutexes, so it + * can't release any until after it has acquired them, so we + * need not participate in the protocol with mutex_vector_enter + * barriers here. + */ mr %r13, %r3 #ifdef PPC_BOOKE mtsprg2 %r3 From 75d33a037163c379086d12fd27c1958bd2bb5fff Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Tue, 21 Feb 2023 12:17:16 +0000 Subject: [PATCH 15/16] riscv: Optimization: Omit needless membar when triggering softint. When we are triggering a softint, it can't already hold any mutexes. So any path to mutex_exit(mtx) must go via mutex_enter(mtx), which is always done with atomic r/m/w, and we need not issue any explicit barrier between ci->ci_curlwp = softlwp and a potential store to mtx->mtx_owner in mutex_exit. PR kern/57240 --- sys/arch/riscv/riscv/cpu_switch.S | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sys/arch/riscv/riscv/cpu_switch.S b/sys/arch/riscv/riscv/cpu_switch.S index 0d7f8236399f..e8e0302f9985 100644 --- a/sys/arch/riscv/riscv/cpu_switch.S +++ b/sys/arch/riscv/riscv/cpu_switch.S @@ -182,7 +182,13 @@ ENTRY_NP(cpu_fast_switchto) mv tp, a0 // set thread pointer to newlwp fence w,w // for mutex_enter; see cpu_switchto PTR_S tp, CI_CURLWP(t1) // update curlwp - fence w,w // for mutex_enter; see cpu_switchto + /* + * No need for barrier after ci->ci_curlwp = softlwp -- when we + * enter a softint lwp, it can't be holding any mutexes, so it + * can't release any until after it has acquired them, so we + * need not participate in the protocol with mutex_vector_enter + * barriers here. + */ PTR_L sp, L_MD_KTF(tp) // switch to its stack csrw sstatus, t0 // reenable interrupts call _C_LABEL(softint_dispatch) From 62003311dcd6b8e6ce639e97f46610170df74622 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Tue, 21 Feb 2023 12:18:07 +0000 Subject: [PATCH 16/16] sparc64: Optimization: Omit needless membar when triggering softint. When we are triggering a softint, it can't already hold any mutexes. So any path to mutex_exit(mtx) must go via mutex_enter(mtx), which is always done with atomic r/m/w, and we need not issue any explicit barrier between ci->ci_curlwp = softlwp and a potential store to mtx->mtx_owner in mutex_exit. PR kern/57240 XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/sparc64/sparc64/locore.s | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sys/arch/sparc64/sparc64/locore.s b/sys/arch/sparc64/sparc64/locore.s index 1e2abefe4643..95e8d25bf6ad 100644 --- a/sys/arch/sparc64/sparc64/locore.s +++ b/sys/arch/sparc64/sparc64/locore.s @@ -6843,7 +6843,13 @@ ENTRY(softint_fastintr) or %o3, %lo(USPACE - TF_SIZE - CC64FSZ - STKB), %o3 membar #StoreStore /* for mutex_enter; see cpu_switchto */ STPTR %i0, [%l7 + %lo(CURLWP)] - membar #StoreStore /* for mutex_enter; see cpu_switchto */ + /* + * No need for barrier after ci->ci_curlwp = softlwp -- when we + * enter a softint lwp, it can't be holding any mutexes, so it + * can't release any until after it has acquired them, so we + * need not participate in the protocol with mutex_vector_enter + * barriers here. + */ add %l1, %o3, %i6 STPTR %l1, [%l6 + %lo(CPCB)] stx %i6, [%l1 + PCB_SP]