From 91cf48711486105863f389ee7bf74550f5e6f52c Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:03:19 +0000 Subject: [PATCH 01/11] aarch64: Add missing barrier in cpu_switchto. The two relevant orderings, which mutex_vector_enter relies on, must be: 1. /* potential mutex_exit */ ... mtx->mtx_owner = 0; // (a) ... /* cpu_switchto */ ... ci->ci_curlwp = otherlwp; // (b) 2. /* cpu_switchto */ ... ci->ci_curlwp = owner; // (a) ... /* potential mutex_exit */ ... mtx->mtx_owner = 0; // (b) In scenario 1, there must be a DMB ISHST or stronger between 1(a) and 1(b). But that's missing right now. So this change inserts it, just before ci->ci_curlwp = otherlwp. In scenario 2, mutex_exit (either via MUTEX_RELEASE or via MD lock stubs) already issues a load/store-before-store barrier before 2(b), sequenced with preemption disabled under splhigh or with an atomic LDXR/STLXR (LL/SC), so there's no need to issue anything in cpu_switchto -- the existing DMB ISHST is unnecessary. So this change adds a comment recommending it be removed -- to be done separately from the insertion of a barrier so that, in case I made a mistake, for the sake of bisection, we don't end up with the same commit both inserting and removing different necessary barriers. XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/aarch64/aarch64/cpuswitch.S | 43 +++++++++++++++++++++++++++- sys/arch/aarch64/aarch64/locore.S | 4 +++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/sys/arch/aarch64/aarch64/cpuswitch.S b/sys/arch/aarch64/aarch64/cpuswitch.S index 6fd1ab26bf57..f3ef02424169 100644 --- a/sys/arch/aarch64/aarch64/cpuswitch.S +++ b/sys/arch/aarch64/aarch64/cpuswitch.S @@ -125,8 +125,42 @@ ENTRY_NP(cpu_switchto) msr tpidr_el1, x1 /* switch curlwp to new lwp */ ldr x3, [x1, #L_CPU] + + /* + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ + dmb ishst str x1, [x3, #CI_CURLWP] /* switch curlwp to new lwp */ + /* + * XXX This DMB ISHST is unnecessary because mutex_exit issues + * a stronger barrier before (DMB ISH), or as part of + * (STLR/STLXR), the store to mtx->mtx_owner anyway. To be + * deleted in a subsequent commit. + */ dmb ishst /* see comments in kern_mutex.c */ + ENABLE_INTERRUPT /* @@ -201,8 +235,8 @@ ENTRY_NP(cpu_switchto_softint) /* onto new stack */ sub sp, x4, #TF_SIZE /* new sp := softlwp->l_md_utf - 1 */ msr tpidr_el1, x0 /* curlwp = softlwp; */ + dmb ishst /* for mutex_enter; see cpu_switchto */ str x0, [x20, #CI_CURLWP] /* curcpu()->ci_curlwp = softlwp; */ - /* no need for memory barrier here */ mov x5, #CPACR_FPEN_NONE msr cpacr_el1, x5 /* cpacr_el1 = CPACR_FPEN_NONE */ @@ -244,7 +278,14 @@ ENTRY_NP(cpu_switchto_softint) DISABLE_INTERRUPT msr tpidr_el1, x19 /* curlwp = pinned_lwp */ ldr x3, [x19, #L_CPU] /* x3 = curlwp->l_cpu */ + dmb ishst /* for mutex_enter; see cpu_switchto */ str x19, [x3, #CI_CURLWP] /* curlwp->l_cpu->ci_curlwp := x19 */ + /* + * XXX This DMB ISHST is unnecessary because mutex_exit issues + * a stronger barrier before (DMB ISH), or as part of + * (STLR/STLXR), the store to mtx->mtx_owner anyway. To be + * deleted in a subsequent commit. + */ dmb ishst /* see comments in kern_mutex.c */ mov sp, x4 /* restore pinned_lwp sp */ diff --git a/sys/arch/aarch64/aarch64/locore.S b/sys/arch/aarch64/aarch64/locore.S index b72be4ce7dc8..1c0e57d7c9cd 100644 --- a/sys/arch/aarch64/aarch64/locore.S +++ b/sys/arch/aarch64/aarch64/locore.S @@ -529,6 +529,10 @@ mp_vstart: */ ldr x1, [x0, #CI_IDLELWP] /* x0 = curcpu()->ci_idlelwp */ msr tpidr_el1, x1 /* tpidr_el1 = curlwp = x1 */ + /* + * No membar needed because we're not switching from a + * previous lwp; see cpu_switchto. + */ str x1, [x0, #CI_CURLWP] /* curlwp is idlelwp */ /* get my stack from lwp */ From 6bd225d8026bbf48e04546b6496132ec37323945 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 20 Feb 2023 11:55:22 +0000 Subject: [PATCH 02/11] aarch64: Omit needless membars after store to ci->ci_curlwp. These were introduced in an attempt to coordinate mutex_exit with mutex_enter. But they're not necessary, because the ordering that we need is: 1. ci->ci_curlwp = owner 2. mtx->mtx_owner = 0 And mutex_exit already guarantees this order with an appropriate barrier (DMB ISH) or store-release (STLR/STLXR), sequenced with the store either with preemption disabled (at splhigh) or with the turnstile lock held. --- sys/arch/aarch64/aarch64/cpuswitch.S | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/sys/arch/aarch64/aarch64/cpuswitch.S b/sys/arch/aarch64/aarch64/cpuswitch.S index f3ef02424169..196c80d4ed41 100644 --- a/sys/arch/aarch64/aarch64/cpuswitch.S +++ b/sys/arch/aarch64/aarch64/cpuswitch.S @@ -153,13 +153,6 @@ ENTRY_NP(cpu_switchto) */ dmb ishst str x1, [x3, #CI_CURLWP] /* switch curlwp to new lwp */ - /* - * XXX This DMB ISHST is unnecessary because mutex_exit issues - * a stronger barrier before (DMB ISH), or as part of - * (STLR/STLXR), the store to mtx->mtx_owner anyway. To be - * deleted in a subsequent commit. - */ - dmb ishst /* see comments in kern_mutex.c */ ENABLE_INTERRUPT @@ -280,13 +273,6 @@ ENTRY_NP(cpu_switchto_softint) ldr x3, [x19, #L_CPU] /* x3 = curlwp->l_cpu */ dmb ishst /* for mutex_enter; see cpu_switchto */ str x19, [x3, #CI_CURLWP] /* curlwp->l_cpu->ci_curlwp := x19 */ - /* - * XXX This DMB ISHST is unnecessary because mutex_exit issues - * a stronger barrier before (DMB ISH), or as part of - * (STLR/STLXR), the store to mtx->mtx_owner anyway. To be - * deleted in a subsequent commit. - */ - dmb ishst /* see comments in kern_mutex.c */ mov sp, x4 /* restore pinned_lwp sp */ msr cpacr_el1, x5 /* restore pinned_lwp cpacr */ From ce8433893abbf1202c357206058fa95bc95eb8f8 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:34:18 +0000 Subject: [PATCH 03/11] alpha: Add missing barrier in cpu_switchto. Details in comments. XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/alpha/include/asm.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/sys/arch/alpha/include/asm.h b/sys/arch/alpha/include/asm.h index 910167763f38..948db05631cc 100644 --- a/sys/arch/alpha/include/asm.h +++ b/sys/arch/alpha/include/asm.h @@ -669,9 +669,38 @@ label: ASCIZ msg; \ #define GET_CURLWP \ call_pal PAL_OSF1_rdval +/* + * The following comment applies to the wmb instruction preceding + * stq r, CPU_INFO_CURLWP(v0) in SET_CURLWP: + * + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ #define SET_CURLWP(r) \ ldq v0, L_CPU(r) ; \ mov r, a0 ; \ + wmb /* XXX patch out if !MP? */ ; \ stq r, CPU_INFO_CURLWP(v0) ; \ call_pal PAL_OSF1_wrval From 0f83af806e608b01e860421a2163010b3502c833 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:40:32 +0000 Subject: [PATCH 04/11] arm32: Add missing barrier in cpu_switchto. Details in comments. Change is as for aarch64. XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/arm/arm32/cpuswitch.S | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/sys/arch/arm/arm32/cpuswitch.S b/sys/arch/arm/arm32/cpuswitch.S index 1ffda42dbd86..145386f539d9 100644 --- a/sys/arch/arm/arm32/cpuswitch.S +++ b/sys/arch/arm/arm32/cpuswitch.S @@ -189,9 +189,43 @@ ENTRY(cpu_switchto) mcr p15, 0, r6, c13, c0, 4 /* set current lwp */ #endif + /* + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ +#ifdef _ARM_ARCH_7 + dmb +#endif + /* We have a new curlwp now so make a note of it */ str r6, [r5, #(CI_CURLWP)] + /* + * XXX This DMB is unnecessary because mutex_exit issues + * the same barrier before the store to mtx->mtx_owner + * anyway. To be deleted in a subsequent commit. + */ #ifdef _ARM_ARCH_7 dmb /* see comments in kern_mutex.c */ #endif @@ -391,8 +425,16 @@ ENTRY_NP(softint_switch) */ #if defined(TPIDRPRW_IS_CURLWP) mcr p15, 0, r5, c13, c0, 4 /* save new lwp */ +#endif +#ifdef _ARM_ARCH_7 + dmb /* for mutex_enter; see cpu_switchto */ #endif str r5, [r7, #(CI_CURLWP)] /* save new lwp */ + /* + * XXX This DMB is unnecessary because mutex_exit issues + * the same barrier before the store to mtx->mtx_owner + * anyway. To be deleted in a subsequent commit. + */ #ifdef _ARM_ARCH_7 dmb /* see comments in kern_mutex.c */ #endif @@ -427,6 +469,9 @@ ENTRY_NP(softint_switch) IRQdisable #if defined(TPIDRPRW_IS_CURLWP) mcr p15, 0, r4, c13, c0, 4 /* restore pinned lwp */ +#endif +#ifdef _ARM_ARCH_7 + dmb /* for mutex_enter; see cpu_switchto */ #endif str r4, [r7, #(CI_CURLWP)] /* restore pinned lwp */ ldr sp, [r2, #(PCB_KSP)] /* now running on the old stack. */ From 6e02eca0d81602e13b71e1d02e74ce0d7d8d5d57 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 20 Feb 2023 12:04:03 +0000 Subject: [PATCH 05/11] arm32: Omit needless membars after store to ci->ci_curlwp. These were introduced in an attempt to coordinate mutex_exit with mutex_enter. But they're not necessary, because the ordering that we need is: 1. ci->ci_curlwp = owner 2. mtx->mtx_owner = 0 And mutex_exit already guarantees this order with the same barrier (DMB), sequenced with the store either with preemption disabled (at splhigh) or with the turnstile lock held. --- sys/arch/arm/arm32/cpuswitch.S | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/sys/arch/arm/arm32/cpuswitch.S b/sys/arch/arm/arm32/cpuswitch.S index 145386f539d9..17e465744219 100644 --- a/sys/arch/arm/arm32/cpuswitch.S +++ b/sys/arch/arm/arm32/cpuswitch.S @@ -221,15 +221,6 @@ ENTRY(cpu_switchto) /* We have a new curlwp now so make a note of it */ str r6, [r5, #(CI_CURLWP)] - /* - * XXX This DMB is unnecessary because mutex_exit issues - * the same barrier before the store to mtx->mtx_owner - * anyway. To be deleted in a subsequent commit. - */ -#ifdef _ARM_ARCH_7 - dmb /* see comments in kern_mutex.c */ -#endif - /* Get the new pcb */ ldr r7, [r6, #(L_PCB)] @@ -430,14 +421,6 @@ ENTRY_NP(softint_switch) dmb /* for mutex_enter; see cpu_switchto */ #endif str r5, [r7, #(CI_CURLWP)] /* save new lwp */ - /* - * XXX This DMB is unnecessary because mutex_exit issues - * the same barrier before the store to mtx->mtx_owner - * anyway. To be deleted in a subsequent commit. - */ -#ifdef _ARM_ARCH_7 - dmb /* see comments in kern_mutex.c */ -#endif #ifdef KASAN mov r0, r5 From 05d6c29c98e5eac58d7d37bc757d8bea99bbda37 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 20 Feb 2023 12:16:59 +0000 Subject: [PATCH 06/11] ia64: Add missing barrier in cpu_switchto. (ia64 has never really worked, so no pullups needed, right?) --- sys/arch/ia64/ia64/vm_machdep.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/sys/arch/ia64/ia64/vm_machdep.c b/sys/arch/ia64/ia64/vm_machdep.c index 25592ffbb109..37aebb62f6de 100644 --- a/sys/arch/ia64/ia64/vm_machdep.c +++ b/sys/arch/ia64/ia64/vm_machdep.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -77,7 +78,33 @@ cpu_switchto(lwp_t *oldlwp, lwp_t *newlwp, bool returning) register uint64_t reg9 __asm("r9"); KASSERT(newlwp != NULL); - + + /* + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ + membar_producer(); ci->ci_curlwp = newlwp; /* required for lwp_startup, copy oldlwp into r9, "mov r9=in0" */ From 234d3a02728927bd9151d35dc5aecb4febcc153e Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 16:17:04 +0000 Subject: [PATCH 07/11] mips: Add missing barrier in cpu_switchto. Otherwise adaptive mutexes don't work -- not reliably, anyway. Details in comments. XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/mips/include/asm.h | 12 ++++++++++++ sys/arch/mips/mips/locore.S | 29 +++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/sys/arch/mips/include/asm.h b/sys/arch/mips/include/asm.h index e896114a5939..30779359bbba 100644 --- a/sys/arch/mips/include/asm.h +++ b/sys/arch/mips/include/asm.h @@ -610,6 +610,18 @@ _C_LABEL(x): #define SYNC_PLUNGER /* nothing */ #endif +/* + * Store-before-store and load-before-load barriers. These could be + * made weaker than release (load/store-before-store) and acquire + * (load-before-load/store) barriers, and newer MIPS does have + * instruction encodings for finer-grained barriers like this, but I + * dunno how to appropriately conditionalize their use or get the + * assembler to be happy with them, so we'll use these definitions for + * now. + */ +#define SYNC_PRODUCER SYNC_REL +#define SYNC_CONSUMER SYNC_ACQ + /* CPU dependent hook for cp0 load delays */ #if defined(MIPS1) || defined(MIPS2) || defined(MIPS3) #define MFC0_HAZARD sll $0,$0,1 /* super scalar nop */ diff --git a/sys/arch/mips/mips/locore.S b/sys/arch/mips/mips/locore.S index 87b8e6ed0f73..20a74d009217 100644 --- a/sys/arch/mips/mips/locore.S +++ b/sys/arch/mips/mips/locore.S @@ -286,6 +286,33 @@ NESTED(cpu_switchto, CALLFRAME_SIZ, ra) PTR_L t2, L_CPU(MIPS_CURLWP) nop # patchable load delay slot + + /* + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ + SYNC_PRODUCER /* XXX fixup to nop for uniprocessor boot */ PTR_S MIPS_CURLWP, CPU_INFO_CURLWP(t2) /* Check for restartable atomic sequences (RAS) */ @@ -406,6 +433,7 @@ NESTED(softint_fast_dispatch, CALLFRAME_SIZ, ra) move MIPS_CURLWP, a0 # switch to softint lwp PTR_L s1, L_CPU(MIPS_CURLWP) # get curcpu() nop # patchable load delay slot + SYNC_PRODUCER /* XXX fixup */ /* for mutex_enter; see cpu_switchto */ PTR_S MIPS_CURLWP, CPU_INFO_CURLWP(s1) # ... move s2, sp # remember sp move s3, t0 # remember curpcb @@ -417,6 +445,7 @@ NESTED(softint_fast_dispatch, CALLFRAME_SIZ, ra) move sp, s2 # restore stack move MIPS_CURLWP, s0 # restore curlwp + SYNC_PRODUCER /* XXX fixup */ /* for mutex_enter; see cpu_switchto */ PTR_S MIPS_CURLWP, CPU_INFO_CURLWP(s1) # .... REG_L ra, CALLFRAME_RA(sp) # load early since we use it From 67290db55e391fe1c6d1625d17da3faee65f2ccc Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:36:20 +0000 Subject: [PATCH 08/11] powerpc: Add missing barrier in cpu_switchto. Details in comments. XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/powerpc/powerpc/locore_subr.S | 36 ++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/sys/arch/powerpc/powerpc/locore_subr.S b/sys/arch/powerpc/powerpc/locore_subr.S index c910c10a38e5..09475a9296ad 100644 --- a/sys/arch/powerpc/powerpc/locore_subr.S +++ b/sys/arch/powerpc/powerpc/locore_subr.S @@ -215,6 +215,36 @@ ENTRY(cpu_switchto) */ GET_CPUINFO(%r7) + + /* + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ +#ifdef MULTIPROCESSOR + sync /* XXX use eieio if available -- cheaper */ +#endif + stptr %r31,CI_CURLWP(%r7) mr %r13,%r31 #ifdef PPC_BOOKE @@ -389,6 +419,9 @@ _ENTRY(softint_fast_dispatch) * to a kernel thread */ +#ifdef MULTIPROCESSOR + sync /* XXX eieio */ /* for mutex_enter; see cpu_switchto */ +#endif stptr %r3, CI_CURLWP(%r7) mr %r13, %r3 #ifdef PPC_BOOKE @@ -423,6 +456,9 @@ _ENTRY(softint_fast_dispatch) #endif GET_CPUINFO(%r7) +#ifdef MULTIPROCESSOR + sync /* XXX eieio */ /* for mutex_enter; see cpu_switchto */ +#endif stptr %r30, CI_CURLWP(%r7) mr %r13, %r30 #ifdef PPC_BOOKE From 6f41da07474df42b78eff40b34266cc0ab3a5885 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:37:53 +0000 Subject: [PATCH 09/11] riscv: Add missing barrier in cpu_switchto. Details in comments. --- sys/arch/riscv/riscv/cpu_switch.S | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/sys/arch/riscv/riscv/cpu_switch.S b/sys/arch/riscv/riscv/cpu_switch.S index acb023b3b9d9..97f5f87b060d 100644 --- a/sys/arch/riscv/riscv/cpu_switch.S +++ b/sys/arch/riscv/riscv/cpu_switch.S @@ -62,6 +62,34 @@ ENTRY_NP(cpu_switchto) mv tp, a1 // # put the new lwp in thread pointer PTR_L t1, L_CPU(tp) // # get curcpu + + /* + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. + */ + fence w,w + PTR_S tp, CI_CURLWP(t1) // # update curcpu with the new curlwp REG_L sp, L_MD_KTF(tp) // # load its kernel stack pointer @@ -154,6 +182,7 @@ ENTRY_NP(cpu_fast_switchto) PTR_S sp, L_MD_KTF(tp) // save trapframe ptr in oldlwp mv tp, a0 // set thread pointer to newlwp + fence w,w // for mutex_enter; see cpu_switchto PTR_S tp, CI_CURLWP(t1) // update curlwp PTR_L sp, L_MD_KTF(tp) // switch to its stack csrw sstatus, t0 // reenable interrupts @@ -161,6 +190,7 @@ ENTRY_NP(cpu_fast_switchto) csrrci t0, sstatus, SR_SIE // disable interrupts PTR_L t1, L_CPU(tp) // get curcpu() again mv tp, s0 // return to pinned lwp + fence w,w // for mutex_enter; see cpu_switchto PTR_S tp, CI_CURLWP(t1) // restore curlwp csrw sstatus, t0 // reenable interrupts mv sp, s1 // restore stack pointer From b7a3ec6526698299a7e158f4a832b784136ad030 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 20 Feb 2023 00:14:39 +0000 Subject: [PATCH 10/11] sparc: Note where STBAR would go in cpu_switchto for non-TSO. No functional change. --- sys/arch/sparc/sparc/locore.s | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sys/arch/sparc/sparc/locore.s b/sys/arch/sparc/sparc/locore.s index 5d7edbec3461..825ea3488c25 100644 --- a/sys/arch/sparc/sparc/locore.s +++ b/sys/arch/sparc/sparc/locore.s @@ -4889,6 +4889,10 @@ Lwb1: SAVE; SAVE; SAVE; SAVE; SAVE; SAVE; /* 6 of each: */ /* set new cpcb, and curlwp */ sethi %hi(curlwp), %l7 st %g5, [%l6 + %lo(cpcb)] ! cpcb = newpcb; + /* + * No membar needed because we run in TSO. But if we didn't, + * we would need STBAR here; see kern_mutex.c for details. + */ st %g3, [%l7 + %lo(curlwp)] ! curlwp = l; /* compute new wim */ From c5163cab242e90ef2735adbccfe363a2e89811d9 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sun, 19 Feb 2023 21:13:32 +0000 Subject: [PATCH 11/11] sparc64: Add missing barrier in cpu_switchto. Details in comments. XXX pullup-8 XXX pullup-9 XXX pullup-10 --- sys/arch/sparc64/sparc64/locore.s | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sys/arch/sparc64/sparc64/locore.s b/sys/arch/sparc64/sparc64/locore.s index 17bf12f8b2cd..1f0c11268b19 100644 --- a/sys/arch/sparc64/sparc64/locore.s +++ b/sys/arch/sparc64/sparc64/locore.s @@ -6731,8 +6731,33 @@ ENTRY(cpu_switchto) * Load the new lwp. To load, we must change stacks and * alter cpcb and the window control registers, hence we must * keep interrupts disabled. + * + * Issue a store-before-store barrier so that any prior + * mutex_exit is visible to any thread that witnesses this + * store. See kern_mutex.c for details -- this is necessary + * for adaptive mutexes to detect whether the lwp is on the + * CPU in order to safely block without requiring atomic CAS + * in mutex_exit. + * + * This serves as a membar_producer that matches the + * membar_consumer between owner = mtx->mtx_owner and + * mutex_oncpu(owner) (or, owner->l_cpu->ci_curlwp == owner) + * in mutex_vector_enter. + * + * Note that we don't need to issue a second membar_producer, + * to match the membar_consumer between mutex_oncpu(owner) + * and MUTEX_HAS_WAITERS(mtx), because a stronger barrier is + * issued by mutex_exit itself before or as part of the store + * that actually releases the mutex. + * + * A store-before-store barrier is sufficient -- no need for + * load/store-before-store or store-release -- because the + * other side, in mutex_vector_enter, is read-only for this + * protocol, allowing us use membar_producer/consumer + * semantics and not the stronger membar_release/acquire. */ + membar #StoreStore STPTR %i1, [%l7 + %lo(CURLWP)] ! curlwp = l; STPTR %l1, [%l6 + %lo(CPCB)] ! cpcb = newpcb; @@ -6826,6 +6851,7 @@ ENTRY(softint_fastintr) sethi %hi(USPACE - TF_SIZE - CC64FSZ - STKB), %o3 LDPTR [%i0 + L_PCB], %l1 ! l1 = softint pcb or %o3, %lo(USPACE - TF_SIZE - CC64FSZ - STKB), %o3 + membar #StoreStore /* for mutex_enter; see cpu_switchto */ STPTR %i0, [%l7 + %lo(CURLWP)] add %l1, %o3, %i6 STPTR %l1, [%l6 + %lo(CPCB)] @@ -6839,6 +6865,7 @@ ENTRY(softint_fastintr) /* switch back to interrupted lwp */ ldx [%l5 + PCB_SP], %i6 + membar #StoreStore /* for mutex_enter; see cpu_switchto */ STPTR %l0, [%l7 + %lo(CURLWP)] STPTR %l5, [%l6 + %lo(CPCB)]