From ec02820492c97652dc82df9cca260464b32bdb09 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 14 Feb 2022 11:12:07 +0000 Subject: [PATCH 1/6] mips: Make sure that mutex_spin_exit works even if !DIAGNOSTIC. The critical store has been under #ifdef DIAGNOSTIC since, uh, 2011. --- sys/arch/mips/mips/lock_stubs_llsc.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/arch/mips/mips/lock_stubs_llsc.S b/sys/arch/mips/mips/lock_stubs_llsc.S index b2edcfd3a689..c75572a045ee 100644 --- a/sys/arch/mips/mips/lock_stubs_llsc.S +++ b/sys/arch/mips/mips/lock_stubs_llsc.S @@ -279,8 +279,8 @@ LEAF(llsc_mutex_spin_exit) INT_L t0, MTX_LOCK(a0) beqz t0, 2f nop - INT_S zero, MTX_LOCK(a0) #endif + INT_S zero, MTX_LOCK(a0) /* * We need to grab this before the mutex count is incremented From f32ac5684ec44c23ad1727736e6db8c05a0389c8 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 14 Feb 2022 11:11:04 +0000 Subject: [PATCH 2/6] mips: Membar audit. This change should be safe because it doesn't remove or weaken any memory barriers, but does add, clarify, or strengthen barriers. Goals: - Make sure mutex_enter/exit and mutex_spin_enter/exit have acquire/release semantics. - New macros make maintenance easier and purpose clearer: . SYNC_ACQ is for load-before-load/store barrier, and BDSYNC_ACQ for a branch delay slot -- currently defined as plain sync for MP and nothing, or nop, for UP; thus it is no weaker than SYNC and BDSYNC as currently defined, which is syncw on Octeon, plain sync on non-Octeon MP, and nothing/nop on UP. It is not clear to me whether load-then-syncw or ll/sc-then-syncw or even bare load provides load-acquire semantics on Octeon -- if no, this will fix bugs; if yes (like it is on SPARC PSO), we can relax SYNC_ACQ to be syncw or nothing later. . SYNC_REL is for load/store-before-store barrier -- currently defined as plain sync for MP and nothing for UP. It is not clear to me whether syncw-then-store is enough for store-release on Octeon -- if no, we can leave this as is; if yes, we can relax SYNC_REL to be syncw on Octeon. . SYNC_PLUNGER is there to flush clogged Cavium store buffers, and BDSYNC_PLUNGER for a branch delay slot -- syncw on Octeon, nothing or nop on non-Octeon. => This is not necessary (or, as far as I'm aware, sufficient) for acquire semantics -- it serves only to flush store buffers where stores might otherwise linger for hundreds of thousands of cycles, which would, e.g., cause spin locks to be held for unreasonably long durations. Newerish revisions of the MIPS ISA also have finer-grained sync variants that could be plopped in here. Mechanism: Insert these barriers in the right places, replacing only those where the definition is currently equivalent, so this change is safe. - Replace #ifdef _MIPS_ARCH_OCTEONP / syncw / #endif at the end of atomic_cas_* by SYNC_PLUNGER, which is `sync 4' (a.k.a. syncw) if __OCTEON__ and empty otherwise. => From what I can tell, __OCTEON__ is defined in at least as many contexts as _MIPS_ARCH_OCTEONP -- i.e., there are some Octeons with no _MIPS_ARCH_OCTEONP, but I don't know if any of them are relevant to us or ever saw the light of day outside Cavium; we seem to buid with `-march=octeonp' so this is unlikely to make a difference. If it turns out that we do care, well, now there's a central place to make the distinction for sync instructions. - Replace post-ll/sc SYNC by SYNC_ACQ in _atomic_cas_*, which are internal kernel versions used in sys/arch/mips/include/lock.h where it assumes they have load-acquire semantics. Should move this to lock.h later, since we _don't_ define __HAVE_ATOMIC_AS_MEMBAR on MIPS and so the extra barrier is might be costly. - Insert SYNC_REL before ll/sc, and replace post-ll/sc SYNC by SYNC_ACQ, in _ucas_*, which is used without any barriers in futex code and doesn't mention barriers in the man page so I have to assume it is required to be a release/acquire barrier. - Change BDSYNC to BDSYNC_ACQ in mutex_enter and mutex_spin_enter. This is necessary to provide load-acquire semantics -- unclear if it was provided already by syncw on Octeon, but it seems more likely that either (a) no sync or syncw is needed at all, or (b) syncw is not enough and sync is needed, since syncw is only a store-before-store ordering barrier. - Insert SYNC_REL before ll/sc in mutex_exit and mutex_spin_exit. This is currently redundant with the SYNC already there, but SYNC_REL more clearly identifies the necessary semantics in case we want to define it differently on different systems, and having a sync in the middle of an ll/sc is a bit weird and possibly not a good idea, so I intend to (carefully) remove the redundant SYNC in a later change. - Change BDSYNC to BDSYNC_PLUNGER at the end of mutex_exit. This has no semantic change right now -- it's syncw on Octeon, sync on non-Octeon MP, nop on UP -- but we can relax it later to nop on non-Cavium MP. - Leave LLSCSYNC in for now -- it is apparently there for a Cavium erratum, but I'm not sure what the erratum is, exactly, and I have no reference for it. I suspect these can be safely removed, but we might have to double up some other syncw instructions -- Linux uses it only in store-release sequences, not at the head of every ll/sc. --- common/lib/libc/arch/mips/atomic/atomic_cas.S | 8 ++--- .../lib/libc/arch/mips/atomic/atomic_op_asm.h | 6 ---- .../lib/libc/arch/mips/atomic/atomic_swap.S | 4 +-- sys/arch/mips/include/asm.h | 15 ++++++++++ sys/arch/mips/mips/lock_stubs_llsc.S | 29 ++++++++++++++----- 5 files changed, 41 insertions(+), 21 deletions(-) diff --git a/common/lib/libc/arch/mips/atomic/atomic_cas.S b/common/lib/libc/arch/mips/atomic/atomic_cas.S index 24f86db89394..8470020925ea 100644 --- a/common/lib/libc/arch/mips/atomic/atomic_cas.S +++ b/common/lib/libc/arch/mips/atomic/atomic_cas.S @@ -47,9 +47,7 @@ LEAF(_atomic_cas_32) beq t0, zero, 1b nop move v0, a1 -#ifdef _MIPS_ARCH_OCTEONP - syncw -#endif + SYNC_PLUNGER 2: j ra nop @@ -69,9 +67,7 @@ LEAF(_atomic_cas_64) beq t0, zero, 1b nop move v0, a1 -#ifdef _MIPS_ARCH_OCTEONP - syncw -#endif + SYNC_PLUNGER 2: j ra nop diff --git a/common/lib/libc/arch/mips/atomic/atomic_op_asm.h b/common/lib/libc/arch/mips/atomic/atomic_op_asm.h index 7264217f0457..ea1e883518e5 100644 --- a/common/lib/libc/arch/mips/atomic/atomic_op_asm.h +++ b/common/lib/libc/arch/mips/atomic/atomic_op_asm.h @@ -44,10 +44,4 @@ #endif /* _KERNEL */ -#ifdef __OCTEON__ -#define SYNCW syncw -#else -#define SYNCW nop -#endif - #endif /* _ATOMIC_OP_ASM_H_ */ diff --git a/common/lib/libc/arch/mips/atomic/atomic_swap.S b/common/lib/libc/arch/mips/atomic/atomic_swap.S index 06eb54ffbe20..66a23f63176b 100644 --- a/common/lib/libc/arch/mips/atomic/atomic_swap.S +++ b/common/lib/libc/arch/mips/atomic/atomic_swap.S @@ -54,7 +54,7 @@ LEAF(_atomic_swap_32) nop 2: j ra - SYNCW + BDSYNC_PLUNGER END(_atomic_swap_32) ATOMIC_OP_ALIAS(atomic_swap_32, _atomic_swap_32) @@ -69,7 +69,7 @@ LEAF(_atomic_swap_64) nop 2: j ra - SYNCW + BDSYNC_PLUNGER END(_atomic_swap_64) ATOMIC_OP_ALIAS(atomic_swap_64, _atomic_swap_64) #endif diff --git a/sys/arch/mips/include/asm.h b/sys/arch/mips/include/asm.h index 249dcc6d86cb..03c606951fd8 100644 --- a/sys/arch/mips/include/asm.h +++ b/sys/arch/mips/include/asm.h @@ -576,14 +576,29 @@ _C_LABEL(x): #define LLSCSYNC sync 4; sync 4 #define SYNC sync 4 /* sync 4 == syncw - sync all writes */ #define BDSYNC sync 4 /* sync 4 == syncw - sync all writes */ +#define BDSYNC_ACQ sync +#define SYNC_ACQ sync +#define SYNC_REL sync +#define BDSYNC_PLUNGER sync 4 +#define SYNC_PLUNGER sync 4 #elif __mips >= 3 || !defined(__mips_o32) #define LLSCSYNC sync #define SYNC sync #define BDSYNC sync +#define BDSYNC_ACQ sync +#define SYNC_ACQ sync +#define SYNC_REL sync +#define BDSYNC_PLUNGER nop +#define SYNC_PLUNGER /* nothing */ #else #define LLSCSYNC /* nothing */ #define SYNC /* nothing */ #define BDSYNC nop +#define BDSYNC_ACQ nop +#define SYNC_ACQ /* nothing */ +#define SYNC_REL /* nothing */ +#define BDSYNC_PLUNGER nop +#define SYNC_PLUNGER /* nothing */ #endif /* CPU dependent hook for cp0 load delays */ diff --git a/sys/arch/mips/mips/lock_stubs_llsc.S b/sys/arch/mips/mips/lock_stubs_llsc.S index c75572a045ee..e4a169b0145a 100644 --- a/sys/arch/mips/mips/lock_stubs_llsc.S +++ b/sys/arch/mips/mips/lock_stubs_llsc.S @@ -63,6 +63,10 @@ RCSID("$NetBSD: lock_stubs_llsc.S,v 1.13 2020/09/26 08:21:27 simonb Exp $") /* * unsigned long atomic_cas_ulong_llsc(volatile unsigned long *val, * unsigned long old, unsigned long new); + * + * For hysterical raisins in sys/arch/mips/include/lock.h, success + * implies load-acquire. The SYNC_ACQ here could be moved there + * instead. */ STATIC_LEAF(llsc_atomic_cas_ulong) LLSCSYNC @@ -73,7 +77,7 @@ STATIC_LEAF(llsc_atomic_cas_ulong) LONG_SC t1, (a0) beqz t1, 1b nop - SYNC + SYNC_ACQ j ra move v0, a1 2: @@ -84,6 +88,10 @@ END(llsc_atomic_cas_ulong) /* * unsigned int _atomic_cas_uint_llsc(volatile unsigned int *val, * unsigned int old, unsigned int new); + * + * For hysterical raisins in sys/arch/mips/include/lock.h, success + * implies load-acquire. The SYNC_ACQ here could be moved there + * instead. */ STATIC_LEAF(llsc_atomic_cas_uint) LLSCSYNC @@ -94,7 +102,7 @@ STATIC_LEAF(llsc_atomic_cas_uint) INT_SC t1, (a0) beqz t1, 1b nop - SYNC + SYNC_ACQ j ra move v0, a1 2: @@ -105,6 +113,9 @@ END(llsc_atomic_cas_uint) /* * int llsc_ucas_32(volatile uint32_t *ptr, uint32_t old, * uint32_t new, uint32_t *ret) + * + * Implies release/acquire barriers until someone tells me + * otherwise about _ucas_32/64. */ STATIC_LEAF(llsc_ucas_32) .set at @@ -115,6 +126,7 @@ STATIC_LEAF(llsc_ucas_32) bltz a0, _C_LABEL(llsc_ucaserr) nop move v0, zero + SYNC_REL LLSCSYNC 1: ll t0, 0(a0) @@ -123,7 +135,7 @@ STATIC_LEAF(llsc_ucas_32) sc t1, 0(a0) beqz t1, 1b nop - SYNC + SYNC_ACQ 2: PTR_S zero, PCB_ONFAULT(v1) j ra @@ -144,6 +156,7 @@ STATIC_LEAF(llsc_ucas_64) bltz a0, _C_LABEL(llsc_ucaserr) nop move v0, zero + SYNC_REL LLSCSYNC 1: lld t0, 0(a0) @@ -152,7 +165,7 @@ STATIC_LEAF(llsc_ucas_64) scd t1, 0(a0) beqz t1, 1b nop - SYNC + SYNC_ACQ 2: PTR_S zero, PCB_ONFAULT(v1) j ra @@ -181,7 +194,7 @@ STATIC_LEAF(llsc_mutex_enter) beqz t2, 1b PTR_LL t0, MTX_OWNER(a0) j ra - BDSYNC + BDSYNC_ACQ 2: j _C_LABEL(mutex_vector_enter) nop @@ -191,6 +204,7 @@ END(llsc_mutex_enter) * void mutex_exit(kmutex_t *mtx); */ STATIC_LEAF(llsc_mutex_exit) + SYNC_REL LLSCSYNC PTR_LL t0, MTX_OWNER(a0) SYNC @@ -201,7 +215,7 @@ STATIC_LEAF(llsc_mutex_exit) beqz t2, 1b PTR_LL t0, MTX_OWNER(a0) j ra - BDSYNC + BDSYNC_PLUNGER 2: j _C_LABEL(mutex_vector_exit) nop @@ -264,7 +278,7 @@ STATIC_NESTED(llsc_mutex_spin_enter, CALLFRAME_SIZ, ra) beqz t1, 3b INT_LL t3, MTX_LOCK(t0) j ra - BDSYNC + BDSYNC_ACQ 4: j _C_LABEL(mutex_spin_retry) move a0, t0 @@ -274,6 +288,7 @@ END(llsc_mutex_spin_enter) * void mutex_spin_exit(kmutex_t *mtx); */ LEAF(llsc_mutex_spin_exit) + SYNC_REL PTR_L t2, L_CPU(MIPS_CURLWP) #if defined(DIAGNOSTIC) INT_L t0, MTX_LOCK(a0) From fd734d142f2ff482258a2c7518b5d8d962bc6012 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 14 Feb 2022 17:22:09 +0000 Subject: [PATCH 3/6] mips: Omit needless SYNC in mutex_exit. This change deletes a memory barrier. However, it should be safe: The semantic requirement for this is already provided by the SYNC_REL above, before the ll. And as currently defined, SYNC_REL is at least as strong as SYNC, so this change can't hurt correctness on its own (barring CPU errata, which would apply to other users of SYNC_REL and can be addressed in the definition of SYNC_REL). Later, perhaps we can relax SYNC_REL to syncw on Octeon if we prove that it is correct (e.g., if Octeon follows the SPARCv9 partial store order semantics). Nix now-unused SYNC macro in asm.h. --- sys/arch/mips/include/asm.h | 3 --- sys/arch/mips/mips/lock_stubs_llsc.S | 1 - 2 files changed, 4 deletions(-) diff --git a/sys/arch/mips/include/asm.h b/sys/arch/mips/include/asm.h index 03c606951fd8..a27b19a4dc4a 100644 --- a/sys/arch/mips/include/asm.h +++ b/sys/arch/mips/include/asm.h @@ -574,7 +574,6 @@ _C_LABEL(x): #if defined(__OCTEON__) /* early cnMIPS have erratum which means 2 */ #define LLSCSYNC sync 4; sync 4 -#define SYNC sync 4 /* sync 4 == syncw - sync all writes */ #define BDSYNC sync 4 /* sync 4 == syncw - sync all writes */ #define BDSYNC_ACQ sync #define SYNC_ACQ sync @@ -583,7 +582,6 @@ _C_LABEL(x): #define SYNC_PLUNGER sync 4 #elif __mips >= 3 || !defined(__mips_o32) #define LLSCSYNC sync -#define SYNC sync #define BDSYNC sync #define BDSYNC_ACQ sync #define SYNC_ACQ sync @@ -592,7 +590,6 @@ _C_LABEL(x): #define SYNC_PLUNGER /* nothing */ #else #define LLSCSYNC /* nothing */ -#define SYNC /* nothing */ #define BDSYNC nop #define BDSYNC_ACQ nop #define SYNC_ACQ /* nothing */ diff --git a/sys/arch/mips/mips/lock_stubs_llsc.S b/sys/arch/mips/mips/lock_stubs_llsc.S index e4a169b0145a..046e5eae7f51 100644 --- a/sys/arch/mips/mips/lock_stubs_llsc.S +++ b/sys/arch/mips/mips/lock_stubs_llsc.S @@ -207,7 +207,6 @@ STATIC_LEAF(llsc_mutex_exit) SYNC_REL LLSCSYNC PTR_LL t0, MTX_OWNER(a0) - SYNC 1: bne t0, MIPS_CURLWP, 2f move t2, zero From b1f4ca11f9caef3ba3fc9e792b18070c2f6e0f21 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 14 Feb 2022 17:24:49 +0000 Subject: [PATCH 4/6] mips: Redefine BDSYNC as sync on Octeon, not syncw. BDSYNC is used for membar_sync, which is supposed to be a full sequential consistency barrier, which is not provided by syncw, so this is necessary for correctness. BDSYNC is not used for anything else, so this can't hurt performance, except where it was necessary for correctness anyway or where the semantic choice of membar_sync was too strong anyway. --- sys/arch/mips/include/asm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/arch/mips/include/asm.h b/sys/arch/mips/include/asm.h index a27b19a4dc4a..d2982da3a2ee 100644 --- a/sys/arch/mips/include/asm.h +++ b/sys/arch/mips/include/asm.h @@ -574,7 +574,7 @@ _C_LABEL(x): #if defined(__OCTEON__) /* early cnMIPS have erratum which means 2 */ #define LLSCSYNC sync 4; sync 4 -#define BDSYNC sync 4 /* sync 4 == syncw - sync all writes */ +#define BDSYNC sync #define BDSYNC_ACQ sync #define SYNC_ACQ sync #define SYNC_REL sync From 6c8c15278c220c8482e5aae126ec0658724950de Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Mon, 14 Feb 2022 17:27:57 +0000 Subject: [PATCH 5/6] mips: Redefine LLSCSYNC as empty on non-Octeon MP. This change deletes memory barriers on non-Octeon MP. However, all the appropriate acquire and release barriers are already used in mutex stubs, and no barriers are needed in atomic_* unless we set __HAVE_ATOMIC_AS_MEMBAR which we don't on MIPS. So this should be safe. Unclear whether we need this even on Octeon -- don't have a clear reference on why it's here. --- sys/arch/mips/include/asm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/arch/mips/include/asm.h b/sys/arch/mips/include/asm.h index d2982da3a2ee..62610c4411c5 100644 --- a/sys/arch/mips/include/asm.h +++ b/sys/arch/mips/include/asm.h @@ -581,7 +581,7 @@ _C_LABEL(x): #define BDSYNC_PLUNGER sync 4 #define SYNC_PLUNGER sync 4 #elif __mips >= 3 || !defined(__mips_o32) -#define LLSCSYNC sync +#define LLSCSYNC /* nothing */ #define BDSYNC sync #define BDSYNC_ACQ sync #define SYNC_ACQ sync From 98a16f72a64cf433e90b40b85f861a35e4329355 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Tue, 15 Feb 2022 12:01:02 +0000 Subject: [PATCH 6/6] mips: Issue a sync plunger at the end of mutex_spin_exit. Same as mutex_exit. Relevant only on cnMIPS where the store buffers get clogged. Recommended by the Cavium documentation. No semantic change, only performance -- this only adds a barrier in some cases where there was none before, so it can't hurt correctness. --- sys/arch/mips/mips/lock_stubs_llsc.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sys/arch/mips/mips/lock_stubs_llsc.S b/sys/arch/mips/mips/lock_stubs_llsc.S index 046e5eae7f51..74ad4b20a860 100644 --- a/sys/arch/mips/mips/lock_stubs_llsc.S +++ b/sys/arch/mips/mips/lock_stubs_llsc.S @@ -342,10 +342,10 @@ LEAF(llsc_mutex_spin_exit) nop #endif j _C_LABEL(splx) - nop + BDSYNC_PLUNGER 1: j ra - nop + BDSYNC_PLUNGER #if defined(DIAGNOSTIC) 2: j _C_LABEL(mutex_vector_exit)