From cde2a6e83b09bc4b8930a5956300a7fdfecc6e54 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 12 Feb 2022 03:19:54 +0000 Subject: [PATCH] mips: Brush up __cpu_simple_lock. - Eradicate last vestiges of mb_* barriers. - In __cpu_simple_lock_init, omit needless barrier. It is the caller's responsibility to ensure __cpu_simple_lock_init happens before other operations on it anyway, so there was never any need for a barrier here. - In __cpu_simple_lock_try, leave comments about memory ordering guarantees of the kernel's _atomic_cas_uint, which are inexplicably different from the non-underscored atomic_cas_uint. - In __cpu_simple_lock, use atomic_load_relaxed(lp) instead of *lp while spinning to avoid the appearance of a data race. No functional change here, except potentially to race sanitizers. - In __cpu_simple_unlock, use atomic_store_release(lp, _SIMPLELOCK_UNLOCKED) instead of _conditionally_ issuing mb_memory before setting *lp. This ensures that in __cpu_simple_lock/.../__cpu_simple_unlock, all memory operations in the ellipsis happen before the store that releases the lock. The barrier was omitted on Octeon, which is a bug -- it needs to be there or else there is no happens-before relation and whoever takes the lock next might see stale values stored or even stomp over the unlocking CPU's delayed loads. And it needs to be sync, not syncw, because we need load/store-before-store and syncw only orders store-before-store. On non-Octeon, the mb_memory was sync. Using atomic_store_release here preserves that, via membar_exit which issues sync. - Leave an essay with citations about why we have an apparently pointless syncw _after_ releasing a lock, to work around a design bug^W^Wquirk in cnmips which sometimes buffers stores for hundreds of thousands of cycles for fun unless you issue syncw. --- common/lib/libc/arch/mips/atomic/membar_ops.S | 10 -- sys/arch/mips/include/lock.h | 106 +++++++++--------- 2 files changed, 52 insertions(+), 64 deletions(-) diff --git a/common/lib/libc/arch/mips/atomic/membar_ops.S b/common/lib/libc/arch/mips/atomic/membar_ops.S index 03fcb5a32e0b..ecdc8ecbdf6b 100644 --- a/common/lib/libc/arch/mips/atomic/membar_ops.S +++ b/common/lib/libc/arch/mips/atomic/membar_ops.S @@ -46,16 +46,6 @@ LEAF(_membar_producer) END(_membar_producer) #endif -#ifdef _KERNEL -STRONG_ALIAS(mb_read, _membar_sync) -#ifdef __OCTEON__ -STRONG_ALIAS(mb_write, _membar_producer) -#else -STRONG_ALIAS(mb_write, _membar_sync) -#endif -STRONG_ALIAS(mb_memory, _membar_sync) -#endif - ATOMIC_OP_ALIAS(membar_sync,_membar_sync) ATOMIC_OP_ALIAS(membar_enter,_membar_sync) STRONG_ALIAS(_membar_enter,_membar_sync) diff --git a/sys/arch/mips/include/lock.h b/sys/arch/mips/include/lock.h index 4317bd1af8e8..64e6e897f23b 100644 --- a/sys/arch/mips/include/lock.h +++ b/sys/arch/mips/include/lock.h @@ -41,6 +41,8 @@ #include +#include + static __inline int __SIMPLELOCK_LOCKED_P(const __cpu_simple_lock_t *__ptr) { @@ -98,63 +100,27 @@ __cpu_simple_lock_try(__cpu_simple_lock_t *lp) return (v0 != 0); } -#ifdef MIPS1 -static __inline void -mb_read(void) -{ - __insn_barrier(); -} - -static __inline void -mb_write(void) -{ - __insn_barrier(); -} - -static __inline void -mb_memory(void) -{ - __insn_barrier(); -} -#else /* MIPS1*/ -static __inline void -mb_read(void) -{ - __asm volatile( - " .set push \n" - " .set mips2 \n" - " sync \n" - " .set pop" - ::: "memory" - ); -} - -static __inline void -mb_write(void) -{ - mb_read(); -} - -static __inline void -mb_memory(void) -{ - mb_read(); -} -#endif /* MIPS1 */ - #else /* !_HARDKERNEL */ u_int _atomic_cas_uint(volatile u_int *, u_int, u_int); u_long _atomic_cas_ulong(volatile u_long *, u_long, u_long); void * _atomic_cas_ptr(volatile void *, void *, void *); -void mb_read(void); -void mb_write(void); -void mb_memory(void); static __inline int __cpu_simple_lock_try(__cpu_simple_lock_t *lp) { + /* + * Successful _atomic_cas_uint functions as a load-acquire -- + * on MP systems, it issues sync after the LL/SC CAS succeeds; + * on non-MP systems every load is a load-acquire so it's moot. + * + * NOTE: This applies oly to _atomic_cas_uint (with the + * underscore), in sys/arch/mips/mips/lock_stubs_*.S. Not true + * for atomic_cas_uint (without the underscore), from + * common/lib/libc/arch/mips/atomic/atomic_cas.S which does not + * imply a load-acquire. It is unclear why these disagree. + */ return _atomic_cas_uint(lp, __SIMPLELOCK_UNLOCKED, __SIMPLELOCK_LOCKED) == __SIMPLELOCK_UNLOCKED; @@ -167,7 +133,6 @@ __cpu_simple_lock_init(__cpu_simple_lock_t *lp) { *lp = __SIMPLELOCK_UNLOCKED; - mb_memory(); } static __inline void @@ -175,7 +140,7 @@ __cpu_simple_lock(__cpu_simple_lock_t *lp) { while (!__cpu_simple_lock_try(lp)) { - while (*lp == __SIMPLELOCK_LOCKED) + while (atomic_load_relaxed(lp) == __SIMPLELOCK_LOCKED) /* spin */; } } @@ -184,12 +149,45 @@ static __inline void __cpu_simple_unlock(__cpu_simple_lock_t *lp) { -#ifndef _MIPS_ARCH_OCTEONP - mb_memory(); -#endif - *lp = __SIMPLELOCK_UNLOCKED; + atomic_store_release(lp, __SIMPLELOCK_UNLOCKED); + #ifdef _MIPS_ARCH_OCTEONP - mb_write(); + /* + * On Cavium's recommendation, we issue an extra SYNCW that is + * not necessary for correct ordering because apparently stores + * can get stuck in Octeon store buffers for hundreds of + * thousands of cycles, according to the following note: + * + * Programming Notes: + * [...] + * Core A (writer) + * SW R1, DATA + * LI R2, 1 + * SYNCW + * SW R2, FLAG + * SYNCW + * [...] + * + * The second SYNCW instruction executed by core A is not + * necessary for correctness, but has very important + * performance effects on OCTEON. Without it, the store + * to FLAG may linger in core A's write buffer before it + * becomes visible to other cores. (If core A is not + * performing many stores, this may add hundreds of + * thousands of cycles to the flag release time since the + * OCTEON core normally retains stores to attempt to merge + * them before sending the store on the CMB.) + * Applications should include this second SYNCW + * instruction after flag or lock releases. + * + * Cavium Networks OCTEON Plus CN50XX Hardware Reference + * Manual, July 2008, Appendix A, p. 943. + * https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/hactive/CN50XX-HRM-V0.99E.pdf + * + * XXX It might be prudent to put this into + * atomic_store_release itself. + */ + __asm volatile("syncw" ::: "memory"); #endif }