From f8a4c069a97ee2daa1bf3b4f92327ab33de9b459 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 12 Feb 2022 03:19:54 +0000 Subject: [PATCH] mips: Brush up __cpu_simple_lock. - Eradicate last vestiges of mb_* barriers. - In __cpu_simple_lock_init, omit needless barrier. It is the caller's responsibility to ensure __cpu_simple_lock_init happens before other operations on it anyway, so there was never any need for a barrier here. - In __cpu_simple_lock_try, leave comments about memory ordering guarantees of the kernel's _atomic_cas_uint, which are inexplicably different from the non-underscored atomic_cas_uint. - In __cpu_simple_unlock, use membar_exit instead of mb_memory, and do it unconditionally. This ensures that in __cpu_simple_lock/.../__cpu_simple_unlock, all memory operations in the ellipsis happen before the store that releases the lock. - On Octeon, the barrier was omitted altogether, which is a bug -- it needs to be there or else there is no happens-before relation and whoever takes the lock next might see stale values stored or even stomp over the unlocking CPU's delayed loads. - On non-Octeon, the mb_memory was sync. Using membar_exit preserves this. XXX On Octeon, membar_exit only issues syncw -- this seems wrong, only store-before-store and not load/store-before-store, unless the CNMIPS architecture guarantees it is sufficient here like SPARCv8/v9 PSO (`Partial Store Order'). - Leave an essay with citations about why we have an apparently pointless syncw _after_ releasing a lock, to work around a design bug^W^Wquirk in cnmips which sometimes buffers stores for hundreds of thousands of cycles for fun unless you issue syncw. --- common/lib/libc/arch/mips/atomic/membar_ops.S | 10 -- sys/arch/mips/include/lock.h | 114 ++++++++++-------- 2 files changed, 62 insertions(+), 62 deletions(-) diff --git a/common/lib/libc/arch/mips/atomic/membar_ops.S b/common/lib/libc/arch/mips/atomic/membar_ops.S index 03fcb5a32e0b..ecdc8ecbdf6b 100644 --- a/common/lib/libc/arch/mips/atomic/membar_ops.S +++ b/common/lib/libc/arch/mips/atomic/membar_ops.S @@ -46,16 +46,6 @@ LEAF(_membar_producer) END(_membar_producer) #endif -#ifdef _KERNEL -STRONG_ALIAS(mb_read, _membar_sync) -#ifdef __OCTEON__ -STRONG_ALIAS(mb_write, _membar_producer) -#else -STRONG_ALIAS(mb_write, _membar_sync) -#endif -STRONG_ALIAS(mb_memory, _membar_sync) -#endif - ATOMIC_OP_ALIAS(membar_sync,_membar_sync) ATOMIC_OP_ALIAS(membar_enter,_membar_sync) STRONG_ALIAS(_membar_enter,_membar_sync) diff --git a/sys/arch/mips/include/lock.h b/sys/arch/mips/include/lock.h index 4317bd1af8e8..d4e6d8cefdb0 100644 --- a/sys/arch/mips/include/lock.h +++ b/sys/arch/mips/include/lock.h @@ -41,6 +41,8 @@ #include +#include + static __inline int __SIMPLELOCK_LOCKED_P(const __cpu_simple_lock_t *__ptr) { @@ -98,63 +100,30 @@ __cpu_simple_lock_try(__cpu_simple_lock_t *lp) return (v0 != 0); } -#ifdef MIPS1 -static __inline void -mb_read(void) -{ - __insn_barrier(); -} - -static __inline void -mb_write(void) -{ - __insn_barrier(); -} - -static __inline void -mb_memory(void) -{ - __insn_barrier(); -} -#else /* MIPS1*/ -static __inline void -mb_read(void) -{ - __asm volatile( - " .set push \n" - " .set mips2 \n" - " sync \n" - " .set pop" - ::: "memory" - ); -} - -static __inline void -mb_write(void) -{ - mb_read(); -} - -static __inline void -mb_memory(void) -{ - mb_read(); -} -#endif /* MIPS1 */ - #else /* !_HARDKERNEL */ u_int _atomic_cas_uint(volatile u_int *, u_int, u_int); u_long _atomic_cas_ulong(volatile u_long *, u_long, u_long); void * _atomic_cas_ptr(volatile void *, void *, void *); -void mb_read(void); -void mb_write(void); -void mb_memory(void); static __inline int __cpu_simple_lock_try(__cpu_simple_lock_t *lp) { + /* + * Successful _atomic_cas_uint functions as a load-acquire -- + * on MP systems, it issues sync after the LL/SC CAS succeeds; + * on non-MP systems every load is a load-acquire so it's moot. + * This pairs with the membar_exit and store sequence in + * __cpu_simple_unlock that functions as a store-release + * operation. + * + * NOTE: This applies only to _atomic_cas_uint (with the + * underscore), in sys/arch/mips/mips/lock_stubs_*.S. Not true + * for atomic_cas_uint (without the underscore), from + * common/lib/libc/arch/mips/atomic/atomic_cas.S which does not + * imply a load-acquire. It is unclear why these disagree. + */ return _atomic_cas_uint(lp, __SIMPLELOCK_UNLOCKED, __SIMPLELOCK_LOCKED) == __SIMPLELOCK_UNLOCKED; @@ -167,7 +136,6 @@ __cpu_simple_lock_init(__cpu_simple_lock_t *lp) { *lp = __SIMPLELOCK_UNLOCKED; - mb_memory(); } static __inline void @@ -184,12 +152,54 @@ static __inline void __cpu_simple_unlock(__cpu_simple_lock_t *lp) { -#ifndef _MIPS_ARCH_OCTEONP - mb_memory(); -#endif + /* + * The membar_exit and then store functions as a store-release + * operation that pairs with the load-acquire operation in + * successful __cpu_simple_lock_try. + * + * Can't use atomic_store_release here because that's not + * available in userland at the moment. + */ + membar_exit(); *lp = __SIMPLELOCK_UNLOCKED; + #ifdef _MIPS_ARCH_OCTEONP - mb_write(); + /* + * On Cavium's recommendation, we issue an extra SYNCW that is + * not necessary for correct ordering because apparently stores + * can get stuck in Octeon store buffers for hundreds of + * thousands of cycles, according to the following note: + * + * Programming Notes: + * [...] + * Core A (writer) + * SW R1, DATA + * LI R2, 1 + * SYNCW + * SW R2, FLAG + * SYNCW + * [...] + * + * The second SYNCW instruction executed by core A is not + * necessary for correctness, but has very important + * performance effects on OCTEON. Without it, the store + * to FLAG may linger in core A's write buffer before it + * becomes visible to other cores. (If core A is not + * performing many stores, this may add hundreds of + * thousands of cycles to the flag release time since the + * OCTEON core normally retains stores to attempt to merge + * them before sending the store on the CMB.) + * Applications should include this second SYNCW + * instruction after flag or lock releases. + * + * Cavium Networks OCTEON Plus CN50XX Hardware Reference + * Manual, July 2008, Appendix A, p. 943. + * https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/hactive/CN50XX-HRM-V0.99E.pdf + * + * XXX It might be prudent to put this into + * atomic_store_release itself. + */ + __asm volatile("syncw" ::: "memory"); #endif }