From 4ddc4a42dbba14ba966a3586f409bd390eb4527b Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <riastradh@NetBSD.org>
Date: Sat, 12 Feb 2022 03:19:54 +0000
Subject: [PATCH] mips: Eradicate last vestiges of bonkers mb_* barriers.

Leave an essay with citations about why we have an apparently
pointless syncw _after_ releasing a lock, just for cnmips.
---
 common/lib/libc/arch/mips/atomic/membar_ops.S | 10 ---
 sys/arch/mips/include/lock.h                  | 88 ++++++++-----------
 2 files changed, 36 insertions(+), 62 deletions(-)

diff --git a/common/lib/libc/arch/mips/atomic/membar_ops.S b/common/lib/libc/arch/mips/atomic/membar_ops.S
index 03fcb5a32e0b..ecdc8ecbdf6b 100644
--- a/common/lib/libc/arch/mips/atomic/membar_ops.S
+++ b/common/lib/libc/arch/mips/atomic/membar_ops.S
@@ -46,16 +46,6 @@ LEAF(_membar_producer)
 END(_membar_producer)
 #endif
 
-#ifdef _KERNEL
-STRONG_ALIAS(mb_read, _membar_sync)
-#ifdef __OCTEON__
-STRONG_ALIAS(mb_write, _membar_producer)
-#else
-STRONG_ALIAS(mb_write, _membar_sync)
-#endif
-STRONG_ALIAS(mb_memory, _membar_sync)
-#endif
-
 ATOMIC_OP_ALIAS(membar_sync,_membar_sync)
 ATOMIC_OP_ALIAS(membar_enter,_membar_sync)
 STRONG_ALIAS(_membar_enter,_membar_sync)
diff --git a/sys/arch/mips/include/lock.h b/sys/arch/mips/include/lock.h
index 4317bd1af8e8..837ffc27329a 100644
--- a/sys/arch/mips/include/lock.h
+++ b/sys/arch/mips/include/lock.h
@@ -41,6 +41,8 @@
 
 #include <sys/param.h>
 
+#include <sys/atomic.h>
+
 static __inline int
 __SIMPLELOCK_LOCKED_P(const __cpu_simple_lock_t *__ptr)
 {
@@ -98,58 +100,11 @@ __cpu_simple_lock_try(__cpu_simple_lock_t *lp)
 	return (v0 != 0);
 }
 
-#ifdef MIPS1
-static __inline void
-mb_read(void)
-{
-	__insn_barrier();
-}
-
-static __inline void
-mb_write(void)
-{
-	__insn_barrier();
-}
-
-static __inline void
-mb_memory(void)
-{
-	__insn_barrier();
-}
-#else	/* MIPS1*/
-static __inline void
-mb_read(void)
-{
-	__asm volatile(
-		"	.set push		\n"
-		"	.set mips2		\n"
-		"	sync			\n"
-		"	.set pop"
-		::: "memory"
-	);
-}
-
-static __inline void
-mb_write(void)
-{
-	mb_read();
-}
-
-static __inline void
-mb_memory(void)
-{
-	mb_read();
-}
-#endif	/* MIPS1 */
-
 #else	/* !_HARDKERNEL */
 
 u_int	_atomic_cas_uint(volatile u_int *, u_int, u_int);
 u_long	_atomic_cas_ulong(volatile u_long *, u_long, u_long);
 void *	_atomic_cas_ptr(volatile void *, void *, void *);
-void	mb_read(void);
-void	mb_write(void);
-void	mb_memory(void);
 
 static __inline int
 __cpu_simple_lock_try(__cpu_simple_lock_t *lp)
@@ -167,7 +122,6 @@ __cpu_simple_lock_init(__cpu_simple_lock_t *lp)
 {
 
 	*lp = __SIMPLELOCK_UNLOCKED;
-	mb_memory();
 }
 
 static __inline void
@@ -184,12 +138,42 @@ static __inline void
 __cpu_simple_unlock(__cpu_simple_lock_t *lp)
 {
 
-#ifndef _MIPS_ARCH_OCTEONP
-	mb_memory();
-#endif
+	membar_exit();
 	*lp = __SIMPLELOCK_UNLOCKED;
 #ifdef _MIPS_ARCH_OCTEONP
-	mb_write();
+	/*
+	 * On Cavium's recommendation, we issue an extra SYNCW that is
+	 * not necessary for correct ordering because apparently stores
+	 * can get stuck indefinitely in Octeon store buffers,
+	 * according to the following note:
+	 *
+	 *	Programming Notes:
+	 *	[...]
+	 *	Core A (writer)
+	 *	SW R1, DATA
+	 *	LI R2, 1
+	 *	SYNCW
+	 *	SW R2, FLAG
+	 *	SYNCW
+	 *	[...]
+	 *
+	 *	The second SYNCW instruction executed by core A is not
+	 *	necessary for correctness, but has very important
+	 *	performance effects on OCTEON.  Without it, the store
+	 *	to FLAG may linger in core A's write buffer before it
+	 *	becomes visible to other cores.  (If core A is not
+	 *	performing many stores, this may add hundreds of
+	 *	thousands of cycles to the flag release time since the
+	 *	OCTEON core normally retains stores to attempt to merge
+	 *	them before sending the store on the CMB.)
+	 *	Applications should include this second SYNCW
+	 *	instruction after flag or lock releases.
+	 *
+	 * Cavium Networks OCTEON Plus CN50XX Hardware Reference
+	 * Manual, July 2008, Appendix A, p. 943.
+	 * https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/hactive/CN50XX-HRM-V0.99E.pdf
+	 */
+	__asm volatile("syncw" ::: "memory");
 #endif
 }