From 5788052254df618cfb3913105da3e5f8f4520a79 Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <riastradh@NetBSD.org>
Date: Wed, 6 Apr 2022 21:10:56 +0000
Subject: [PATCH 46/49] powerpc: Rework store-store and release/acquire
 membars.

- For userland code, use the cheapest instruction encoding that has
  the same meaning on classic powerpc and booke powerpc.  This means
  avoiding lwsync -- only classic sync (L=0), which on booke encodes
  msync with the same semantics, and classic eieio, which on booke
  encodes mbar 0 with stronger semantcs.

  lwsync will execute in userland but only by trapping to the kernel
  for emulation, which is no doubt more expensive than just issuing a
  full sync.

- For kernel code, use classic powerpc or booke powerpc barrier
  instructions according to what the kernel is being built for.

  . On classic powerpc, take advantage of lwsync (acq/rel, i.e. r/rw
    and rw/w) for membar_acquire/release and eieio (w/w) for
    membar_producer.

  . On booke, take advantage of mbar 1 (w/w) for membar_producer.

  . For !MULTIPROCESSOR, skip the instructions altogether.

Keep this organized by macros in machine/asm.h.  For now, this
imposes a performance penalty on classic powerpc by using sync where
lwsync would work.  We could fix this with an ld.so.conf for classic
powerpc membars, to use lwsync where possible.  Put notes and
references in membar_ops.S for future editors and auditors.
---
 .../lib/libc/arch/powerpc/atomic/membar_ops.S | 199 +++++++++++-------
 sys/arch/powerpc/include/asm.h                |  35 +++
 sys/arch/powerpc/powerpc/lock_stubs.S         |   9 -
 3 files changed, 154 insertions(+), 89 deletions(-)

diff --git a/common/lib/libc/arch/powerpc/atomic/membar_ops.S b/common/lib/libc/arch/powerpc/atomic/membar_ops.S
index d254e7dc2d37..4af5ad38da66 100644
--- a/common/lib/libc/arch/powerpc/atomic/membar_ops.S
+++ b/common/lib/libc/arch/powerpc/atomic/membar_ops.S
@@ -33,106 +33,145 @@
 
 __RCSID("$NetBSD: membar_ops.S,v 1.4 2011/01/15 07:31:11 matt Exp $")
 
+/*
+ * Classic PowerPC and Book E have slightly different synchronization
+ * instructions, with different meanings for overlapping encodings:
+ *
+ *	21:30	L/MO	Classic PowerPC		Book E
+ *	598	L=0	sync or hwsync, rw/rw	msync, rw/rw
+ *	598	L=1	lwsync, r/rw and rw/w	(undefined)
+ *	854	MO=0	eieio, w/w		mbar 0, rw/rw
+ *	854	MO=1	(undefined)		mbar 1, w/w (e500/e200 only?)
+ *
+ * References:
+ *
+ *	(Classic PowerPC) PowerPC Virtual Environment Architecture,
+ *	Book II, Version 2.01, December 2003, IBM.
+ *	https://archive.org/details/bitsavers_ibmpowerpcvironmentArchitectureBookIIVer2.01200312_275529
+ *
+ *	Book E: Enhanced PowerPC Architecture, Version 1.0, May 7,
+ *	2002, Freescale Semiconductor.
+ *	https://www.nxp.com/docs/en/user-guide/BOOK_EUM.pdf
+ *
+ *	EREF: A Programmer's Reference Manual for Freescale Power
+ *	Architecture Processors, EREF_RM, Rev. 1 (EIS 2.1), 06/2014,
+ *	Freescale Semiconductor.
+ *	https://www.nxp.com/files-static/32bit/doc/ref_manual/EREF_RM.pdf
+ *
+ *	PowerPC e500 Core Family Reference Manual, E500CORERM, Rev. 1,
+ *	4/2005, Freescale Semiconductor.
+ *	https://www.nxp.com/docs/en/reference-manual/E500CORERM.pdf
+ *
+ * There is an erratum in AN3441 about mbar 1 in e500 CPUs:
+ *
+ *	Coherency and Synchronization Requirements for PowerQUICC III,
+ *	Application Note AN3441, Rev. 1, 12/2007, Freescale
+ *	Semiconductor.
+ *	https://www.nxp.com/docs/en/application-note/AN3441.pdf
+ *
+ * However, it is only applicable to store-load ordering on
+ * cache-inhibited guarded memory for memory-mapped I/O devices, not
+ * regular memory that membar_* is applicable to:
+ *
+ *	`mbar MO=1 fails to properly order caching-inhibited guarded
+ *	 loads with respect to caching-inhibited guarded stores.  For
+ *	 guaranteed store-load ordering to cache-inhibited guarded
+ *	 memory, mbar with MO=0 or msync is required.  The failure is
+ *	 limited to cache-inhibited loads bypassing the mbar MO=1
+ *	 barrier.  The mbar MO=1 instruction correctly enforces
+ *	 ordering of cache-inhibited stores to cache-inhibited stores,
+ *	 and the ordering of cacheable stores to cacheable stores.
+ *
+ * For kernel code, we can pick classic or Book E because we use
+ * different kernels for the two, but the same userland has to work on
+ * both so for userland we limit ourselves to sync.
+ *
+ * It is tempting to use isync to order load-before-load/store.
+ * However, isync orders prior loads only if their value flows into a
+ * control-flow dependency prior to the isync:
+ *
+ *	`[I]f an isync follows a conditional Branch instruction that
+ *	 depends on the value returned by a preceding Load instruction,
+ *	 the load on which the Branch depends is performed before any
+ *	 loads caused by instructions following the isync. This applies
+ *	 even if the effects of the ``dependency'' are independent of
+ *	 the value loaded (e.g., the value is compared to itself and
+ *	 the Branch tests the EQ bit in the selected CR field), and
+ *	 even if the branch target is the sequentially next
+ *	 instruction.'
+ *
+ *	--PowerPC Virtual Environment Architecture, Book II, Version
+ *	  2.01, December 2003, 1.7.1 `Storage Access Ordering', p. 7.
+ *
+ * membar_acquire, however, must order _all_ prior loads, even if they
+ * do not flow into any control flow dependency preceding the isync.
+ * For example:
+ *
+ *	x = *p;
+ *	membar_acquire();
+ *	if (x) goto foo;
+ *
+ * This can't be implemented by:
+ *
+ *	lwz	x, p
+ *	isync
+ *	cmpwi	cc, x, 0
+ *	bne	cc, foo
+ *
+ * isync doesn't work here because there's no conditional dependency on
+ * x between the lwz x, p and the isync.  The ordering would be
+ * guaranteed by
+ *
+ *	lwz	x, p
+ *	cmpwi	cc, x, 0
+ *	bne	cc, foo
+ *	isync
+ *	...
+ * foo:	isync
+ *
+ * But it would require some compiler support to ensure there is a
+ * branch between the load and the isync.  Making this connection with
+ * membar_acquire requires a clever compiler to notice the control
+ * flow.  atomic_load_acquire is easier because it has the load as part
+ * of the operation, so compilers will usually implement x =
+ * atomic_load_acquire(p) as
+ *
+ *	lwz	x, p
+ *	cmpw	cc, x, x
+ *	bne-	cc, 1f
+ * 1:	isync
+ */
+
 	.text
 
 ENTRY(_membar_acquire)
-	/*
-	 * It is tempting to use isync to order load-before-load/store.
-	 * However, isync orders prior loads only if their value flows
-	 * into a control-flow dependency prior to the isync:
-	 *
-	 *	`[I]f an isync follows a conditional Branch instruction
-	 *	 that depends on the value returned by a preceding Load
-	 *	 instruction, the load on which the Branch depends is
-	 *	 performed before any loads caused by instructions
-	 *	 following the isync. This applies even if the effects
-	 *	 of the ``dependency'' are independent of the value
-	 *	 loaded (e.g., the value is compared to itself and the
-	 *	 Branch tests the EQ bit in the selected CR field), and
-	 *	 even if the branch target is the sequentially next
-	 *	 instruction.'
-	 *
-	 *	--PowerPC Virtual Environment Architecture, Book II,
-	 *	  Version 2.01, December 2003, 1.7.1 `Storage Access
-	 *	  Ordering', p. 7.
-	 *
-	 * We are required here, however, to order _all_ prior loads,
-	 * even if they do not flow into any control flow dependency.
-	 * For example:
-	 *
-	 *	x = *p;
-	 *	membar_acquire();
-	 *	if (x) goto foo;
-	 *
-	 * This can't be implemented by:
-	 *
-	 *	lwz	x, p
-	 *	isync
-	 *	cmpwi	x, 0
-	 *	bne	foo
-	 *
-	 * isync doesn't work here because there's no conditional
-	 * dependency on x between the lwz x, p and the isync.
-	 *
-	 * isync would only work if it followed the branch:
-	 *
-	 *	lwz	x, p
-	 *	isync
-	 *	cmpwi	x, 0
-	 *	bne	foo
-	 *	...
-	 * foo:	isync
-	 *	...
-	 *
-	 * lwsync orders everything except store-before-load, so it
-	 * serves here -- see below in membar_release in lwsync.
-	 * Except we can't use it on booke, so use sync for now.
-	 */
-	sync
+	LWSYNC		/* acq/rel -- r/rw _and_ rw/w (but not w/r) */
 	blr
 END(_membar_acquire)
 ATOMIC_OP_ALIAS(membar_acquire,_membar_acquire)
 
 ENTRY(_membar_release)
-	/*
-	 *	`The memory barrier provides an ordering function for
-	 *	 the storage accesses caused by Load, Store, and dcbz
-	 *	 instructions that are executed by the processor
-	 *	 executing the [lwsync] instruction and for which the
-	 *	 specified storage location is in storage that is
-	 *	 Memory Coherence Required and is neither Write Through
-	 *	 Required nor Caching Inhibited.  The applicable pairs
-	 *	 are all pairs a_i, b_j of such accesses except those
-	 *	 in which a_i is an access caused by a Store or dcbz
-	 *	 instruction and b_j is an access caused by a Load
-	 *	 instruction.'
-	 *
-	 *	--PowerPC Virtual Environment Architecture, Book II,
-	 *	  Version 2.01, December 2003, 3.3.3 `Memory Barrier
-	 *	  Instructions', p. 25.
-	 *
-	 * In brief, lwsync is an acquire-release barrier -- it orders
-	 * load-before-load/store and load/store-before-store, but not
-	 * store-before-load.  Except we can't use it on booke, so use
-	 * sync for now.
-	 */
-	sync
+	LWSYNC		/* acq/rel -- r/rw _and_ rw/w (but not w/r) */
 	blr
 END(_membar_release)
 ATOMIC_OP_ALIAS(membar_release,_membar_release)
 
+ENTRY(_membar_producer)
+	SYNC_STORE	/* w/w (and, with eieio, unrelated I/O ordering) */
+	blr
+END(_membar_producer)
+ATOMIC_OP_ALIAS(membar_producer,_membar_producer)
+
 ENTRY(_membar_sync)
 	/*
 	 * sync, or `heavyweight sync', is a full sequential
-	 * consistency barrier.
+	 * consistency barrier (rw/rw).
 	 */
-	sync
+	SYNC
 	blr
 END(_membar_sync)
 ATOMIC_OP_ALIAS(membar_sync,_membar_sync)
 
-ATOMIC_OP_ALIAS(membar_producer,_membar_release)
-STRONG_ALIAS(_membar_producer,_membar_release)
 ATOMIC_OP_ALIAS(membar_consumer,_membar_acquire)
 STRONG_ALIAS(_membar_consumer,_membar_acquire)
 ATOMIC_OP_ALIAS(membar_enter,_membar_sync)
diff --git a/sys/arch/powerpc/include/asm.h b/sys/arch/powerpc/include/asm.h
index 9b61a80e270f..165dc12d771f 100644
--- a/sys/arch/powerpc/include/asm.h
+++ b/sys/arch/powerpc/include/asm.h
@@ -34,6 +34,11 @@
 #ifndef _PPC_ASM_H_
 #define _PPC_ASM_H_
 
+#ifdef _KERNEL_OPT
+#include "opt_multiprocessor.h"
+#include "opt_ppcarch.h"
+#endif
+
 #ifdef _LP64
 
 /* ppc64 is always PIC, r2 is always the TOC */
@@ -450,4 +455,34 @@ y:	.quad	.##y,.TOC.@tocbase,0;	\
 #define	IBM405_ERRATA77_SYNC		/* nothing */
 #endif
 
+#if defined(_KERNEL) && !defined(MULTIPROCESSOR)
+
+#define	ISYNC		/* nothing */
+#define	SYNC		/* nothing */
+#define	LWSYNC		/* nothing */
+#define	SYNC_STORE	/* nothing */
+
+#else  /* !_KERNEL, or _KERNEL && MULTIPROCESSOR */
+
+/*
+ * See common/lib/libc/powerpc/arch/membar_ops.S for notes and
+ * references.
+ */
+
+#define	ISYNC	isync
+#define	SYNC	sync
+
+#if !defined(_KERNEL)
+#define	LWSYNC		sync	/* rw/rw */
+#define	SYNC_STORE	eieio	/* cheaper w/w on classic, rw/rw on booke */
+#elif defined(PPC_BOOKE)
+#define	LWSYNC		msync	/* rw/rw; no cheaper r/rw or rw/w barrier */
+#define	SYNC_STORE	mbar 1	/* w/w, same as classic eieio */
+#else  /* _KERNEL && !PPC_BOOKE */
+#define	LWSYNC		lwsync	/* acq/rel (r/rw and rw/w) */
+#define	SYNC_STORE	eieio	/* w/w (plus I/O ordering) */
+#endif
+
+#endif
+
 #endif /* !_PPC_ASM_H_ */
diff --git a/sys/arch/powerpc/powerpc/lock_stubs.S b/sys/arch/powerpc/powerpc/lock_stubs.S
index 1a63a5b90bdf..62cc593916f1 100644
--- a/sys/arch/powerpc/powerpc/lock_stubs.S
+++ b/sys/arch/powerpc/powerpc/lock_stubs.S
@@ -34,16 +34,7 @@
 #include "assym.h"
 
 #ifdef _KERNEL_OPT
-#include "opt_multiprocessor.h"
 #include "opt_lockdebug.h"
-#endif
-
-#if defined(MULTIPROCESSOR)
-#define	ISYNC	isync
-#define	SYNC	sync
-#else
-#define	ISYNC	/* nothing */
-#define	SYNC	/* nothing */
 #endif
 
 	.text