From 8358c92174d4abd77b78174235a0406a2d019104 Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <riastradh@NetBSD.org>
Date: Wed, 30 Mar 2022 20:33:03 +0000
Subject: [PATCH 03/50] x86: Every load is a load-acquire, so membar_consumer
 is a noop.

lfence is only needed for MD logic, such as operations on I/O memory
rather than normal cacheable memory, or special instructions like
RDTSC -- never for MI synchronization between threads/CPUs.  No need
for hot-patching to do lfence here.

(The x86_lfence function might reasonably be patched on i386 to do
lfence for MD logic, but it isn't now and this doesn't change that.)
---
 common/lib/libc/arch/i386/atomic/atomic.S   | 17 ++++----------
 common/lib/libc/arch/x86_64/atomic/atomic.S | 19 +++++----------
 sys/arch/amd64/include/frameasm.h           |  3 +--
 sys/arch/i386/include/frameasm.h            |  9 ++++---
 sys/arch/x86/x86/patch.c                    | 26 +++++++--------------
 5 files changed, 24 insertions(+), 50 deletions(-)

diff --git a/common/lib/libc/arch/i386/atomic/atomic.S b/common/lib/libc/arch/i386/atomic/atomic.S
index 53ef8e18f228..3dea25c1ce90 100644
--- a/common/lib/libc/arch/i386/atomic/atomic.S
+++ b/common/lib/libc/arch/i386/atomic/atomic.S
@@ -46,12 +46,10 @@
 #include "opt_xen.h"
 #include <machine/frameasm.h>
 #define LOCK			HOTPATCH(HP_NAME_NOLOCK, 1); lock
-#define HOTPATCH_SSE2_LFENCE	HOTPATCH(HP_NAME_SSE2_LFENCE, 7);
 #define HOTPATCH_SSE2_MFENCE	HOTPATCH(HP_NAME_SSE2_MFENCE, 7);
 #define HOTPATCH_CAS_64		HOTPATCH(HP_NAME_CAS_64, 49);
 #else
 #define LOCK			lock
-#define HOTPATCH_SSE2_LFENCE	/* nothing */
 #define HOTPATCH_SSE2_MFENCE	/* nothing */
 #define HOTPATCH_CAS_64		/* nothing */
 #endif
@@ -181,10 +179,11 @@ ENTRY(_atomic_cas_32_ni)
 END(_atomic_cas_32_ni)
 
 ENTRY(_membar_consumer)
-	HOTPATCH_SSE2_LFENCE
-	/* 7 bytes of instructions */
-	LOCK
-	addl	$0, -4(%esp)
+	/*
+	 * Every load from normal memory is a load-acquire on x86, so
+	 * there is never any need for explicit barriers to order
+	 * load-before-anything.
+	 */
 	ret
 END(_membar_consumer)
 
@@ -396,12 +395,6 @@ STRONG_ALIAS(_membar_exit,_membar_producer)
 #ifdef _HARDKERNEL
 	.section .rodata
 
-LABEL(sse2_lfence)
-	lfence
-	ret
-	nop; nop; nop;
-LABEL(sse2_lfence_end)
-
 LABEL(sse2_mfence)
 	mfence
 	ret
diff --git a/common/lib/libc/arch/x86_64/atomic/atomic.S b/common/lib/libc/arch/x86_64/atomic/atomic.S
index 2b2d843cf3d8..a483aa98de1d 100644
--- a/common/lib/libc/arch/x86_64/atomic/atomic.S
+++ b/common/lib/libc/arch/x86_64/atomic/atomic.S
@@ -15,7 +15,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- *      
+ *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
@@ -41,11 +41,9 @@
 #ifdef _HARDKERNEL
 #include <machine/frameasm.h>
 #define LOCK			HOTPATCH(HP_NAME_NOLOCK, 1); lock
-#define HOTPATCH_SSE2_LFENCE	HOTPATCH(HP_NAME_SSE2_LFENCE, 8);
 #define HOTPATCH_SSE2_MFENCE	HOTPATCH(HP_NAME_SSE2_MFENCE, 8);
 #else
 #define LOCK			lock
-#define HOTPATCH_SSE2_LFENCE	/* nothing */
 #define HOTPATCH_SSE2_MFENCE	/* nothing */
 #endif
 
@@ -256,10 +254,11 @@ END(_atomic_cas_64_ni)
 /* memory barriers */
 
 ENTRY(_membar_consumer)
-	HOTPATCH_SSE2_LFENCE
-	/* 8 bytes of instructions */
-	LOCK
-	addq	$0, -8(%rsp)
+	/*
+	 * Every load from normal memory is a load-acquire on x86, so
+	 * there is never any need for explicit barriers to order
+	 * load-before-anything.
+	 */
 	ret
 END(_membar_consumer)
 
@@ -419,12 +418,6 @@ STRONG_ALIAS(_membar_exit,_membar_producer)
 #ifdef _HARDKERNEL
 	.section .rodata
 
-LABEL(sse2_lfence)
-	lfence
-	ret
-	nop; nop; nop; nop;
-LABEL(sse2_lfence_end)
-
 LABEL(sse2_mfence)
 	mfence
 	ret
diff --git a/sys/arch/amd64/include/frameasm.h b/sys/arch/amd64/include/frameasm.h
index bbd30dd78e57..e82077dd8e03 100644
--- a/sys/arch/amd64/include/frameasm.h
+++ b/sys/arch/amd64/include/frameasm.h
@@ -63,8 +63,7 @@
 #define HP_NAME_SVS_ENTER_NMI	11
 #define HP_NAME_SVS_LEAVE_NMI	12
 #define HP_NAME_MDS_LEAVE	13
-#define HP_NAME_SSE2_LFENCE	14
-#define HP_NAME_SSE2_MFENCE	15
+#define HP_NAME_SSE2_MFENCE	14
 
 #define HOTPATCH(name, size) \
 123:						; \
diff --git a/sys/arch/i386/include/frameasm.h b/sys/arch/i386/include/frameasm.h
index f24d05b164d8..3467fa521046 100644
--- a/sys/arch/i386/include/frameasm.h
+++ b/sys/arch/i386/include/frameasm.h
@@ -48,11 +48,10 @@
 #define HP_NAME_STAC		2
 #define HP_NAME_NOLOCK		3
 #define HP_NAME_RETFENCE	4
-#define HP_NAME_SSE2_LFENCE	5
-#define HP_NAME_SSE2_MFENCE	6
-#define HP_NAME_CAS_64		7
-#define HP_NAME_SPLLOWER	8
-#define HP_NAME_MUTEX_EXIT	9
+#define HP_NAME_SSE2_MFENCE	5
+#define HP_NAME_CAS_64		6
+#define HP_NAME_SPLLOWER	7
+#define HP_NAME_MUTEX_EXIT	8
 
 #define HOTPATCH(name, size) \
 123:						; \
diff --git a/sys/arch/x86/x86/patch.c b/sys/arch/x86/x86/patch.c
index 4b91b67dc668..69efb230b05c 100644
--- a/sys/arch/x86/x86/patch.c
+++ b/sys/arch/x86/x86/patch.c
@@ -117,19 +117,6 @@ static const struct x86_hotpatch_descriptor hp_nolock_desc = {
 };
 __link_set_add_rodata(x86_hotpatch_descriptors, hp_nolock_desc);
 
-/* Use LFENCE if available, part of SSE2. */
-extern uint8_t sse2_lfence, sse2_lfence_end;
-static const struct x86_hotpatch_source hp_sse2_lfence_source = {
-	.saddr = &sse2_lfence,
-	.eaddr = &sse2_lfence_end
-};
-static const struct x86_hotpatch_descriptor hp_sse2_lfence_desc = {
-	.name = HP_NAME_SSE2_LFENCE,
-	.nsrc = 1,
-	.srcs = { &hp_sse2_lfence_source }
-};
-__link_set_add_rodata(x86_hotpatch_descriptors, hp_sse2_lfence_desc);
-
 /* Use MFENCE if available, part of SSE2. */
 extern uint8_t sse2_mfence, sse2_mfence_end;
 static const struct x86_hotpatch_source hp_sse2_mfence_source = {
@@ -342,12 +329,15 @@ x86_patch(bool early)
 
 	if (!early && (cpu_feature[0] & CPUID_SSE2) != 0) {
 		/*
-		 * Faster memory barriers.  We do not need to patch
-		 * membar_producer to use SFENCE because on x86
-		 * ordinary non-temporal stores are always issued in
-		 * program order to main memory and to other CPUs.
+		 * Faster memory barriers.  The only barrier x86 ever
+		 * requires for MI synchronization between CPUs is
+		 * MFENCE for store-before-load ordering; all other
+		 * ordering is guaranteed already -- every load is a
+		 * load-acquire and every store is a store-release.
+		 *
+		 * LFENCE and SFENCE are relevant only for MD logic
+		 * involving I/O devices or non-temporal stores.
 		 */
-		x86_hotpatch(HP_NAME_SSE2_LFENCE, 0);
 		x86_hotpatch(HP_NAME_SSE2_MFENCE, 0);
 	}