From 9d202e1515a05c7cce295f8e5a6f3afbeed1dfcf Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <riastradh@NetBSD.org>
Date: Sat, 13 Nov 2021 13:02:04 +0000
Subject: [PATCH] arm: Fix CPU startup synchronization.

- Use load-acquire instead of (wrong) membar_consumer then load in
  cpu_boot_secondary_processors and cpu_hatched_p.

  => (Could use load then membar_consumer instead but load-acquire is
     shorter.)

- Issue membar_exit before setting or clearing the bit in
  cpu_set_hatched and cpu_clr_mbox.

This way, everything before cpu_set_hatched or cpu_clr_mbox is
guaranteed to happen before everything after
cpu_boot_secondary_processors, which was previously not guaranteed.
---
 sys/arch/aarch64/aarch64/locore.S |  3 +--
 sys/arch/arm/arm/armv6_start.S    |  4 ++--
 sys/arch/arm/arm/cpu_subr.c       | 10 +++++++---
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/sys/arch/aarch64/aarch64/locore.S b/sys/arch/aarch64/aarch64/locore.S
index 04ccf19b0805..65e15a1650dd 100644
--- a/sys/arch/aarch64/aarch64/locore.S
+++ b/sys/arch/aarch64/aarch64/locore.S
@@ -520,8 +520,7 @@ mp_vstart:
 
 	/* wait for the mailbox start bit to become true */
 1:
-	dmb	sy
-	ldr	x20, [x28]
+	ldar	x20, [x28]	/* matches cpu_boot_secondary_processors */
 	tst	x20, x29
 	bne	9f
 	wfe
diff --git a/sys/arch/arm/arm/armv6_start.S b/sys/arch/arm/arm/armv6_start.S
index 5ac3121a3a4b..9508f89244f8 100644
--- a/sys/arch/arm/arm/armv6_start.S
+++ b/sys/arch/arm/arm/armv6_start.S
@@ -924,8 +924,8 @@ armv7_mpcontinuation:
 	lsl	r5, R_INDEX			// ... for our cpu
 
 	/* wait for the mailbox start bit to become true */
-1:	dmb					// data memory barrier
-	ldr	r2, [r6]			// load mbox
+1:	ldr	r2, [r6]			// load mbox
+	dmb					//    make it a load-acquire
 	tst	r2, r5				// is our bit set?
 	wfeeq					//    no, back to waiting
 	beq	1b				//    no, and try again
diff --git a/sys/arch/arm/arm/cpu_subr.c b/sys/arch/arm/arm/cpu_subr.c
index fcbf500fcbd6..47658cd6f3b4 100644
--- a/sys/arch/arm/arm/cpu_subr.c
+++ b/sys/arch/arm/arm/cpu_subr.c
@@ -81,6 +81,7 @@ cpu_boot_secondary_processors(void)
 	VPRINTF("%s: starting secondary processors\n", __func__);
 
 	/* send mbox to have secondary processors do cpu_hatch() */
+	membar_exit();  /* store-release matches locore.S/armv6_start.S */
 	for (size_t n = 0; n < __arraycount(arm_cpu_mbox); n++)
 		atomic_or_ulong(&arm_cpu_mbox[n], arm_cpu_hatched[n]);
 
@@ -95,7 +96,8 @@ cpu_boot_secondary_processors(void)
 		const size_t off = cpuno / CPUINDEX_DIVISOR;
 		const u_long bit = __BIT(cpuno % CPUINDEX_DIVISOR);
 
-		while (membar_consumer(), arm_cpu_mbox[off] & bit) {
+		/* load-acquire matches cpu_clr_mbox */
+		while (atomic_load_acquire(&arm_cpu_mbox[off]) & bit) {
 			__asm __volatile ("wfe");
 		}
 		/* Add processor to kcpuset */
@@ -111,8 +113,8 @@ cpu_hatched_p(u_int cpuindex)
 	const u_int off = cpuindex / CPUINDEX_DIVISOR;
 	const u_int bit = cpuindex % CPUINDEX_DIVISOR;
 
-	membar_consumer();
-	return (arm_cpu_hatched[off] & __BIT(bit)) != 0;
+	/* load-acquire matches cpu_set_hatched */
+	return (atomic_load_acquire(&arm_cpu_hatched[off]) & __BIT(bit)) != 0;
 }
 
 void
@@ -122,6 +124,7 @@ cpu_set_hatched(int cpuindex)
 	const size_t off = cpuindex / CPUINDEX_DIVISOR;
 	const u_long bit = __BIT(cpuindex % CPUINDEX_DIVISOR);
 
+	membar_exit();		/* store-release matches cpu_hatched_p */
 	atomic_or_ulong(&arm_cpu_hatched[off], bit);
 	dsb(ishst);
 	sev();
@@ -135,6 +138,7 @@ cpu_clr_mbox(int cpuindex)
 	const u_long bit = __BIT(cpuindex % CPUINDEX_DIVISOR);
 
 	/* Notify cpu_boot_secondary_processors that we're done */
+	membar_exit();		/* store-release */
 	atomic_and_ulong(&arm_cpu_mbox[off], ~bit);
 	dsb(ishst);
 	sev();