From 0002fed8bcd99c178133d434bcc4cd65bf858bee Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 13 Nov 2021 13:02:04 +0000 Subject: [PATCH] arm: Fix CPU startup synchronization. - Use load-acquire instead of (wrong) membar_consumer then load in cpu_boot_secondary_processors and cpu_hatched_p. => (Could use load then membar_consumer instead but load-acquire is shorter.) - Issue dmb ish before setting or clearing the bit in cpu_set_hatched and cpu_clr_mbox to effect a store-release. => (Could use membar_exit, which is semantically weaker than dmb ish but on arm is just implemented as dmb ish.) => (Could use stlr except we don't have atomic_ops(9) to do that.) This way, everything before cpu_set_hatched or cpu_clr_mbox is guaranteed to happen before everything after cpu_boot_secondary_processors, which was previously not guaranteed. --- sys/arch/aarch64/aarch64/locore.S | 3 +-- sys/arch/arm/arm/armv6_start.S | 4 ++-- sys/arch/arm/arm/cpu_subr.c | 10 +++++++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/sys/arch/aarch64/aarch64/locore.S b/sys/arch/aarch64/aarch64/locore.S index 04ccf19b0805..65e15a1650dd 100644 --- a/sys/arch/aarch64/aarch64/locore.S +++ b/sys/arch/aarch64/aarch64/locore.S @@ -520,8 +520,7 @@ mp_vstart: /* wait for the mailbox start bit to become true */ 1: - dmb sy - ldr x20, [x28] + ldar x20, [x28] /* matches cpu_boot_secondary_processors */ tst x20, x29 bne 9f wfe diff --git a/sys/arch/arm/arm/armv6_start.S b/sys/arch/arm/arm/armv6_start.S index 5ac3121a3a4b..9508f89244f8 100644 --- a/sys/arch/arm/arm/armv6_start.S +++ b/sys/arch/arm/arm/armv6_start.S @@ -924,8 +924,8 @@ armv7_mpcontinuation: lsl r5, R_INDEX // ... for our cpu /* wait for the mailbox start bit to become true */ -1: dmb // data memory barrier - ldr r2, [r6] // load mbox +1: ldr r2, [r6] // load mbox + dmb // make it a load-acquire tst r2, r5 // is our bit set? wfeeq // no, back to waiting beq 1b // no, and try again diff --git a/sys/arch/arm/arm/cpu_subr.c b/sys/arch/arm/arm/cpu_subr.c index fcbf500fcbd6..8b622843a7f6 100644 --- a/sys/arch/arm/arm/cpu_subr.c +++ b/sys/arch/arm/arm/cpu_subr.c @@ -81,6 +81,7 @@ cpu_boot_secondary_processors(void) VPRINTF("%s: starting secondary processors\n", __func__); /* send mbox to have secondary processors do cpu_hatch() */ + dmb(ish); /* store-release matches locore.S/armv6_start.S */ for (size_t n = 0; n < __arraycount(arm_cpu_mbox); n++) atomic_or_ulong(&arm_cpu_mbox[n], arm_cpu_hatched[n]); @@ -95,7 +96,8 @@ cpu_boot_secondary_processors(void) const size_t off = cpuno / CPUINDEX_DIVISOR; const u_long bit = __BIT(cpuno % CPUINDEX_DIVISOR); - while (membar_consumer(), arm_cpu_mbox[off] & bit) { + /* load-acquire matches cpu_clr_mbox */ + while (atomic_load_acquire(&arm_cpu_mbox[off]) & bit) { __asm __volatile ("wfe"); } /* Add processor to kcpuset */ @@ -111,8 +113,8 @@ cpu_hatched_p(u_int cpuindex) const u_int off = cpuindex / CPUINDEX_DIVISOR; const u_int bit = cpuindex % CPUINDEX_DIVISOR; - membar_consumer(); - return (arm_cpu_hatched[off] & __BIT(bit)) != 0; + /* load-acquire matches cpu_set_hatched */ + return (atomic_load_acquire(&arm_cpu_hatched[off]) & __BIT(bit)) != 0; } void @@ -122,6 +124,7 @@ cpu_set_hatched(int cpuindex) const size_t off = cpuindex / CPUINDEX_DIVISOR; const u_long bit = __BIT(cpuindex % CPUINDEX_DIVISOR); + dmb(ish); /* store-release matches cpu_hatched_p */ atomic_or_ulong(&arm_cpu_hatched[off], bit); dsb(ishst); sev(); @@ -135,6 +138,7 @@ cpu_clr_mbox(int cpuindex) const u_long bit = __BIT(cpuindex % CPUINDEX_DIVISOR); /* Notify cpu_boot_secondary_processors that we're done */ + dmb(ish); /* store-release */ atomic_and_ulong(&arm_cpu_mbox[off], ~bit); dsb(ishst); sev();