From a498c50a25a312b45b9ee585464d0e127e9e77ce Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Thu, 31 Mar 2022 12:06:30 +0000 Subject: [PATCH 48/49] mips/cavium: Take advantage of Octeon's guaranteed r/rw ordering. --- common/lib/libc/arch/mips/atomic/membar_ops.S | 94 +++++++++++++------ sys/arch/mips/include/asm.h | 40 +++++++- 2 files changed, 100 insertions(+), 34 deletions(-) diff --git a/common/lib/libc/arch/mips/atomic/membar_ops.S b/common/lib/libc/arch/mips/atomic/membar_ops.S index dae62c17d1d6..a1e2440791d8 100644 --- a/common/lib/libc/arch/mips/atomic/membar_ops.S +++ b/common/lib/libc/arch/mips/atomic/membar_ops.S @@ -38,44 +38,80 @@ LEAF(_membar_sync) j ra BDSYNC END(_membar_sync) +ATOMIC_OP_ALIAS(membar_sync,_membar_sync) + +STRONG_ALIAS(_membar_enter,_membar_sync) +ATOMIC_OP_ALIAS(membar_enter,_membar_sync) #ifdef __OCTEON__ + +/* + * cnMIPS guarantees load-before-load/store ordering without any + * barriers. So the only barriers we need are store-before-load (sync) + * and store-before-store (syncw, i.e., sync 4). See Table 2-32 + * `Execution Ordering Rules' on p. 104 of Cavium OCTEON III CN78XX + * Hardware Reference Manual, CN78XX-HM-0.99E, September 2014: + * + * First Operation DLD [load instruction to a physical + * address that is L2/DRAM] + * Second Operation Any + * Execution Ordering Comments + * + * The second operation cannot appear to execute before + * the first (DLD) operation, regardless of the presence + * or absence of SYNC* instructions. + * + * Note: I'm not sure if this applies to earlier cnMIPS -- can't find + * it in the Cavium Networks OCTEON Plus CN50XX Hardware Reference + * Manual CN50XX-HM-0.99E, July 2008. Experimentally, on an erlite3 + * (Cavium Octeon CN5020-500), I can easily detect reordering of + * store-before-store and store-before-load, but I haven't been able to + * detect any reordering of load-before-load or load-before-store. + * + * Note: On early cnMIPS (CN3xxx), there is an erratum which sometimes + * requires issuing two syncw's in a row. I don't know the details -- + * don't have documentation -- and in Linux it is only used for I/O + * purposes. + * + * Currently we don't build kernels that work on both Octeon and + * non-Octeon MIPS CPUs, so none of this is done with binary patching. + * For userlands we could use a separate shared library on Octeon with + * ld.so.conf to override the symbols with cheaper definitions, but we + * don't do that now. + */ + +LEAF(_membar_acquire) + j ra + nop +END(_membar_acquire) +ATOMIC_OP_ALIAS(membar_acquire,_membar_acquire) + +STRONG_ALIAS(_membar_consumer,_membar_acquire) +ATOMIC_OP_ALIAS(membar_consumer,_membar_acquire) + LEAF(_membar_release) - /* - * syncw is documented as ordering store-before-store in - * - * Cavium OCTEON III CN78XX Hardware Reference Manual, - * CN78XX-HM-0.99E, September 2014. - * - * It's unclear from the documentation the architecture - * guarantees load-before-store ordering without barriers, but - * this code assumes it does. If that assumption is wrong, we - * can only use syncw for membar_producer -- membar_release has - * to use the full sync. - */ j ra syncw END(_membar_release) -#endif +ATOMIC_OP_ALIAS(membar_release,_membar_release) -ATOMIC_OP_ALIAS(membar_sync,_membar_sync) -ATOMIC_OP_ALIAS(membar_acquire,_membar_sync) -STRONG_ALIAS(_membar_acquire,_membar_sync) -ATOMIC_OP_ALIAS(membar_enter,_membar_sync) -STRONG_ALIAS(_membar_enter,_membar_sync) -#ifdef __OCTEON__ -ATOMIC_OP_ALIAS(membar_exit,_membar_release) STRONG_ALIAS(_membar_exit,_membar_release) -ATOMIC_OP_ALIAS(membar_release,_membar_release) -ATOMIC_OP_ALIAS(membar_producer,_membar_release) +ATOMIC_OP_ALIAS(membar_exit,_membar_release) + STRONG_ALIAS(_membar_producer,_membar_release) -#else -ATOMIC_OP_ALIAS(membar_exit,_membar_sync) -STRONG_ALIAS(_membar_exit,_membar_sync) -ATOMIC_OP_ALIAS(membar_release,_membar_sync) +ATOMIC_OP_ALIAS(membar_producer,_membar_release) + +#else /* !__OCTEON__ */ + +STRONG_ALIAS(_membar_acquire,_membar_sync) +ATOMIC_OP_ALIAS(membar_acquire,_membar_sync) STRONG_ALIAS(_membar_release,_membar_sync) -ATOMIC_OP_ALIAS(membar_producer,_membar_sync) +ATOMIC_OP_ALIAS(membar_release,_membar_sync) +STRONG_ALIAS(_membar_exit,_membar_sync) +ATOMIC_OP_ALIAS(membar_exit,_membar_sync) +STRONG_ALIAS(_membar_consumer,_membar_sync) +ATOMIC_OP_ALIAS(membar_consumer,_membar_sync) STRONG_ALIAS(_membar_producer,_membar_sync) +ATOMIC_OP_ALIAS(membar_producer,_membar_sync) + #endif -ATOMIC_OP_ALIAS(membar_consumer,_membar_sync) -STRONG_ALIAS(_membar_consumer,_membar_sync) diff --git a/sys/arch/mips/include/asm.h b/sys/arch/mips/include/asm.h index 3103118b1d71..3d0eda76b6a1 100644 --- a/sys/arch/mips/include/asm.h +++ b/sys/arch/mips/include/asm.h @@ -572,12 +572,42 @@ _C_LABEL(x): /* compiler define */ #if defined(__OCTEON__) - /* early cnMIPS have erratum which means 2 */ -#define LLSCSYNC sync 4; sync 4 +/* + * cnMIPS guarantees load-before-load/store ordering without any + * barriers. So the only barriers we need are store-before-load (sync) + * and store-before-store (syncw, i.e., sync 4). See Table 2-32 + * `Execution Ordering Rules' on p. 104 of Cavium OCTEON III CN78XX + * Hardware Reference Manual, CN78XX-HM-0.99E, September 2014: + * + * First Operation DLD [load instruction to a physical + * address that is L2/DRAM] + * Second Operation Any + * Execution Ordering Comments + * + * The second operation cannot appear to execute before + * the first (DLD) operation, regardless of the presence + * or absence of SYNC* instructions. + * + * Note: I'm not sure if this applies to earlier cnMIPS -- can't find + * it in the Cavium Networks OCTEON Plus CN50XX Hardware Reference + * Manual CN50XX-HM-0.99E, July 2008. + * + * Except cnMIPS also has a quirk where the store buffer can get + * clogged and we need to apply a plunger to it _after_ releasing a + * lock or else other CPUs may spin for hundreds of thousands of cycles + * before they see the lock is released. So we also have the quirky + * SYNC_PLUNGER barrier as syncw. + * + * Note: On early cnMIPS (CN3xxx), there is an erratum which sometimes + * requires issuing two syncw's in a row. I don't know the details -- + * don't have documentation -- and in Linux it is only used for I/O + * purposes. + */ +#define LLSCSYNC /* nothing */ #define BDSYNC sync -#define BDSYNC_ACQ sync -#define SYNC_ACQ sync -#define SYNC_REL sync +#define BDSYNC_ACQ nop +#define SYNC_ACQ /* nothing */ +#define SYNC_REL sync 4 #define BDSYNC_PLUNGER sync 4 #define SYNC_PLUNGER sync 4 #elif __mips >= 3 || !defined(__mips_o32)