From 3459ea79fcc0232d8c18ea33583e8b6f455cbd27 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 3 Dec 2021 12:56:00 +0000 Subject: [PATCH 1/2] runtime: Use REP STOS for memclr, no vectors or nontemporal stores. --- src/runtime/memclr_amd64.s | 174 +++---------------------------------- 1 file changed, 10 insertions(+), 164 deletions(-) diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s index 700bbd7b9b..7a61881811 100644 --- a/src/runtime/memclr_amd64.s +++ b/src/runtime/memclr_amd64.s @@ -2,9 +2,6 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !plan9 - -#include "go_asm.h" #include "textflag.h" // See memclrNoHeapPointers Go doc for important implementation constraints. @@ -15,165 +12,14 @@ TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16 // AX = ptr // BX = n MOVQ AX, DI // DI = ptr - XORQ AX, AX - - // MOVOU seems always faster than REP STOSQ. -tail: - // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. - TESTQ BX, BX - JEQ _0 - CMPQ BX, $2 - JBE _1or2 - CMPQ BX, $4 - JBE _3or4 - CMPQ BX, $8 - JB _5through7 - JE _8 - CMPQ BX, $16 - JBE _9through16 - CMPQ BX, $32 - JBE _17through32 - CMPQ BX, $64 - JBE _33through64 - CMPQ BX, $128 - JBE _65through128 - CMPQ BX, $256 - JBE _129through256 - CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 - JE loop_preheader_avx2 - // TODO: for really big clears, use MOVNTDQ, even without AVX2. - -loop: - MOVOU X15, 0(DI) - MOVOU X15, 16(DI) - MOVOU X15, 32(DI) - MOVOU X15, 48(DI) - MOVOU X15, 64(DI) - MOVOU X15, 80(DI) - MOVOU X15, 96(DI) - MOVOU X15, 112(DI) - MOVOU X15, 128(DI) - MOVOU X15, 144(DI) - MOVOU X15, 160(DI) - MOVOU X15, 176(DI) - MOVOU X15, 192(DI) - MOVOU X15, 208(DI) - MOVOU X15, 224(DI) - MOVOU X15, 240(DI) - SUBQ $256, BX - ADDQ $256, DI - CMPQ BX, $256 - JAE loop - JMP tail - -loop_preheader_avx2: - VPXOR Y0, Y0, Y0 - // For smaller sizes MOVNTDQ may be faster or slower depending on hardware. - // For larger sizes it is always faster, even on dual Xeons with 30M cache. - // TODO take into account actual LLC size. E. g. glibc uses LLC size/2. - CMPQ BX, $0x2000000 - JAE loop_preheader_avx2_huge -loop_avx2: - VMOVDQU Y0, 0(DI) - VMOVDQU Y0, 32(DI) - VMOVDQU Y0, 64(DI) - VMOVDQU Y0, 96(DI) - SUBQ $128, BX - ADDQ $128, DI - CMPQ BX, $128 - JAE loop_avx2 - VMOVDQU Y0, -32(DI)(BX*1) - VMOVDQU Y0, -64(DI)(BX*1) - VMOVDQU Y0, -96(DI)(BX*1) - VMOVDQU Y0, -128(DI)(BX*1) - VZEROUPPER - RET -loop_preheader_avx2_huge: - // Align to 32 byte boundary - VMOVDQU Y0, 0(DI) - MOVQ DI, SI - ADDQ $32, DI - ANDQ $~31, DI - SUBQ DI, SI - ADDQ SI, BX -loop_avx2_huge: - VMOVNTDQ Y0, 0(DI) - VMOVNTDQ Y0, 32(DI) - VMOVNTDQ Y0, 64(DI) - VMOVNTDQ Y0, 96(DI) - SUBQ $128, BX - ADDQ $128, DI - CMPQ BX, $128 - JAE loop_avx2_huge - // In the description of MOVNTDQ in [1] - // "... fencing operation implemented with the SFENCE or MFENCE instruction - // should be used in conjunction with MOVNTDQ instructions..." - // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf - SFENCE - VMOVDQU Y0, -32(DI)(BX*1) - VMOVDQU Y0, -64(DI)(BX*1) - VMOVDQU Y0, -96(DI)(BX*1) - VMOVDQU Y0, -128(DI)(BX*1) - VZEROUPPER - RET - -_1or2: - MOVB AX, (DI) - MOVB AX, -1(DI)(BX*1) - RET -_0: - RET -_3or4: - MOVW AX, (DI) - MOVW AX, -2(DI)(BX*1) - RET -_5through7: - MOVL AX, (DI) - MOVL AX, -4(DI)(BX*1) - RET -_8: - // We need a separate case for 8 to make sure we clear pointers atomically. - MOVQ AX, (DI) - RET -_9through16: - MOVQ AX, (DI) - MOVQ AX, -8(DI)(BX*1) - RET -_17through32: - MOVOU X15, (DI) - MOVOU X15, -16(DI)(BX*1) - RET -_33through64: - MOVOU X15, (DI) - MOVOU X15, 16(DI) - MOVOU X15, -32(DI)(BX*1) - MOVOU X15, -16(DI)(BX*1) - RET -_65through128: - MOVOU X15, (DI) - MOVOU X15, 16(DI) - MOVOU X15, 32(DI) - MOVOU X15, 48(DI) - MOVOU X15, -64(DI)(BX*1) - MOVOU X15, -48(DI)(BX*1) - MOVOU X15, -32(DI)(BX*1) - MOVOU X15, -16(DI)(BX*1) - RET -_129through256: - MOVOU X15, (DI) - MOVOU X15, 16(DI) - MOVOU X15, 32(DI) - MOVOU X15, 48(DI) - MOVOU X15, 64(DI) - MOVOU X15, 80(DI) - MOVOU X15, 96(DI) - MOVOU X15, 112(DI) - MOVOU X15, -128(DI)(BX*1) - MOVOU X15, -112(DI)(BX*1) - MOVOU X15, -96(DI)(BX*1) - MOVOU X15, -80(DI)(BX*1) - MOVOU X15, -64(DI)(BX*1) - MOVOU X15, -48(DI)(BX*1) - MOVOU X15, -32(DI)(BX*1) - MOVOU X15, -16(DI)(BX*1) + MOVQ BX, CX + ANDQ $7, BX + SHRQ $3, CX + MOVQ $0, AX + CLD + REP + STOSQ + MOVQ BX, CX + REP + STOSB RET From 8949ee351c0e7e9b383604ca4fe85784388716f9 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Fri, 3 Dec 2021 13:12:06 +0000 Subject: [PATCH 2/2] runtime: Use REP MOVS for memmove, no vectors or nontemporal stores. --- src/runtime/memmove_amd64.s | 68 +++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s index eeb5033fd9..0646b85565 100644 --- a/src/runtime/memmove_amd64.s +++ b/src/runtime/memmove_amd64.s @@ -40,6 +40,73 @@ TEXT runtime·memmove(SB), NOSPLIT, $0-24 MOVQ BX, SI MOVQ CX, BX + // check for ordering -- forward copy is OK even when + // overlapping if dst <= src, but if src < dst then just + // always do backward + CMPQ SI, DI + JLS backward + + // forward + CLD // set DF=0 so MOVS copies forwards + + // copy aligned 8-byte units forwards + // for (CX = n/8; CX != 0; CX -= 8, SI += 8, DI += 8) + // *(uint64_t *)DI = *(const uint64_t *)SI; + SHRQ $3, CX + ANDQ $7, BX + REP + MOVSQ + + // copy any remaining 1-byte units forwards + // for (CX = n%8; CX != 0; CX -= 1, SI += 1, DI += 1) + // *(uint8_t *)DI = *(const uint8_t *)SI; + MOVQ BX, CX + REP + MOVSB + + // all done + RET + +backward: + STD // set DF=1 so MOVS copies backwards + TESTQ $7, CX // byte-aligned? + JNZ unaligned_backward + + // copy 8-byte units backwards + // for (CX = n, SI += n - 8, DI += n - 8; + // CX != 0; + // CX -= 8, SI -= 8, DI -= 8) + // *(uint64_t *)DI = *(const uint64_t *)SI; + ADDQ CX, DI // DI += CX - 8 + ADDQ $-8, DI // (can't figure out go LEA syntax) + ADDQ CX, SI // SI += CX - 8 + ADDQ $-8, SI + SHRQ $3, CX + REP + MOVSQ + + // all done, reset DF=0 in case caller cares + CLD + RET + +unaligned_backward: + // copy 1-byte units backwards + // for (CX = n, SI += n - 1, DI += n - 1; + // CX != 0; + // CX -= 1, SI -= 1, DI -= 1) + // *(uint8_t *)DI = *(const uint8_t *)SI; + ADDQ CX, DI // DI += CX - 1 + ADDQ $-1, DI + ADDQ CX, SI // SI += CX - 1 + ADDQ $-1, SI + REP + MOVSB + + // all done, reset DF=0 in case caller cares + CLD + RET + +#ifdef notyet // REP instructions have a high startup cost, so we handle small sizes // with some straightline code. The REP MOVSQ instruction is really fast // for large sizes. The cutover is approximately 2K. @@ -530,3 +597,4 @@ gobble_big_mem_bwd_loop: MOVOU X11, 0x60(AX) MOVOU X12, 0x70(AX) RET +#endif