diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s index 700bbd7b9b..e3fd4dd0fd 100644 --- a/src/runtime/memclr_amd64.s +++ b/src/runtime/memclr_amd64.s @@ -17,6 +17,12 @@ TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16 MOVQ AX, DI // DI = ptr XORQ AX, AX +#ifndef hack + CLD + MOVQ BX, CX + REP + STOSB +#else // MOVOU seems always faster than REP STOSQ. tail: // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. @@ -176,4 +182,5 @@ _129through256: MOVOU X15, -48(DI)(BX*1) MOVOU X15, -32(DI)(BX*1) MOVOU X15, -16(DI)(BX*1) +#endif RET diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s index eeb5033fd9..05dd64412a 100644 --- a/src/runtime/memmove_amd64.s +++ b/src/runtime/memmove_amd64.s @@ -38,6 +38,23 @@ TEXT runtime·memmove(SB), NOSPLIT, $0-24 // CX = n MOVQ AX, DI MOVQ BX, SI +#ifdef hack + CMPQ SI, DI // Copy right-to-left if src < dst, else left-to-right. + JL 0f + CLD // Copy left-to-right. + JMP 1f +0: + STD // Copy right-to-left. + ADDQ CX, DI // Point to end of dst. + ADDQ $-1, DI + ADDQ CX, SI // Point to end of src. + ADDQ $-1, SI +1: + REP + MOVSB + + CLD // Either way, leave it cleared. +#else MOVQ CX, BX // REP instructions have a high startup cost, so we handle small sizes @@ -529,4 +546,5 @@ gobble_big_mem_bwd_loop: MOVOU X10, 0x50(AX) MOVOU X11, 0x60(AX) MOVOU X12, 0x70(AX) +#endif RET