diff -r b9bd12fb9564 sys/crypto/aes/arch/arm/aes_neon_32.S --- a/sys/crypto/aes/arch/arm/aes_neon_32.S Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/aes/arch/arm/aes_neon_32.S Wed Aug 05 14:02:05 2020 +0000 @@ -270,7 +270,7 @@ ENTRY(aes_neon_enc1) ldr r12, .Lconstants_addr adr r11, .Lconstants_addr - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ movw r3, #0 vmov.i8 q1, #0x0f @@ -280,8 +280,8 @@ ENTRY(aes_neon_enc1) /* (q4, q5) := (iptlo, ipthi) */ add r6, r12, #(iptlo - .Lconstants) add r7, r12, #(ipthi - .Lconstants) - vld1.64 {d8-d9}, [r6 :128] - vld1.64 {d10-d11}, [r7 :128] + vld1.8 {d8-d9}, [r6 :128] + vld1.8 {d10-d11}, [r7 :128] /* load the rest of the constants */ add r4, r12, #(sb1_0 - .Lconstants) @@ -290,12 +290,12 @@ ENTRY(aes_neon_enc1) add r7, r12, #(sb2_1 - .Lconstants) add r8, r12, #(inv - .Lconstants) add r10, r12, #(inva - .Lconstants) - vld1.64 {d12-d13}, [r4 :128] /* q6 = sb1[0] */ - vld1.64 {d14-d15}, [r5 :128] /* q7 = sb1[1] */ - vld1.64 {d16-d17}, [r6 :128] /* q8 = sb2[0] */ - vld1.64 {d18-d19}, [r7 :128] /* q9 = sb2[1] */ - vld1.64 {d20-d21}, [r8 :128] /* q10 = inv */ - vld1.64 {d22-d23}, [r10 :128] /* q11 = inva */ + vld1.8 {d12-d13}, [r4 :128] /* q6 = sb1[0] */ + vld1.8 {d14-d15}, [r5 :128] /* q7 = sb1[1] */ + vld1.8 {d16-d17}, [r6 :128] /* q8 = sb2[0] */ + vld1.8 {d18-d19}, [r7 :128] /* q9 = sb2[1] */ + vld1.8 {d20-d21}, [r8 :128] /* q10 = inv */ + vld1.8 {d22-d23}, [r10 :128] /* q11 = inva */ /* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */ add r4, r12, #(mc_forward - .Lconstants) @@ -319,7 +319,7 @@ ENTRY(aes_neon_enc1) b 2f _ALIGN_TEXT -1: vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ +1: vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ /* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */ vtbl.8 d24, {d12-d13}, d4 @@ -339,8 +339,8 @@ 1: vld1.64 {d28-d29}, [r0 :128]! /* q14 /* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */ add r6, r4, r3, lsl #4 add r7, r5, r3, lsl #4 - vld1.64 {d24-d25}, [r6] - vld1.64 {d26-d27}, [r7] + vld1.8 {d24-d25}, [r6] + vld1.8 {d26-d27}, [r7] /* q15 := A2_B = A2 + A(mcf) */ vtbl.8 d30, {d0-d1}, d24 @@ -412,11 +412,11 @@ 2: /* add r6, r12, #(sbo_0 - .Lconstants) add r7, r12, #(sbo_1 - .Lconstants) add r8, r8, r3, lsl #4 - vld1.64 {d12-d13}, [r6 :128] - vld1.64 {d14-d15}, [r7 :128] - vld1.64 {d30-d31}, [r8 :128] + vld1.8 {d12-d13}, [r6 :128] + vld1.8 {d14-d15}, [r7 :128] + vld1.8 {d30-d31}, [r8 :128] - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ /* (q2, q3) := (sbo_0(io), sbo_1(jo)) */ vtbl.8 d4, {d12-d13}, d4 @@ -489,7 +489,7 @@ ENTRY(aes_neon_dec1) ldr r12, .Lconstants_addr adr r11, .Lconstants_addr - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ rsb r3, r1, #0 /* r3 := ~(x - 1) = -x */ vmov.i8 q1, #0x0f and r3, r3, #3 /* r3 := 3 & ~(x - 1) */ @@ -500,8 +500,8 @@ ENTRY(aes_neon_dec1) /* (q4, q5) := (diptlo, dipthi) */ add r6, r12, #(diptlo - .Lconstants) add r7, r12, #(dipthi - .Lconstants) - vld1.64 {d8-d9}, [r6 :128] - vld1.64 {d10-d11}, [r7 :128] + vld1.8 {d8-d9}, [r6 :128] + vld1.8 {d10-d11}, [r7 :128] /* load the rest of the constants */ add r4, r12, #(dsbb_0 - .Lconstants) @@ -509,11 +509,11 @@ ENTRY(aes_neon_dec1) add r6, r12, #(inv - .Lconstants) add r7, r12, #(inva - .Lconstants) add r8, r12, #(.Lmc_forward_3 - .Lconstants) - vld1.64 {d12-d13}, [r4 :128] /* q6 := dsbb[0] */ - vld1.64 {d14-d15}, [r5 :128] /* q7 := dsbb[1] */ - vld1.64 {d20-d21}, [r6 :128] /* q10 := inv */ - vld1.64 {d22-d23}, [r7 :128] /* q11 := inva */ - vld1.64 {d30-d31}, [r8 :128] /* q15 := mc_forward[3] */ + vld1.8 {d12-d13}, [r4 :128] /* q6 := dsbb[0] */ + vld1.8 {d14-d15}, [r5 :128] /* q7 := dsbb[1] */ + vld1.8 {d20-d21}, [r6 :128] /* q10 := inv */ + vld1.8 {d22-d23}, [r7 :128] /* q11 := inva */ + vld1.8 {d30-d31}, [r8 :128] /* q15 := mc_forward[3] */ /* (q2, q3) := (lo, hi) */ vshr.u8 q3, q0, #4 @@ -529,8 +529,8 @@ ENTRY(aes_neon_dec1) /* load dsb9 */ add r4, r12, #(dsb9_0 - .Lconstants) add r5, r12, #(dsb9_1 - .Lconstants) - vld1.64 {d8-d9}, [r4 :128] /* q4 := dsb9[0] */ - vld1.64 {d10-d11}, [r5 :128] /* q5 := dsb9[1] */ + vld1.8 {d8-d9}, [r4 :128] /* q4 := dsb9[0] */ + vld1.8 {d10-d11}, [r5 :128] /* q5 := dsb9[1] */ /* q0 := rk[0] + diptlo(lo) + dipthi(hi) */ veor q0, q14, q2 @@ -541,10 +541,10 @@ ENTRY(aes_neon_dec1) _ALIGN_TEXT 1: /* load dsbd */ add r4, r12, #(dsbd_0 - .Lconstants) - vld1.64 {d16-d17}, [r4 :128]! /* q8 := dsbd[0] */ - vld1.64 {d18-d19}, [r4 :128] /* q9 := dsbd[1] */ + vld1.8 {d16-d17}, [r4 :128]! /* q8 := dsbd[0] */ + vld1.8 {d18-d19}, [r4 :128] /* q9 := dsbd[1] */ - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ /* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */ vtbl.8 d24, {d8-d9}, d4 @@ -568,8 +568,8 @@ 1: /* load dsbd */ /* load dsbe */ add r4, r12, #(dsbe_0 - .Lconstants) - vld1.64 {d16-d17}, [r4 :128]! /* q8 := dsbe[0] */ - vld1.64 {d18-d19}, [r4 :128] /* q9 := dsbe[1] */ + vld1.8 {d16-d17}, [r4 :128]! /* q8 := dsbe[0] */ + vld1.8 {d18-d19}, [r4 :128] /* q9 := dsbe[1] */ /* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */ vtbl.8 d28, {d0-d1}, d30 @@ -647,11 +647,11 @@ 2: /* add r6, r12, #(dsbo_0 - .Lconstants) add r7, r12, #(dsbo_1 - .Lconstants) add r8, r8, r3, lsl #4 - vld1.64 {d12-d13}, [r6 :128] - vld1.64 {d14-d15}, [r7 :128] - vld1.64 {d30-d31}, [r8 :128] + vld1.8 {d12-d13}, [r6 :128] + vld1.8 {d14-d15}, [r7 :128] + vld1.8 {d30-d31}, [r8 :128] - vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ + vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */ /* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */ vtbl.8 d4, {d12-d13}, d4 diff -r b9bd12fb9564 sys/crypto/aes/arch/arm/aes_neon_subr.c --- a/sys/crypto/aes/arch/arm/aes_neon_subr.c Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/aes/arch/arm/aes_neon_subr.c Wed Aug 05 14:02:05 2020 +0000 @@ -57,6 +57,20 @@ storeblock(void *out, uint8x16_t block) vst1q_u8(out, block); } +#if _BYTE_ORDER == _LITTLE_ENDIAN +#define vbetoh32q_u8 vrev32q_u8 +#define vhtobe32q_u8 vrev32q_u8 +#define vletoh32q_u8(x) (x) +#define vhtole32q_u8(x) (x) +#elif _BYTE_ORDER == _BIG_ENDIAN +#define vbetoh32q_u8(x) (x) +#define vhtobe32q_u8(x) (x) +#define vletoh32q_u8 vrev32q_u8 +#define vhtole32q_u8 vrev32q_u8 +#else +#error what kind of endian are you anyway +#endif + void aes_neon_enc(const struct aesenc *enc, const uint8_t in[static 16], uint8_t out[static 16], uint32_t nrounds) @@ -148,45 +162,48 @@ aes_neon_xts_update(uint8x16_t t8) int32x4_t t, t_; uint32x4_t mask; - t = vreinterpretq_s32_u8(t8); + t = vreinterpretq_s32_u8(vletoh32q_u8(t8)); mask = vcltq_s32(t, zero); /* -1 if high bit set else 0 */ mask = vextq_u32(mask, mask, 3); /* rotate quarters */ t_ = vsliq_n_s32(zero, t, 1); /* shift */ t_ ^= carry & mask; - return vreinterpretq_u8_s32(t_); + return vhtole32q_u8(vreinterpretq_u8_s32(t_)); } static int aes_neon_xts_update_selftest(void) { static const struct { - uint32_t in[4], out[4]; + uint8_t in[16], out[16]; } cases[] = { [0] = { {1}, {2} }, - [1] = { {0x80000000U,0,0,0}, {0,1,0,0} }, - [2] = { {0,0x80000000U,0,0}, {0,0,1,0} }, - [3] = { {0,0,0x80000000U,0}, {0,0,0,1} }, - [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} }, - [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} }, + [1] = { {0,0,0,0x80, 0,0,0,0, 0,0,0,0, 0,0,0,0}, + {0,0,0,0, 1,0,0,0, 0,0,0,0, 0,0,0,0} }, + [2] = { {0,0,0,0, 0,0,0,0x80, 0,0,0,0, 0,0,0,0}, + {0,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0} }, + [3] = { {0,0,0,0, 0,0,0,0, 0,0,0,0x80, 0,0,0,0}, + {0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0} }, + [4] = { {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80}, + {0x87,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0} }, + [5] = { {0,0,0,0, 0,0,0,0x80, 0,0,0,0, 0,0,0,0x80}, + {0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0} }, }; unsigned i; - uint32_t t[4]; + uint8_t t[16]; int result = 0; for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) { - t[0] = cases[i].in[0]; - t[1] = cases[i].in[1]; - t[2] = cases[i].in[2]; - t[3] = cases[i].in[3]; - storeblock(t, aes_neon_xts_update(loadblock(t))); - if (t[0] != cases[i].out[0] || - t[1] != cases[i].out[1] || - t[2] != cases[i].out[2] || - t[3] != cases[i].out[3]) { - printf("%s %u:" - " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n", - __func__, i, t[0], t[1], t[2], t[3]); + storeblock(t, aes_neon_xts_update(loadblock(cases[i].in))); + if (memcmp(t, cases[i].out, 16)) { + char buf[33]; + unsigned j; + + for (j = 0; j < 16; j++) { + snprintf(buf + 2*j, sizeof(buf) - 2*j, + "%02hhx", t[j]); + } + printf("%s %u: %s\n", __func__, i, buf); result = -1; } } @@ -289,16 +306,6 @@ aes_neon_cbcmac_update1(const struct aes * function, which should substantially improve CCM throughput. */ -#if _BYTE_ORDER == _LITTLE_ENDIAN -#define vbetoh32q_u8 vrev32q_u8 -#define vhtobe32q_u8 vrev32q_u8 -#elif _BYTE_ORDER == _BIG_ENDIAN -#define vbetoh32q_u8(x) (x) -#define vhtobe32q_u8(x) (x) -#else -#error what kind of endian are you anyway -#endif - void aes_neon_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16], uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32], diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon.c --- a/sys/crypto/chacha/arch/arm/chacha_neon.c Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/chacha/arch/arm/chacha_neon.c Wed Aug 05 14:02:05 2020 +0000 @@ -209,8 +209,8 @@ hchacha_neon(uint8_t out[restrict static chacha_permute(&r0, &r1, &r2, &r3, nr); - vst1q_u32((uint32_t *)out + 0, r0); - vst1q_u32((uint32_t *)out + 4, r3); + vst1q_u32((uint32_t *)out + 0, vhtole_u32(r0)); + vst1q_u32((uint32_t *)out + 4, vhtole_u32(r3)); } void diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon_32.S --- a/sys/crypto/chacha/arch/arm/chacha_neon_32.S Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/chacha/arch/arm/chacha_neon_32.S Wed Aug 05 14:02:05 2020 +0000 @@ -96,7 +96,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 vsri.u32 \b2, \c0, #(32 - 12) vsri.u32 \b3, \c1, #(32 - 12) - vld1.8 {\c0l}, [r7, :64] /* load rot8 table */ + vld1.32 {\c0l}, [r7, :64] /* load rot8 table */ /* a += b; d ^= a; d <<<= 8 */ vadd.u32 \a0, \a0, \b0 @@ -205,22 +205,22 @@ ENTRY(chacha_stream256_neon) vdup.32 q14, r8 vdup.32 q15, r10 - HTOLE32(q0) - HTOLE32(q1) - HTOLE32(q2) - HTOLE32(q3) - HTOLE32(q4) - HTOLE32(q5) - HTOLE32(q6) - HTOLE32(q7) - HTOLE32(q8) - HTOLE32(q9) - HTOLE32(q10) - HTOLE32(q11) - HTOLE32(q12) - HTOLE32(q13) - HTOLE32(q14) - HTOLE32(q15) + LE32TOH(q0) + LE32TOH(q1) + LE32TOH(q2) + LE32TOH(q3) + LE32TOH(q4) + LE32TOH(q5) + LE32TOH(q6) + LE32TOH(q7) + LE32TOH(q8) + LE32TOH(q9) + LE32TOH(q10) + LE32TOH(q11) + /* LE32TOH(q12) -- blkno, already host order */ + LE32TOH(q13) + LE32TOH(q14) + LE32TOH(q15) b 2f @@ -320,6 +320,9 @@ 2: subs r5, r5, #2 vswp q1, q4 vswp q3, q6 + LE32TOH(q9) + LE32TOH(q8) + vadd.u32 q0, q0, q9 vadd.u32 q4, q4, q9 vadd.u32 q2, q2, q9 @@ -332,14 +335,14 @@ 2: subs r5, r5, #2 vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ - LE32TOH(q0) - LE32TOH(q1) - LE32TOH(q2) - LE32TOH(q3) - LE32TOH(q4) - LE32TOH(q5) - LE32TOH(q6) - LE32TOH(q7) + HTOLE32(q0) + HTOLE32(q1) + HTOLE32(q2) + HTOLE32(q3) + HTOLE32(q4) + HTOLE32(q5) + HTOLE32(q6) + HTOLE32(q7) vst1.32 {q0-q1}, [r0]! vld1.32 {q0}, [r3] /* q0 := key[16:32) */ @@ -360,6 +363,9 @@ 2: subs r5, r5, #2 vswp q9, q12 vswp q11, q14 + LE32TOH(q0) + LE32TOH(q1) + vadd.u32 q8, q8, q0 vadd.u32 q12, q12, q0 vadd.u32 q10, q10, q0 @@ -370,14 +376,14 @@ 2: subs r5, r5, #2 vadd.u32 q11, q11, q1 vadd.u32 q15, q15, q1 - LE32TOH(q8) - LE32TOH(q9) - LE32TOH(q10) - LE32TOH(q11) - LE32TOH(q12) - LE32TOH(q13) - LE32TOH(q14) - LE32TOH(q15) + HTOLE32(q8) + HTOLE32(q9) + HTOLE32(q10) + HTOLE32(q11) + HTOLE32(q12) + HTOLE32(q13) + HTOLE32(q14) + HTOLE32(q15) /* vst1.32 {q0-q1}, [r0]! */ vst1.32 {q8-q9}, [r0]! @@ -448,22 +454,22 @@ ENTRY(chacha_stream_xor256_neon) vdup.32 q14, r8 vdup.32 q15, r10 - HTOLE32(q0) - HTOLE32(q1) - HTOLE32(q2) - HTOLE32(q3) - HTOLE32(q4) - HTOLE32(q5) - HTOLE32(q6) - HTOLE32(q7) - HTOLE32(q8) - HTOLE32(q9) - HTOLE32(q10) - HTOLE32(q11) - HTOLE32(q12) - HTOLE32(q13) - HTOLE32(q14) - HTOLE32(q15) + LE32TOH(q0) + LE32TOH(q1) + LE32TOH(q2) + LE32TOH(q3) + LE32TOH(q4) + LE32TOH(q5) + LE32TOH(q6) + LE32TOH(q7) + LE32TOH(q8) + LE32TOH(q9) + LE32TOH(q10) + LE32TOH(q11) + /* LE32TOH(q12) -- already host order, block number */ + LE32TOH(q13) + LE32TOH(q14) + LE32TOH(q15) b 2f @@ -508,6 +514,9 @@ 2: subs ip, ip, #2 vswp q1, q4 vswp q3, q6 + LE32TOH(q9) + LE32TOH(q8) + vadd.u32 q0, q0, q9 vadd.u32 q4, q4, q9 vadd.u32 q2, q2, q9 @@ -520,14 +529,17 @@ 2: subs ip, ip, #2 vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ - LE32TOH(q0) - LE32TOH(q1) - LE32TOH(q2) - LE32TOH(q6) - LE32TOH(q4) - LE32TOH(q5) - LE32TOH(q3) - LE32TOH(q7) + HTOLE32(q0) + HTOLE32(q1) + HTOLE32(q2) + HTOLE32(q6) + HTOLE32(q4) + HTOLE32(q5) + HTOLE32(q3) + HTOLE32(q7) + + LE32TOH(q8) + LE32TOH(q9) veor q0, q0, q8 /* compute ciphertext bytes [0:32) */ veor q1, q1, q9 @@ -553,6 +565,9 @@ 2: subs ip, ip, #2 vswp q9, q12 /* free up q9 earlier for consecutive q8-q9 */ vswp q11, q14 + LE32TOH(q0) + LE32TOH(q1) + vadd.u32 q8, q8, q0 vadd.u32 q12, q12, q0 vadd.u32 q10, q10, q0 @@ -565,14 +580,14 @@ 2: subs ip, ip, #2 vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ - LE32TOH(q8) - LE32TOH(q9) - LE32TOH(q10) - LE32TOH(q11) - LE32TOH(q12) - LE32TOH(q13) - LE32TOH(q14) - LE32TOH(q15) + HTOLE32(q8) + HTOLE32(q9) + HTOLE32(q10) + HTOLE32(q11) + HTOLE32(q12) + HTOLE32(q13) + HTOLE32(q14) + HTOLE32(q15) veor q0, q0, q8 /* compute ciphertext bytes [32:64) */ veor q1, q1, q9 diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon_64.S --- a/sys/crypto/chacha/arch/arm/chacha_neon_64.S Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/chacha/arch/arm/chacha_neon_64.S Wed Aug 05 14:02:05 2020 +0000 @@ -169,22 +169,22 @@ ENTRY(chacha_stream256_neon) ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ - HTOLE32(v0.16b) - HTOLE32(v1.16b) - HTOLE32(v2.16b) - HTOLE32(v3.16b) - HTOLE32(v4.16b) - HTOLE32(v5.16b) - HTOLE32(v6.16b) - HTOLE32(v7.16b) - HTOLE32(v8.16b) - HTOLE32(v9.16b) - HTOLE32(v10.16b) - HTOLE32(v11.16b) - HTOLE32(v12.16b) - HTOLE32(v13.16b) - HTOLE32(v14.16b) - HTOLE32(v15.16b) + LE32TOH(v0.16b) + LE32TOH(v1.16b) + LE32TOH(v2.16b) + LE32TOH(v3.16b) + LE32TOH(v4.16b) + LE32TOH(v5.16b) + LE32TOH(v6.16b) + LE32TOH(v7.16b) + LE32TOH(v8.16b) + LE32TOH(v9.16b) + LE32TOH(v10.16b) + LE32TOH(v11.16b) + /* LE32TOH(v12.16b) -- blkno, already host order */ + LE32TOH(v13.16b) + LE32TOH(v14.16b) + LE32TOH(v15.16b) mov v16.16b, v0.16b mov v17.16b, v1.16b @@ -234,22 +234,22 @@ 1: subs w5, w5, #2 add v14.4s, v14.4s, v30.4s add v15.4s, v15.4s, v31.4s - LE32TOH(v0.16b) - LE32TOH(v1.16b) - LE32TOH(v2.16b) - LE32TOH(v3.16b) - LE32TOH(v4.16b) - LE32TOH(v5.16b) - LE32TOH(v6.16b) - LE32TOH(v7.16b) - LE32TOH(v8.16b) - LE32TOH(v9.16b) - LE32TOH(v10.16b) - LE32TOH(v11.16b) - LE32TOH(v12.16b) - LE32TOH(v13.16b) - LE32TOH(v14.16b) - LE32TOH(v15.16b) + HTOLE32(v0.16b) + HTOLE32(v1.16b) + HTOLE32(v2.16b) + HTOLE32(v3.16b) + HTOLE32(v4.16b) + HTOLE32(v5.16b) + HTOLE32(v6.16b) + HTOLE32(v7.16b) + HTOLE32(v8.16b) + HTOLE32(v9.16b) + HTOLE32(v10.16b) + HTOLE32(v11.16b) + HTOLE32(v12.16b) + HTOLE32(v13.16b) + HTOLE32(v14.16b) + HTOLE32(v15.16b) st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16 st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16 @@ -308,22 +308,22 @@ ENTRY(chacha_stream_xor256_neon) ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ - HTOLE32(v0.16b) - HTOLE32(v1.16b) - HTOLE32(v2.16b) - HTOLE32(v3.16b) - HTOLE32(v4.16b) - HTOLE32(v5.16b) - HTOLE32(v6.16b) - HTOLE32(v7.16b) - HTOLE32(v8.16b) - HTOLE32(v9.16b) - HTOLE32(v10.16b) - HTOLE32(v11.16b) - HTOLE32(v12.16b) - HTOLE32(v13.16b) - HTOLE32(v14.16b) - HTOLE32(v15.16b) + LE32TOH(v0.16b) + LE32TOH(v1.16b) + LE32TOH(v2.16b) + LE32TOH(v3.16b) + LE32TOH(v4.16b) + LE32TOH(v5.16b) + LE32TOH(v6.16b) + LE32TOH(v7.16b) + LE32TOH(v8.16b) + LE32TOH(v9.16b) + LE32TOH(v10.16b) + LE32TOH(v11.16b) + /* LE32TOH(v12.16b) -- blkno, already host order */ + LE32TOH(v13.16b) + LE32TOH(v14.16b) + LE32TOH(v15.16b) mov v16.16b, v0.16b mov v17.16b, v1.16b @@ -401,22 +401,22 @@ 1: subs w6, w6, #2 ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16 ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16 - LE32TOH(v0.16b) - LE32TOH(v1.16b) - LE32TOH(v2.16b) - LE32TOH(v3.16b) - LE32TOH(v4.16b) - LE32TOH(v5.16b) - LE32TOH(v6.16b) - LE32TOH(v7.16b) - LE32TOH(v8.16b) - LE32TOH(v9.16b) - LE32TOH(v10.16b) - LE32TOH(v11.16b) - LE32TOH(v12.16b) - LE32TOH(v13.16b) - LE32TOH(v14.16b) - LE32TOH(v15.16b) + HTOLE32(v0.16b) + HTOLE32(v1.16b) + HTOLE32(v2.16b) + HTOLE32(v3.16b) + HTOLE32(v4.16b) + HTOLE32(v5.16b) + HTOLE32(v6.16b) + HTOLE32(v7.16b) + HTOLE32(v8.16b) + HTOLE32(v9.16b) + HTOLE32(v10.16b) + HTOLE32(v11.16b) + HTOLE32(v12.16b) + HTOLE32(v13.16b) + HTOLE32(v14.16b) + HTOLE32(v15.16b) eor v16.16b, v16.16b, v0.16b eor v17.16b, v17.16b, v1.16b