diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon_32.S --- a/sys/crypto/chacha/arch/arm/chacha_neon_32.S Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/chacha/arch/arm/chacha_neon_32.S Tue Aug 04 14:35:16 2020 +0000 @@ -96,7 +96,7 @@ RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2 vsri.u32 \b2, \c0, #(32 - 12) vsri.u32 \b3, \c1, #(32 - 12) - vld1.8 {\c0l}, [r7, :64] /* load rot8 table */ + vld1.32 {\c0l}, [r7, :64] /* load rot8 table */ /* a += b; d ^= a; d <<<= 8 */ vadd.u32 \a0, \a0, \b0 @@ -205,22 +205,22 @@ ENTRY(chacha_stream256_neon) vdup.32 q14, r8 vdup.32 q15, r10 - HTOLE32(q0) - HTOLE32(q1) - HTOLE32(q2) - HTOLE32(q3) - HTOLE32(q4) - HTOLE32(q5) - HTOLE32(q6) - HTOLE32(q7) - HTOLE32(q8) - HTOLE32(q9) - HTOLE32(q10) - HTOLE32(q11) - HTOLE32(q12) - HTOLE32(q13) - HTOLE32(q14) - HTOLE32(q15) + LE32TOH(q0) + LE32TOH(q1) + LE32TOH(q2) + LE32TOH(q3) + LE32TOH(q4) + LE32TOH(q5) + LE32TOH(q6) + LE32TOH(q7) + LE32TOH(q8) + LE32TOH(q9) + LE32TOH(q10) + LE32TOH(q11) + /* LE32TOH(q12) -- blkno, already host order */ + LE32TOH(q13) + LE32TOH(q14) + LE32TOH(q15) b 2f @@ -332,14 +332,14 @@ 2: subs r5, r5, #2 vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ - LE32TOH(q0) - LE32TOH(q1) - LE32TOH(q2) - LE32TOH(q3) - LE32TOH(q4) - LE32TOH(q5) - LE32TOH(q6) - LE32TOH(q7) + HTOLE32(q0) + HTOLE32(q1) + HTOLE32(q2) + HTOLE32(q3) + HTOLE32(q4) + HTOLE32(q5) + HTOLE32(q6) + HTOLE32(q7) vst1.32 {q0-q1}, [r0]! vld1.32 {q0}, [r3] /* q0 := key[16:32) */ @@ -370,14 +370,14 @@ 2: subs r5, r5, #2 vadd.u32 q11, q11, q1 vadd.u32 q15, q15, q1 - LE32TOH(q8) - LE32TOH(q9) - LE32TOH(q10) - LE32TOH(q11) - LE32TOH(q12) - LE32TOH(q13) - LE32TOH(q14) - LE32TOH(q15) + HTOLE32(q8) + HTOLE32(q9) + HTOLE32(q10) + HTOLE32(q11) + HTOLE32(q12) + HTOLE32(q13) + HTOLE32(q14) + HTOLE32(q15) /* vst1.32 {q0-q1}, [r0]! */ vst1.32 {q8-q9}, [r0]! @@ -448,22 +448,22 @@ ENTRY(chacha_stream_xor256_neon) vdup.32 q14, r8 vdup.32 q15, r10 - HTOLE32(q0) - HTOLE32(q1) - HTOLE32(q2) - HTOLE32(q3) - HTOLE32(q4) - HTOLE32(q5) - HTOLE32(q6) - HTOLE32(q7) - HTOLE32(q8) - HTOLE32(q9) - HTOLE32(q10) - HTOLE32(q11) - HTOLE32(q12) - HTOLE32(q13) - HTOLE32(q14) - HTOLE32(q15) + LE32TOH(q0) + LE32TOH(q1) + LE32TOH(q2) + LE32TOH(q3) + LE32TOH(q4) + LE32TOH(q5) + LE32TOH(q6) + LE32TOH(q7) + LE32TOH(q8) + LE32TOH(q9) + LE32TOH(q10) + LE32TOH(q11) + /* LE32TOH(q12) -- already host order, block number */ + LE32TOH(q13) + LE32TOH(q14) + LE32TOH(q15) b 2f @@ -520,14 +520,14 @@ 2: subs ip, ip, #2 vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ - LE32TOH(q0) - LE32TOH(q1) - LE32TOH(q2) - LE32TOH(q6) - LE32TOH(q4) - LE32TOH(q5) - LE32TOH(q3) - LE32TOH(q7) + HTOLE32(q0) + HTOLE32(q1) + HTOLE32(q2) + HTOLE32(q6) + HTOLE32(q4) + HTOLE32(q5) + HTOLE32(q3) + HTOLE32(q7) veor q0, q0, q8 /* compute ciphertext bytes [0:32) */ veor q1, q1, q9 @@ -565,14 +565,14 @@ 2: subs ip, ip, #2 vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ - LE32TOH(q8) - LE32TOH(q9) - LE32TOH(q10) - LE32TOH(q11) - LE32TOH(q12) - LE32TOH(q13) - LE32TOH(q14) - LE32TOH(q15) + HTOLE32(q8) + HTOLE32(q9) + HTOLE32(q10) + HTOLE32(q11) + HTOLE32(q12) + HTOLE32(q13) + HTOLE32(q14) + HTOLE32(q15) veor q0, q0, q8 /* compute ciphertext bytes [32:64) */ veor q1, q1, q9 diff -r b9bd12fb9564 sys/crypto/chacha/arch/arm/chacha_neon_64.S --- a/sys/crypto/chacha/arch/arm/chacha_neon_64.S Sun Aug 02 18:20:51 2020 +0000 +++ b/sys/crypto/chacha/arch/arm/chacha_neon_64.S Tue Aug 04 14:35:16 2020 +0000 @@ -169,22 +169,22 @@ ENTRY(chacha_stream256_neon) ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ - HTOLE32(v0.16b) - HTOLE32(v1.16b) - HTOLE32(v2.16b) - HTOLE32(v3.16b) - HTOLE32(v4.16b) - HTOLE32(v5.16b) - HTOLE32(v6.16b) - HTOLE32(v7.16b) - HTOLE32(v8.16b) - HTOLE32(v9.16b) - HTOLE32(v10.16b) - HTOLE32(v11.16b) - HTOLE32(v12.16b) - HTOLE32(v13.16b) - HTOLE32(v14.16b) - HTOLE32(v15.16b) + LE32TOH(v0.16b) + LE32TOH(v1.16b) + LE32TOH(v2.16b) + LE32TOH(v3.16b) + LE32TOH(v4.16b) + LE32TOH(v5.16b) + LE32TOH(v6.16b) + LE32TOH(v7.16b) + LE32TOH(v8.16b) + LE32TOH(v9.16b) + LE32TOH(v10.16b) + LE32TOH(v11.16b) + /* LE32TOH(v12.16b) -- blkno, already host order */ + LE32TOH(v13.16b) + LE32TOH(v14.16b) + LE32TOH(v15.16b) mov v16.16b, v0.16b mov v17.16b, v1.16b @@ -234,22 +234,22 @@ 1: subs w5, w5, #2 add v14.4s, v14.4s, v30.4s add v15.4s, v15.4s, v31.4s - LE32TOH(v0.16b) - LE32TOH(v1.16b) - LE32TOH(v2.16b) - LE32TOH(v3.16b) - LE32TOH(v4.16b) - LE32TOH(v5.16b) - LE32TOH(v6.16b) - LE32TOH(v7.16b) - LE32TOH(v8.16b) - LE32TOH(v9.16b) - LE32TOH(v10.16b) - LE32TOH(v11.16b) - LE32TOH(v12.16b) - LE32TOH(v13.16b) - LE32TOH(v14.16b) - LE32TOH(v15.16b) + HTOLE32(v0.16b) + HTOLE32(v1.16b) + HTOLE32(v2.16b) + HTOLE32(v3.16b) + HTOLE32(v4.16b) + HTOLE32(v5.16b) + HTOLE32(v6.16b) + HTOLE32(v7.16b) + HTOLE32(v8.16b) + HTOLE32(v9.16b) + HTOLE32(v10.16b) + HTOLE32(v11.16b) + HTOLE32(v12.16b) + HTOLE32(v13.16b) + HTOLE32(v14.16b) + HTOLE32(v15.16b) st4 { v0.s, v1.s, v2.s, v3.s}[0], [x0], #16 st4 { v4.s, v5.s, v6.s, v7.s}[0], [x0], #16 @@ -308,22 +308,22 @@ ENTRY(chacha_stream_xor256_neon) ld3r {v13.4s-v15.4s}, [x13] /* (v13,v14,v15) := nonce */ add v12.4s, v12.4s, v26.4s /* v12 := blkno + (0,1,2,3) */ - HTOLE32(v0.16b) - HTOLE32(v1.16b) - HTOLE32(v2.16b) - HTOLE32(v3.16b) - HTOLE32(v4.16b) - HTOLE32(v5.16b) - HTOLE32(v6.16b) - HTOLE32(v7.16b) - HTOLE32(v8.16b) - HTOLE32(v9.16b) - HTOLE32(v10.16b) - HTOLE32(v11.16b) - HTOLE32(v12.16b) - HTOLE32(v13.16b) - HTOLE32(v14.16b) - HTOLE32(v15.16b) + LE32TOH(v0.16b) + LE32TOH(v1.16b) + LE32TOH(v2.16b) + LE32TOH(v3.16b) + LE32TOH(v4.16b) + LE32TOH(v5.16b) + LE32TOH(v6.16b) + LE32TOH(v7.16b) + LE32TOH(v8.16b) + LE32TOH(v9.16b) + LE32TOH(v10.16b) + LE32TOH(v11.16b) + /* LE32TOH(v12.16b) -- blkno, already host order */ + LE32TOH(v13.16b) + LE32TOH(v14.16b) + LE32TOH(v15.16b) mov v16.16b, v0.16b mov v17.16b, v1.16b @@ -401,22 +401,22 @@ 1: subs w6, w6, #2 ld4 {v24.s,v25.s,v26.s,v27.s}[3], [x1], #16 ld4 {v28.s,v29.s,v30.s,v31.s}[3], [x1], #16 - LE32TOH(v0.16b) - LE32TOH(v1.16b) - LE32TOH(v2.16b) - LE32TOH(v3.16b) - LE32TOH(v4.16b) - LE32TOH(v5.16b) - LE32TOH(v6.16b) - LE32TOH(v7.16b) - LE32TOH(v8.16b) - LE32TOH(v9.16b) - LE32TOH(v10.16b) - LE32TOH(v11.16b) - LE32TOH(v12.16b) - LE32TOH(v13.16b) - LE32TOH(v14.16b) - LE32TOH(v15.16b) + HTOLE32(v0.16b) + HTOLE32(v1.16b) + HTOLE32(v2.16b) + HTOLE32(v3.16b) + HTOLE32(v4.16b) + HTOLE32(v5.16b) + HTOLE32(v6.16b) + HTOLE32(v7.16b) + HTOLE32(v8.16b) + HTOLE32(v9.16b) + HTOLE32(v10.16b) + HTOLE32(v11.16b) + HTOLE32(v12.16b) + HTOLE32(v13.16b) + HTOLE32(v14.16b) + HTOLE32(v15.16b) eor v16.16b, v16.16b, v0.16b eor v17.16b, v17.16b, v1.16b