# HG changeset patch
# User Taylor R Campbell <riastradh@NetBSD.org>
# Date 1593309656 0
#      Sun Jun 28 02:00:56 2020 +0000
# Branch trunk
# Node ID 63ee790c67c5e2606ea021c1909ee263476418be
# Parent  6a9b86125802c90cb2ad321556da68e4a7533381
# EXP-Topic riastradh-kernelcrypto
New permutation-based AES implementation using ARM NEON.

Work in progress -- clang support requires some work to adapt the
hokey intrinsic stubs, or just get the real arm_neon.h to be
available during the kernel build.

In principle this should work on armv7, but in practice there are
some barriers:

- need to implement fpu_kern_enter/leave
- need to figure out how pacify linker:

  armv7--netbsdelf-eabihf-ld: error: aes_neon.o uses VFP register arguments, netbsd does not
  armv7--netbsdelf-eabihf-ld: failed to merge target specific data of file aes_neon.o

- need to find the right place to call aes_md_init(&aes_neon_impl)

diff -r 6a9b86125802 -r 63ee790c67c5 sys/arch/aarch64/aarch64/cpu.c
--- a/sys/arch/aarch64/aarch64/cpu.c	Fri Jun 26 23:17:30 2020 +0000
+++ b/sys/arch/aarch64/aarch64/cpu.c	Sun Jun 28 02:00:56 2020 +0000
@@ -45,6 +45,7 @@
 #include <sys/systm.h>
 
 #include <crypto/aes/arch/arm/aes_armv8.h>
+#include <crypto/aes/arch/arm/aes_neon.h>
 
 #include <aarch64/armreg.h>
 #include <aarch64/cpu.h>
@@ -601,16 +602,24 @@ cpu_setup_aes(device_t dv, struct cpu_in
 {
 	struct aarch64_sysctl_cpu_id *id = &ci->ci_id;
 
-	/* Verify that it is supported.  */
+	/* Check for ARMv8.0-AES support.  */
 	switch (__SHIFTOUT(id->ac_aa64isar0, ID_AA64ISAR0_EL1_AES)) {
 	case ID_AA64ISAR0_EL1_AES_AES:
 	case ID_AA64ISAR0_EL1_AES_PMUL:
-		break;
+		aes_md_init(&aes_armv8_impl);
+		return;
 	default:
-		return;
+		break;
 	}
 
-	aes_md_init(&aes_armv8_impl);
+	/* Failing that, check for SIMD support.  */
+	switch (__SHIFTOUT(id->ac_aa64pfr0, ID_AA64PFR0_EL1_ADVSIMD)) {
+	case ID_AA64PFR0_EL1_ADV_SIMD_IMPL:
+		aes_md_init(&aes_neon_impl);
+		return;
+	default:
+		break;
+	}
 }
 
 #ifdef MULTIPROCESSOR
diff -r 6a9b86125802 -r 63ee790c67c5 sys/arch/aarch64/conf/files.aarch64
--- a/sys/arch/aarch64/conf/files.aarch64	Fri Jun 26 23:17:30 2020 +0000
+++ b/sys/arch/aarch64/conf/files.aarch64	Sun Jun 28 02:00:56 2020 +0000
@@ -141,3 +141,6 @@ file	dev/tprof/tprof_armv8.c			tprof	nee
 
 # ARMv8.0-AES
 include "crypto/aes/arch/arm/files.aesarmv8"
+
+# vpaes with ARM NEON
+include "crypto/aes/arch/arm/files.aesneon"
diff -r 6a9b86125802 -r 63ee790c67c5 sys/arch/arm/conf/Makefile.arm
--- a/sys/arch/arm/conf/Makefile.arm	Fri Jun 26 23:17:30 2020 +0000
+++ b/sys/arch/arm/conf/Makefile.arm	Sun Jun 28 02:00:56 2020 +0000
@@ -47,7 +47,7 @@ AFLAGS.fiq_subr.S+=	${CLANG_OBSOLETE_MUL
 AFLAGS.fusu.S+=		-marm
 AFLAGS.irq_dispatch.S+=	-marm ${CLANG_OBSOLETE_MULTI_ST}
 AFLAGS.locore.S+=	-marm ${CLANG_OBSOLETE_MULTI_ST}
-CFLAGS+=	 	-mfloat-abi=soft
+CFLAGS+=	 	${FLOATABI.${.IMPSRC:T}:U-mfloat-abi=soft}
 
 # This files use instructions deprecated for ARMv7+, but still
 # included in kernel that build with higher -mcpu=... settings.
diff -r 6a9b86125802 -r 63ee790c67c5 sys/arch/arm/conf/files.arm
--- a/sys/arch/arm/conf/files.arm	Fri Jun 26 23:17:30 2020 +0000
+++ b/sys/arch/arm/conf/files.arm	Sun Jun 28 02:00:56 2020 +0000
@@ -262,3 +262,7 @@ file	arch/arm/arm/linux_trap.c		compat_l
 
 # profiling support
 file	dev/tprof/tprof_armv7.c			tprof
+
+# vpaes with ARM NEON -- disabled for now pending arm32 kernel fpu
+# support and ctf
+#include "crypto/aes/arch/arm/files.aesneon"
diff -r 6a9b86125802 -r 63ee790c67c5 sys/crypto/aes/arch/arm/aes_neon.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon.c	Sun Jun 28 02:00:56 2020 +0000
@@ -0,0 +1,618 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES
+ * software, at <https://crypto.stanford.edu/vpaes/>, described in
+ *
+ *	Mike Hamburg, `Accelerating AES with Vector Permute
+ *	Instructions', in Christophe Clavier and Kris Gaj (eds.),
+ *	Cryptographic Hardware and Embedded Systems -- CHES 2009,
+ *	Springer LNCS 5747, pp. 18-32.
+ *
+ *	https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <sys/types.h>
+
+#include <sys/systm.h>
+
+#include "aes_neon_internal.h"
+
+static const uint8x16_t
+mc_forward[4] = {
+	{0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
+	 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C},
+	{0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
+	 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00},
+	{0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
+	 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04},
+	{0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
+	 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08},
+},
+mc_backward[4] = {
+	{0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
+	 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E},
+	{0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
+	 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A},
+	{0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
+	 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06},
+	{0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
+	 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02},
+},
+ipt[2] = {
+	{0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
+	 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA},
+	{0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
+	 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD},
+},
+opt[2] = {
+	{0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
+	 0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7},
+	{0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
+	 0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1},
+},
+dipt[2] = {
+	{0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
+	 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15},
+	{0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
+	 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12},
+},
+sb1[2] = {
+	{0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
+	 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5},
+	{0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
+	 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B},
+},
+sb2[2] = {
+	{0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
+	 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E},
+	{0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
+	 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2},
+},
+sbo[2] = {
+	{0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
+	 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15},
+	{0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
+	 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E},
+},
+dsb9[2] = {
+	{0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
+	 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA},
+	{0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
+	 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72},
+},
+dsbd[2] = {
+	{0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
+	 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5},
+	{0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
+	 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29},
+},
+dsbb[2] = {
+	{0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
+	 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60},
+	{0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
+	 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3},
+},
+dsbe[2] = {
+	{0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
+	 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22},
+	{0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
+	 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94},
+},
+dsbo[2] = {
+	{0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
+	 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7},
+	{0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
+	 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA},
+},
+dks1[2] = {
+	{0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
+	 0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A},
+	{0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
+	 0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B},
+},
+dks2[2] = {
+	{0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
+	 0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46},
+	{0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
+	 0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73},
+},
+dks3[2] = {
+	{0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
+	 0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8},
+	{0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
+	 0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5},
+},
+dks4[2] = {
+	{0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
+	 0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0},
+	{0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0,
+	 0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F},
+},
+deskew[2] = {
+	{0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07,
+	 0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D},
+	{0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
+	 0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28},
+},
+sr[4] = {
+	{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
+	 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},
+	{0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
+	 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B},
+	{0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F,
+	 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07},
+	{0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B,
+	 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03},
+},
+rcon =	{0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F,
+	0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70},
+s63 =	{0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,
+	0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B},
+of =	{0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,
+	0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F},
+inv =	{0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E,
+	0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04},
+inva =	{0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
+	0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03};
+
+static inline uint8x16_t
+loadroundkey(const void *rkp)
+{
+	return vld1q_u8(rkp);
+}
+
+static inline void
+storeroundkey(void *rkp, uint8x16_t rk)
+{
+	vst1q_u8(rkp, rk);
+}
+
+/* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g.  */
+static inline void
+bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x)
+{
+
+	*lo = x & of;
+	*hi = vshrq_n_u8(x & ~of, 4);
+}
+
+/* Given 0p0q0r0s, return 0x0y0z0w where x = a/p, y = a/q, &c.  */
+static inline uint8x16_t
+gf16_inva(uint8x16_t x)
+{
+	return vqtbl1q_u8(inva, x);
+}
+
+/* Given 0p0q0r0s, return 0x0y0z0w where x = 1/p, y = 1/q, &c.  */
+static inline uint8x16_t
+gf16_inv(uint8x16_t x)
+{
+	return vqtbl1q_u8(inv, x);
+}
+
+/*
+ * t is a pair of maps respectively from low and high nybbles to bytes.
+ * Apply t the nybbles, and add the results in GF(2).
+ */
+static uint8x16_t
+aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2])
+{
+	uint8x16_t lo, hi;
+
+	bytes2nybbles(&lo, &hi, x);
+	return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi);
+}
+
+static inline void
+subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x)
+{
+	uint8x16_t k, i, ak, j;
+
+	bytes2nybbles(&k, &i, x);
+	ak = gf16_inva(k);
+	j = i ^ k;
+	*io = j ^ gf16_inv(ak ^ gf16_inv(i));
+	*jo = i ^ gf16_inv(ak ^ gf16_inv(j));
+}
+
+static uint8x16_t
+aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk)
+{
+	uint8x16_t io, jo;
+
+	/* smear prk */
+	prk ^= vextq_u8(vdupq_n_u8(0), prk, 12);
+	prk ^= vextq_u8(vdupq_n_u8(0), prk, 8);
+	prk ^= s63;
+
+	/* subbytes */
+	subbytes(&io, &jo, rk);
+	rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo);
+
+	/* add in smeared stuff */
+	return rk ^ prk;
+}
+
+static uint8x16_t
+aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot)
+{
+	uint32x4_t rk32;
+
+	/* extract rcon from rcon_rot */
+	prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15);
+	*rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15);
+
+	/* rotate */
+	rk32 = vreinterpretq_u32_u8(rk);
+	rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3));
+	rk = vreinterpretq_u8_u32(rk32);
+	rk = vextq_u8(rk, rk, 1);
+
+	return aes_schedule_low_round(rk, prk);
+}
+
+static uint8x16_t
+aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i)
+{
+	uint8x16_t y = vdupq_n_u8(0);
+
+	x ^= s63;
+
+	x = vqtbl1q_u8(x, mc_forward[0]);
+	y ^= x;
+	x = vqtbl1q_u8(x, mc_forward[0]);
+	y ^= x;
+	x = vqtbl1q_u8(x, mc_forward[0]);
+	y ^= x;
+
+	return vqtbl1q_u8(y, sr_i);
+}
+
+static uint8x16_t
+aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i)
+{
+
+	return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt);
+}
+
+static uint8x16_t
+aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i)
+{
+	uint8x16_t y = vdupq_n_u8(0);
+
+	x = aes_schedule_transform(x, dks1);
+	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
+	x = aes_schedule_transform(x, dks2);
+	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
+	x = aes_schedule_transform(x, dks3);
+	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
+	x = aes_schedule_transform(x, dks4);
+	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
+
+	return vqtbl1q_u8(y, sr_i);
+}
+
+static uint8x16_t
+aes_schedule_mangle_last_dec(uint8x16_t x)
+{
+
+	return aes_schedule_transform(x ^ s63, deskew);
+}
+
+static uint8x16_t
+aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk)
+{
+	uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi);
+	uint32x4_t prk32 = vreinterpretq_u32_u8(prk);
+	uint32x4_t rk32;
+
+	rk32 = prkhi32;
+	rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2),
+	    vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)),
+	    3);
+	rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2),
+	    vdupq_n_u32(vgetq_lane_u32(prk32, 3)),
+	    0);
+
+	return vreinterpretq_u8_u32(rk32);
+}
+
+static uint8x16_t
+aes_schedule_192_smearhi(uint8x16_t rk)
+{
+	uint64x2_t rk64 = vreinterpretq_u64_u8(rk);
+
+	rk64 = vsetq_lane_u64(0, rk64, 0);
+
+	return vreinterpretq_u8_u64(rk64);
+}
+
+void
+aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
+{
+	uint32_t *rk32 = enc->aese_aes.aes_rk;
+	uint8x16_t mrk;		/* mangled round key */
+	uint8x16_t rk;		/* round key */
+	uint8x16_t prk;		/* previous round key */
+	uint8x16_t rcon_rot = rcon;
+	uint64_t i = 3;
+
+	/* input transform */
+	rk = aes_schedule_transform(vld1q_u8(key), ipt);
+	storeroundkey(rk32, rk);
+	rk32 += 4;
+
+	switch (nrounds) {
+	case 10:
+		for (;;) {
+			rk = aes_schedule_round(rk, rk, &rcon_rot);
+			if (--nrounds == 0)
+				break;
+			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
+			storeroundkey(rk32, mrk);
+			rk32 += 4;
+		}
+		break;
+	case 12: {
+		uint8x16_t prkhi;	/* high half of previous round key */
+
+		prk = rk;
+		rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
+		prkhi = aes_schedule_192_smearhi(rk);
+		for (;;) {
+			prk = aes_schedule_round(rk, prk, &rcon_rot);
+			rk = vextq_u8(prkhi, prk, 8);
+
+			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
+			storeroundkey(rk32, mrk);
+			rk32 += 4;
+			rk = aes_schedule_192_smear(prkhi, prk);
+			prkhi = aes_schedule_192_smearhi(rk);
+
+			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
+			storeroundkey(rk32, mrk);
+			rk32 += 4;
+			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
+			if ((nrounds -= 3) == 0)
+				break;
+
+			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
+			storeroundkey(rk32, mrk);
+			rk32 += 4;
+			rk = aes_schedule_192_smear(prkhi, prk);
+			prkhi = aes_schedule_192_smearhi(rk);
+		}
+		break;
+	}
+	case 14: {
+		uint8x16_t pprk;	/* previous previous round key */
+
+		prk = rk;
+		rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
+		for (;;) {
+			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
+			storeroundkey(rk32, mrk);
+			rk32 += 4;
+			pprk = rk;
+
+			/* high round */
+			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
+			if ((nrounds -= 2) == 0)
+				break;
+			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
+			storeroundkey(rk32, mrk);
+			rk32 += 4;
+
+			/* low round */
+			rk = vreinterpretq_u8_u32(
+				vdupq_n_u32(
+				    vgetq_lane_u32(vreinterpretq_u32_u8(rk),
+					3)));
+			rk = aes_schedule_low_round(rk, pprk);
+		}
+		break;
+	}
+	default:
+		panic("invalid number of AES rounds: %u", nrounds);
+	}
+	storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4]));
+}
+
+void
+aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
+{
+	uint32_t *rk32 = dec->aesd_aes.aes_rk;
+	uint8x16_t mrk;		/* mangled round key */
+	uint8x16_t ork;		/* original round key */
+	uint8x16_t rk;		/* round key */
+	uint8x16_t prk;		/* previous round key */
+	uint8x16_t rcon_rot = rcon;
+	unsigned i = nrounds == 12 ? 0 : 2;
+
+	ork = vld1q_u8(key);
+
+	/* input transform */
+	rk = aes_schedule_transform(ork, ipt);
+
+	/* go from end */
+	rk32 += 4*nrounds;
+	storeroundkey(rk32, vqtbl1q_u8(ork, sr[i]));
+	rk32 -= 4;
+	i ^= 3;
+
+	switch (nrounds) {
+	case 10:
+		for (;;) {
+			rk = aes_schedule_round(rk, rk, &rcon_rot);
+			if (--nrounds == 0)
+				break;
+			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
+			storeroundkey(rk32, mrk);
+			rk32 -= 4;
+		}
+		break;
+	case 12: {
+		uint8x16_t prkhi;	/* high half of previous round key */
+
+		prk = rk;
+		rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
+		prkhi = aes_schedule_192_smearhi(rk);
+		for (;;) {
+			prk = aes_schedule_round(rk, prk, &rcon_rot);
+			rk = vextq_u8(prkhi, prk, 8);
+
+			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
+			storeroundkey(rk32, mrk);
+			rk32 -= 4;
+			rk = aes_schedule_192_smear(prkhi, prk);
+			prkhi = aes_schedule_192_smearhi(rk);
+
+			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
+			storeroundkey(rk32, mrk);
+			rk32 -= 4;
+			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
+			if ((nrounds -= 3) == 0)
+				break;
+
+			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
+			storeroundkey(rk32, mrk);
+			rk32 -= 4;
+			rk = aes_schedule_192_smear(prkhi, prk);
+			prkhi = aes_schedule_192_smearhi(rk);
+		}
+		break;
+	}
+	case 14: {
+		uint8x16_t pprk;	/* previous previous round key */
+
+		prk = rk;
+		rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
+		for (;;) {
+			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
+			storeroundkey(rk32, mrk);
+			rk32 -= 4;
+			pprk = rk;
+
+			/* high round */
+			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
+			if ((nrounds -= 2) == 0)
+				break;
+			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
+			storeroundkey(rk32, mrk);
+			rk32 -= 4;
+
+			/* low round */
+			rk = vreinterpretq_u8_u32(
+				vdupq_n_u32(
+				    vgetq_lane_u32(vreinterpretq_u32_u8(rk),
+					3)));
+			rk = aes_schedule_low_round(rk, pprk);
+		}
+		break;
+	}
+	default:
+		panic("invalid number of AES rounds: %u", nrounds);
+	}
+	storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
+}
+
+uint8x16_t
+aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
+{
+	const uint32_t *rk32 = enc->aese_aes.aes_rk;
+	uint8x16_t io, jo;
+	unsigned rmod4 = 0;
+
+	x = aes_schedule_transform(x, ipt);
+	x ^= loadroundkey(rk32);
+	for (;;) {
+		uint8x16_t A, A2, A2_B, A2_B_D;
+
+		subbytes(&io, &jo, x);
+
+		rk32 += 4;
+		rmod4 = (rmod4 + 1) % 4;
+		if (--nrounds == 0)
+			break;
+
+		A = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo);
+		A ^= loadroundkey(rk32);
+		A2 = vqtbl1q_u8(sb2[0], io) ^ vqtbl1q_u8(sb2[1], jo);
+		A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]);
+		A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]);
+		x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]);
+	}
+	x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo);
+	x ^= loadroundkey(rk32);
+	x = vqtbl1q_u8(x, sr[rmod4]);
+
+	return x;
+}
+
+uint8x16_t
+aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
+{
+	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
+	unsigned i = 3 & ~(nrounds - 1);
+	uint8x16_t io, jo, mc;
+
+	x = aes_schedule_transform(x, dipt);
+	x ^= loadroundkey(rk32);
+	rk32 += 4;
+
+	mc = mc_forward[3];
+	for (;;) {
+		subbytes(&io, &jo, x);
+		if (--nrounds == 0)
+			break;
+
+		x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo);
+		x ^= loadroundkey(rk32);
+		rk32 += 4;				/* next round key */
+
+		x = vqtbl1q_u8(x, mc);
+		x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo);
+
+		x = vqtbl1q_u8(x, mc);
+		x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo);
+
+		x = vqtbl1q_u8(x, mc);
+		x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo);
+
+		mc = vextq_u8(mc, mc, 12);
+	}
+
+	x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo);
+	x ^= loadroundkey(rk32);
+	return vqtbl1q_u8(x, sr[i]);
+}
diff -r 6a9b86125802 -r 63ee790c67c5 sys/crypto/aes/arch/arm/aes_neon.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon.h	Sun Jun 28 02:00:56 2020 +0000
@@ -0,0 +1,36 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_CRYPTO_AES_ARCH_ARM_AES_NEON_H
+#define	_CRYPTO_AES_ARCH_ARM_AES_NEON_H
+
+#include <crypto/aes/aes.h>
+
+extern struct aes_impl aes_neon_impl;
+
+#endif	/* _CRYPTO_AES_ARCH_ARM_AES_NEON_H */
diff -r 6a9b86125802 -r 63ee790c67c5 sys/crypto/aes/arch/arm/aes_neon_impl.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon_impl.c	Sun Jun 28 02:00:56 2020 +0000
@@ -0,0 +1,173 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <sys/types.h>
+#include <sys/proc.h>
+
+#include <crypto/aes/aes.h>
+#include <crypto/aes/arch/arm/aes_neon.h>
+
+#ifdef __aarch64__
+#include <aarch64/armreg.h>
+#include <aarch64/fpu.h>
+#else
+#include <arm/locore.h>
+#define	fpu_kern_enter()	((void)0)
+#define	fpu_kern_leave()	((void)0)
+#endif
+
+#include "aes_neon_subr.h"
+
+static void
+aes_neon_setenckey_impl(struct aesenc *enc, const uint8_t *key,
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_neon_setenckey(enc, key, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_neon_setdeckey_impl(struct aesdec *dec, const uint8_t *key,
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_neon_setdeckey(dec, key, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_neon_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_neon_enc(enc, in, out, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_neon_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_neon_dec(dec, in, out, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_neon_cbc_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_neon_cbc_enc(enc, in, out, nbytes, iv, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_neon_cbc_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_neon_cbc_dec(dec, in, out, nbytes, iv, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_neon_xts_enc_impl(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_neon_xts_enc(enc, in, out, nbytes, iv, nrounds);
+	fpu_kern_leave();
+}
+
+static void
+aes_neon_xts_dec_impl(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
+    uint32_t nrounds)
+{
+
+	fpu_kern_enter();
+	aes_neon_xts_dec(dec, in, out, nbytes, iv, nrounds);
+	fpu_kern_leave();
+}
+
+static int
+aes_neon_probe(void)
+{
+#ifdef __aarch64__
+	struct aarch64_sysctl_cpu_id *id;
+#endif
+	int result = 0;
+
+	/* Verify that the CPU supports NEON.  */
+#ifdef __aarch64__
+	id = &curcpu()->ci_id;
+	switch (__SHIFTOUT(id->ac_aa64pfr0, ID_AA64PFR0_EL1_ADVSIMD)) {
+	case ID_AA64PFR0_EL1_ADV_SIMD_IMPL:
+		break;
+	default:
+		return -1;
+	}
+#else
+	if (!cpu_neon_present)
+		return -1;
+#endif
+
+	fpu_kern_enter();
+	result = aes_neon_selftest();
+	fpu_kern_leave();
+
+	return result;
+}
+
+struct aes_impl aes_neon_impl = {
+	.ai_name = "ARM NEON vpaes",
+	.ai_probe = aes_neon_probe,
+	.ai_setenckey = aes_neon_setenckey_impl,
+	.ai_setdeckey = aes_neon_setdeckey_impl,
+	.ai_enc = aes_neon_enc_impl,
+	.ai_dec = aes_neon_dec_impl,
+	.ai_cbc_enc = aes_neon_cbc_enc_impl,
+	.ai_cbc_dec = aes_neon_cbc_dec_impl,
+	.ai_xts_enc = aes_neon_xts_enc_impl,
+	.ai_xts_dec = aes_neon_xts_dec_impl,
+};
diff -r 6a9b86125802 -r 63ee790c67c5 sys/crypto/aes/arch/arm/aes_neon_internal.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon_internal.h	Sun Jun 28 02:00:56 2020 +0000
@@ -0,0 +1,43 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_CRYPTO_AES_ARCH_ARM_AES_NEON_INTERNAL_H
+#define	_CRYPTO_AES_ARCH_ARM_AES_NEON_INTERNAL_H
+
+#include <sys/types.h>
+
+#include "arm_neon.h"
+
+#include <crypto/aes/aes.h>
+
+#include "aes_neon_subr.h"
+
+uint8x16_t aes_neon_enc1(const struct aesenc *, uint8x16_t, unsigned);
+uint8x16_t aes_neon_dec1(const struct aesdec *, uint8x16_t, unsigned);
+
+#endif	/* _CRYPTO_AES_ARCH_ARM_AES_NEON_INTERNAL_H */
diff -r 6a9b86125802 -r 63ee790c67c5 sys/crypto/aes/arch/arm/aes_neon_subr.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon_subr.c	Sun Jun 28 02:00:56 2020 +0000
@@ -0,0 +1,217 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD$");
+
+#include <sys/systm.h>
+
+#include <lib/libkern/libkern.h>
+
+#include "aes_neon_internal.h"
+#include "aes_neon_subr.h"
+
+static inline uint8x16_t
+loadblock(const void *in)
+{
+	return vld1q_u8(in);
+}
+
+static inline void
+storeblock(void *out, uint8x16_t block)
+{
+	vst1q_u8(out, block);
+}
+
+void
+aes_neon_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+	uint8x16_t block;
+
+	block = loadblock(in);
+	block = aes_neon_enc1(enc, block, nrounds);
+	storeblock(out, block);
+}
+
+void
+aes_neon_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], uint32_t nrounds)
+{
+	uint8x16_t block;
+
+	block = loadblock(in);
+	block = aes_neon_dec1(dec, block, nrounds);
+	storeblock(out, block);
+}
+
+void
+aes_neon_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
+    uint32_t nrounds)
+{
+	uint8x16_t cv;
+
+	KASSERT(nbytes);
+
+	cv = loadblock(iv);
+	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+		cv ^= loadblock(in);
+		cv = aes_neon_enc1(enc, cv, nrounds);
+		storeblock(out, cv);
+	}
+	storeblock(iv, cv);
+}
+
+void
+aes_neon_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
+    uint32_t nrounds)
+{
+	uint8x16_t iv0, cv, b;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	iv0 = loadblock(iv);
+	cv = loadblock(in + nbytes - 16);
+	storeblock(iv, cv);
+
+	for (;;) {
+		b = aes_neon_dec1(dec, cv, nrounds);
+		if ((nbytes -= 16) == 0)
+			break;
+		cv = loadblock(in + nbytes - 16);
+		storeblock(out + nbytes, b ^ cv);
+	}
+	storeblock(out, b ^ iv0);
+}
+
+static inline uint8x16_t
+aes_neon_xts_update(uint8x16_t t8)
+{
+	const int32x4_t zero = vdupq_n_s32(0);
+	const int32x4_t carry = {0x87, 1, 1, 1};
+	int32x4_t t, t_;
+	uint32x4_t mask;
+
+	t = vreinterpretq_s32_u8(t8);
+	mask = vcltq_s32(t, zero);		/* -1 if high bit set else 0 */
+	mask = vextq_u32(mask, mask, 3);	/* rotate quarters */
+	t_ = vsliq_n_s32(zero, t, 1);		/* shift */
+	t_ ^= carry & mask;
+
+	return vreinterpretq_u8_s32(t_);
+}
+
+static int
+aes_neon_xts_update_selftest(void)
+{
+	static const struct {
+		uint32_t in[4], out[4];
+	} cases[] = {
+		[0] = { {1}, {2} },
+		[1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
+		[2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
+		[3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
+		[4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
+		[5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
+	};
+	unsigned i;
+	uint32_t t[4];
+	int result = 0;
+
+	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
+		t[0] = cases[i].in[0];
+		t[1] = cases[i].in[1];
+		t[2] = cases[i].in[2];
+		t[3] = cases[i].in[3];
+		storeblock(t, aes_neon_xts_update(loadblock(t)));
+		if (t[0] != cases[i].out[0] ||
+		    t[1] != cases[i].out[1] ||
+		    t[2] != cases[i].out[2] ||
+		    t[3] != cases[i].out[3]) {
+			printf("%s %u:"
+			    " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
+			    __func__, i, t[0], t[1], t[2], t[3]);
+			result = -1;
+		}
+	}
+
+	return result;
+}
+
+void
+aes_neon_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+    uint32_t nrounds)
+{
+	uint8x16_t t, b;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	t = loadblock(tweak);
+	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+		b = t ^ loadblock(in);
+		b = aes_neon_enc1(enc, b, nrounds);
+		storeblock(out, t ^ b);
+		t = aes_neon_xts_update(t);
+	}
+	storeblock(tweak, t);
+}
+
+void
+aes_neon_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
+    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
+    uint32_t nrounds)
+{
+	uint8x16_t t, b;
+
+	KASSERT(nbytes);
+	KASSERT(nbytes % 16 == 0);
+
+	t = loadblock(tweak);
+	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
+		b = t ^ loadblock(in);
+		b = aes_neon_dec1(dec, b, nrounds);
+		storeblock(out, t ^ b);
+		t = aes_neon_xts_update(t);
+	}
+	storeblock(tweak, t);
+}
+
+int
+aes_neon_selftest(void)
+{
+
+	if (aes_neon_xts_update_selftest())
+		return -1;
+
+	return 0;
+}
diff -r 6a9b86125802 -r 63ee790c67c5 sys/crypto/aes/arch/arm/aes_neon_subr.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/arm/aes_neon_subr.h	Sun Jun 28 02:00:56 2020 +0000
@@ -0,0 +1,60 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_CRYPTO_AES_ARCH_ARM_AES_NEON_SUBR_H
+#define	_CRYPTO_AES_ARCH_ARM_AES_NEON_SUBR_H
+
+#include <crypto/aes/aes.h>
+
+/*
+ * These functions MUST NOT use any vector registers for parameters or
+ * results -- the caller is compiled with -mno-sse &c. in the kernel,
+ * and dynamically turns on the vector unit just before calling them.
+ * Internal subroutines that use the vector unit for parameters are
+ * declared in aes_neon_internal.h instead.
+ */
+
+void aes_neon_setenckey(struct aesenc *, const uint8_t *, unsigned);
+void aes_neon_setdeckey(struct aesdec *, const uint8_t *, unsigned);
+
+void aes_neon_enc(const struct aesenc *, const uint8_t[static 16],
+    uint8_t[static 16], uint32_t);
+void aes_neon_dec(const struct aesdec *, const uint8_t[static 16],
+    uint8_t[static 16], uint32_t);
+void aes_neon_cbc_enc(const struct aesenc *, const uint8_t[static 16],
+    uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void aes_neon_cbc_dec(const struct aesdec *, const uint8_t[static 16],
+    uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void aes_neon_xts_enc(const struct aesenc *, const uint8_t[static 16],
+    uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+void aes_neon_xts_dec(const struct aesdec *, const uint8_t[static 16],
+    uint8_t[static 16], size_t, uint8_t[static 16], uint32_t);
+
+int aes_neon_selftest(void);
+
+#endif	/* _CRYPTO_AES_ARCH_ARM_AES_NEON_SUBR_H */
diff -r 6a9b86125802 -r 63ee790c67c5 sys/crypto/aes/arch/arm/arm_neon.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/arm/arm_neon.h	Sun Jun 28 02:00:56 2020 +0000
@@ -0,0 +1,311 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2020 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H
+#define	_SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H
+
+#if defined(__GNUC__) && !defined(__clang__)
+
+#define	_INTRINSATTR							      \
+	__extension__							      \
+	__attribute__((__always_inline__, __gnu_inline__, __artificial__))
+
+#ifdef __aarch64__
+typedef __Int32x4_t int32x4_t;
+typedef __Int64x2_t int64x2_t;
+typedef __Int8x16_t int8x16_t;
+typedef __Uint32x4_t uint32x4_t;
+typedef __Uint64x2_t uint64x2_t;
+typedef __Uint8x16_t uint8x16_t;
+#else
+typedef __simd128_int32_t int32x4_t;
+typedef __simd128_int64_t int64x2_t;
+typedef __simd128_int8_t int8x16_t;
+typedef __simd128_uint32_t uint32x4_t;
+typedef __simd128_uint64_t uint64x2_t;
+typedef __simd128_uint8_t uint8x16_t;
+
+typedef __simd64_int8_t int8x8_t;
+typedef __simd64_uint8_t uint8x8_t;
+typedef __builtin_neon_udi uint64x1_t;
+typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
+#endif
+
+#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
+#define	__neon_lane_index(__v, __i)	(__arraycount(__v) - 1 - __i)
+#else
+#define	__neon_lane_index(__v, __i)	__i
+#endif
+
+#elif defined(__clang__)
+
+#define	_INTRINSATTR							      \
+	__attribute__((__always_inline__, __nodebug))
+
+typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
+typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
+typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
+typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
+typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
+typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
+
+#error Teach me how to neon in clang!
+
+#else
+
+#error Teach me how to neon in your compile!
+
+#endif
+
+_INTRINSATTR
+static __inline uint32x4_t
+vcltq_s32(int32x4_t __v0, int32x4_t __v1)
+{
+	return (uint32x4_t)(__v0 < __v1);
+}
+
+_INTRINSATTR
+static __inline int32x4_t
+vdupq_n_s32(int32_t __x)
+{
+	return (int32x4_t) { __x, __x, __x, __x };
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vdupq_n_u32(uint32_t __x)
+{
+	return (uint32x4_t) { __x, __x, __x, __x };
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vdupq_n_u8(uint8_t __x)
+{
+	return (uint8x16_t) {
+		__x, __x, __x, __x, __x, __x, __x, __x,
+		__x, __x, __x, __x, __x, __x, __x, __x,
+	};
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i)
+{
+#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
+	return __builtin_shuffle(__hi, __lo,
+	    (uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i });
+#else
+	return __builtin_shuffle(__lo, __hi,
+	    (uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 });
+#endif
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
+{
+#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
+	return __builtin_shuffle(__hi, __lo,
+	    (uint8x16_t) {
+		16 - __i, 17 - __i, 18 - __i, 19 - __i,
+		20 - __i, 21 - __i, 22 - __i, 23 - __i,
+		24 - __i, 25 - __i, 26 - __i, 27 - __i,
+		28 - __i, 29 - __i, 30 - __i, 31 - __i,
+	});
+#else
+	return __builtin_shuffle(__lo, __hi,
+	    (uint8x16_t) {
+		__i +  0, __i +  1, __i +  2, __i +  3,
+		__i +  4, __i +  5, __i +  6, __i +  7,
+		__i +  8, __i +  9, __i + 10, __i + 11,
+		__i + 12, __i + 13, __i + 14, __i + 15,
+	});
+#endif
+}
+
+_INTRINSATTR
+static __inline uint32_t
+vgetq_lane_u32(uint32x4_t __v, uint8_t __i)
+{
+#ifdef __aarch64__
+	return __v[__i];
+#else
+	return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i);
+#endif
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vld1q_u8(const uint8_t *__p8)
+{
+#ifdef __aarch64__
+	const __builtin_aarch64_simd_qi *__p =
+	    (const __builtin_aarch64_simd_qi *)__p8;
+
+	return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p);
+#else
+	const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8;
+
+	return (uint8x16_t)__builtin_neon_vld1v16qi(__p);
+#endif
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx)
+{
+#ifdef __aarch64__
+	uint8x16_t __res;
+	__asm__("tbl %0.16b, {%1.16b}, %2.16b"
+	    : "=w"(__res) : "w"(__tab), "w"(__idx));
+	return __res;
+#else
+	/*
+	 * No native ARMv7 NEON instruction for this, so do it via two
+	 * half-width TBLs instead (vtbl2_u8 equivalent).
+	 */
+	uint64x2_t __tab64 = (uint64x2_t)__tab;
+	uint8x8_t __tablo = (uint8x8_t)__tab64[0];
+	uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
+	uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } };
+	union {
+		uint8x8x2_t __u8x8x2;
+		__builtin_neon_ti __ti;
+	} __u = { __tab8x8x2 };
+	uint64x2_t __idx64, __out64;
+	int8x8_t __idxlo, __idxhi, __outlo, __outhi;
+
+	__idx64 = (uint64x2_t)__idx;
+	__idxlo = (int8x8_t)__idx64[0];
+	__idxhi = (int8x8_t)__idx64[1];
+	__outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo);
+	__outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi);
+	__out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi };
+
+	return (uint8x16_t)__out64;
+#endif
+}
+
+_INTRINSATTR
+static __inline int32x4_t
+vreinterpretq_s32_u8(uint8x16_t __v)
+{
+	return (int32x4_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vreinterpretq_u32_u8(uint8x16_t __v)
+{
+	return (uint32x4_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint64x2_t
+vreinterpretq_u64_u8(uint8x16_t __v)
+{
+	return (uint64x2_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vreinterpretq_u8_s32(int32x4_t __v)
+{
+	return (uint8x16_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vreinterpretq_u8_u32(uint32x4_t __v)
+{
+	return (uint8x16_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vreinterpretq_u8_u64(uint64x2_t __v)
+{
+	return (uint8x16_t)__v;
+}
+
+_INTRINSATTR
+static __inline uint32x4_t
+vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i)
+{
+	__v[__neon_lane_index(__v, __i)] = __x;
+	return __v;
+}
+
+_INTRINSATTR
+static __inline uint64x2_t
+vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i)
+{
+	__v[__neon_lane_index(__v, __i)] = __x;
+	return __v;
+}
+
+_INTRINSATTR
+static __inline uint8x16_t
+vshrq_n_u8(uint8x16_t __v, uint8_t __bits)
+{
+#ifdef __aarch64__
+	return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits);
+#else
+	return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits);
+#endif
+}
+
+_INTRINSATTR
+static __inline int32x4_t
+vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits)
+{
+#ifdef __aarch64__
+	return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits);
+#else
+	return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits);
+#endif
+}
+
+_INTRINSATTR
+static __inline void
+vst1q_u8(uint8_t *__p8, uint8x16_t __v)
+{
+#ifdef __aarch64__
+	__builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8;
+
+	__builtin_aarch64_st1v16qi(__p, (int8x16_t)__v);
+#else
+	__builtin_neon_qi *__p = (__builtin_neon_qi *)__p8;
+
+	__builtin_neon_vst1v16qi(__p, (int8x16_t)__v);
+#endif
+}
+
+#endif	/* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H */
diff -r 6a9b86125802 -r 63ee790c67c5 sys/crypto/aes/arch/arm/files.aesneon
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sys/crypto/aes/arch/arm/files.aesneon	Sun Jun 28 02:00:56 2020 +0000
@@ -0,0 +1,13 @@
+#	$NetBSD$
+
+ifdef aarch64
+makeoptions	aes	"COPTS.aes_neon.c"+="-march=armv8-a"
+makeoptions	aes	"COPTS.aes_neon_subr.c"+="-march=armv8-a"
+else
+makeoptions	aes	"FLOATABI.aes_neon.c"+="-mfpu=neon -mfloat-abi=hard"
+makeoptions	aes	"FLOATABI.aes_neon_subr.c"+="-mfpu=neon -mfloat-abi=hard"
+endif
+
+file	crypto/aes/arch/arm/aes_neon.c		aes
+file	crypto/aes/arch/arm/aes_neon_impl.c	aes
+file	crypto/aes/arch/arm/aes_neon_subr.c	aes