diff -r 1cb0546d18b6 sys/arch/amd64/amd64/genassym.cf --- a/sys/arch/amd64/amd64/genassym.cf Thu Apr 10 18:53:29 2025 +0000 +++ b/sys/arch/amd64/amd64/genassym.cf Fri Apr 11 22:00:30 2025 +0000 @@ -186,7 +186,6 @@ define PCB_FLAGS offsetof(struct pcb, p define PCB_COMPAT32 PCB_COMPAT32 define PCB_FS offsetof(struct pcb, pcb_fs) define PCB_GS offsetof(struct pcb, pcb_gs) -define PCB_SAVEFPU offsetof(struct pcb, pcb_savefpu) define TF_RDI offsetof(struct trapframe, tf_rdi) define TF_RSI offsetof(struct trapframe, tf_rsi) diff -r 1cb0546d18b6 sys/arch/amd64/include/param.h --- a/sys/arch/amd64/include/param.h Thu Apr 10 18:53:29 2025 +0000 +++ b/sys/arch/amd64/include/param.h Fri Apr 11 22:00:30 2025 +0000 @@ -69,11 +69,27 @@ #define SINCR 1 /* increment of stack/NBPG */ #if defined(KASAN) || defined(KMSAN) -#define UPAGES 8 +#define UPAGES_KxSAN 2 +#else +#define UPAGES_KxSAN 0 +#endif +#if defined(SVS) +#define UPAGES_SVS 1 +#else +#define UPAGES_SVS 0 +#endif +#define UPAGES_PCB 1 /* one page for the PCB */ +#define UPAGES_RED 1 /* one page for red zone between pcb/stack */ +#define UPAGES_STACK 3 /* three pages (12 KiB) of stack space */ +#define UPAGES \ + (UPAGES_PCB + UPAGES_RED + UPAGES_STACK + UPAGES_SVS + UPAGES_KxSAN) + +#if defined(KASAN) || defined(KMSAN) +__CTASSERT(UPAGES == 8); #elif defined(SVS) -#define UPAGES 6 /* 1 page used internally by SVS */ +__CTASSERT(UPAGES == 6); #else -#define UPAGES 5 /* pages of u-area (1 for redzone) */ +__CTASSERT(UPAGES == 5); #endif #define USPACE (UPAGES * NBPG) /* total size of u-area */ diff -r 1cb0546d18b6 sys/arch/amd64/include/pcb.h --- a/sys/arch/amd64/include/pcb.h Thu Apr 10 18:53:29 2025 +0000 +++ b/sys/arch/amd64/include/pcb.h Fri Apr 11 22:00:30 2025 +0000 @@ -97,13 +97,16 @@ struct pcb { struct dbreg *pcb_dbregs; uint16_t pcb_fpu_dflt_cw; int pcb_iopl; + union savefpu *pcb_savefpu; - uint32_t pcb_unused[8]; /* unused */ + uint32_t pcb_unused[6]; /* unused */ - union savefpu pcb_savefpu __aligned(64); /* floating point state */ + /* fpu state, if it fits; otherwise allocated separately */ + union savefpu pcb_savefpusmall __aligned(64); /* **** DO NOT ADD ANYTHING HERE **** */ }; #ifndef __lint__ +__CTASSERT(offsetof(struct pcb, pcb_savefpusmall) == 128); __CTASSERT(sizeof(struct pcb) - sizeof (union savefpu) == 128); #endif diff -r 1cb0546d18b6 sys/arch/i386/i386/genassym.cf --- a/sys/arch/i386/i386/genassym.cf Thu Apr 10 18:53:29 2025 +0000 +++ b/sys/arch/i386/i386/genassym.cf Fri Apr 11 22:00:30 2025 +0000 @@ -192,7 +192,6 @@ define PCB_ESP0 offsetof(struct pcb, pc define PCB_FSD offsetof(struct pcb, pcb_fsd) define PCB_GSD offsetof(struct pcb, pcb_gsd) define PCB_IOMAP offsetof(struct pcb, pcb_iomap) -define PCB_SAVEFPU offsetof(struct pcb, pcb_savefpu) define TF_CS offsetof(struct trapframe, tf_cs) define TF_EIP offsetof(struct trapframe, tf_eip) diff -r 1cb0546d18b6 sys/arch/i386/include/pcb.h --- a/sys/arch/i386/include/pcb.h Thu Apr 10 18:53:29 2025 +0000 +++ b/sys/arch/i386/include/pcb.h Fri Apr 11 22:00:30 2025 +0000 @@ -99,12 +99,14 @@ struct pcb { int not_used[15]; /* floating point state */ - union savefpu pcb_savefpu __aligned(64); + union savefpu pcb_savefpu[1] __aligned(64); +#define pcb_savefpusmall pcb_savefpu /* **** DO NOT ADD ANYTHING HERE **** */ }; #ifndef __lint__ /* This doesn't really matter, but there is a lot of implied padding */ +__CTASSERT(offsetof(struct pcb, pcb_savefpu) == 128); __CTASSERT(sizeof(struct pcb) - sizeof (union savefpu) == 128); #endif diff -r 1cb0546d18b6 sys/arch/x86/include/cpu.h --- a/sys/arch/x86/include/cpu.h Thu Apr 10 18:53:29 2025 +0000 +++ b/sys/arch/x86/include/cpu.h Fri Apr 11 22:00:30 2025 +0000 @@ -481,6 +481,7 @@ extern uint64_t x86_xsave_features; extern size_t x86_xsave_offsets[]; extern size_t x86_xsave_sizes[]; extern uint32_t x86_fpu_mxcsr_mask; +bool x86_fpu_save_separate_p(void); extern void (*x86_cpu_idle)(void); #define cpu_idle() (*x86_cpu_idle)() diff -r 1cb0546d18b6 sys/arch/x86/include/cpu_extended_state.h --- a/sys/arch/x86/include/cpu_extended_state.h Thu Apr 10 18:53:29 2025 +0000 +++ b/sys/arch/x86/include/cpu_extended_state.h Fri Apr 11 22:00:30 2025 +0000 @@ -226,6 +226,8 @@ struct xstate { * It is defined this way to separate the definitions and to * minimise the number of union/struct selectors. * NB: Some userspace stuff (eg firefox) uses it to parse ucontext. + * NB: This is not actually the largest possible save space; + * x86_fpu_save_size may be larger. */ union savefpu { struct save87 sv_87; diff -r 1cb0546d18b6 sys/arch/x86/x86/fpu.c --- a/sys/arch/x86/x86/fpu.c Thu Apr 10 18:53:29 2025 +0000 +++ b/sys/arch/x86/x86/fpu.c Fri Apr 11 22:00:30 2025 +0000 @@ -136,11 +136,35 @@ void fpu_switch(struct lwp *, struct lwp uint32_t x86_fpu_mxcsr_mask __read_mostly = 0; +static const union savefpu safe_fpu_storage __aligned(64) = { + .sv_xmm = { + .fx_mxcsr = __SAFE_MXCSR__, + }, +}; +static const union savefpu zero_fpu_storage __aligned(64); + +static const void *safe_fpu __read_mostly = &safe_fpu_storage; +static const void *zero_fpu __read_mostly = &zero_fpu_storage; + +/* + * x86_fpu_save_separate_p() + * + * True if we allocate the FPU save space separately, outside the + * struct pcb itself, because it doesn't fit in a single page. + */ +bool +x86_fpu_save_separate_p(void) +{ + + return x86_fpu_save_size > + PAGE_SIZE - offsetof(struct pcb, pcb_savefpusmall); +} + static inline union savefpu * fpu_lwp_area(struct lwp *l) { struct pcb *pcb = lwp_getpcb(l); - union savefpu *area = &pcb->pcb_savefpu; + union savefpu *area = pcb->pcb_savefpu; KASSERT((l->l_flag & LW_SYSTEM) == 0); if (l == curlwp) { @@ -155,7 +179,7 @@ static inline void fpu_save_lwp(struct lwp *l) { struct pcb *pcb = lwp_getpcb(l); - union savefpu *area = &pcb->pcb_savefpu; + union savefpu *area = pcb->pcb_savefpu; int s; s = splvm(); @@ -189,14 +213,91 @@ fpuinit(struct cpu_info *ci) stts(); } +/* + * fpuinit_mxcsr_mask() + * + * Called once by cpu_init on the primary CPU. Initializes + * x86_fpu_mxcsr_mask based on the initial FPU state, and + * initializes save_fpu and zero_fpu if necessary when the + * hardware's FPU save size is larger than union savefpu. + * + * XXX Rename this function! + */ void fpuinit_mxcsr_mask(void) { + /* + * If the CPU's x86 fpu save size is larger than union savefpu, + * we have to allocate larger buffers for the safe and zero FPU + * states used here and by fpu_kern_enter/leave. + * + * Note: This is NOT the same as x86_fpu_save_separate_p(), + * which may have a little more space than union savefpu. + */ + const bool allocfpusave = x86_fpu_save_size > sizeof(union savefpu); + vaddr_t va; + +#ifdef __i386__ + if (x86_fpu_save_separate_p()) { + /* + * XXX Need to teach cpu_uarea_alloc/free to allocate a + * separate fpu save space, and make pcb_savefpu a + * pointer indirection -- currently only done on amd64, + * not on i386. + * + * But the primary motivation on amd64 is the 8192-byte + * TILEDATA state for Intel AMX (Advanced Matrix + * Extensions), which doesn't work in 32-bit mode + * anyway, so on such machines we ought to just disable + * it in the first place and keep x86_fpu_save_size + * down: + * + * While Intel AMX instructions can be executed + * only in 64-bit mode, instructions of the XSAVE + * feature set can operate on TILECFG and TILEDATA + * in any mode. It is recommended that only + * 64-bit operating systems enable Intel AMX by + * setting XCR0[18:17]. + * + * --Intel 64 and IA-32 Architectures Software + * Developer's Manual, Volume 1: Basic + * Architecture, Order Number: 253665-087US, March + * 2025, Sec. 13.3 `Enabling the XSAVE feature set + * and XSAVE-enabled features', p. 13-6. + * https://cdrdv2.intel.com/v1/dl/getContent/671436 + * https://web.archive.org/web/20250404141850/https://cdrdv2-public.intel.com/851056/253665-087-sdm-vol-1.pdf + * https://web.archive.org/web/20250404141850if_/https://cdrdv2-public.intel.com/851056/253665-087-sdm-vol-1.pdf#page=324 + */ + panic("NetBSD/i386 does not support fpu save size %u", + x86_fpu_save_size); + } +#endif + #ifndef XENPV - union savefpu fpusave __aligned(64); + union savefpu fpusave_stack __aligned(64); + union savefpu *fpusave; u_long psl; - memset(&fpusave, 0, sizeof(fpusave)); + /* + * Allocate a temporary save space from the stack if it fits, + * or from the heap otherwise, so we can query its mxcsr mask. + */ + if (allocfpusave) { + /* + * Need 64-byte alignment for XSAVE instructions. + * kmem_* doesn't guarantee that and we don't have a + * handy posix_memalign in the kernel unless we hack it + * ourselves with vmem(9), so just ask for page + * alignment with uvm_km(9). + */ + __CTASSERT(PAGE_SIZE >= 64); + va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE, + UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA); + fpusave = (void *)va; + } else { + fpusave = &fpusave_stack; + memset(fpusave, 0, sizeof(*fpusave)); + } /* Disable interrupts, and enable FPU */ psl = x86_read_psl(); @@ -204,16 +305,25 @@ fpuinit_mxcsr_mask(void) clts(); /* Fill in the FPU area */ - fxsave(&fpusave); + fxsave(fpusave); /* Restore previous state */ stts(); x86_write_psl(psl); - if (fpusave.sv_xmm.fx_mxcsr_mask == 0) { + if (fpusave->sv_xmm.fx_mxcsr_mask == 0) { x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__; } else { - x86_fpu_mxcsr_mask = fpusave.sv_xmm.fx_mxcsr_mask; + x86_fpu_mxcsr_mask = fpusave->sv_xmm.fx_mxcsr_mask; + } + + /* + * Free the temporary save space. + */ + if (allocfpusave) { + uvm_km_free(kernel_map, va, x86_fpu_save_size, UVM_KMF_WIRED); + fpusave = NULL; + va = 0; } #else /* @@ -223,6 +333,32 @@ fpuinit_mxcsr_mask(void) */ x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__; #endif + + /* + * If necessary, allocate FPU save spaces for safe or zero FPU + * state, for fpu_kern_enter/leave. + */ + if (allocfpusave) { + __CTASSERT(PAGE_SIZE >= 64); + + va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE, + UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA); + memcpy((void *)va, &safe_fpu_storage, + sizeof(safe_fpu_storage)); + uvm_km_protect(kernel_map, va, x86_fpu_save_size, + VM_PROT_READ); + safe_fpu = (void *)va; + + va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE, + UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA); + /* + * No initialization -- just want zeroes! In fact we + * could share this with other all-zero pages. + */ + uvm_km_protect(kernel_map, va, x86_fpu_save_size, + VM_PROT_READ); + zero_fpu = (void *)va; + } } static inline void @@ -305,7 +441,7 @@ void fpu_handle_deferred(void) { struct pcb *pcb = lwp_getpcb(curlwp); - fpu_area_restore(&pcb->pcb_savefpu, x86_xsave_features, + fpu_area_restore(pcb->pcb_savefpu, x86_xsave_features, !(curlwp->l_proc->p_flag & PK_32)); } @@ -321,7 +457,7 @@ fpu_switch(struct lwp *oldlwp, struct lw if (oldlwp->l_md.md_flags & MDL_FPU_IN_CPU) { KASSERT(!(oldlwp->l_flag & LW_SYSTEM)); pcb = lwp_getpcb(oldlwp); - fpu_area_save(&pcb->pcb_savefpu, x86_xsave_features, + fpu_area_save(pcb->pcb_savefpu, x86_xsave_features, !(oldlwp->l_proc->p_flag & PK_32)); oldlwp->l_md.md_flags &= ~MDL_FPU_IN_CPU; } @@ -338,14 +474,15 @@ fpu_lwp_fork(struct lwp *l1, struct lwp if (__predict_false(l2->l_flag & LW_SYSTEM)) { return; } + /* For init(8). */ if (__predict_false(l1->l_flag & LW_SYSTEM)) { - memset(&pcb2->pcb_savefpu, 0, x86_fpu_save_size); + memset(pcb2->pcb_savefpu, 0, x86_fpu_save_size); return; } fpu_save = fpu_lwp_area(l1); - memcpy(&pcb2->pcb_savefpu, fpu_save, x86_fpu_save_size); + memcpy(pcb2->pcb_savefpu, fpu_save, x86_fpu_save_size); l2->l_md.md_flags &= ~MDL_FPU_IN_CPU; } @@ -378,11 +515,6 @@ fpu_lwp_abandon(struct lwp *l) void fpu_kern_enter(void) { - static const union savefpu safe_fpu __aligned(64) = { - .sv_xmm = { - .fx_mxcsr = __SAFE_MXCSR__, - }, - }; struct lwp *l = curlwp; struct cpu_info *ci; int s; @@ -421,7 +553,7 @@ fpu_kern_enter(void) /* * Zero the FPU registers and install safe control words. */ - fpu_area_restore(&safe_fpu, x86_xsave_features, /*is_64bit*/false); + fpu_area_restore(safe_fpu, x86_xsave_features, /*is_64bit*/false); } /* @@ -432,7 +564,6 @@ fpu_kern_enter(void) void fpu_kern_leave(void) { - static const union savefpu zero_fpu __aligned(64); struct cpu_info *ci = curcpu(); int s; @@ -451,7 +582,7 @@ fpu_kern_leave(void) * through Spectre-class attacks to userland, even if there are * no bugs in fpu state management. */ - fpu_area_restore(&zero_fpu, x86_xsave_features, /*is_64bit*/false); + fpu_area_restore(zero_fpu, x86_xsave_features, /*is_64bit*/false); /* * Set CR0_TS again so that the kernel can't accidentally use diff -r 1cb0546d18b6 sys/arch/x86/x86/vm_machdep.c --- a/sys/arch/x86/x86/vm_machdep.c Thu Apr 10 18:53:29 2025 +0000 +++ b/sys/arch/x86/x86/vm_machdep.c Fri Apr 11 22:00:30 2025 +0000 @@ -366,10 +366,42 @@ cpu_uarea_alloc(bool system) { vaddr_t base, va; paddr_t pa; + struct pcb *pcb; base = uvm_km_alloc(kernel_map, USPACE + PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_WAITVA); + /* + * Prepare the FPU save area: + * + * 1. If this is a system thread, no save area. + * XXX Allocate/free one in kthread_fpu_enter/exit_md. + * + * 2. If this is a user thread, and the fpu save size is large + * enough, allocate an extra block of memory for it. + * + * 3. Otherwise, this is a user thread and the fpu save size + * fits inside the pcb page, so use that. + * + * XXX Note that this is currently amd64-only -- if you extend + * this FPU save space allocation to i386, you'll need to + * remove the panic in fpuinit_mxcsr_mask on + * x86_fpu_save_separate_p and make pcb_savefpu a pointer + * indirection in struct pcb. + */ + pcb = (void *)base; + if (system) { /* (1) */ + pcb->pcb_savefpu = NULL; + } else if (x86_fpu_save_separate_p()) { /* (2) */ + __CTASSERT(PAGE_SIZE >= 64); + /* No need to zero -- caller will initialize. */ + va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE, + UVM_KMF_WIRED|UVM_KMF_WAITVA); + pcb->pcb_savefpu = (void *)va; + } else { /* (3) */ + pcb->pcb_savefpu = &pcb->pcb_savefpusmall; + } + /* Page[1] = RedZone */ va = base + PAGE_SIZE; if (!pmap_extract(pmap_kernel(), va, &pa)) { @@ -394,8 +426,20 @@ cpu_uarea_alloc(bool system) bool cpu_uarea_free(void *addr) { + const struct pcb *const pcb = addr; vaddr_t base = (vaddr_t)addr; + /* + * If we allocated a separate FPU save area, free it. + */ + if (pcb->pcb_savefpu != NULL && + pcb->pcb_savefpu != &pcb->pcb_savefpusmall) { + KASSERTMSG(x86_fpu_save_separate_p(), "pcb=%p pcb_savefpu=%p", + pcb, pcb->pcb_savefpu); + uvm_km_free(kernel_map, (vaddr_t)pcb->pcb_savefpu, + x86_fpu_save_size, UVM_KMF_WIRED); + } + KASSERT(!pmap_extract(pmap_kernel(), base + PAGE_SIZE, NULL)); KASSERT(!pmap_extract(pmap_kernel(), base + USPACE, NULL)); uvm_km_free(kernel_map, base, USPACE + PAGE_SIZE, UVM_KMF_WIRED);