/* $NetBSD$ */ /*- * Copyright (c) 2018 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Taylor R. Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __KERNEL_RCSID(0, "$NetBSD$"); #include #include #include #include #include #include #include #include #include #include #include static inline uint64_t rol64(uint64_t v, unsigned n) { return (v << n) | (v >> (64 - n)); } #define SIPROUND(V0, V1, V2, V3) do { \ (V0) += (V1); (V1) = rol64((V1), 13); \ (V1) ^= (V0); (V0) = rol64((V0), 32); \ (V2) += (V3); (V3) = rol64((V3), 16); \ (V3) ^= (V2); \ (V0) += (V3); (V3) = rol64((V3), 21); \ (V3) ^= (V0); \ (V2) += (V1); (V1) = rol64((V1), 17); \ (V1) ^= (V2); (V2) = rol64((V2), 32); \ } while (0) #define SIPCOMPRESS(I, CR, M, V0, V1, V2, V3) do { \ (V3) ^= (M); \ for ((I) = (CR); (I) --> 0;) \ SIPROUND(V0, V1, V2, V3); \ (V0) ^= (M); \ } while (0) static inline uint64_t siphash(unsigned cr, unsigned fr, const uint8_t key[16], const void *buf, size_t len) { const uint8_t *p = buf; uint64_t v0, v1, v2, v3; size_t n8; unsigned i; uint64_t m; /* `somepseudorandomlygeneratedbytes' */ v0 = le64dec(key + 0) ^ UINT64_C(0x736f6d6570736575); v1 = le64dec(key + 8) ^ UINT64_C(0x646f72616e646f6d); v2 = le64dec(key + 0) ^ UINT64_C(0x6c7967656e657261); v3 = le64dec(key + 8) ^ UINT64_C(0x7465646279746573); /* Compress 8-byte words. */ for (n8 = len/8; n8 --> 0;) { m = le64dec(p); SIPCOMPRESS(i, cr, m, v0, v1, v2, v3); p += 8; } /* Pad the last word and compress. */ m = 0; for (i = 0; i < (len % 8); i++) m |= (uint64_t)p[i] << (i*8); m |= (uint64_t)(len & 0xff) << 56; SIPCOMPRESS(i, cr, m, v0, v1, v2, v3); /* Finalize. */ v2 ^= 0xff; for (i = fr; i --> 0;) SIPROUND(v0, v1, v2, v3); return v0 ^ v1 ^ v2 ^ v3; } static uint64_t siphash_2_4(const uint8_t key[16], const void *buf, size_t len) { return siphash(2, 4, key, buf, len); } /* * struct futex_queue * * A queue of futex waiters. */ struct futex_queue { kmutex_t fq_lock; TAILQ_HEAD(, futex_wait) fq_queue; }; /* * struct futex_wait * * State for a thread awaiting a futex wakeup, indicated by * setting fw_bitset to zero. */ struct futex_wait { kmutex_t fw_lock; kcondvar_t fw_cv; struct futex_queue *fw_queue; TAILQ_ENTRY(futex_wait) fw_entry; int fw_bitset; }; /* * futextab * * Global hash table of futex queues. Multiple futex addresses * may share a common queue -- using multiple queues is merey an * optimization to reduce the frequency of spurious wakeups. */ static struct { struct { struct futex_queue q; char pad[roundup(sizeof(struct futex_queue), COHERENCY_UNIT) - sizeof(struct futex_queue)]; } *queue; size_t hashmask; uint8_t hashkey[16]; } futextab __read_mostly; /* Each queue should be in its own cache line. */ CTASSERT((sizeof(futextab.queue[0]) % COHERENCY_UNIT) == 0); /* * futex_abort * * Global state to delay linux_futex_fini until all aborting futex * waits have completed. */ static struct { kmutex_t lock; kcondvar_t cv; struct localcount count; } futex_abort __cacheline_aligned; /* XXX Arbitrary choice here. */ #ifndef FUTEX_NQUEUES #define FUTEX_NQUEUES MAXLWP #endif /* * futex_nqueues() * * Compute the number of futex queues. Guaranteed to be a power * of two. Round FUTEX_NQUEUES up to a power of two. */ static size_t futex_nqueues(void) { size_t nqueues = FUTEX_NQUEUES; unsigned i; for (i = 0; i < CHAR_BIT*sizeof(nqueues)/2; i++) nqueues |= (nqueues >> (1 << i)); return nqueues; } /* * linux_futex_init() * * Initialize the futex subsystem. */ void linux_futex_init(void) { const size_t nqueues = futex_nqueues(); struct futex_queue *fq; size_t i; CTASSERT(FUTEX_NQUEUES <= SIZE_MAX/(2*sizeof(futextab.queue[0]))); KASSERT(nqueues/2 <= FUTEX_NQUEUES); KASSERT(powerof2(nqueues)); /* Allocate an array of queues. */ futextab.queue = kmem_alloc(nqueues*sizeof(futextab.queue[0]), KM_SLEEP); futextab.hashmask = nqueues - 1; /* Initialize each of the queues. */ for (i = 0; i < nqueues; i++) { fq = &futextab.queue[i].q; mutex_init(&fq->fq_lock, MUTEX_DEFAULT, IPL_NONE); TAILQ_INIT(&fq->fq_queue); } /* Initialize the abort counter. */ mutex_init(&futex_abort.lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&futex_abort.cv, "futabort"); localcount_init(&futex_abort.count); } /* * linux_futex_fini() * * Finalize the futex subsystem. Caller must guarantee no new use * of futexes. */ void linux_futex_fini(void) { const size_t nqueues = futex_nqueues(); struct futex_queue *fq; struct futex_wait *fw, *fw_next; size_t i; KASSERT(powerof2(nqueues)); KASSERT(nqueues - 1 == futextab.hashmask); /* Wake up all of the waiters. */ for (i = 0; i < nqueues; i++) { fq = &futextab.queue[i].q; mutex_enter(&fq->fq_lock); TAILQ_FOREACH_SAFE(fw, &fq->fq_queue, fw_entry, fw_next) { mutex_enter(&fw->fw_lock); TAILQ_REMOVE(&fq->fq_queue, fw, fw_entry); fw->fw_bitset = 0; cv_broadcast(&fw->fw_cv); mutex_exit(&fw->fw_lock); } mutex_exit(&fq->fq_lock); } /* Wait for all aborted waits to finish. */ mutex_enter(&futex_abort.lock); localcount_drain(&futex_abort.count, &futex_abort.cv, &futex_abort.lock); mutex_exit(&futex_abort.lock); /* Destroy each of the queues. */ for (i = 0; i < nqueues; i++) { fq = &futextab.queue[i].q; KASSERT(TAILQ_EMPTY(&fq->fq_queue)); mutex_destroy(&fq->fq_lock); } /* Free the array of queues. */ kmem_free(futextab.queue, nqueues*sizeof(futextab.queue[0])); futextab.queue = NULL; } /* * futex_load(uaddr, kaddr) * * Perform a single atomic load to read *uaddr, and return the * result in *kaddr. Return 0 on success, EFAULT if uaddr is not * mapped. * * XXX This may confuse unmapped uaddr with *uaddr == -1 on LP32 * machines because fuword is stupid. */ static int futex_load(int *uaddr, int *kaddr) { #ifdef _LP64 long val; val = fuword(uaddr); if (val == -1) return EFAULT; *kaddr = val; return 0; #else /* XXX Gakkkk... */ *kaddr = fuword(uaddr); return 0; #endif } /* * futex_test(uaddr, expected) * * True if *uaddr == expected. False if *uaddr != expected, or if * uaddr is not mapped. * * XXX This may confuse unmapped uaddr with *uaddr == -1 on LP32 * machines because fuword is stupid. */ static bool futex_test(int *uaddr, int expected) { int val; int error; error = futex_load(uaddr, &val); if (error) return false; return val == expected; } /* * futex_hash_init * * Choose a 128-bit siphash key. Done on demand so that it can be * deferred as late as possible and maximize the probability of * getting reasonable entropy. */ static ONCE_DECL(futex_hash_init_once); static int futex_hash_init(void) { cprng_strong(kern_cprng, futextab.hashkey, sizeof futextab.hashkey, 0); return 0; } /* * futex_hash(vm, va) * * Hash a VM space and virtual address into the futex queue index * for a private futex. */ static size_t futex_hash(struct vmspace *vm, vaddr_t va) { uint8_t buf[sizeof vm + sizeof va]; uint64_t hash; int error; error = RUN_ONCE(&futex_hash_init_once, futex_hash_init); if (error) panic("futex_hash_init failed: %d", error); memcpy(buf, &vm, sizeof vm); memcpy(buf + sizeof vm, &va, sizeof va); hash = siphash_2_4(futextab.hashkey, buf, sizeof buf); return (hash & futextab.hashmask); } /* * futex_shared_hash(pa) * * Hash a physical address into a futex queue index for a * process-shared futex. */ static size_t futex_shared_hash(paddr_t pa) { uint64_t hash; int error; error = RUN_ONCE(&futex_hash_init_once, futex_hash_init); if (error) panic("futex_hash_init failed: %d", error); hash = siphash_2_4(futextab.hashkey, &pa, sizeof pa); return (hash & futextab.hashmask); } /* * futex_get(uaddr, shared, &fq) * * Acquire the queue for the futex at the specified virtual * address in the current process's VM space. Return 0 and set fq * to the queue on success; return error code on failure. */ static int futex_get(int *uaddr, bool shared, struct futex_queue **fqp) { struct vmspace *vm = curproc->p_vmspace; vaddr_t va = (vaddr_t)uaddr; int error; /* Reject misaligned virtual addresses. */ if (va & 3) return EINVAL; /* See whether we need to go to extra trouble for shared. */ if (shared) { /* Compute the virtual page address and the offset in it. */ vaddr_t vpg = trunc_page(va); unsigned pgoff = va - vpg; paddr_t ppg; /* Lock the virtual page. */ error = uvm_vslock(vm, (void *)vpg, PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE); if (error) return error; /* Get the physical page address. */ if (!pmap_extract(vm_map_pmap(&vm->vm_map), vpg, &ppg)) { uvm_vsunlock(vm, (void *)vpg, PAGE_SIZE); return error; } /* Use the queue for the hash of the pa. */ *fqp = &futextab.queue[futex_shared_hash(ppg + pgoff)].q; } else { /* Use the queue for the hash of the VM space and the va. */ *fqp = &futextab.queue[futex_hash(vm, va)].q; } return 0; } /* * futex_put(uaddr, shared, fq) * * Release the futex queue fq, which must have been obtained for a * futex at the specified virtual address in the current process's * VM space. */ static void futex_put(int *uaddr, bool shared, struct futex_queue *fq) { struct vmspace *vm = curproc->p_vmspace; vaddr_t va = (vaddr_t)uaddr; /* Should have rejected misaligned virtual addresses already. */ KASSERT((va & 3) == 0); if (shared) { /* We don't have the pa, so can't assert about the queue. */ uvm_vsunlock(vm, (void *)trunc_page(va), PAGE_SIZE); } else { /* Just assert we're using the same queue. */ KASSERT(fq == &futextab.queue[futex_hash(vm, va)].q); } } /* * futex_wait_init(&fw, bitset) * * Initialize a record for a thread to wait on a futex matching * the specified bit set. Should be passed to futex_wait_enqueue * before futex_wait, and should be passed to futex_wait_fini when * done. */ static void futex_wait_init(struct futex_wait *fw, int bitset) { mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&fw->fw_cv, "futex"); fw->fw_queue = NULL; fw->fw_bitset = bitset; } /* * futex_wait_fini(fw) * * Finalize a record for a futex waiter. Must not be on any futex * queue. */ static void futex_wait_fini(struct futex_wait *fw) { cv_destroy(&fw->fw_cv); mutex_destroy(&fw->fw_lock); } /* * futex_wait_enqueue(fw, fq) * * Put the waiter fw on the futex queue q. Must be done before * futex_wait. Caller must hold fw's lock and fq's lock, and fw * must not be on any existing futex queue. */ static void futex_wait_enqueue(struct futex_wait *fw, struct futex_queue *fq) { KASSERT(mutex_owned(&fq->fq_lock)); KASSERT(mutex_owned(&fw->fw_lock)); KASSERT(fw->fw_queue == NULL); fw->fw_queue = fq; TAILQ_INSERT_TAIL(&fq->fq_queue, fw, fw_entry); } /* * futex_wait_dequeue(fw, fq) * * Remove the waiter fw from the futex queue fq. Precludes * subsequent futex_wait until a futex_wait_enqueue. Caller must * hold fw's lock and fq's lock, and fw must be on fq. */ static void futex_wait_dequeue(struct futex_wait *fw, struct futex_queue *fq) { KASSERT(mutex_owned(&fq->fq_lock)); KASSERT(mutex_owned(&fw->fw_lock)); KASSERT(fw->fw_queue == fq); TAILQ_REMOVE(&fq->fq_queue, fw, fw_entry); fw->fw_queue = NULL; } /* * futex_wait_abort(fw) * * Caller is no longer waiting for fw. Remove it from any queue * if it was on one. */ static void futex_wait_abort(struct futex_wait *fw) { struct futex_queue *fq; localcount_acquire(&futex_abort.count); for (;;) { /* Get the queue under the wait lock. */ mutex_enter(&fw->fw_lock); fq = fw->fw_queue; /* * If there's a queue, and we can get the locks out of * order, we're done. */ if (fq != NULL && mutex_tryenter(&fq->fq_lock)) { remove_locked: futex_wait_dequeue(fw, fq); mutex_exit(&fw->fw_lock); mutex_exit(&fq->fq_lock); break; } mutex_exit(&fw->fw_lock); /* If we're not on a queue any more, we're done. */ if (fq == NULL) break; /* * We have a candidate queue and we can acquire the * locks in order now: queue lock, then wait lock. * However, we may have been moved to another queue in * the interim. If we're still on the same queue, * we're done; if not, try again. */ mutex_enter(&fq->fq_lock); mutex_enter(&fw->fw_lock); if (fw->fw_queue == fq) goto remove_locked; mutex_exit(&fw->fw_lock); mutex_exit(&fq->fq_lock); } localcount_release(&futex_abort.count, &futex_abort.cv, &futex_abort.lock); } /* * futex_wait(fw, deadline, clkid) * * fw must be a waiter on a futex queue. Wait until deadline on * the clock clkid, or forever if deadline is NULL, for a futex * wakeup. Return 0 on explicit wakeup or destruction of futex, * ETIMEDOUT on timeout, EINTR/ERESTART on signal. */ static int futex_wait(struct futex_wait *fw, const struct timespec *deadline, clockid_t clkid) { int error = 0; /* Test and wait under the wait lock. */ mutex_enter(&fw->fw_lock); while (fw->fw_bitset) { /* Not done yet. Wait. */ if (deadline) { struct timespec ts; /* Check our watch. */ error = clock_gettime1(clkid, &ts); if (error) break; /* If we're past the deadline, ETIMEDOUT. */ if (timespeccmp(deadline, &ts, <=)) { error = ETIMEDOUT; break; } /* Count how much time is left. */ timespecsub(deadline, &ts, &ts); /* Wait for that much time, allowing signals. */ error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock, tstohz(&ts)); } else { /* Wait indefinitely, allowing signals. */ error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock); } /* If wait failed, stop here. */ if (error) { /* Convert EWOULDBLOCK to ETIMEDOUT. */ if (error == EWOULDBLOCK) error = ETIMEDOUT; break; } } mutex_exit(&fw->fw_lock); return error; } /* * futex_wake(fq, nwake, fq2, nrequeue, bitset) * * Wake up to nwake waiters on fq matching bitset; then, if fq2 is * provided, move up to nrequeue remaining waiters on fq matching * bitset to fq2. Return the number of waiters actually woken. * Caller must hold the lock of fq, and of fq2 if provided. */ static unsigned futex_wake(struct futex_queue *fq, unsigned nwake, struct futex_queue *fq2, unsigned nrequeue, int bitset) { struct futex_wait *fw, *fw_next; unsigned nwoken = 0; /* Caller must hold the lock of fq, and of fq2 if provided. */ KASSERT(mutex_owned(&fq->fq_lock)); KASSERT(fq2 == NULL || mutex_owned(&fq2->fq_lock)); /* Wake up to nwake waiters, and count the number woken. */ TAILQ_FOREACH_SAFE(fw, &fq->fq_queue, fw_entry, fw_next) { if ((fw->fw_bitset & bitset) == 0) continue; if (nwake --> 0) { mutex_enter(&fw->fw_lock); futex_wait_dequeue(fw, fq); fw->fw_bitset = 0; cv_broadcast(&fw->fw_cv); mutex_exit(&fw->fw_lock); nwoken++; } else { break; } } if (fq2 != NULL && fq2 != fq) { /* Move up to nrequeue waiters from fq to fq2. */ TAILQ_FOREACH_SAFE(fw, &fq2->fq_queue, fw_entry, fw_next) { if ((fw->fw_bitset & bitset) == 0) continue; if (nrequeue --> 0) { mutex_enter(&fw->fw_lock); futex_wait_dequeue(fw, fq); futex_wait_enqueue(fw, fq2); mutex_exit(&fw->fw_lock); } else { break; } } } else { KASSERT(fq2 == fq || nrequeue == 0); } /* Return the number of waiters woken. */ return nwoken; } /* * futex_lock2(fq, fq2) * * Acquire the locks of fq and fq2, which may be the same. If * they are distinct, an arbitrary total order is chosen on the * locks. * * Callers should only ever acquire multiple queue locks * simultaneously using futex_lock2. */ static void futex_lock2(struct futex_queue *fq, struct futex_queue *fq2) { if (fq == fq2) { mutex_enter(&fq->fq_lock); return; } /* * Lock queues with lower virtual addresses in kernel virtual * address space first. * * The only struct futex_queue objects we use are allocated in * a single array in linux_futex_init, and as such are * guaranteed to be comparable without technically undefined * behaviour. */ if (fq < fq2) { mutex_enter(&fq->fq_lock); mutex_enter(&fq2->fq_lock); } else { mutex_enter(&fq2->fq_lock); mutex_enter(&fq->fq_lock); } } /* * futex_unlock2(fq, fq2) * * Release the locks of both fq and fq2, which may be the same. */ static void futex_unlock2(struct futex_queue *fq, struct futex_queue *fq2) { if (fq == fq2) { mutex_exit(&fq->fq_lock); return; } if (fq < fq2) { mutex_exit(&fq2->fq_lock); mutex_exit(&fq->fq_lock); } else { mutex_exit(&fq->fq_lock); mutex_exit(&fq2->fq_lock); } } /* * linux_do_futex_wait(uaddr, val, val3, timeout, clkid, clkflags, retval) * * Implement Linux futex(FUTEX_WAIT). */ static int linux_do_futex_wait(bool shared, int *uaddr, int val, int val3, const struct timespec *timeout, clockid_t clkid, int clkflags, register_t *retval) { struct futex_queue *fq; struct futex_wait wait, *fw = &wait; struct timespec ts; const struct timespec *deadline; int error; /* Optimistically test before anything else. */ if (!futex_test(uaddr, val)) return EAGAIN; /* Determine a deadline on the specified clock. */ if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) { deadline = timeout; } else { error = clock_gettime1(clkid, &ts); if (error) return error; timespecadd(&ts, timeout, &ts); deadline = &ts; } /* Get the futex. */ error = futex_get(uaddr, shared, &fq); if (error) return error; /* Get ready to wait. */ futex_wait_init(fw, val3); /* * Under the queue lock, check the value again: if it has * already changed, EAGAIN; otherwise enqueue the waiter. * Since FUTEX_WAKE will use the same lock and be done after * modifying the value, the order in which we check and enqueue * is immaterial. */ mutex_enter(&fq->fq_lock); if (!futex_test(uaddr, val)) { mutex_exit(&fq->fq_lock); error = EAGAIN; goto out; } futex_wait_enqueue(fw, fq); mutex_exit(&fq->fq_lock); /* We are now done with the queue. We only need the waiter. */ futex_put(uaddr, shared, fq); fq = NULL; /* Wait. */ error = futex_wait(fw, deadline, clkid); if (error) { futex_wait_abort(fw); goto out; } /* Return 0 on success, error on failure. */ *retval = 0; out: futex_wait_fini(fw); if (fq != NULL) futex_put(uaddr, shared, fq); return error; } /* * linux_do_futex_wake(uaddr, val, val3, retval) * * Implement Linux futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET). */ static int linux_do_futex_wake(bool shared, int *uaddr, int val, int val3, register_t *retval) { struct futex_queue *fq; unsigned nwoken; int error; /* Reject negative number of wakeups. */ if (val < 0) return EINVAL; /* Get the futex queue. */ error = futex_get(uaddr, shared, &fq); if (error) return error; /* Under fq's lock, wake the waiters and remember how many woken. */ mutex_enter(&fq->fq_lock); nwoken = futex_wake(fq, val, NULL, 0, val3); mutex_exit(&fq->fq_lock); /* Release the futex queue. */ futex_put(uaddr, shared, fq); /* Return the number of waiters woken. */ *retval = nwoken; /* Success! */ return 0; } /* * linux_do_futex_requeue(uaddr, val, uaddr2, val2, retval) * * Implement Linux futex(FUTEX_REQUEUE). */ static int linux_do_futex_requeue(bool shared, int *uaddr, int val, int *uaddr2, int val2, register_t *retval) { struct futex_queue *fq, *fq2; unsigned nwoken; int error; /* Reject negative number of wakeups. */ if (val < 0) return EINVAL; /* Get the first futex. */ error = futex_get(uaddr, shared, &fq); if (error) goto out0; /* Get the second futex. */ error = futex_get(uaddr2, shared, &fq2); if (error) goto out1; /* * Under the futexes' queue locks, wake the waiters and return * the number woken. */ futex_lock2(fq, fq2); nwoken = futex_wake(fq, val, fq2, val2, FUTEX_BITSET_MATCH_ANY); futex_unlock2(fq, fq2); /* Return the number of waiters woken. */ *retval = nwoken; /* Release the futexes and we're done. */ futex_put(uaddr2, shared, fq2); out1: futex_put(uaddr, shared, fq); out0: return error; } /* * linux_do_futex_cmp_requeue(uaddr, val, uaddr2, val2, val3, retval) * * Implement Linux futex(FUTEX_CMP_REQUEUE). */ static int linux_do_futex_cmp_requeue(bool shared, int *uaddr, int val, int *uaddr2, int val2, int val3, register_t *retval) { struct futex_queue *fq, *fq2; unsigned nwoken; int error; /* Reject negative number of wakeups. */ if (val < 0) return EINVAL; /* Get the first futex. */ error = futex_get(uaddr, shared, &fq); if (error) goto out0; /* Get the second futex. */ error = futex_get(uaddr2, shared, &fq2); if (error) goto out1; /* * Under the futexes' queue locks, check the value; if * unchanged from val3, wake the waiters. */ futex_lock2(fq, fq2); if (!futex_test(uaddr, val3)) { error = EAGAIN; nwoken = 0; } else { error = 0; nwoken = futex_wake(fq, val, fq2, val2, FUTEX_BITSET_MATCH_ANY); } futex_unlock2(fq, fq2); /* Return the number of waiters woken. */ *retval = nwoken; /* Release the futexes and we're done. */ futex_put(uaddr2, shared, fq2); out1: futex_put(uaddr, shared, fq); out0: return error; } #define FUTEX_OP_OP __BITS(31,28) #define FUTEX_OP_CMP __BITS(27,24) #define FUTEX_OP_OPARG __BITS(23,12) #define FUTEX_OP_CMPARG __BITS(11,0) /* * futex_validate_op_cmp(val3) * * Validate an op/cmp argument for FUTEX_WAKE_OP. */ static int futex_validate_op_cmp(int val3) { int op = __SHIFTOUT(val3, FUTEX_OP_OP); int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG); int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP); int cmparg __unused = __SHIFTOUT(val3, FUTEX_OP_CMPARG); /* If a shift is specified, it must not shift beyond 32 bits. */ if (op & FUTEX_OP_OPARG_SHIFT) { if (oparg < 0) return EINVAL; if (oparg >= 32) return EINVAL; op &= ~FUTEX_OP_OPARG_SHIFT; } /* The op must be SET, ADD, OR, ANDN, or XOR. */ switch (op) { case FUTEX_OP_SET: case FUTEX_OP_ADD: case FUTEX_OP_OR: case FUTEX_OP_ANDN: case FUTEX_OP_XOR: break; default: return EINVAL; } /* The cmp must be EQ, NE, LT, LE, GT, or GE. */ switch (cmp) { case FUTEX_OP_CMP_EQ: case FUTEX_OP_CMP_NE: case FUTEX_OP_CMP_LT: case FUTEX_OP_CMP_LE: case FUTEX_OP_CMP_GT: case FUTEX_OP_CMP_GE: break; default: return EINVAL; } /* Success! */ return 0; } /* * futex_compute_op(oldval, val3) * * Apply a FUTEX_WAIT_OP operation to oldval. */ static int futex_compute_op(int oldval, int val3) { int op = __SHIFTOUT(val3, FUTEX_OP_OP); int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG); /* Apply the shift, if present, to oparg. */ if (op & FUTEX_OP_OPARG_SHIFT) { KASSERT(oparg >= 0); KASSERT(oparg < 32); oparg = 1u << oparg; op &= ~FUTEX_OP_OPARG_SHIFT; } /* Apply the op. */ switch (op) { case FUTEX_OP_SET: return oparg; case FUTEX_OP_ADD: /* * Avoid signed arithmetic overflow by doing * arithmetic unsigned and converting back to signed * at the end. */ return (int)((unsigned)oldval + (unsigned)oparg); case FUTEX_OP_OR: return oldval | oparg; case FUTEX_OP_ANDN: return oldval & ~oparg; case FUTEX_OP_XOR: return oldval ^ oparg; default: panic("invalid futex op"); } } /* * futex_compute_cmp(oldval, val3) * * Apply a FUTEX_WAIT_OP comparison to oldval. */ static bool futex_compute_cmp(int oldval, int val3) { int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP); int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG); switch (cmp) { case FUTEX_OP_CMP_EQ: return (oldval == cmparg); case FUTEX_OP_CMP_NE: return (oldval != cmparg); case FUTEX_OP_CMP_LT: return (oldval < cmparg); case FUTEX_OP_CMP_LE: return (oldval <= cmparg); case FUTEX_OP_CMP_GT: return (oldval > cmparg); case FUTEX_OP_CMP_GE: return (oldval >= cmparg); default: panic("invalid futex cmp operation"); } } /* * linux_do_futex_wake_op(uaddr, val, uaddr2, val2, val3, retval) * * Implement Linux futex(FUTEX_WAKE_OP). */ static int linux_do_futex_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2, int val3, register_t *retval) { struct futex_queue *fq, *fq2; int oldval, newval, actual; unsigned nwoken; int error; /* Reject negative number of wakeups. */ if (val < 0) return EINVAL; /* Reject invalid operations before we start doing things. */ if (!futex_validate_op_cmp(val3)) return EINVAL; /* Get the first futex. */ error = futex_get(uaddr, shared, &fq); if (error) goto out0; /* Get the second futex. */ error = futex_get(uaddr2, shared, &fq2); if (error) goto out1; /* * Under the queue locks: * * 1. Read/modify/write: *uaddr2 op= oparg. * 2. Unconditionally wake uaddr. * 3. Conditionally wake uaddr2, if it previously matched val2. */ futex_lock2(fq, fq2); do { error = futex_load(uaddr2, &oldval); if (error) { futex_unlock2(fq, fq2); goto out2; } newval = futex_compute_op(oldval, val3); error = ucas_int(uaddr2, oldval, newval, &actual); if (error) { futex_unlock2(fq, fq2); goto out2; } } while (actual != oldval); nwoken = futex_wake(fq, val, NULL, 0, FUTEX_BITSET_MATCH_ANY); if (futex_compute_cmp(oldval, val3)) nwoken += futex_wake(fq2, val2, NULL, 0, FUTEX_BITSET_MATCH_ANY); futex_unlock2(fq, fq2); /* Return the number of waiters woken. */ *retval = nwoken; error = 0; /* Release the futexes and we're done. */ out2: futex_put(uaddr2, shared, fq2); out1: futex_put(uaddr, shared, fq); out0: return error; } /* * linux_do_futex(uaddr, op, val, timeout, uaddr2, val2, val3) * * Implement the Linux futex system call with all the parameters * parsed out. */ int linux_do_futex(int *uaddr, int op, int val, const struct timespec *timeout, int *uaddr2, int val2, int val3, register_t *retval) { bool shared; clockid_t clkid; if (op & LINUX_FUTEX_PRIVATE_FLAG) shared = false; else shared = true; if (op & LINUX_FUTEX_CLOCK_REALTIME) clkid = CLOCK_REALTIME; else clkid = CLOCK_MONOTONIC; switch (op & LINUX_FUTEX_CMD_MASK) { case LINUX_FUTEX_WAIT: return linux_do_futex_wait(shared, uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME, retval); case LINUX_FUTEX_WAKE: return linux_do_futex_wake(shared, uaddr, val, FUTEX_BITSET_MATCH_ANY, retval); case LINUX_FUTEX_FD: return ENOSYS; case LINUX_FUTEX_REQUEUE: return linux_do_futex_requeue(shared, uaddr, val, uaddr2, val2, retval); case LINUX_FUTEX_CMP_REQUEUE: return linux_do_futex_cmp_requeue(shared, uaddr, val, uaddr2, val2, val3, retval); case LINUX_FUTEX_WAIT_BITSET: return linux_do_futex_wait(shared, uaddr, val, val3, timeout, clkid, TIMER_ABSTIME, retval); case LINUX_FUTEX_WAKE_BITSET: return linux_do_futex_wake(shared, uaddr, val, val3, retval); case LINUX_FUTEX_WAKE_OP: return linux_do_futex_wake_op(shared, uaddr, val, uaddr2, val2, val3, retval); default: return ENOSYS; } } /* * linux_sys_futex(l, uap, retval) * * Linux futex system call: generic futex operations. */ int linux_sys_futex(struct lwp *l, const struct linux_sys_futex_args *uap, register_t *retval) { /* { syscallarg(int *) uaddr; syscallarg(int) op; syscallarg(int) val; syscallarg(const struct linux_timespec *) timeout; syscallarg(int *) uaddr2; syscallarg(int) val3; } */ struct linux_timespec lts; struct timespec ts, *tsp; int op = SCARG(uap, op); int val2; int error; /* * Discriminate between whether * * (a) the `timeout' argument is actually a pointer to a Linux * struct timespec for a futex wait operation, or whether * * (b) it is actually an int shoehorned into a pointer for any * other futex operation. */ switch (op & LINUX_FUTEX_CMD_MASK) { case LINUX_FUTEX_WAIT: case LINUX_FUTEX_WAIT_BITSET: error = copyin(SCARG(uap, timeout), <s, sizeof lts); if (error) return error; linux_to_native_timespec(&ts, <s); tsp = &ts; val2 = -1; break; default: tsp = NULL; val2 = (int)(intptr_t)SCARG(uap, timeout); break; } return linux_do_futex(SCARG(uap, uaddr), op, SCARG(uap, val), tsp, SCARG(uap, uaddr2), val2, SCARG(uap, val3), retval); } /* * linux_sys_set_robust_list(l, uap, retval) * * Linux set_robust_list system call for robust futexes. */ int linux_sys_set_robust_list(struct lwp *l, const struct linux_sys_set_robust_list_args *uap, register_t *retval) { /* { syscallarg(struct linux_robust_list_head *) head; syscallarg(size_t) len; } */ struct linux_emuldata *led = l->l_emuldata; struct linux_robust_list_head *head = SCARG(uap, head); if (SCARG(uap, len) != sizeof(struct linux_robust_list_head)) return EINVAL; if ((uintptr_t)head % sizeof(head)) return EINVAL; led->led_robust_head = head; *retval = 0; return 0; } /* * linux_sys_get_robust_list(l, uap, retval) * * Linux get_robust_list system call for robust futexes. */ int linux_sys_get_robust_list(struct lwp *l, const struct linux_sys_get_robust_list_args *uap, register_t *retval) { /* { syscallarg(int) pid; syscallarg(struct linux_robust_list_head **) head; syscallarg(size_t *) len; } */ struct linux_emuldata *led = l->l_emuldata; struct proc *p = curproc; struct linux_robust_list_head *head; const size_t len = sizeof(*head); int error; KASSERT(p == l->l_proc); /* Find the other lwp, if requested; otherwise use our robust head. */ if (SCARG(uap, pid)) { mutex_enter(p->p_lock); l = lwp_find(p, SCARG(uap, pid)); if (l == NULL) { mutex_exit(p->p_lock); return ESRCH; } head = led->led_robust_head; mutex_exit(p->p_lock); } else { head = led->led_robust_head; } /* Copy out the head pointer and the head structure length. */ error = copyout(&head, SCARG(uap, head), sizeof(head)); if (error) return error; error = copyout(&len, SCARG(uap, len), sizeof(len)); if (error) return error; /* Success! */ return 0; } /* * linux_lwp_tid(l) * * Return the Linux tid for the lwp l. * * XXX THIS IS CURRENTLY BROKEN AND SHOULD LIVE ELSEWHERE */ static int linux_lwp_tid(struct lwp *l) { /* * XXX This is wrong... There's a global namespace of Linux * tids, not a per-process namespace. */ return l->l_lid; } /* * release_futex(uva) * * Try to release the shared futex at uptr in the current process * on lwp exit. If anything goes wrong, silently fail. It is the * userland program's obligation to arrange the robust list * correctly. */ static void release_futex(uintptr_t uptr) { int *uaddr; struct futex_queue *fq; int oldval, newval, actual; int error; /* If it's misaligned, tough. */ if (uptr & 3) return; uaddr = (int *)uptr; /* Optimistically test whether we need to do anything at all. */ error = futex_load(uaddr, &oldval); if (error == 0 && (oldval & FUTEX_TID_MASK) != linux_lwp_tid(curlwp)) return; /* Get the shared futex queue. If we can't, tough. */ error = futex_get(uaddr, /*shared*/true, &fq); if (error) return; /* Work under the futex queue lock. */ mutex_enter(&fq->fq_lock); /* * Fetch the word: if the tid doesn't match ours, skip; * otherwise, set the owner-died bit, atomically. */ do { error = futex_load(uaddr, &oldval); if (error) goto out; if ((oldval & FUTEX_TID_MASK) != linux_lwp_tid(curlwp)) goto out; newval = oldval | FUTEX_OWNER_DIED; error = ucas_int(uaddr, oldval, newval, &actual); if (error) goto out; } while (actual != oldval); /* * If there may be waiters, try to wake one. If anything goes * wrong, tough. */ if (oldval & FUTEX_WAITERS) (void)futex_wake(fq, 1, NULL, 0, FUTEX_BITSET_MATCH_ANY); /* Unlock the queue and release the futex. */ out: mutex_exit(&fq->fq_lock); futex_put(uaddr, /*shared*/true, fq); } /* * release_futexes(l) * * Release all l's robust futexes. If anything looks funny in * the process, give up -- it's userland's responsibility to dot * the i's and cross the t's. */ void release_futexes(struct lwp *l) { struct linux_emuldata *led = l->l_emuldata; struct linux_robust_list_head head; struct linux_robust_list entry, *next; unsigned limit = 1000000; int error; /* If there's no robust list, nothing to do. */ if (led->led_robust_head == NULL) return; /* Read the final snapshot of the robust list head. */ error = copyin(led->led_robust_head, &head, sizeof head); if (error) { printf("WARNING: pid %jd (%s) lwp %jd:" " unmapped robust futex list head\n", (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm, (uintmax_t)l->l_lid); return; } /* * Walk down the list of locked futexes and release them, up * to one million of them before we give up. */ for (next = head.list.next; limit --> 0 && next != &led->led_robust_head->list; next = entry.next) { release_futex((uintptr_t)next + head.futex_offset); error = copyin(next, &entry, sizeof entry); if (error) break; } /* If there's a pending futex, it may need to be released too. */ if (head.pending_list != NULL) { release_futex((uintptr_t)head.pending_list + head.futex_offset); } }