Merge tag 'locking-core-2025-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Ingo Molnar:
 "Futexes:

   - Add support for task local hash maps (Sebastian Andrzej Siewior,
     Peter Zijlstra)

   - Implement the FUTEX2_NUMA ABI, which feature extends the futex
     interface to be NUMA-aware. On NUMA-aware futexes a second u32 word
     containing the NUMA node is added to after the u32 futex value word
     (Peter Zijlstra)

   - Implement the FUTEX2_MPOL ABI, which feature extends the futex
     interface to be mempolicy-aware as well, to further refine futex
     node mappings and lookups (Peter Zijlstra)

  Locking primitives:

   - Misc cleanups (Andy Shevchenko, Borislav Petkov, Colin Ian King,
     Ingo Molnar, Nam Cao, Peter Zijlstra)

  Lockdep:

   - Prevent abuse of lockdep subclasses (Waiman Long)

   - Add number of dynamic keys to /proc/lockdep_stats (Waiman Long)

  Plus misc cleanups and fixes"

* tag 'locking-core-2025-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (35 commits)
  selftests/futex: Fix spelling mistake "unitiliazed" -> "uninitialized"
  futex: Correct the kernedoc return value for futex_wait_setup().
  tools headers: Synchronize prctl.h ABI header
  futex: Use RCU_INIT_POINTER() in futex_mm_init().
  selftests/futex: Use TAP output in futex_numa_mpol
  selftests/futex: Use TAP output in futex_priv_hash
  futex: Fix kernel-doc comments
  futex: Relax the rcu_assign_pointer() assignment of mm->futex_phash in futex_mm_init()
  futex: Fix outdated comment in struct restart_block
  locking/lockdep: Add number of dynamic keys to /proc/lockdep_stats
  locking/lockdep: Prevent abuse of lockdep subclass
  locking/lockdep: Move hlock_equal() to the respective #ifdeffery
  futex,selftests: Add another FUTEX2_NUMA selftest
  selftests/futex: Add futex_numa_mpol
  selftests/futex: Add futex_priv_hash
  selftests/futex: Build without headers nonsense
  tools/perf: Allow to select the number of hash buckets
  tools headers: Synchronize prctl.h ABI header
  futex: Implement FUTEX2_MPOL
  futex: Implement FUTEX2_NUMA
  ...
This commit is contained in:
Linus Torvalds
2025-05-26 14:42:07 -07:00
38 changed files with 2536 additions and 612 deletions

View File

@@ -4,11 +4,11 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/ktime.h> #include <linux/ktime.h>
#include <linux/mm_types.h>
#include <uapi/linux/futex.h> #include <uapi/linux/futex.h>
struct inode; struct inode;
struct mm_struct;
struct task_struct; struct task_struct;
/* /*
@@ -34,6 +34,7 @@ union futex_key {
u64 i_seq; u64 i_seq;
unsigned long pgoff; unsigned long pgoff;
unsigned int offset; unsigned int offset;
/* unsigned int node; */
} shared; } shared;
struct { struct {
union { union {
@@ -42,11 +43,13 @@ union futex_key {
}; };
unsigned long address; unsigned long address;
unsigned int offset; unsigned int offset;
/* unsigned int node; */
} private; } private;
struct { struct {
u64 ptr; u64 ptr;
unsigned long word; unsigned long word;
unsigned int offset; unsigned int offset;
unsigned int node; /* NOT hashed! */
} both; } both;
}; };
@@ -77,7 +80,25 @@ void futex_exec_release(struct task_struct *tsk);
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3); u32 __user *uaddr2, u32 val2, u32 val3);
#else int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4);
#ifdef CONFIG_FUTEX_PRIVATE_HASH
int futex_hash_allocate_default(void);
void futex_hash_free(struct mm_struct *mm);
static inline void futex_mm_init(struct mm_struct *mm)
{
RCU_INIT_POINTER(mm->futex_phash, NULL);
mutex_init(&mm->futex_hash_lock);
}
#else /* !CONFIG_FUTEX_PRIVATE_HASH */
static inline int futex_hash_allocate_default(void) { return 0; }
static inline void futex_hash_free(struct mm_struct *mm) { }
static inline void futex_mm_init(struct mm_struct *mm) { }
#endif /* CONFIG_FUTEX_PRIVATE_HASH */
#else /* !CONFIG_FUTEX */
static inline void futex_init_task(struct task_struct *tsk) { } static inline void futex_init_task(struct task_struct *tsk) { }
static inline void futex_exit_recursive(struct task_struct *tsk) { } static inline void futex_exit_recursive(struct task_struct *tsk) { }
static inline void futex_exit_release(struct task_struct *tsk) { } static inline void futex_exit_release(struct task_struct *tsk) { }
@@ -88,6 +109,17 @@ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
{ {
return -EINVAL; return -EINVAL;
} }
static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
{
return -EINVAL;
}
static inline int futex_hash_allocate_default(void)
{
return 0;
}
static inline void futex_hash_free(struct mm_struct *mm) { }
static inline void futex_mm_init(struct mm_struct *mm) { }
#endif #endif
#endif #endif

View File

@@ -31,6 +31,7 @@
#define INIT_PASID 0 #define INIT_PASID 0
struct address_space; struct address_space;
struct futex_private_hash;
struct mem_cgroup; struct mem_cgroup;
/* /*
@@ -1031,7 +1032,11 @@ struct mm_struct {
*/ */
seqcount_t mm_lock_seq; seqcount_t mm_lock_seq;
#endif #endif
#ifdef CONFIG_FUTEX_PRIVATE_HASH
struct mutex futex_hash_lock;
struct futex_private_hash __rcu *futex_phash;
struct futex_private_hash *futex_phash_new;
#endif
unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */

View File

@@ -7,6 +7,7 @@
#include <linux/rwsem.h> #include <linux/rwsem.h>
#include <linux/tracepoint-defs.h> #include <linux/tracepoint-defs.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/cleanup.h>
#define MMAP_LOCK_INITIALIZER(name) \ #define MMAP_LOCK_INITIALIZER(name) \
.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
@@ -211,6 +212,9 @@ static inline void mmap_read_unlock(struct mm_struct *mm)
up_read(&mm->mmap_lock); up_read(&mm->mmap_lock);
} }
DEFINE_GUARD(mmap_read_lock, struct mm_struct *,
mmap_read_lock(_T), mmap_read_unlock(_T))
static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
{ {
__mmap_lock_trace_released(mm, false); __mmap_lock_trace_released(mm, false);

View File

@@ -30,7 +30,11 @@ static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
* rcuref_read - Read the number of held reference counts of a rcuref * rcuref_read - Read the number of held reference counts of a rcuref
* @ref: Pointer to the reference count * @ref: Pointer to the reference count
* *
* Return: The number of held references (0 ... N) * Return: The number of held references (0 ... N). The value 0 does not
* indicate that it is safe to schedule the object, protected by this reference
* counter, for deconstruction.
* If you want to know if the reference counter has been marked DEAD (as
* signaled by rcuref_put()) please use rcuread_is_dead().
*/ */
static inline unsigned int rcuref_read(rcuref_t *ref) static inline unsigned int rcuref_read(rcuref_t *ref)
{ {
@@ -40,6 +44,22 @@ static inline unsigned int rcuref_read(rcuref_t *ref)
return c >= RCUREF_RELEASED ? 0 : c + 1; return c >= RCUREF_RELEASED ? 0 : c + 1;
} }
/**
* rcuref_is_dead - Check if the rcuref has been already marked dead
* @ref: Pointer to the reference count
*
* Return: True if the object has been marked DEAD. This signals that a previous
* invocation of rcuref_put() returned true on this reference counter meaning
* the protected object can safely be scheduled for deconstruction.
* Otherwise, returns false.
*/
static inline bool rcuref_is_dead(rcuref_t *ref)
{
unsigned int c = atomic_read(&ref->refcnt);
return (c >= RCUREF_RELEASED) && (c < RCUREF_NOREF);
}
extern __must_check bool rcuref_get_slowpath(rcuref_t *ref); extern __must_check bool rcuref_get_slowpath(rcuref_t *ref);
/** /**

View File

@@ -26,7 +26,7 @@ struct restart_block {
unsigned long arch_data; unsigned long arch_data;
long (*fn)(struct restart_block *); long (*fn)(struct restart_block *);
union { union {
/* For futex_wait and futex_wait_requeue_pi */ /* For futex_wait() */
struct { struct {
u32 __user *uaddr; u32 __user *uaddr;
u32 val; u32 val;

View File

@@ -169,8 +169,13 @@ void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_m
int node, const void *caller) __alloc_size(1); int node, const void *caller) __alloc_size(1);
#define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__)) #define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__))
void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) __alloc_size(1);
#define vmalloc_huge(...) alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__)) #define vmalloc_huge_node(...) alloc_hooks(vmalloc_huge_node_noprof(__VA_ARGS__))
static inline void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
{
return vmalloc_huge_node(size, gfp_mask, NUMA_NO_NODE);
}
extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
#define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__)) #define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__))

View File

@@ -63,7 +63,7 @@
#define FUTEX2_SIZE_U32 0x02 #define FUTEX2_SIZE_U32 0x02
#define FUTEX2_SIZE_U64 0x03 #define FUTEX2_SIZE_U64 0x03
#define FUTEX2_NUMA 0x04 #define FUTEX2_NUMA 0x04
/* 0x08 */ #define FUTEX2_MPOL 0x08
/* 0x10 */ /* 0x10 */
/* 0x20 */ /* 0x20 */
/* 0x40 */ /* 0x40 */
@@ -74,6 +74,13 @@
/* do not use */ /* do not use */
#define FUTEX_32 FUTEX2_SIZE_U32 /* historical accident :-( */ #define FUTEX_32 FUTEX2_SIZE_U32 /* historical accident :-( */
/*
* When FUTEX2_NUMA doubles the futex word, the second word is a node value.
* The special value -1 indicates no-node. This is the same value as
* NUMA_NO_NODE, except that value is not ABI, this is.
*/
#define FUTEX_NO_NODE (-1)
/* /*
* Max numbers of elements in a futex_waitv array * Max numbers of elements in a futex_waitv array
*/ */

View File

@@ -364,4 +364,11 @@ struct prctl_mm_map {
# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 # define PR_TIMER_CREATE_RESTORE_IDS_ON 1
# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 # define PR_TIMER_CREATE_RESTORE_IDS_GET 2
/* FUTEX hash management */
#define PR_FUTEX_HASH 78
# define PR_FUTEX_HASH_SET_SLOTS 1
# define FH_FLAG_IMMUTABLE (1ULL << 0)
# define PR_FUTEX_HASH_GET_SLOTS 2
# define PR_FUTEX_HASH_GET_IMMUTABLE 3
#endif /* _LINUX_PRCTL_H */ #endif /* _LINUX_PRCTL_H */

View File

@@ -1687,6 +1687,16 @@ config FUTEX_PI
depends on FUTEX && RT_MUTEXES depends on FUTEX && RT_MUTEXES
default y default y
config FUTEX_PRIVATE_HASH
bool
depends on FUTEX && !BASE_SMALL && MMU
default y
config FUTEX_MPOL
bool
depends on FUTEX && NUMA
default y
config EPOLL config EPOLL
bool "Enable eventpoll support" if EXPERT bool "Enable eventpoll support" if EXPERT
default y default y

View File

@@ -273,7 +273,6 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct io_futex_data *ifd = NULL; struct io_futex_data *ifd = NULL;
struct futex_hash_bucket *hb;
int ret; int ret;
if (!iof->futex_mask) { if (!iof->futex_mask) {
@@ -295,12 +294,11 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
ifd->req = req; ifd->req = req;
ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags, ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags,
&ifd->q, &hb); &ifd->q, NULL, NULL);
if (!ret) { if (!ret) {
hlist_add_head(&req->hash_node, &ctx->futex_list); hlist_add_head(&req->hash_node, &ctx->futex_list);
io_ring_submit_unlock(ctx, issue_flags); io_ring_submit_unlock(ctx, issue_flags);
futex_queue(&ifd->q, hb, NULL);
return IOU_ISSUE_SKIP_COMPLETE; return IOU_ISSUE_SKIP_COMPLETE;
} }

View File

@@ -1306,6 +1306,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
RCU_INIT_POINTER(mm->exe_file, NULL); RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm); mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm); init_tlb_flush_pending(mm);
futex_mm_init(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
mm->pmd_huge_pte = NULL; mm->pmd_huge_pte = NULL;
#endif #endif
@@ -1388,6 +1389,7 @@ static inline void __mmput(struct mm_struct *mm)
if (mm->binfmt) if (mm->binfmt)
module_put(mm->binfmt->module); module_put(mm->binfmt->module);
lru_gen_del_mm(mm); lru_gen_del_mm(mm);
futex_hash_free(mm);
mmdrop(mm); mmdrop(mm);
} }
@@ -2153,6 +2155,13 @@ static void rv_task_fork(struct task_struct *p)
#define rv_task_fork(p) do {} while (0) #define rv_task_fork(p) do {} while (0)
#endif #endif
static bool need_futex_hash_allocate_default(u64 clone_flags)
{
if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM))
return false;
return true;
}
/* /*
* This creates a new process as a copy of the old one, * This creates a new process as a copy of the old one,
* but does not actually start it yet. * but does not actually start it yet.
@@ -2533,6 +2542,21 @@ __latent_entropy struct task_struct *copy_process(
if (retval) if (retval)
goto bad_fork_cancel_cgroup; goto bad_fork_cancel_cgroup;
/*
* Allocate a default futex hash for the user process once the first
* thread spawns.
*/
if (need_futex_hash_allocate_default(clone_flags)) {
retval = futex_hash_allocate_default();
if (retval)
goto bad_fork_core_free;
/*
* If we fail beyond this point we don't free the allocated
* futex hash map. We assume that another thread will be created
* and makes use of it. The hash map will be freed once the main
* thread terminates.
*/
}
/* /*
* From this point on we must avoid any synchronous user-space * From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do * communication until we take the tasklist-lock. In particular, we do

View File

@@ -36,9 +36,15 @@
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/plist.h> #include <linux/plist.h>
#include <linux/gfp.h>
#include <linux/vmalloc.h>
#include <linux/memblock.h> #include <linux/memblock.h>
#include <linux/fault-inject.h> #include <linux/fault-inject.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/prctl.h>
#include <linux/rcuref.h>
#include <linux/mempolicy.h>
#include <linux/mmap_lock.h>
#include "futex.h" #include "futex.h"
#include "../locking/rtmutex_common.h" #include "../locking/rtmutex_common.h"
@@ -49,12 +55,24 @@
* reside in the same cacheline. * reside in the same cacheline.
*/ */
static struct { static struct {
struct futex_hash_bucket *queues;
unsigned long hashmask; unsigned long hashmask;
unsigned int hashshift;
struct futex_hash_bucket *queues[MAX_NUMNODES];
} __futex_data __read_mostly __aligned(2*sizeof(long)); } __futex_data __read_mostly __aligned(2*sizeof(long));
#define futex_queues (__futex_data.queues)
#define futex_hashmask (__futex_data.hashmask)
#define futex_hashmask (__futex_data.hashmask)
#define futex_hashshift (__futex_data.hashshift)
#define futex_queues (__futex_data.queues)
struct futex_private_hash {
rcuref_t users;
unsigned int hash_mask;
struct rcu_head rcu;
void *mm;
bool custom;
bool immutable;
struct futex_hash_bucket queues[];
};
/* /*
* Fault injections for futexes. * Fault injections for futexes.
@@ -107,21 +125,328 @@ late_initcall(fail_futex_debugfs);
#endif /* CONFIG_FAIL_FUTEX */ #endif /* CONFIG_FAIL_FUTEX */
/** static struct futex_hash_bucket *
* futex_hash - Return the hash bucket in the global hash __futex_hash(union futex_key *key, struct futex_private_hash *fph);
* @key: Pointer to the futex key for which the hash is calculated
*
* We hash on the keys returned from get_futex_key (see below) and return the
* corresponding hash bucket in the global hash.
*/
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
key->both.offset);
return &futex_queues[hash & futex_hashmask]; #ifdef CONFIG_FUTEX_PRIVATE_HASH
static inline bool futex_key_is_private(union futex_key *key)
{
/*
* Relies on get_futex_key() to set either bit for shared
* futexes -- see comment with union futex_key.
*/
return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
} }
bool futex_private_hash_get(struct futex_private_hash *fph)
{
if (fph->immutable)
return true;
return rcuref_get(&fph->users);
}
void futex_private_hash_put(struct futex_private_hash *fph)
{
/* Ignore return value, last put is verified via rcuref_is_dead() */
if (fph->immutable)
return;
if (rcuref_put(&fph->users))
wake_up_var(fph->mm);
}
/**
* futex_hash_get - Get an additional reference for the local hash.
* @hb: ptr to the private local hash.
*
* Obtain an additional reference for the already obtained hash bucket. The
* caller must already own an reference.
*/
void futex_hash_get(struct futex_hash_bucket *hb)
{
struct futex_private_hash *fph = hb->priv;
if (!fph)
return;
WARN_ON_ONCE(!futex_private_hash_get(fph));
}
void futex_hash_put(struct futex_hash_bucket *hb)
{
struct futex_private_hash *fph = hb->priv;
if (!fph)
return;
futex_private_hash_put(fph);
}
static struct futex_hash_bucket *
__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
{
u32 hash;
if (!futex_key_is_private(key))
return NULL;
if (!fph)
fph = rcu_dereference(key->private.mm->futex_phash);
if (!fph || !fph->hash_mask)
return NULL;
hash = jhash2((void *)&key->private.address,
sizeof(key->private.address) / 4,
key->both.offset);
return &fph->queues[hash & fph->hash_mask];
}
static void futex_rehash_private(struct futex_private_hash *old,
struct futex_private_hash *new)
{
struct futex_hash_bucket *hb_old, *hb_new;
unsigned int slots = old->hash_mask + 1;
unsigned int i;
for (i = 0; i < slots; i++) {
struct futex_q *this, *tmp;
hb_old = &old->queues[i];
spin_lock(&hb_old->lock);
plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
plist_del(&this->list, &hb_old->chain);
futex_hb_waiters_dec(hb_old);
WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
hb_new = __futex_hash(&this->key, new);
futex_hb_waiters_inc(hb_new);
/*
* The new pointer isn't published yet but an already
* moved user can be unqueued due to timeout or signal.
*/
spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
plist_add(&this->list, &hb_new->chain);
this->lock_ptr = &hb_new->lock;
spin_unlock(&hb_new->lock);
}
spin_unlock(&hb_old->lock);
}
}
static bool __futex_pivot_hash(struct mm_struct *mm,
struct futex_private_hash *new)
{
struct futex_private_hash *fph;
WARN_ON_ONCE(mm->futex_phash_new);
fph = rcu_dereference_protected(mm->futex_phash,
lockdep_is_held(&mm->futex_hash_lock));
if (fph) {
if (!rcuref_is_dead(&fph->users)) {
mm->futex_phash_new = new;
return false;
}
futex_rehash_private(fph, new);
}
rcu_assign_pointer(mm->futex_phash, new);
kvfree_rcu(fph, rcu);
return true;
}
static void futex_pivot_hash(struct mm_struct *mm)
{
scoped_guard(mutex, &mm->futex_hash_lock) {
struct futex_private_hash *fph;
fph = mm->futex_phash_new;
if (fph) {
mm->futex_phash_new = NULL;
__futex_pivot_hash(mm, fph);
}
}
}
struct futex_private_hash *futex_private_hash(void)
{
struct mm_struct *mm = current->mm;
/*
* Ideally we don't loop. If there is a replacement in progress
* then a new private hash is already prepared and a reference can't be
* obtained once the last user dropped it's.
* In that case we block on mm_struct::futex_hash_lock and either have
* to perform the replacement or wait while someone else is doing the
* job. Eitherway, on the second iteration we acquire a reference on the
* new private hash or loop again because a new replacement has been
* requested.
*/
again:
scoped_guard(rcu) {
struct futex_private_hash *fph;
fph = rcu_dereference(mm->futex_phash);
if (!fph)
return NULL;
if (fph->immutable)
return fph;
if (rcuref_get(&fph->users))
return fph;
}
futex_pivot_hash(mm);
goto again;
}
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
struct futex_private_hash *fph;
struct futex_hash_bucket *hb;
again:
scoped_guard(rcu) {
hb = __futex_hash(key, NULL);
fph = hb->priv;
if (!fph || futex_private_hash_get(fph))
return hb;
}
futex_pivot_hash(key->private.mm);
goto again;
}
#else /* !CONFIG_FUTEX_PRIVATE_HASH */
static struct futex_hash_bucket *
__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
{
return NULL;
}
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
return __futex_hash(key, NULL);
}
#endif /* CONFIG_FUTEX_PRIVATE_HASH */
#ifdef CONFIG_FUTEX_MPOL
static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = vma_lookup(mm, addr);
struct mempolicy *mpol;
int node = FUTEX_NO_NODE;
if (!vma)
return FUTEX_NO_NODE;
mpol = vma_policy(vma);
if (!mpol)
return FUTEX_NO_NODE;
switch (mpol->mode) {
case MPOL_PREFERRED:
node = first_node(mpol->nodes);
break;
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
if (mpol->home_node != NUMA_NO_NODE)
node = mpol->home_node;
break;
default:
break;
}
return node;
}
static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr)
{
int seq, node;
guard(rcu)();
if (!mmap_lock_speculate_try_begin(mm, &seq))
return -EBUSY;
node = __futex_key_to_node(mm, addr);
if (mmap_lock_speculate_retry(mm, seq))
return -EAGAIN;
return node;
}
static int futex_mpol(struct mm_struct *mm, unsigned long addr)
{
int node;
node = futex_key_to_node_opt(mm, addr);
if (node >= FUTEX_NO_NODE)
return node;
guard(mmap_read_lock)(mm);
return __futex_key_to_node(mm, addr);
}
#else /* !CONFIG_FUTEX_MPOL */
static int futex_mpol(struct mm_struct *mm, unsigned long addr)
{
return FUTEX_NO_NODE;
}
#endif /* CONFIG_FUTEX_MPOL */
/**
* __futex_hash - Return the hash bucket
* @key: Pointer to the futex key for which the hash is calculated
* @fph: Pointer to private hash if known
*
* We hash on the keys returned from get_futex_key (see below) and return the
* corresponding hash bucket.
* If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the
* private hash) is returned if existing. Otherwise a hash bucket from the
* global hash is returned.
*/
static struct futex_hash_bucket *
__futex_hash(union futex_key *key, struct futex_private_hash *fph)
{
int node = key->both.node;
u32 hash;
if (node == FUTEX_NO_NODE) {
struct futex_hash_bucket *hb;
hb = __futex_hash_private(key, fph);
if (hb)
return hb;
}
hash = jhash2((u32 *)key,
offsetof(typeof(*key), both.offset) / sizeof(u32),
key->both.offset);
if (node == FUTEX_NO_NODE) {
/*
* In case of !FLAGS_NUMA, use some unused hash bits to pick a
* node -- this ensures regular futexes are interleaved across
* the nodes and avoids having to allocate multiple
* hash-tables.
*
* NOTE: this isn't perfectly uniform, but it is fast and
* handles sparse node masks.
*/
node = (hash >> futex_hashshift) % nr_node_ids;
if (!node_possible(node)) {
node = find_next_bit_wrap(node_possible_map.bits,
nr_node_ids, node);
}
}
return &futex_queues[node][hash & futex_hashmask];
}
/** /**
* futex_setup_timer - set up the sleeping hrtimer. * futex_setup_timer - set up the sleeping hrtimer.
@@ -227,25 +552,60 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
struct page *page; struct page *page;
struct folio *folio; struct folio *folio;
struct address_space *mapping; struct address_space *mapping;
int err, ro = 0; int node, err, size, ro = 0;
bool node_updated = false;
bool fshared; bool fshared;
fshared = flags & FLAGS_SHARED; fshared = flags & FLAGS_SHARED;
size = futex_size(flags);
if (flags & FLAGS_NUMA)
size *= 2;
/* /*
* The futex address must be "naturally" aligned. * The futex address must be "naturally" aligned.
*/ */
key->both.offset = address % PAGE_SIZE; key->both.offset = address % PAGE_SIZE;
if (unlikely((address % sizeof(u32)) != 0)) if (unlikely((address % size) != 0))
return -EINVAL; return -EINVAL;
address -= key->both.offset; address -= key->both.offset;
if (unlikely(!access_ok(uaddr, sizeof(u32)))) if (unlikely(!access_ok(uaddr, size)))
return -EFAULT; return -EFAULT;
if (unlikely(should_fail_futex(fshared))) if (unlikely(should_fail_futex(fshared)))
return -EFAULT; return -EFAULT;
node = FUTEX_NO_NODE;
if (flags & FLAGS_NUMA) {
u32 __user *naddr = (void *)uaddr + size / 2;
if (futex_get_value(&node, naddr))
return -EFAULT;
if (node != FUTEX_NO_NODE &&
(node >= MAX_NUMNODES || !node_possible(node)))
return -EINVAL;
}
if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) {
node = futex_mpol(mm, address);
node_updated = true;
}
if (flags & FLAGS_NUMA) {
u32 __user *naddr = (void *)uaddr + size / 2;
if (node == FUTEX_NO_NODE) {
node = numa_node_id();
node_updated = true;
}
if (node_updated && futex_put_value(node, naddr))
return -EFAULT;
}
key->both.node = node;
/* /*
* PROCESS_PRIVATE futexes are fast. * PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs * As the mm cannot disappear under us and the 'key' only needs
@@ -502,13 +862,9 @@ void __futex_unqueue(struct futex_q *q)
} }
/* The key must be already stored in q->key. */ /* The key must be already stored in q->key. */
struct futex_hash_bucket *futex_q_lock(struct futex_q *q) void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
__acquires(&hb->lock) __acquires(&hb->lock)
{ {
struct futex_hash_bucket *hb;
hb = futex_hash(&q->key);
/* /*
* Increment the counter before taking the lock so that * Increment the counter before taking the lock so that
* a potential waker won't miss a to-be-slept task that is * a potential waker won't miss a to-be-slept task that is
@@ -522,14 +878,13 @@ struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
q->lock_ptr = &hb->lock; q->lock_ptr = &hb->lock;
spin_lock(&hb->lock); spin_lock(&hb->lock);
return hb;
} }
void futex_q_unlock(struct futex_hash_bucket *hb) void futex_q_unlock(struct futex_hash_bucket *hb)
__releases(&hb->lock) __releases(&hb->lock)
{ {
spin_unlock(&hb->lock);
futex_hb_waiters_dec(hb); futex_hb_waiters_dec(hb);
spin_unlock(&hb->lock);
} }
void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
@@ -568,6 +923,8 @@ int futex_unqueue(struct futex_q *q)
spinlock_t *lock_ptr; spinlock_t *lock_ptr;
int ret = 0; int ret = 0;
/* RCU so lock_ptr is not going away during locking. */
guard(rcu)();
/* In the common case we don't take the spinlock, which is nice. */ /* In the common case we don't take the spinlock, which is nice. */
retry: retry:
/* /*
@@ -606,6 +963,24 @@ retry:
return ret; return ret;
} }
void futex_q_lockptr_lock(struct futex_q *q)
{
spinlock_t *lock_ptr;
/*
* See futex_unqueue() why lock_ptr can change.
*/
guard(rcu)();
retry:
lock_ptr = READ_ONCE(q->lock_ptr);
spin_lock(lock_ptr);
if (unlikely(lock_ptr != q->lock_ptr)) {
spin_unlock(lock_ptr);
goto retry;
}
}
/* /*
* PI futexes can not be requeued and must remove themselves from the hash * PI futexes can not be requeued and must remove themselves from the hash
* bucket. The hash bucket lock (i.e. lock_ptr) is held. * bucket. The hash bucket lock (i.e. lock_ptr) is held.
@@ -949,9 +1324,19 @@ static void exit_pi_state_list(struct task_struct *curr)
{ {
struct list_head *next, *head = &curr->pi_state_list; struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state; struct futex_pi_state *pi_state;
struct futex_hash_bucket *hb;
union futex_key key = FUTEX_KEY_INIT; union futex_key key = FUTEX_KEY_INIT;
/*
* The mutex mm_struct::futex_hash_lock might be acquired.
*/
might_sleep();
/*
* Ensure the hash remains stable (no resize) during the while loop
* below. The hb pointer is acquired under the pi_lock so we can't block
* on the mutex.
*/
WARN_ON(curr != current);
guard(private_hash)();
/* /*
* We are a ZOMBIE and nobody can enqueue itself on * We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful * pi_state_list anymore, but we have to be careful
@@ -962,50 +1347,52 @@ static void exit_pi_state_list(struct task_struct *curr)
next = head->next; next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list); pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key; key = pi_state->key;
hb = futex_hash(&key); if (1) {
CLASS(hb, hb)(&key);
/* /*
* We can race against put_pi_state() removing itself from the * We can race against put_pi_state() removing itself from the
* list (a waiter going away). put_pi_state() will first * list (a waiter going away). put_pi_state() will first
* decrement the reference count and then modify the list, so * decrement the reference count and then modify the list, so
* its possible to see the list entry but fail this reference * its possible to see the list entry but fail this reference
* acquire. * acquire.
* *
* In that case; drop the locks to let put_pi_state() make * In that case; drop the locks to let put_pi_state() make
* progress and retry the loop. * progress and retry the loop.
*/ */
if (!refcount_inc_not_zero(&pi_state->refcount)) { if (!refcount_inc_not_zero(&pi_state->refcount)) {
raw_spin_unlock_irq(&curr->pi_lock);
cpu_relax();
raw_spin_lock_irq(&curr->pi_lock);
continue;
}
raw_spin_unlock_irq(&curr->pi_lock); raw_spin_unlock_irq(&curr->pi_lock);
cpu_relax();
raw_spin_lock_irq(&curr->pi_lock);
continue;
}
raw_spin_unlock_irq(&curr->pi_lock);
spin_lock(&hb->lock); spin_lock(&hb->lock);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
raw_spin_lock(&curr->pi_lock); raw_spin_lock(&curr->pi_lock);
/* /*
* We dropped the pi-lock, so re-check whether this * We dropped the pi-lock, so re-check whether this
* task still owns the PI-state: * task still owns the PI-state:
*/ */
if (head->next != next) { if (head->next != next) {
/* retain curr->pi_lock for the loop invariant */ /* retain curr->pi_lock for the loop invariant */
raw_spin_unlock(&pi_state->pi_mutex.wait_lock); raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
put_pi_state(pi_state);
continue;
}
WARN_ON(pi_state->owner != curr);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
pi_state->owner = NULL;
raw_spin_unlock(&curr->pi_lock);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock); spin_unlock(&hb->lock);
put_pi_state(pi_state);
continue;
} }
WARN_ON(pi_state->owner != curr);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
pi_state->owner = NULL;
raw_spin_unlock(&curr->pi_lock);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
rt_mutex_futex_unlock(&pi_state->pi_mutex); rt_mutex_futex_unlock(&pi_state->pi_mutex);
put_pi_state(pi_state); put_pi_state(pi_state);
@@ -1125,30 +1512,304 @@ void futex_exit_release(struct task_struct *tsk)
futex_cleanup_end(tsk, FUTEX_STATE_DEAD); futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
} }
static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
struct futex_private_hash *fph)
{
#ifdef CONFIG_FUTEX_PRIVATE_HASH
fhb->priv = fph;
#endif
atomic_set(&fhb->waiters, 0);
plist_head_init(&fhb->chain);
spin_lock_init(&fhb->lock);
}
#define FH_CUSTOM 0x01
#define FH_IMMUTABLE 0x02
#ifdef CONFIG_FUTEX_PRIVATE_HASH
void futex_hash_free(struct mm_struct *mm)
{
struct futex_private_hash *fph;
kvfree(mm->futex_phash_new);
fph = rcu_dereference_raw(mm->futex_phash);
if (fph) {
WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
kvfree(fph);
}
}
static bool futex_pivot_pending(struct mm_struct *mm)
{
struct futex_private_hash *fph;
guard(rcu)();
if (!mm->futex_phash_new)
return true;
fph = rcu_dereference(mm->futex_phash);
return rcuref_is_dead(&fph->users);
}
static bool futex_hash_less(struct futex_private_hash *a,
struct futex_private_hash *b)
{
/* user provided always wins */
if (!a->custom && b->custom)
return true;
if (a->custom && !b->custom)
return false;
/* zero-sized hash wins */
if (!b->hash_mask)
return true;
if (!a->hash_mask)
return false;
/* keep the biggest */
if (a->hash_mask < b->hash_mask)
return true;
if (a->hash_mask > b->hash_mask)
return false;
return false; /* equal */
}
static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
{
struct mm_struct *mm = current->mm;
struct futex_private_hash *fph;
bool custom = flags & FH_CUSTOM;
int i;
if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
return -EINVAL;
/*
* Once we've disabled the global hash there is no way back.
*/
scoped_guard(rcu) {
fph = rcu_dereference(mm->futex_phash);
if (fph && (!fph->hash_mask || fph->immutable)) {
if (custom)
return -EBUSY;
return 0;
}
}
fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!fph)
return -ENOMEM;
rcuref_init(&fph->users, 1);
fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
fph->custom = custom;
fph->immutable = !!(flags & FH_IMMUTABLE);
fph->mm = mm;
for (i = 0; i < hash_slots; i++)
futex_hash_bucket_init(&fph->queues[i], fph);
if (custom) {
/*
* Only let prctl() wait / retry; don't unduly delay clone().
*/
again:
wait_var_event(mm, futex_pivot_pending(mm));
}
scoped_guard(mutex, &mm->futex_hash_lock) {
struct futex_private_hash *free __free(kvfree) = NULL;
struct futex_private_hash *cur, *new;
cur = rcu_dereference_protected(mm->futex_phash,
lockdep_is_held(&mm->futex_hash_lock));
new = mm->futex_phash_new;
mm->futex_phash_new = NULL;
if (fph) {
if (cur && !new) {
/*
* If we have an existing hash, but do not yet have
* allocated a replacement hash, drop the initial
* reference on the existing hash.
*/
futex_private_hash_put(cur);
}
if (new) {
/*
* Two updates raced; throw out the lesser one.
*/
if (futex_hash_less(new, fph)) {
free = new;
new = fph;
} else {
free = fph;
}
} else {
new = fph;
}
fph = NULL;
}
if (new) {
/*
* Will set mm->futex_phash_new on failure;
* futex_private_hash_get() will try again.
*/
if (!__futex_pivot_hash(mm, new) && custom)
goto again;
}
}
return 0;
}
int futex_hash_allocate_default(void)
{
unsigned int threads, buckets, current_buckets = 0;
struct futex_private_hash *fph;
if (!current->mm)
return 0;
scoped_guard(rcu) {
threads = min_t(unsigned int,
get_nr_threads(current),
num_online_cpus());
fph = rcu_dereference(current->mm->futex_phash);
if (fph) {
if (fph->custom)
return 0;
current_buckets = fph->hash_mask + 1;
}
}
/*
* The default allocation will remain within
* 16 <= threads * 4 <= global hash size
*/
buckets = roundup_pow_of_two(4 * threads);
buckets = clamp(buckets, 16, futex_hashmask + 1);
if (current_buckets >= buckets)
return 0;
return futex_hash_allocate(buckets, 0);
}
static int futex_hash_get_slots(void)
{
struct futex_private_hash *fph;
guard(rcu)();
fph = rcu_dereference(current->mm->futex_phash);
if (fph && fph->hash_mask)
return fph->hash_mask + 1;
return 0;
}
static int futex_hash_get_immutable(void)
{
struct futex_private_hash *fph;
guard(rcu)();
fph = rcu_dereference(current->mm->futex_phash);
if (fph && fph->immutable)
return 1;
if (fph && !fph->hash_mask)
return 1;
return 0;
}
#else
static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
{
return -EINVAL;
}
static int futex_hash_get_slots(void)
{
return 0;
}
static int futex_hash_get_immutable(void)
{
return 0;
}
#endif
int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
{
unsigned int flags = FH_CUSTOM;
int ret;
switch (arg2) {
case PR_FUTEX_HASH_SET_SLOTS:
if (arg4 & ~FH_FLAG_IMMUTABLE)
return -EINVAL;
if (arg4 & FH_FLAG_IMMUTABLE)
flags |= FH_IMMUTABLE;
ret = futex_hash_allocate(arg3, flags);
break;
case PR_FUTEX_HASH_GET_SLOTS:
ret = futex_hash_get_slots();
break;
case PR_FUTEX_HASH_GET_IMMUTABLE:
ret = futex_hash_get_immutable();
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
static int __init futex_init(void) static int __init futex_init(void)
{ {
unsigned long hashsize, i; unsigned long hashsize, i;
unsigned int futex_shift; unsigned int order, n;
unsigned long size;
#ifdef CONFIG_BASE_SMALL #ifdef CONFIG_BASE_SMALL
hashsize = 16; hashsize = 16;
#else #else
hashsize = roundup_pow_of_two(256 * num_possible_cpus()); hashsize = 256 * num_possible_cpus();
hashsize /= num_possible_nodes();
hashsize = max(4, hashsize);
hashsize = roundup_pow_of_two(hashsize);
#endif #endif
futex_hashshift = ilog2(hashsize);
size = sizeof(struct futex_hash_bucket) * hashsize;
order = get_order(size);
futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), for_each_node(n) {
hashsize, 0, 0, struct futex_hash_bucket *table;
&futex_shift, NULL,
hashsize, hashsize);
hashsize = 1UL << futex_shift;
for (i = 0; i < hashsize; i++) { if (order > MAX_PAGE_ORDER)
atomic_set(&futex_queues[i].waiters, 0); table = vmalloc_huge_node(size, GFP_KERNEL, n);
plist_head_init(&futex_queues[i].chain); else
spin_lock_init(&futex_queues[i].lock); table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
BUG_ON(!table);
for (i = 0; i < hashsize; i++)
futex_hash_bucket_init(&table[i], NULL);
futex_queues[n] = table;
} }
futex_hashmask = hashsize - 1; futex_hashmask = hashsize - 1;
pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
return 0; return 0;
} }
core_initcall(futex_init); core_initcall(futex_init);

View File

@@ -7,6 +7,7 @@
#include <linux/sched/wake_q.h> #include <linux/sched/wake_q.h>
#include <linux/compat.h> #include <linux/compat.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/cleanup.h>
#ifdef CONFIG_PREEMPT_RT #ifdef CONFIG_PREEMPT_RT
#include <linux/rcuwait.h> #include <linux/rcuwait.h>
@@ -38,6 +39,7 @@
#define FLAGS_HAS_TIMEOUT 0x0040 #define FLAGS_HAS_TIMEOUT 0x0040
#define FLAGS_NUMA 0x0080 #define FLAGS_NUMA 0x0080
#define FLAGS_STRICT 0x0100 #define FLAGS_STRICT 0x0100
#define FLAGS_MPOL 0x0200
/* FUTEX_ to FLAGS_ */ /* FUTEX_ to FLAGS_ */
static inline unsigned int futex_to_flags(unsigned int op) static inline unsigned int futex_to_flags(unsigned int op)
@@ -53,7 +55,7 @@ static inline unsigned int futex_to_flags(unsigned int op)
return flags; return flags;
} }
#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE) #define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE)
/* FUTEX2_ to FLAGS_ */ /* FUTEX2_ to FLAGS_ */
static inline unsigned int futex2_to_flags(unsigned int flags2) static inline unsigned int futex2_to_flags(unsigned int flags2)
@@ -66,6 +68,9 @@ static inline unsigned int futex2_to_flags(unsigned int flags2)
if (flags2 & FUTEX2_NUMA) if (flags2 & FUTEX2_NUMA)
flags |= FLAGS_NUMA; flags |= FLAGS_NUMA;
if (flags2 & FUTEX2_MPOL)
flags |= FLAGS_MPOL;
return flags; return flags;
} }
@@ -86,6 +91,19 @@ static inline bool futex_flags_valid(unsigned int flags)
if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
return false; return false;
/*
* Must be able to represent both FUTEX_NO_NODE and every valid nodeid
* in a futex word.
*/
if (flags & FLAGS_NUMA) {
int bits = 8 * futex_size(flags);
u64 max = ~0ULL;
max >>= 64 - bits;
if (nr_node_ids >= max)
return false;
}
return true; return true;
} }
@@ -117,6 +135,7 @@ struct futex_hash_bucket {
atomic_t waiters; atomic_t waiters;
spinlock_t lock; spinlock_t lock;
struct plist_head chain; struct plist_head chain;
struct futex_private_hash *priv;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/* /*
@@ -156,6 +175,7 @@ typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
* @requeue_pi_key: the requeue_pi target futex key * @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup * @bitset: bitset for the optional bitmasked wakeup
* @requeue_state: State field for futex_requeue_pi() * @requeue_state: State field for futex_requeue_pi()
* @drop_hb_ref: Waiter should drop the extra hash bucket reference if true
* @requeue_wait: RCU wait for futex_requeue_pi() (RT only) * @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
* *
* We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
@@ -182,6 +202,7 @@ struct futex_q {
union futex_key *requeue_pi_key; union futex_key *requeue_pi_key;
u32 bitset; u32 bitset;
atomic_t requeue_state; atomic_t requeue_state;
bool drop_hb_ref;
#ifdef CONFIG_PREEMPT_RT #ifdef CONFIG_PREEMPT_RT
struct rcuwait requeue_wait; struct rcuwait requeue_wait;
#endif #endif
@@ -196,12 +217,35 @@ enum futex_access {
extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw); enum futex_access rw);
extern void futex_q_lockptr_lock(struct futex_q *q);
extern struct hrtimer_sleeper * extern struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns); int flags, u64 range_ns);
extern struct futex_hash_bucket *futex_hash(union futex_key *key); extern struct futex_hash_bucket *futex_hash(union futex_key *key);
#ifdef CONFIG_FUTEX_PRIVATE_HASH
extern void futex_hash_get(struct futex_hash_bucket *hb);
extern void futex_hash_put(struct futex_hash_bucket *hb);
extern struct futex_private_hash *futex_private_hash(void);
extern bool futex_private_hash_get(struct futex_private_hash *fph);
extern void futex_private_hash_put(struct futex_private_hash *fph);
#else /* !CONFIG_FUTEX_PRIVATE_HASH */
static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
static inline bool futex_private_hash_get(void) { return false; }
static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
#endif
DEFINE_CLASS(hb, struct futex_hash_bucket *,
if (_T) futex_hash_put(_T),
futex_hash(key), union futex_key *key);
DEFINE_CLASS(private_hash, struct futex_private_hash *,
if (_T) futex_private_hash_put(_T),
futex_private_hash(), void);
/** /**
* futex_match - Check whether two futex keys are equal * futex_match - Check whether two futex keys are equal
@@ -219,9 +263,9 @@ static inline int futex_match(union futex_key *key1, union futex_key *key2)
} }
extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb); struct futex_q *q, union futex_key *key2,
extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, struct task_struct *task);
struct hrtimer_sleeper *timeout); extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout);
extern bool __futex_wake_mark(struct futex_q *q); extern bool __futex_wake_mark(struct futex_q *q);
extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
@@ -256,7 +300,7 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32
* This looks a bit overkill, but generally just results in a couple * This looks a bit overkill, but generally just results in a couple
* of instructions. * of instructions.
*/ */
static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) static __always_inline int futex_get_value(u32 *dest, u32 __user *from)
{ {
u32 val; u32 val;
@@ -273,12 +317,26 @@ Efault:
return -EFAULT; return -EFAULT;
} }
static __always_inline int futex_put_value(u32 val, u32 __user *to)
{
if (can_do_masked_user_access())
to = masked_user_access_begin(to);
else if (!user_read_access_begin(to, sizeof(*to)))
return -EFAULT;
unsafe_put_user(val, to, Efault);
user_read_access_end();
return 0;
Efault:
user_read_access_end();
return -EFAULT;
}
static inline int futex_get_value_locked(u32 *dest, u32 __user *from) static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
{ {
int ret; int ret;
pagefault_disable(); pagefault_disable();
ret = futex_read_inatomic(dest, from); ret = futex_get_value(dest, from);
pagefault_enable(); pagefault_enable();
return ret; return ret;
@@ -354,7 +412,7 @@ static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
#endif #endif
} }
extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q); extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb);
extern void futex_q_unlock(struct futex_hash_bucket *hb); extern void futex_q_unlock(struct futex_hash_bucket *hb);

View File

@@ -806,7 +806,7 @@ handle_err:
break; break;
} }
spin_lock(q->lock_ptr); futex_q_lockptr_lock(q);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/* /*
@@ -920,7 +920,6 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
struct hrtimer_sleeper timeout, *to; struct hrtimer_sleeper timeout, *to;
struct task_struct *exiting = NULL; struct task_struct *exiting = NULL;
struct rt_mutex_waiter rt_waiter; struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init; struct futex_q q = futex_q_init;
DEFINE_WAKE_Q(wake_q); DEFINE_WAKE_Q(wake_q);
int res, ret; int res, ret;
@@ -939,151 +938,183 @@ retry:
goto out; goto out;
retry_private: retry_private:
hb = futex_q_lock(&q); if (1) {
CLASS(hb, hb)(&q.key);
ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, futex_q_lock(&q, hb);
&exiting, 0);
if (unlikely(ret)) { ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
/* &exiting, 0);
* Atomic work succeeded and we got the lock, if (unlikely(ret)) {
* or failed. Either way, we do _not_ block.
*/
switch (ret) {
case 1:
/* We got the lock. */
ret = 0;
goto out_unlock_put_key;
case -EFAULT:
goto uaddr_faulted;
case -EBUSY:
case -EAGAIN:
/* /*
* Two reasons for this: * Atomic work succeeded and we got the lock,
* - EBUSY: Task is exiting and we just wait for the * or failed. Either way, we do _not_ block.
* exit to complete.
* - EAGAIN: The user space value changed.
*/ */
futex_q_unlock(hb); switch (ret) {
/* case 1:
* Handle the case where the owner is in the middle of /* We got the lock. */
* exiting. Wait for the exit to complete otherwise ret = 0;
* this task might loop forever, aka. live lock. goto out_unlock_put_key;
*/ case -EFAULT:
wait_for_owner_exiting(ret, exiting); goto uaddr_faulted;
cond_resched(); case -EBUSY:
goto retry; case -EAGAIN:
default: /*
goto out_unlock_put_key; * Two reasons for this:
* - EBUSY: Task is exiting and we just wait for the
* exit to complete.
* - EAGAIN: The user space value changed.
*/
futex_q_unlock(hb);
/*
* Handle the case where the owner is in the middle of
* exiting. Wait for the exit to complete otherwise
* this task might loop forever, aka. live lock.
*/
wait_for_owner_exiting(ret, exiting);
cond_resched();
goto retry;
default:
goto out_unlock_put_key;
}
} }
}
WARN_ON(!q.pi_state); WARN_ON(!q.pi_state);
/* /*
* Only actually queue now that the atomic ops are done: * Only actually queue now that the atomic ops are done:
*/ */
__futex_queue(&q, hb, current); __futex_queue(&q, hb, current);
if (trylock) { if (trylock) {
ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */ /* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK; ret = ret ? 0 : -EWOULDBLOCK;
goto no_block; goto no_block;
} }
/* /*
* Must be done before we enqueue the waiter, here is unfortunately * Caution; releasing @hb in-scope. The hb->lock is still locked
* under the hb lock, but that *should* work because it does nothing. * while the reference is dropped. The reference can not be dropped
*/ * after the unlock because if a user initiated resize is in progress
rt_mutex_pre_schedule(); * then we might need to wake him. This can not be done after the
* rt_mutex_pre_schedule() invocation. The hb will remain valid because
* the thread, performing resize, will block on hb->lock during
* the requeue.
*/
futex_hash_put(no_free_ptr(hb));
/*
* Must be done before we enqueue the waiter, here is unfortunately
* under the hb lock, but that *should* work because it does nothing.
*/
rt_mutex_pre_schedule();
rt_mutex_init_waiter(&rt_waiter); rt_mutex_init_waiter(&rt_waiter);
/* /*
* On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
* hold it while doing rt_mutex_start_proxy(), because then it will * hold it while doing rt_mutex_start_proxy(), because then it will
* include hb->lock in the blocking chain, even through we'll not in * include hb->lock in the blocking chain, even through we'll not in
* fact hold it while blocking. This will lead it to report -EDEADLK * fact hold it while blocking. This will lead it to report -EDEADLK
* and BUG when futex_unlock_pi() interleaves with this. * and BUG when futex_unlock_pi() interleaves with this.
* *
* Therefore acquire wait_lock while holding hb->lock, but drop the * Therefore acquire wait_lock while holding hb->lock, but drop the
* latter before calling __rt_mutex_start_proxy_lock(). This * latter before calling __rt_mutex_start_proxy_lock(). This
* interleaves with futex_unlock_pi() -- which does a similar lock * interleaves with futex_unlock_pi() -- which does a similar lock
* handoff -- such that the latter can observe the futex_q::pi_state * handoff -- such that the latter can observe the futex_q::pi_state
* before __rt_mutex_start_proxy_lock() is done. * before __rt_mutex_start_proxy_lock() is done.
*/ */
raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
spin_unlock(q.lock_ptr); spin_unlock(q.lock_ptr);
/* /*
* __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
* such that futex_unlock_pi() is guaranteed to observe the waiter when * such that futex_unlock_pi() is guaranteed to observe the waiter when
* it sees the futex_q::pi_state. * it sees the futex_q::pi_state.
*/ */
ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
if (ret) { if (ret) {
if (ret == 1) if (ret == 1)
ret = 0; ret = 0;
goto cleanup; goto cleanup;
} }
if (unlikely(to)) if (unlikely(to))
hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
cleanup: cleanup:
/* /*
* If we failed to acquire the lock (deadlock/signal/timeout), we must * If we failed to acquire the lock (deadlock/signal/timeout), we must
* must unwind the above, however we canont lock hb->lock because * unwind the above, however we canont lock hb->lock because
* rt_mutex already has a waiter enqueued and hb->lock can itself try * rt_mutex already has a waiter enqueued and hb->lock can itself try
* and enqueue an rt_waiter through rtlock. * and enqueue an rt_waiter through rtlock.
* *
* Doing the cleanup without holding hb->lock can cause inconsistent * Doing the cleanup without holding hb->lock can cause inconsistent
* state between hb and pi_state, but only in the direction of not * state between hb and pi_state, but only in the direction of not
* seeing a waiter that is leaving. * seeing a waiter that is leaving.
* *
* See futex_unlock_pi(), it deals with this inconsistency. * See futex_unlock_pi(), it deals with this inconsistency.
* *
* There be dragons here, since we must deal with the inconsistency on * There be dragons here, since we must deal with the inconsistency on
* the way out (here), it is impossible to detect/warn about the race * the way out (here), it is impossible to detect/warn about the race
* the other way around (missing an incoming waiter). * the other way around (missing an incoming waiter).
* *
* What could possibly go wrong... * What could possibly go wrong...
*/ */
if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
ret = 0; ret = 0;
/* /*
* Now that the rt_waiter has been dequeued, it is safe to use * Now that the rt_waiter has been dequeued, it is safe to use
* spinlock/rtlock (which might enqueue its own rt_waiter) and fix up * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
* the * the
*/ */
spin_lock(q.lock_ptr); futex_q_lockptr_lock(&q);
/* /*
* Waiter is unqueued. * Waiter is unqueued.
*/ */
rt_mutex_post_schedule(); rt_mutex_post_schedule();
no_block: no_block:
/* /*
* Fixup the pi_state owner and possibly acquire the lock if we * Fixup the pi_state owner and possibly acquire the lock if we
* haven't already. * haven't already.
*/ */
res = fixup_pi_owner(uaddr, &q, !ret); res = fixup_pi_owner(uaddr, &q, !ret);
/* /*
* If fixup_pi_owner() returned an error, propagate that. If it acquired * If fixup_pi_owner() returned an error, propagate that. If it acquired
* the lock, clear our -ETIMEDOUT or -EINTR. * the lock, clear our -ETIMEDOUT or -EINTR.
*/ */
if (res) if (res)
ret = (res < 0) ? res : 0; ret = (res < 0) ? res : 0;
futex_unqueue_pi(&q); futex_unqueue_pi(&q);
spin_unlock(q.lock_ptr); spin_unlock(q.lock_ptr);
goto out; if (q.drop_hb_ref) {
CLASS(hb, hb)(&q.key);
/* Additional reference from futex_unlock_pi() */
futex_hash_put(hb);
}
goto out;
out_unlock_put_key: out_unlock_put_key:
futex_q_unlock(hb); futex_q_unlock(hb);
goto out;
uaddr_faulted:
futex_q_unlock(hb);
ret = fault_in_user_writeable(uaddr);
if (ret)
goto out;
if (!(flags & FLAGS_SHARED))
goto retry_private;
goto retry;
}
out: out:
if (to) { if (to) {
@@ -1091,18 +1122,6 @@ out:
destroy_hrtimer_on_stack(&to->timer); destroy_hrtimer_on_stack(&to->timer);
} }
return ret != -EINTR ? ret : -ERESTARTNOINTR; return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:
futex_q_unlock(hb);
ret = fault_in_user_writeable(uaddr);
if (ret)
goto out;
if (!(flags & FLAGS_SHARED))
goto retry_private;
goto retry;
} }
/* /*
@@ -1114,7 +1133,6 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{ {
u32 curval, uval, vpid = task_pid_vnr(current); u32 curval, uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT; union futex_key key = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb;
struct futex_q *top_waiter; struct futex_q *top_waiter;
int ret; int ret;
@@ -1134,7 +1152,7 @@ retry:
if (ret) if (ret)
return ret; return ret;
hb = futex_hash(&key); CLASS(hb, hb)(&key);
spin_lock(&hb->lock); spin_lock(&hb->lock);
retry_hb: retry_hb:
@@ -1187,6 +1205,12 @@ retry_hb:
*/ */
rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
if (!rt_waiter) { if (!rt_waiter) {
/*
* Acquire a reference for the leaving waiter to ensure
* valid futex_q::lock_ptr.
*/
futex_hash_get(hb);
top_waiter->drop_hb_ref = true;
__futex_unqueue(top_waiter); __futex_unqueue(top_waiter);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
goto retry_hb; goto retry_hb;

View File

@@ -87,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
futex_hb_waiters_inc(hb2); futex_hb_waiters_inc(hb2);
plist_add(&q->list, &hb2->chain); plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock; q->lock_ptr = &hb2->lock;
/*
* hb1 and hb2 belong to the same futex_hash_bucket_private
* because if we managed get a reference on hb1 then it can't be
* replaced. Therefore we avoid put(hb1)+get(hb2) here.
*/
} }
q->key = *key2; q->key = *key2;
} }
@@ -231,7 +236,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
WARN_ON(!q->rt_waiter); WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL; q->rt_waiter = NULL;
/*
* Acquire a reference for the waiter to ensure valid
* futex_q::lock_ptr.
*/
futex_hash_get(hb);
q->drop_hb_ref = true;
q->lock_ptr = &hb->lock; q->lock_ptr = &hb->lock;
/* Signal locked state to the waiter */ /* Signal locked state to the waiter */
@@ -371,7 +381,6 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
int task_count = 0, ret; int task_count = 0, ret;
struct futex_pi_state *pi_state = NULL; struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next; struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q); DEFINE_WAKE_Q(wake_q);
@@ -443,240 +452,242 @@ retry:
if (requeue_pi && futex_match(&key1, &key2)) if (requeue_pi && futex_match(&key1, &key2))
return -EINVAL; return -EINVAL;
hb1 = futex_hash(&key1);
hb2 = futex_hash(&key2);
retry_private: retry_private:
futex_hb_waiters_inc(hb2); if (1) {
double_lock_hb(hb1, hb2); CLASS(hb, hb1)(&key1);
CLASS(hb, hb2)(&key2);
if (likely(cmpval != NULL)) { futex_hb_waiters_inc(hb2);
u32 curval; double_lock_hb(hb1, hb2);
ret = futex_get_value_locked(&curval, uaddr1); if (likely(cmpval != NULL)) {
u32 curval;
if (unlikely(ret)) { ret = futex_get_value_locked(&curval, uaddr1);
double_unlock_hb(hb1, hb2);
futex_hb_waiters_dec(hb2);
ret = get_user(curval, uaddr1); if (unlikely(ret)) {
if (ret) futex_hb_waiters_dec(hb2);
return ret; double_unlock_hb(hb1, hb2);
if (!(flags1 & FLAGS_SHARED)) ret = get_user(curval, uaddr1);
goto retry_private; if (ret)
return ret;
goto retry; if (!(flags1 & FLAGS_SHARED))
} goto retry_private;
if (curval != *cmpval) {
ret = -EAGAIN;
goto out_unlock;
}
}
if (requeue_pi) {
struct task_struct *exiting = NULL;
/*
* Attempt to acquire uaddr2 and wake the top waiter. If we
* intend to requeue waiters, force setting the FUTEX_WAITERS
* bit. We force this here where we are able to easily handle
* faults rather in the requeue loop below.
*
* Updates topwaiter::requeue_state if a top waiter exists.
*/
ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
&key2, &pi_state,
&exiting, nr_requeue);
/*
* At this point the top_waiter has either taken uaddr2 or
* is waiting on it. In both cases pi_state has been
* established and an initial refcount on it. In case of an
* error there's nothing.
*
* The top waiter's requeue_state is up to date:
*
* - If the lock was acquired atomically (ret == 1), then
* the state is Q_REQUEUE_PI_LOCKED.
*
* The top waiter has been dequeued and woken up and can
* return to user space immediately. The kernel/user
* space state is consistent. In case that there must be
* more waiters requeued the WAITERS bit in the user
* space futex is set so the top waiter task has to go
* into the syscall slowpath to unlock the futex. This
* will block until this requeue operation has been
* completed and the hash bucket locks have been
* dropped.
*
* - If the trylock failed with an error (ret < 0) then
* the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
* happened", or Q_REQUEUE_PI_IGNORE when there was an
* interleaved early wakeup.
*
* - If the trylock did not succeed (ret == 0) then the
* state is either Q_REQUEUE_PI_IN_PROGRESS or
* Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
* This will be cleaned up in the loop below, which
* cannot fail because futex_proxy_trylock_atomic() did
* the same sanity checks for requeue_pi as the loop
* below does.
*/
switch (ret) {
case 0:
/* We hold a reference on the pi state. */
break;
case 1:
/*
* futex_proxy_trylock_atomic() acquired the user space
* futex. Adjust task_count.
*/
task_count++;
ret = 0;
break;
/*
* If the above failed, then pi_state is NULL and
* waiter::requeue_state is correct.
*/
case -EFAULT:
double_unlock_hb(hb1, hb2);
futex_hb_waiters_dec(hb2);
ret = fault_in_user_writeable(uaddr2);
if (!ret)
goto retry; goto retry;
return ret; }
case -EBUSY: if (curval != *cmpval) {
case -EAGAIN: ret = -EAGAIN;
/* goto out_unlock;
* Two reasons for this: }
* - EBUSY: Owner is exiting and we just wait for the
* exit to complete.
* - EAGAIN: The user space value changed.
*/
double_unlock_hb(hb1, hb2);
futex_hb_waiters_dec(hb2);
/*
* Handle the case where the owner is in the middle of
* exiting. Wait for the exit to complete otherwise
* this task might loop forever, aka. live lock.
*/
wait_for_owner_exiting(ret, exiting);
cond_resched();
goto retry;
default:
goto out_unlock;
}
}
plist_for_each_entry_safe(this, next, &hb1->chain, list) {
if (task_count - nr_wake >= nr_requeue)
break;
if (!futex_match(&this->key, &key1))
continue;
/*
* FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
* be paired with each other and no other futex ops.
*
* We should never be requeueing a futex_q with a pi_state,
* which is awaiting a futex_unlock_pi().
*/
if ((requeue_pi && !this->rt_waiter) ||
(!requeue_pi && this->rt_waiter) ||
this->pi_state) {
ret = -EINVAL;
break;
} }
/* Plain futexes just wake or requeue and are done */ if (requeue_pi) {
if (!requeue_pi) { struct task_struct *exiting = NULL;
if (++task_count <= nr_wake)
this->wake(&wake_q, this); /*
else * Attempt to acquire uaddr2 and wake the top waiter. If we
* intend to requeue waiters, force setting the FUTEX_WAITERS
* bit. We force this here where we are able to easily handle
* faults rather in the requeue loop below.
*
* Updates topwaiter::requeue_state if a top waiter exists.
*/
ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
&key2, &pi_state,
&exiting, nr_requeue);
/*
* At this point the top_waiter has either taken uaddr2 or
* is waiting on it. In both cases pi_state has been
* established and an initial refcount on it. In case of an
* error there's nothing.
*
* The top waiter's requeue_state is up to date:
*
* - If the lock was acquired atomically (ret == 1), then
* the state is Q_REQUEUE_PI_LOCKED.
*
* The top waiter has been dequeued and woken up and can
* return to user space immediately. The kernel/user
* space state is consistent. In case that there must be
* more waiters requeued the WAITERS bit in the user
* space futex is set so the top waiter task has to go
* into the syscall slowpath to unlock the futex. This
* will block until this requeue operation has been
* completed and the hash bucket locks have been
* dropped.
*
* - If the trylock failed with an error (ret < 0) then
* the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
* happened", or Q_REQUEUE_PI_IGNORE when there was an
* interleaved early wakeup.
*
* - If the trylock did not succeed (ret == 0) then the
* state is either Q_REQUEUE_PI_IN_PROGRESS or
* Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
* This will be cleaned up in the loop below, which
* cannot fail because futex_proxy_trylock_atomic() did
* the same sanity checks for requeue_pi as the loop
* below does.
*/
switch (ret) {
case 0:
/* We hold a reference on the pi state. */
break;
case 1:
/*
* futex_proxy_trylock_atomic() acquired the user space
* futex. Adjust task_count.
*/
task_count++;
ret = 0;
break;
/*
* If the above failed, then pi_state is NULL and
* waiter::requeue_state is correct.
*/
case -EFAULT:
futex_hb_waiters_dec(hb2);
double_unlock_hb(hb1, hb2);
ret = fault_in_user_writeable(uaddr2);
if (!ret)
goto retry;
return ret;
case -EBUSY:
case -EAGAIN:
/*
* Two reasons for this:
* - EBUSY: Owner is exiting and we just wait for the
* exit to complete.
* - EAGAIN: The user space value changed.
*/
futex_hb_waiters_dec(hb2);
double_unlock_hb(hb1, hb2);
/*
* Handle the case where the owner is in the middle of
* exiting. Wait for the exit to complete otherwise
* this task might loop forever, aka. live lock.
*/
wait_for_owner_exiting(ret, exiting);
cond_resched();
goto retry;
default:
goto out_unlock;
}
}
plist_for_each_entry_safe(this, next, &hb1->chain, list) {
if (task_count - nr_wake >= nr_requeue)
break;
if (!futex_match(&this->key, &key1))
continue;
/*
* FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
* be paired with each other and no other futex ops.
*
* We should never be requeueing a futex_q with a pi_state,
* which is awaiting a futex_unlock_pi().
*/
if ((requeue_pi && !this->rt_waiter) ||
(!requeue_pi && this->rt_waiter) ||
this->pi_state) {
ret = -EINVAL;
break;
}
/* Plain futexes just wake or requeue and are done */
if (!requeue_pi) {
if (++task_count <= nr_wake)
this->wake(&wake_q, this);
else
requeue_futex(this, hb1, hb2, &key2);
continue;
}
/* Ensure we requeue to the expected futex for requeue_pi. */
if (!futex_match(this->requeue_pi_key, &key2)) {
ret = -EINVAL;
break;
}
/*
* Requeue nr_requeue waiters and possibly one more in the case
* of requeue_pi if we couldn't acquire the lock atomically.
*
* Prepare the waiter to take the rt_mutex. Take a refcount
* on the pi_state and store the pointer in the futex_q
* object of the waiter.
*/
get_pi_state(pi_state);
/* Don't requeue when the waiter is already on the way out. */
if (!futex_requeue_pi_prepare(this, pi_state)) {
/*
* Early woken waiter signaled that it is on the
* way out. Drop the pi_state reference and try the
* next waiter. @this->pi_state is still NULL.
*/
put_pi_state(pi_state);
continue;
}
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
this->task);
if (ret == 1) {
/*
* We got the lock. We do neither drop the refcount
* on pi_state nor clear this->pi_state because the
* waiter needs the pi_state for cleaning up the
* user space value. It will drop the refcount
* after doing so. this::requeue_state is updated
* in the wakeup as well.
*/
requeue_pi_wake_futex(this, &key2, hb2);
task_count++;
} else if (!ret) {
/* Waiter is queued, move it to hb2 */
requeue_futex(this, hb1, hb2, &key2); requeue_futex(this, hb1, hb2, &key2);
continue; futex_requeue_pi_complete(this, 0);
} task_count++;
} else {
/* Ensure we requeue to the expected futex for requeue_pi. */ /*
if (!futex_match(this->requeue_pi_key, &key2)) { * rt_mutex_start_proxy_lock() detected a potential
ret = -EINVAL; * deadlock when we tried to queue that waiter.
break; * Drop the pi_state reference which we took above
* and remove the pointer to the state from the
* waiters futex_q object.
*/
this->pi_state = NULL;
put_pi_state(pi_state);
futex_requeue_pi_complete(this, ret);
/*
* We stop queueing more waiters and let user space
* deal with the mess.
*/
break;
}
} }
/* /*
* Requeue nr_requeue waiters and possibly one more in the case * We took an extra initial reference to the pi_state in
* of requeue_pi if we couldn't acquire the lock atomically. * futex_proxy_trylock_atomic(). We need to drop it here again.
*
* Prepare the waiter to take the rt_mutex. Take a refcount
* on the pi_state and store the pointer in the futex_q
* object of the waiter.
*/ */
get_pi_state(pi_state); put_pi_state(pi_state);
/* Don't requeue when the waiter is already on the way out. */
if (!futex_requeue_pi_prepare(this, pi_state)) {
/*
* Early woken waiter signaled that it is on the
* way out. Drop the pi_state reference and try the
* next waiter. @this->pi_state is still NULL.
*/
put_pi_state(pi_state);
continue;
}
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
this->task);
if (ret == 1) {
/*
* We got the lock. We do neither drop the refcount
* on pi_state nor clear this->pi_state because the
* waiter needs the pi_state for cleaning up the
* user space value. It will drop the refcount
* after doing so. this::requeue_state is updated
* in the wakeup as well.
*/
requeue_pi_wake_futex(this, &key2, hb2);
task_count++;
} else if (!ret) {
/* Waiter is queued, move it to hb2 */
requeue_futex(this, hb1, hb2, &key2);
futex_requeue_pi_complete(this, 0);
task_count++;
} else {
/*
* rt_mutex_start_proxy_lock() detected a potential
* deadlock when we tried to queue that waiter.
* Drop the pi_state reference which we took above
* and remove the pointer to the state from the
* waiters futex_q object.
*/
this->pi_state = NULL;
put_pi_state(pi_state);
futex_requeue_pi_complete(this, ret);
/*
* We stop queueing more waiters and let user space
* deal with the mess.
*/
break;
}
}
/*
* We took an extra initial reference to the pi_state in
* futex_proxy_trylock_atomic(). We need to drop it here again.
*/
put_pi_state(pi_state);
out_unlock: out_unlock:
double_unlock_hb(hb1, hb2); futex_hb_waiters_dec(hb2);
double_unlock_hb(hb1, hb2);
}
wake_up_q(&wake_q); wake_up_q(&wake_q);
futex_hb_waiters_dec(hb2);
return ret ? ret : task_count; return ret ? ret : task_count;
} }
@@ -769,7 +780,6 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
{ {
struct hrtimer_sleeper timeout, *to; struct hrtimer_sleeper timeout, *to;
struct rt_mutex_waiter rt_waiter; struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT; union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init; struct futex_q q = futex_q_init;
struct rt_mutex_base *pi_mutex; struct rt_mutex_base *pi_mutex;
@@ -805,35 +815,28 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* Prepare to wait on uaddr. On success, it holds hb->lock and q * Prepare to wait on uaddr. On success, it holds hb->lock and q
* is initialized. * is initialized.
*/ */
ret = futex_wait_setup(uaddr, val, flags, &q, &hb); ret = futex_wait_setup(uaddr, val, flags, &q, &key2, current);
if (ret) if (ret)
goto out; goto out;
/*
* The check above which compares uaddrs is not sufficient for
* shared futexes. We need to compare the keys:
*/
if (futex_match(&q.key, &key2)) {
futex_q_unlock(hb);
ret = -EINVAL;
goto out;
}
/* Queue the futex_q, drop the hb lock, wait for wakeup. */ /* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue(hb, &q, to); futex_do_wait(&q, to);
switch (futex_requeue_pi_wakeup_sync(&q)) { switch (futex_requeue_pi_wakeup_sync(&q)) {
case Q_REQUEUE_PI_IGNORE: case Q_REQUEUE_PI_IGNORE:
/* The waiter is still on uaddr1 */ {
spin_lock(&hb->lock); CLASS(hb, hb)(&q.key);
ret = handle_early_requeue_pi_wakeup(hb, &q, to); /* The waiter is still on uaddr1 */
spin_unlock(&hb->lock); spin_lock(&hb->lock);
ret = handle_early_requeue_pi_wakeup(hb, &q, to);
spin_unlock(&hb->lock);
}
break; break;
case Q_REQUEUE_PI_LOCKED: case Q_REQUEUE_PI_LOCKED:
/* The requeue acquired the lock */ /* The requeue acquired the lock */
if (q.pi_state && (q.pi_state->owner != current)) { if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr); futex_q_lockptr_lock(&q);
ret = fixup_pi_owner(uaddr2, &q, true); ret = fixup_pi_owner(uaddr2, &q, true);
/* /*
* Drop the reference to the pi state which the * Drop the reference to the pi state which the
@@ -860,7 +863,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
ret = 0; ret = 0;
spin_lock(q.lock_ptr); futex_q_lockptr_lock(&q);
debug_rt_mutex_free_waiter(&rt_waiter); debug_rt_mutex_free_waiter(&rt_waiter);
/* /*
* Fixup the pi_state owner and possibly acquire the lock if we * Fixup the pi_state owner and possibly acquire the lock if we
@@ -892,6 +895,11 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
default: default:
BUG(); BUG();
} }
if (q.drop_hb_ref) {
CLASS(hb, hb)(&q.key);
/* Additional reference from requeue_pi_wake_futex() */
futex_hash_put(hb);
}
out: out:
if (to) { if (to) {

View File

@@ -154,7 +154,6 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
*/ */
int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{ {
struct futex_hash_bucket *hb;
struct futex_q *this, *next; struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT; union futex_key key = FUTEX_KEY_INIT;
DEFINE_WAKE_Q(wake_q); DEFINE_WAKE_Q(wake_q);
@@ -170,7 +169,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if ((flags & FLAGS_STRICT) && !nr_wake) if ((flags & FLAGS_STRICT) && !nr_wake)
return 0; return 0;
hb = futex_hash(&key); CLASS(hb, hb)(&key);
/* Make sure we really have tasks to wakeup */ /* Make sure we really have tasks to wakeup */
if (!futex_hb_waiters_pending(hb)) if (!futex_hb_waiters_pending(hb))
@@ -253,7 +252,6 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op) int nr_wake, int nr_wake2, int op)
{ {
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next; struct futex_q *this, *next;
int ret, op_ret; int ret, op_ret;
DEFINE_WAKE_Q(wake_q); DEFINE_WAKE_Q(wake_q);
@@ -266,67 +264,69 @@ retry:
if (unlikely(ret != 0)) if (unlikely(ret != 0))
return ret; return ret;
hb1 = futex_hash(&key1);
hb2 = futex_hash(&key2);
retry_private: retry_private:
double_lock_hb(hb1, hb2); if (1) {
op_ret = futex_atomic_op_inuser(op, uaddr2); CLASS(hb, hb1)(&key1);
if (unlikely(op_ret < 0)) { CLASS(hb, hb2)(&key2);
double_unlock_hb(hb1, hb2);
if (!IS_ENABLED(CONFIG_MMU) || double_lock_hb(hb1, hb2);
unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { op_ret = futex_atomic_op_inuser(op, uaddr2);
/* if (unlikely(op_ret < 0)) {
* we don't get EFAULT from MMU faults if we don't have double_unlock_hb(hb1, hb2);
* an MMU, but we might get them from range checking
*/
ret = op_ret;
return ret;
}
if (op_ret == -EFAULT) { if (!IS_ENABLED(CONFIG_MMU) ||
ret = fault_in_user_writeable(uaddr2); unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
if (ret) /*
* we don't get EFAULT from MMU faults if we don't have
* an MMU, but we might get them from range checking
*/
ret = op_ret;
return ret; return ret;
}
cond_resched();
if (!(flags & FLAGS_SHARED))
goto retry_private;
goto retry;
}
plist_for_each_entry_safe(this, next, &hb1->chain, list) {
if (futex_match (&this->key, &key1)) {
if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
goto out_unlock;
} }
this->wake(&wake_q, this);
if (++ret >= nr_wake)
break;
}
}
if (op_ret > 0) { if (op_ret == -EFAULT) {
op_ret = 0; ret = fault_in_user_writeable(uaddr2);
plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (ret)
if (futex_match (&this->key, &key2)) { return ret;
}
cond_resched();
if (!(flags & FLAGS_SHARED))
goto retry_private;
goto retry;
}
plist_for_each_entry_safe(this, next, &hb1->chain, list) {
if (futex_match(&this->key, &key1)) {
if (this->pi_state || this->rt_waiter) { if (this->pi_state || this->rt_waiter) {
ret = -EINVAL; ret = -EINVAL;
goto out_unlock; goto out_unlock;
} }
this->wake(&wake_q, this); this->wake(&wake_q, this);
if (++op_ret >= nr_wake2) if (++ret >= nr_wake)
break; break;
} }
} }
ret += op_ret;
} if (op_ret > 0) {
op_ret = 0;
plist_for_each_entry_safe(this, next, &hb2->chain, list) {
if (futex_match(&this->key, &key2)) {
if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
goto out_unlock;
}
this->wake(&wake_q, this);
if (++op_ret >= nr_wake2)
break;
}
}
ret += op_ret;
}
out_unlock: out_unlock:
double_unlock_hb(hb1, hb2); double_unlock_hb(hb1, hb2);
}
wake_up_q(&wake_q); wake_up_q(&wake_q);
return ret; return ret;
} }
@@ -334,23 +334,12 @@ out_unlock:
static long futex_wait_restart(struct restart_block *restart); static long futex_wait_restart(struct restart_block *restart);
/** /**
* futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal * futex_do_wait() - wait for wakeup, timeout, or signal
* @hb: the futex hash bucket, must be locked by the caller
* @q: the futex_q to queue up on * @q: the futex_q to queue up on
* @timeout: the prepared hrtimer_sleeper, or null for no timeout * @timeout: the prepared hrtimer_sleeper, or null for no timeout
*/ */
void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout)
struct hrtimer_sleeper *timeout)
{ {
/*
* The task state is guaranteed to be set before another task can
* wake it. set_current_state() is implemented using smp_store_mb() and
* futex_queue() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
futex_queue(q, hb, current);
/* Arm the timer */ /* Arm the timer */
if (timeout) if (timeout)
hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
@@ -412,11 +401,16 @@ int futex_unqueue_multiple(struct futex_vector *v, int count)
*/ */
int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
{ {
struct futex_hash_bucket *hb;
bool retry = false; bool retry = false;
int ret, i; int ret, i;
u32 uval; u32 uval;
/*
* Make sure to have a reference on the private_hash such that we
* don't block on rehash after changing the task state below.
*/
guard(private_hash)();
/* /*
* Enqueuing multiple futexes is tricky, because we need to enqueue * Enqueuing multiple futexes is tricky, because we need to enqueue
* each futex on the list before dealing with the next one to avoid * each futex on the list before dealing with the next one to avoid
@@ -451,20 +445,24 @@ retry:
struct futex_q *q = &vs[i].q; struct futex_q *q = &vs[i].q;
u32 val = vs[i].w.val; u32 val = vs[i].w.val;
hb = futex_q_lock(q); if (1) {
ret = futex_get_value_locked(&uval, uaddr); CLASS(hb, hb)(&q->key);
if (!ret && uval == val) { futex_q_lock(q, hb);
/* ret = futex_get_value_locked(&uval, uaddr);
* The bucket lock can't be held while dealing with the
* next futex. Queue each futex at this moment so hb can if (!ret && uval == val) {
* be unlocked. /*
*/ * The bucket lock can't be held while dealing with the
futex_queue(q, hb, current); * next futex. Queue each futex at this moment so hb can
continue; * be unlocked.
*/
futex_queue(q, hb, current);
continue;
}
futex_q_unlock(hb);
} }
futex_q_unlock(hb);
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
/* /*
@@ -578,7 +576,8 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
* @val: the expected value * @val: the expected value
* @flags: futex flags (FLAGS_SHARED, etc.) * @flags: futex flags (FLAGS_SHARED, etc.)
* @q: the associated futex_q * @q: the associated futex_q
* @hb: storage for hash_bucket pointer to be returned to caller * @key2: the second futex_key if used for requeue PI
* @task: Task queueing this futex
* *
* Setup the futex_q and locate the hash_bucket. Get the futex value and * Setup the futex_q and locate the hash_bucket. Get the futex value and
* compare it with the expected value. Handle atomic faults internally. * compare it with the expected value. Handle atomic faults internally.
@@ -586,10 +585,12 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
* *
* Return: * Return:
* - 0 - uaddr contains val and hb has been locked; * - 0 - uaddr contains val and hb has been locked;
* - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked * - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not
* be read, does not contain the expected value or is not properly aligned.
*/ */
int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb) struct futex_q *q, union futex_key *key2,
struct task_struct *task)
{ {
u32 uval; u32 uval;
int ret; int ret;
@@ -618,26 +619,45 @@ retry:
return ret; return ret;
retry_private: retry_private:
*hb = futex_q_lock(q); if (1) {
CLASS(hb, hb)(&q->key);
ret = futex_get_value_locked(&uval, uaddr); futex_q_lock(q, hb);
if (ret) { ret = futex_get_value_locked(&uval, uaddr);
futex_q_unlock(*hb);
ret = get_user(uval, uaddr); if (ret) {
if (ret) futex_q_unlock(hb);
return ret;
if (!(flags & FLAGS_SHARED)) ret = get_user(uval, uaddr);
goto retry_private; if (ret)
return ret;
goto retry; if (!(flags & FLAGS_SHARED))
} goto retry_private;
if (uval != val) { goto retry;
futex_q_unlock(*hb); }
ret = -EWOULDBLOCK;
if (uval != val) {
futex_q_unlock(hb);
return -EWOULDBLOCK;
}
if (key2 && futex_match(&q->key, key2)) {
futex_q_unlock(hb);
return -EINVAL;
}
/*
* The task state is guaranteed to be set before another task can
* wake it. set_current_state() is implemented using smp_store_mb() and
* futex_queue() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
if (task == current)
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
futex_queue(q, hb, task);
} }
return ret; return ret;
@@ -647,7 +667,6 @@ int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
struct hrtimer_sleeper *to, u32 bitset) struct hrtimer_sleeper *to, u32 bitset)
{ {
struct futex_q q = futex_q_init; struct futex_q q = futex_q_init;
struct futex_hash_bucket *hb;
int ret; int ret;
if (!bitset) if (!bitset)
@@ -660,12 +679,12 @@ retry:
* Prepare to wait on uaddr. On success, it holds hb->lock and q * Prepare to wait on uaddr. On success, it holds hb->lock and q
* is initialized. * is initialized.
*/ */
ret = futex_wait_setup(uaddr, val, flags, &q, &hb); ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current);
if (ret) if (ret)
return ret; return ret;
/* futex_queue and wait for wakeup, timeout, or a signal. */ /* futex_queue and wait for wakeup, timeout, or a signal. */
futex_wait_queue(hb, &q, to); futex_do_wait(&q, to);
/* If we were woken (and unqueued), we succeeded, whatever. */ /* If we were woken (and unqueued), we succeeded, whatever. */
if (!futex_unqueue(&q)) if (!futex_unqueue(&q))

View File

@@ -219,6 +219,7 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes; unsigned long nr_lock_classes;
unsigned long nr_zapped_classes; unsigned long nr_zapped_classes;
unsigned long nr_dynamic_keys;
unsigned long max_lock_class_idx; unsigned long max_lock_class_idx;
struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
@@ -1238,6 +1239,7 @@ void lockdep_register_key(struct lock_class_key *key)
goto out_unlock; goto out_unlock;
} }
hlist_add_head_rcu(&key->hash_entry, hash_head); hlist_add_head_rcu(&key->hash_entry, hash_head);
nr_dynamic_keys++;
out_unlock: out_unlock:
graph_unlock(); graph_unlock();
restore_irqs: restore_irqs:
@@ -1976,41 +1978,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
print_circular_bug_entry(entry, depth); print_circular_bug_entry(entry, depth);
} }
/*
* We are about to add A -> B into the dependency graph, and in __bfs() a
* strong dependency path A -> .. -> B is found: hlock_class equals
* entry->class.
*
* If A -> .. -> B can replace A -> B in any __bfs() search (means the former
* is _stronger_ than or equal to the latter), we consider A -> B as redundant.
* For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A
* -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
* dependency graph, as any strong path ..-> A -> B ->.. we can get with
* having dependency A -> B, we could already get a equivalent path ..-> A ->
* .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant.
*
* We need to make sure both the start and the end of A -> .. -> B is not
* weaker than A -> B. For the start part, please see the comment in
* check_redundant(). For the end part, we need:
*
* Either
*
* a) A -> B is -(*R)-> (everything is not weaker than that)
*
* or
*
* b) A -> .. -> B is -(*N)-> (nothing is stronger than this)
*
*/
static inline bool hlock_equal(struct lock_list *entry, void *data)
{
struct held_lock *hlock = (struct held_lock *)data;
return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
(hlock->read == 2 || /* A -> B is -(*R)-> */
!entry->only_xr); /* A -> .. -> B is -(*N)-> */
}
/* /*
* We are about to add B -> A into the dependency graph, and in __bfs() a * We are about to add B -> A into the dependency graph, and in __bfs() a
* strong dependency path A -> .. -> B is found: hlock_class equals * strong dependency path A -> .. -> B is found: hlock_class equals
@@ -2915,6 +2882,41 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
#endif /* CONFIG_TRACE_IRQFLAGS */ #endif /* CONFIG_TRACE_IRQFLAGS */
#ifdef CONFIG_LOCKDEP_SMALL #ifdef CONFIG_LOCKDEP_SMALL
/*
* We are about to add A -> B into the dependency graph, and in __bfs() a
* strong dependency path A -> .. -> B is found: hlock_class equals
* entry->class.
*
* If A -> .. -> B can replace A -> B in any __bfs() search (means the former
* is _stronger_ than or equal to the latter), we consider A -> B as redundant.
* For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A
* -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
* dependency graph, as any strong path ..-> A -> B ->.. we can get with
* having dependency A -> B, we could already get a equivalent path ..-> A ->
* .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant.
*
* We need to make sure both the start and the end of A -> .. -> B is not
* weaker than A -> B. For the start part, please see the comment in
* check_redundant(). For the end part, we need:
*
* Either
*
* a) A -> B is -(*R)-> (everything is not weaker than that)
*
* or
*
* b) A -> .. -> B is -(*N)-> (nothing is stronger than this)
*
*/
static inline bool hlock_equal(struct lock_list *entry, void *data)
{
struct held_lock *hlock = (struct held_lock *)data;
return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
(hlock->read == 2 || /* A -> B is -(*R)-> */
!entry->only_xr); /* A -> .. -> B is -(*N)-> */
}
/* /*
* Check that the dependency graph starting at <src> can lead to * Check that the dependency graph starting at <src> can lead to
* <target> or not. If it can, <src> -> <target> dependency is already * <target> or not. If it can, <src> -> <target> dependency is already
@@ -5101,6 +5103,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
lockevent_inc(lockdep_nocheck); lockevent_inc(lockdep_nocheck);
} }
if (DEBUG_LOCKS_WARN_ON(subclass >= MAX_LOCKDEP_SUBCLASSES))
return 0;
if (subclass < NR_LOCKDEP_CACHING_CLASSES) if (subclass < NR_LOCKDEP_CACHING_CLASSES)
class = lock->class_cache[subclass]; class = lock->class_cache[subclass];
/* /*
@@ -6606,6 +6611,7 @@ void lockdep_unregister_key(struct lock_class_key *key)
pf = get_pending_free(); pf = get_pending_free();
__lockdep_free_key_range(pf, key, 1); __lockdep_free_key_range(pf, key, 1);
need_callback = prepare_call_rcu_zapped(pf); need_callback = prepare_call_rcu_zapped(pf);
nr_dynamic_keys--;
} }
lockdep_unlock(); lockdep_unlock();
raw_local_irq_restore(flags); raw_local_irq_restore(flags);

View File

@@ -138,6 +138,7 @@ extern unsigned long nr_lock_classes;
extern unsigned long nr_zapped_classes; extern unsigned long nr_zapped_classes;
extern unsigned long nr_zapped_lock_chains; extern unsigned long nr_zapped_lock_chains;
extern unsigned long nr_list_entries; extern unsigned long nr_list_entries;
extern unsigned long nr_dynamic_keys;
long lockdep_next_lockchain(long i); long lockdep_next_lockchain(long i);
unsigned long lock_chain_count(void); unsigned long lock_chain_count(void);
extern unsigned long nr_stack_trace_entries; extern unsigned long nr_stack_trace_entries;

View File

@@ -286,6 +286,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#endif #endif
seq_printf(m, " lock-classes: %11lu [max: %lu]\n", seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
nr_lock_classes, MAX_LOCKDEP_KEYS); nr_lock_classes, MAX_LOCKDEP_KEYS);
seq_printf(m, " dynamic-keys: %11lu\n",
nr_dynamic_keys);
seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
nr_list_entries, MAX_LOCKDEP_ENTRIES); nr_list_entries, MAX_LOCKDEP_ENTRIES);
seq_printf(m, " indirect dependencies: %11lu\n", seq_printf(m, " indirect dependencies: %11lu\n",

View File

@@ -52,6 +52,7 @@
#include <linux/user_namespace.h> #include <linux/user_namespace.h>
#include <linux/time_namespace.h> #include <linux/time_namespace.h>
#include <linux/binfmts.h> #include <linux/binfmts.h>
#include <linux/futex.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/sched/autogroup.h> #include <linux/sched/autogroup.h>
@@ -2820,6 +2821,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL; return -EINVAL;
error = posixtimer_create_prctl(arg2); error = posixtimer_create_prctl(arg2);
break; break;
case PR_FUTEX_HASH:
error = futex_hash_prctl(arg2, arg3, arg4);
break;
default: default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL; error = -EINVAL;

View File

@@ -200,7 +200,23 @@ void *vmalloc_noprof(unsigned long size)
} }
EXPORT_SYMBOL(vmalloc_noprof); EXPORT_SYMBOL(vmalloc_noprof);
void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof); /*
* vmalloc_huge_node - allocate virtually contiguous memory, on a node
*
* @size: allocation size
* @gfp_mask: flags for the page level allocator
* @node: node to use for allocation or NUMA_NO_NODE
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
* Due to NOMMU implications the node argument and HUGE page attribute is
* ignored.
*/
void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
return __vmalloc_noprof(size, gfp_mask);
}
/* /*
* vzalloc - allocate virtually contiguous memory with zero fill * vzalloc - allocate virtually contiguous memory with zero fill

View File

@@ -3944,9 +3944,10 @@ void *vmalloc_noprof(unsigned long size)
EXPORT_SYMBOL(vmalloc_noprof); EXPORT_SYMBOL(vmalloc_noprof);
/** /**
* vmalloc_huge - allocate virtually contiguous memory, allow huge pages * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
* @size: allocation size * @size: allocation size
* @gfp_mask: flags for the page level allocator * @gfp_mask: flags for the page level allocator
* @node: node to use for allocation or NUMA_NO_NODE
* *
* Allocate enough pages to cover @size from the page level * Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space. * allocator and map them into contiguous kernel virtual space.
@@ -3955,13 +3956,13 @@ EXPORT_SYMBOL(vmalloc_noprof);
* *
* Return: pointer to the allocated memory or %NULL on error * Return: pointer to the allocated memory or %NULL on error
*/ */
void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{ {
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
NUMA_NO_NODE, __builtin_return_address(0)); node, __builtin_return_address(0));
} }
EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);
/** /**
* vzalloc - allocate virtually contiguous memory with zero fill * vzalloc - allocate virtually contiguous memory with zero fill

View File

@@ -230,7 +230,7 @@ struct prctl_mm_map {
# define PR_PAC_APDBKEY (1UL << 3) # define PR_PAC_APDBKEY (1UL << 3)
# define PR_PAC_APGAKEY (1UL << 4) # define PR_PAC_APGAKEY (1UL << 4)
/* Tagged user address controls for arm64 */ /* Tagged user address controls for arm64 and RISC-V */
#define PR_SET_TAGGED_ADDR_CTRL 55 #define PR_SET_TAGGED_ADDR_CTRL 55
#define PR_GET_TAGGED_ADDR_CTRL 56 #define PR_GET_TAGGED_ADDR_CTRL 56
# define PR_TAGGED_ADDR_ENABLE (1UL << 0) # define PR_TAGGED_ADDR_ENABLE (1UL << 0)
@@ -244,6 +244,9 @@ struct prctl_mm_map {
# define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) # define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT)
/* Unused; kept only for source compatibility */ /* Unused; kept only for source compatibility */
# define PR_MTE_TCF_SHIFT 1 # define PR_MTE_TCF_SHIFT 1
/* RISC-V pointer masking tag length */
# define PR_PMLEN_SHIFT 24
# define PR_PMLEN_MASK (0x7fUL << PR_PMLEN_SHIFT)
/* Control reclaim behavior when allocating memory */ /* Control reclaim behavior when allocating memory */
#define PR_SET_IO_FLUSHER 57 #define PR_SET_IO_FLUSHER 57
@@ -328,4 +331,44 @@ struct prctl_mm_map {
# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */ # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */
# define PR_PPC_DEXCR_CTRL_MASK 0x1f # define PR_PPC_DEXCR_CTRL_MASK 0x1f
/*
* Get the current shadow stack configuration for the current thread,
* this will be the value configured via PR_SET_SHADOW_STACK_STATUS.
*/
#define PR_GET_SHADOW_STACK_STATUS 74
/*
* Set the current shadow stack configuration. Enabling the shadow
* stack will cause a shadow stack to be allocated for the thread.
*/
#define PR_SET_SHADOW_STACK_STATUS 75
# define PR_SHADOW_STACK_ENABLE (1UL << 0)
# define PR_SHADOW_STACK_WRITE (1UL << 1)
# define PR_SHADOW_STACK_PUSH (1UL << 2)
/*
* Prevent further changes to the specified shadow stack
* configuration. All bits may be locked via this call, including
* undefined bits.
*/
#define PR_LOCK_SHADOW_STACK_STATUS 76
/*
* Controls the mode of timer_create() for CRIU restore operations.
* Enabling this allows CRIU to restore timers with explicit IDs.
*
* Don't use for normal operations as the result might be undefined.
*/
#define PR_TIMER_CREATE_RESTORE_IDS 77
# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0
# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
# define PR_TIMER_CREATE_RESTORE_IDS_GET 2
/* FUTEX hash management */
#define PR_FUTEX_HASH 78
# define PR_FUTEX_HASH_SET_SLOTS 1
# define FH_FLAG_IMMUTABLE (1ULL << 0)
# define PR_FUTEX_HASH_GET_SLOTS 2
# define PR_FUTEX_HASH_GET_IMMUTABLE 3
#endif /* _LINUX_PRCTL_H */ #endif /* _LINUX_PRCTL_H */

View File

@@ -3,6 +3,7 @@ perf-bench-y += sched-pipe.o
perf-bench-y += sched-seccomp-notify.o perf-bench-y += sched-seccomp-notify.o
perf-bench-y += syscall.o perf-bench-y += syscall.o
perf-bench-y += mem-functions.o perf-bench-y += mem-functions.o
perf-bench-y += futex.o
perf-bench-y += futex-hash.o perf-bench-y += futex-hash.o
perf-bench-y += futex-wake.o perf-bench-y += futex-wake.o
perf-bench-y += futex-wake-parallel.o perf-bench-y += futex-wake-parallel.o

View File

@@ -18,9 +18,11 @@
#include <stdlib.h> #include <stdlib.h>
#include <linux/compiler.h> #include <linux/compiler.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/prctl.h>
#include <linux/zalloc.h> #include <linux/zalloc.h>
#include <sys/time.h> #include <sys/time.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/prctl.h>
#include <perf/cpumap.h> #include <perf/cpumap.h>
#include "../util/mutex.h" #include "../util/mutex.h"
@@ -50,9 +52,12 @@ struct worker {
static struct bench_futex_parameters params = { static struct bench_futex_parameters params = {
.nfutexes = 1024, .nfutexes = 1024,
.runtime = 10, .runtime = 10,
.nbuckets = -1,
}; };
static const struct option options[] = { static const struct option options[] = {
OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"),
OPT_BOOLEAN( 'I', "immutable", &params.buckets_immutable, "Make the hash buckets immutable"),
OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"), OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"),
OPT_UINTEGER('r', "runtime", &params.runtime, "Specify runtime (in seconds)"), OPT_UINTEGER('r', "runtime", &params.runtime, "Specify runtime (in seconds)"),
OPT_UINTEGER('f', "futexes", &params.nfutexes, "Specify amount of futexes per threads"), OPT_UINTEGER('f', "futexes", &params.nfutexes, "Specify amount of futexes per threads"),
@@ -118,6 +123,7 @@ static void print_summary(void)
printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n",
!params.silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), !params.silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg),
(int)bench__runtime.tv_sec); (int)bench__runtime.tv_sec);
futex_print_nbuckets(&params);
} }
int bench_futex_hash(int argc, const char **argv) int bench_futex_hash(int argc, const char **argv)
@@ -161,6 +167,7 @@ int bench_futex_hash(int argc, const char **argv)
if (!params.fshared) if (!params.fshared)
futex_flag = FUTEX_PRIVATE_FLAG; futex_flag = FUTEX_PRIVATE_FLAG;
futex_set_nbuckets_param(&params);
printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n", printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n",
getpid(), params.nthreads, params.nfutexes, params.fshared ? "shared":"private", params.runtime); getpid(), params.nthreads, params.nfutexes, params.fshared ? "shared":"private", params.runtime);

View File

@@ -41,10 +41,13 @@ static struct stats throughput_stats;
static struct cond thread_parent, thread_worker; static struct cond thread_parent, thread_worker;
static struct bench_futex_parameters params = { static struct bench_futex_parameters params = {
.nbuckets = -1,
.runtime = 10, .runtime = 10,
}; };
static const struct option options[] = { static const struct option options[] = {
OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"),
OPT_BOOLEAN( 'I', "immutable", &params.buckets_immutable, "Make the hash buckets immutable"),
OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"), OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"),
OPT_UINTEGER('r', "runtime", &params.runtime, "Specify runtime (in seconds)"), OPT_UINTEGER('r', "runtime", &params.runtime, "Specify runtime (in seconds)"),
OPT_BOOLEAN( 'M', "multi", &params.multi, "Use multiple futexes"), OPT_BOOLEAN( 'M', "multi", &params.multi, "Use multiple futexes"),
@@ -67,6 +70,7 @@ static void print_summary(void)
printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n",
!params.silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), !params.silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg),
(int)bench__runtime.tv_sec); (int)bench__runtime.tv_sec);
futex_print_nbuckets(&params);
} }
static void toggle_done(int sig __maybe_unused, static void toggle_done(int sig __maybe_unused,
@@ -203,6 +207,7 @@ int bench_futex_lock_pi(int argc, const char **argv)
mutex_init(&thread_lock); mutex_init(&thread_lock);
cond_init(&thread_parent); cond_init(&thread_parent);
cond_init(&thread_worker); cond_init(&thread_worker);
futex_set_nbuckets_param(&params);
threads_starting = params.nthreads; threads_starting = params.nthreads;
gettimeofday(&bench__start, NULL); gettimeofday(&bench__start, NULL);

View File

@@ -42,6 +42,7 @@ static unsigned int threads_starting;
static int futex_flag = 0; static int futex_flag = 0;
static struct bench_futex_parameters params = { static struct bench_futex_parameters params = {
.nbuckets = -1,
/* /*
* How many tasks to requeue at a time. * How many tasks to requeue at a time.
* Default to 1 in order to make the kernel work more. * Default to 1 in order to make the kernel work more.
@@ -50,6 +51,8 @@ static struct bench_futex_parameters params = {
}; };
static const struct option options[] = { static const struct option options[] = {
OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"),
OPT_BOOLEAN( 'I', "immutable", &params.buckets_immutable, "Make the hash buckets immutable"),
OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"), OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"),
OPT_UINTEGER('q', "nrequeue", &params.nrequeue, "Specify amount of threads to requeue at once"), OPT_UINTEGER('q', "nrequeue", &params.nrequeue, "Specify amount of threads to requeue at once"),
OPT_BOOLEAN( 's', "silent", &params.silent, "Silent mode: do not display data/details"), OPT_BOOLEAN( 's', "silent", &params.silent, "Silent mode: do not display data/details"),
@@ -77,6 +80,7 @@ static void print_summary(void)
params.nthreads, params.nthreads,
requeuetime_avg / USEC_PER_MSEC, requeuetime_avg / USEC_PER_MSEC,
rel_stddev_stats(requeuetime_stddev, requeuetime_avg)); rel_stddev_stats(requeuetime_stddev, requeuetime_avg));
futex_print_nbuckets(&params);
} }
static void *workerfn(void *arg __maybe_unused) static void *workerfn(void *arg __maybe_unused)
@@ -204,6 +208,8 @@ int bench_futex_requeue(int argc, const char **argv)
if (params.broadcast) if (params.broadcast)
params.nrequeue = params.nthreads; params.nrequeue = params.nthreads;
futex_set_nbuckets_param(&params);
printf("Run summary [PID %d]: Requeuing %d threads (from [%s] %p to %s%p), " printf("Run summary [PID %d]: Requeuing %d threads (from [%s] %p to %s%p), "
"%d at a time.\n\n", getpid(), params.nthreads, "%d at a time.\n\n", getpid(), params.nthreads,
params.fshared ? "shared":"private", &futex1, params.fshared ? "shared":"private", &futex1,

View File

@@ -57,9 +57,13 @@ static struct stats waketime_stats, wakeup_stats;
static unsigned int threads_starting; static unsigned int threads_starting;
static int futex_flag = 0; static int futex_flag = 0;
static struct bench_futex_parameters params; static struct bench_futex_parameters params = {
.nbuckets = -1,
};
static const struct option options[] = { static const struct option options[] = {
OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"),
OPT_BOOLEAN( 'I', "immutable", &params.buckets_immutable, "Make the hash buckets immutable"),
OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"), OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"),
OPT_UINTEGER('w', "nwakers", &params.nwakes, "Specify amount of waking threads"), OPT_UINTEGER('w', "nwakers", &params.nwakes, "Specify amount of waking threads"),
OPT_BOOLEAN( 's', "silent", &params.silent, "Silent mode: do not display data/details"), OPT_BOOLEAN( 's', "silent", &params.silent, "Silent mode: do not display data/details"),
@@ -218,6 +222,7 @@ static void print_summary(void)
params.nthreads, params.nthreads,
waketime_avg / USEC_PER_MSEC, waketime_avg / USEC_PER_MSEC,
rel_stddev_stats(waketime_stddev, waketime_avg)); rel_stddev_stats(waketime_stddev, waketime_avg));
futex_print_nbuckets(&params);
} }
@@ -291,6 +296,8 @@ int bench_futex_wake_parallel(int argc, const char **argv)
if (!params.fshared) if (!params.fshared)
futex_flag = FUTEX_PRIVATE_FLAG; futex_flag = FUTEX_PRIVATE_FLAG;
futex_set_nbuckets_param(&params);
printf("Run summary [PID %d]: blocking on %d threads (at [%s] " printf("Run summary [PID %d]: blocking on %d threads (at [%s] "
"futex %p), %d threads waking up %d at a time.\n\n", "futex %p), %d threads waking up %d at a time.\n\n",
getpid(), params.nthreads, params.fshared ? "shared":"private", getpid(), params.nthreads, params.fshared ? "shared":"private",

View File

@@ -42,6 +42,7 @@ static unsigned int threads_starting;
static int futex_flag = 0; static int futex_flag = 0;
static struct bench_futex_parameters params = { static struct bench_futex_parameters params = {
.nbuckets = -1,
/* /*
* How many wakeups to do at a time. * How many wakeups to do at a time.
* Default to 1 in order to make the kernel work more. * Default to 1 in order to make the kernel work more.
@@ -50,6 +51,8 @@ static struct bench_futex_parameters params = {
}; };
static const struct option options[] = { static const struct option options[] = {
OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"),
OPT_BOOLEAN( 'I', "immutable", &params.buckets_immutable, "Make the hash buckets immutable"),
OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"), OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"),
OPT_UINTEGER('w', "nwakes", &params.nwakes, "Specify amount of threads to wake at once"), OPT_UINTEGER('w', "nwakes", &params.nwakes, "Specify amount of threads to wake at once"),
OPT_BOOLEAN( 's', "silent", &params.silent, "Silent mode: do not display data/details"), OPT_BOOLEAN( 's', "silent", &params.silent, "Silent mode: do not display data/details"),
@@ -93,6 +96,7 @@ static void print_summary(void)
params.nthreads, params.nthreads,
waketime_avg / USEC_PER_MSEC, waketime_avg / USEC_PER_MSEC,
rel_stddev_stats(waketime_stddev, waketime_avg)); rel_stddev_stats(waketime_stddev, waketime_avg));
futex_print_nbuckets(&params);
} }
static void block_threads(pthread_t *w, struct perf_cpu_map *cpu) static void block_threads(pthread_t *w, struct perf_cpu_map *cpu)

67
tools/perf/bench/futex.c Normal file
View File

@@ -0,0 +1,67 @@
// SPDX-License-Identifier: GPL-2.0
#include <err.h>
#include <stdio.h>
#include <stdlib.h>
#include <linux/prctl.h>
#include <sys/prctl.h>
#include "futex.h"
void futex_set_nbuckets_param(struct bench_futex_parameters *params)
{
unsigned long flags;
int ret;
if (params->nbuckets < 0)
return;
flags = params->buckets_immutable ? FH_FLAG_IMMUTABLE : 0;
ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, params->nbuckets, flags);
if (ret) {
printf("Requesting %d hash buckets failed: %d/%m\n",
params->nbuckets, ret);
err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)");
}
}
void futex_print_nbuckets(struct bench_futex_parameters *params)
{
char *futex_hash_mode;
int ret;
ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS);
if (params->nbuckets >= 0) {
if (ret != params->nbuckets) {
if (ret < 0) {
printf("Can't query number of buckets: %m\n");
err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)");
}
printf("Requested number of hash buckets does not currently used.\n");
printf("Requested: %d in usage: %d\n", params->nbuckets, ret);
err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)");
}
if (params->nbuckets == 0) {
ret = asprintf(&futex_hash_mode, "Futex hashing: global hash");
} else {
ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_IMMUTABLE);
if (ret < 0) {
printf("Can't check if the hash is immutable: %m\n");
err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)");
}
ret = asprintf(&futex_hash_mode, "Futex hashing: %d hash buckets %s",
params->nbuckets,
ret == 1 ? "(immutable)" : "");
}
} else {
if (ret <= 0) {
ret = asprintf(&futex_hash_mode, "Futex hashing: global hash");
} else {
ret = asprintf(&futex_hash_mode, "Futex hashing: auto resized to %d buckets",
ret);
}
}
if (ret < 0)
err(EXIT_FAILURE, "ENOMEM, futex_hash_mode");
printf("%s\n", futex_hash_mode);
free(futex_hash_mode);
}

View File

@@ -25,6 +25,8 @@ struct bench_futex_parameters {
unsigned int nfutexes; unsigned int nfutexes;
unsigned int nwakes; unsigned int nwakes;
unsigned int nrequeue; unsigned int nrequeue;
int nbuckets;
bool buckets_immutable;
}; };
/** /**
@@ -143,4 +145,7 @@ futex_cmp_requeue_pi(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2,
val, opflags); val, opflags);
} }
void futex_set_nbuckets_param(struct bench_futex_parameters *params);
void futex_print_nbuckets(struct bench_futex_parameters *params);
#endif /* _FUTEX_H */ #endif /* _FUTEX_H */

View File

@@ -1,11 +1,13 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
futex_numa_mpol
futex_priv_hash
futex_requeue
futex_requeue_pi futex_requeue_pi
futex_requeue_pi_mismatched_ops futex_requeue_pi_mismatched_ops
futex_requeue_pi_signal_restart futex_requeue_pi_signal_restart
futex_wait
futex_wait_private_mapped_file futex_wait_private_mapped_file
futex_wait_timeout futex_wait_timeout
futex_wait_uninitialized_heap futex_wait_uninitialized_heap
futex_wait_wouldblock futex_wait_wouldblock
futex_wait
futex_requeue
futex_waitv futex_waitv

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
INCLUDES := -I../include -I../../ $(KHDR_INCLUDES) INCLUDES := -I../include -I../../ $(KHDR_INCLUDES)
CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread $(INCLUDES) $(KHDR_INCLUDES) CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread $(INCLUDES) $(KHDR_INCLUDES)
LDLIBS := -lpthread -lrt LDLIBS := -lpthread -lrt -lnuma
LOCAL_HDRS := \ LOCAL_HDRS := \
../include/futextest.h \ ../include/futextest.h \
@@ -17,7 +17,10 @@ TEST_GEN_PROGS := \
futex_wait_private_mapped_file \ futex_wait_private_mapped_file \
futex_wait \ futex_wait \
futex_requeue \ futex_requeue \
futex_waitv futex_priv_hash \
futex_numa_mpol \
futex_waitv \
futex_numa
TEST_PROGS := run.sh TEST_PROGS := run.sh

View File

@@ -0,0 +1,262 @@
// SPDX-License-Identifier: GPL-2.0
#include <pthread.h>
#include <sys/shm.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <stdbool.h>
#include <time.h>
#include <assert.h>
#include "logging.h"
#include "futextest.h"
#include "futex2test.h"
typedef u_int32_t u32;
typedef int32_t s32;
typedef u_int64_t u64;
static unsigned int fflags = (FUTEX2_SIZE_U32 | FUTEX2_PRIVATE);
static int fnode = FUTEX_NO_NODE;
/* fairly stupid test-and-set lock with a waiter flag */
#define N_LOCK 0x0000001
#define N_WAITERS 0x0001000
struct futex_numa_32 {
union {
u64 full;
struct {
u32 val;
u32 node;
};
};
};
void futex_numa_32_lock(struct futex_numa_32 *lock)
{
for (;;) {
struct futex_numa_32 new, old = {
.full = __atomic_load_n(&lock->full, __ATOMIC_RELAXED),
};
for (;;) {
new = old;
if (old.val == 0) {
/* no waiter, no lock -> first lock, set no-node */
new.node = fnode;
}
if (old.val & N_LOCK) {
/* contention, set waiter */
new.val |= N_WAITERS;
}
new.val |= N_LOCK;
/* nothing changed, ready to block */
if (old.full == new.full)
break;
/*
* Use u64 cmpxchg to set the futex value and node in a
* consistent manner.
*/
if (__atomic_compare_exchange_n(&lock->full,
&old.full, new.full,
/* .weak */ false,
__ATOMIC_ACQUIRE,
__ATOMIC_RELAXED)) {
/* if we just set N_LOCK, we own it */
if (!(old.val & N_LOCK))
return;
/* go block */
break;
}
}
futex2_wait(lock, new.val, fflags, NULL, 0);
}
}
void futex_numa_32_unlock(struct futex_numa_32 *lock)
{
u32 val = __atomic_sub_fetch(&lock->val, N_LOCK, __ATOMIC_RELEASE);
assert((s32)val >= 0);
if (val & N_WAITERS) {
int woken = futex2_wake(lock, 1, fflags);
assert(val == N_WAITERS);
if (!woken) {
__atomic_compare_exchange_n(&lock->val, &val, 0U,
false, __ATOMIC_RELAXED,
__ATOMIC_RELAXED);
}
}
}
static long nanos = 50000;
struct thread_args {
pthread_t tid;
volatile int * done;
struct futex_numa_32 *lock;
int val;
int *val1, *val2;
int node;
};
static void *threadfn(void *_arg)
{
struct thread_args *args = _arg;
struct timespec ts = {
.tv_nsec = nanos,
};
int node;
while (!*args->done) {
futex_numa_32_lock(args->lock);
args->val++;
assert(*args->val1 == *args->val2);
(*args->val1)++;
nanosleep(&ts, NULL);
(*args->val2)++;
node = args->lock->node;
futex_numa_32_unlock(args->lock);
if (node != args->node) {
args->node = node;
printf("node: %d\n", node);
}
nanosleep(&ts, NULL);
}
return NULL;
}
static void *contendfn(void *_arg)
{
struct thread_args *args = _arg;
while (!*args->done) {
/*
* futex2_wait() will take hb-lock, verify *var == val and
* queue/abort. By knowingly setting val 'wrong' this will
* abort and thereby generate hb-lock contention.
*/
futex2_wait(&args->lock->val, ~0U, fflags, NULL, 0);
args->val++;
}
return NULL;
}
static volatile int done = 0;
static struct futex_numa_32 lock = { .val = 0, };
static int val1, val2;
int main(int argc, char *argv[])
{
struct thread_args *tas[512], *cas[512];
int c, t, threads = 2, contenders = 0;
int sleeps = 10;
int total = 0;
while ((c = getopt(argc, argv, "c:t:s:n:N::")) != -1) {
switch (c) {
case 'c':
contenders = atoi(optarg);
break;
case 't':
threads = atoi(optarg);
break;
case 's':
sleeps = atoi(optarg);
break;
case 'n':
nanos = atoi(optarg);
break;
case 'N':
fflags |= FUTEX2_NUMA;
if (optarg)
fnode = atoi(optarg);
break;
default:
exit(1);
break;
}
}
for (t = 0; t < contenders; t++) {
struct thread_args *args = calloc(1, sizeof(*args));
if (!args) {
perror("thread_args");
exit(-1);
}
args->done = &done;
args->lock = &lock;
args->val1 = &val1;
args->val2 = &val2;
args->node = -1;
if (pthread_create(&args->tid, NULL, contendfn, args)) {
perror("pthread_create");
exit(-1);
}
cas[t] = args;
}
for (t = 0; t < threads; t++) {
struct thread_args *args = calloc(1, sizeof(*args));
if (!args) {
perror("thread_args");
exit(-1);
}
args->done = &done;
args->lock = &lock;
args->val1 = &val1;
args->val2 = &val2;
args->node = -1;
if (pthread_create(&args->tid, NULL, threadfn, args)) {
perror("pthread_create");
exit(-1);
}
tas[t] = args;
}
sleep(sleeps);
done = true;
for (t = 0; t < threads; t++) {
struct thread_args *args = tas[t];
pthread_join(args->tid, NULL);
total += args->val;
// printf("tval: %d\n", args->val);
}
printf("total: %d\n", total);
if (contenders) {
total = 0;
for (t = 0; t < contenders; t++) {
struct thread_args *args = cas[t];
pthread_join(args->tid, NULL);
total += args->val;
// printf("tval: %d\n", args->val);
}
printf("contenders: %d\n", total);
}
return 0;
}

View File

@@ -0,0 +1,231 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2025 Sebastian Andrzej Siewior <bigeasy@linutronix.de>
*/
#define _GNU_SOURCE
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <numa.h>
#include <numaif.h>
#include <linux/futex.h>
#include <sys/mman.h>
#include "logging.h"
#include "futextest.h"
#include "futex2test.h"
#define MAX_THREADS 64
static pthread_barrier_t barrier_main;
static pthread_t threads[MAX_THREADS];
struct thread_args {
void *futex_ptr;
unsigned int flags;
int result;
};
static struct thread_args thread_args[MAX_THREADS];
#ifndef FUTEX_NO_NODE
#define FUTEX_NO_NODE (-1)
#endif
#ifndef FUTEX2_MPOL
#define FUTEX2_MPOL 0x08
#endif
static void *thread_lock_fn(void *arg)
{
struct thread_args *args = arg;
int ret;
pthread_barrier_wait(&barrier_main);
ret = futex2_wait(args->futex_ptr, 0, args->flags, NULL, 0);
args->result = ret;
return NULL;
}
static void create_max_threads(void *futex_ptr)
{
int i, ret;
for (i = 0; i < MAX_THREADS; i++) {
thread_args[i].futex_ptr = futex_ptr;
thread_args[i].flags = FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA;
thread_args[i].result = 0;
ret = pthread_create(&threads[i], NULL, thread_lock_fn, &thread_args[i]);
if (ret)
ksft_exit_fail_msg("pthread_create failed\n");
}
}
static void join_max_threads(void)
{
int i, ret;
for (i = 0; i < MAX_THREADS; i++) {
ret = pthread_join(threads[i], NULL);
if (ret)
ksft_exit_fail_msg("pthread_join failed for thread %d\n", i);
}
}
static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flags)
{
int to_wake, ret, i, need_exit = 0;
pthread_barrier_init(&barrier_main, NULL, MAX_THREADS + 1);
create_max_threads(futex_ptr);
pthread_barrier_wait(&barrier_main);
to_wake = MAX_THREADS;
do {
ret = futex2_wake(futex_ptr, to_wake, futex_flags);
if (must_fail) {
if (ret < 0)
break;
ksft_exit_fail_msg("futex2_wake(%d, 0x%x) should fail, but didn't\n",
to_wake, futex_flags);
}
if (ret < 0) {
ksft_exit_fail_msg("Failed futex2_wake(%d, 0x%x): %m\n",
to_wake, futex_flags);
}
if (!ret)
usleep(50);
to_wake -= ret;
} while (to_wake);
join_max_threads();
for (i = 0; i < MAX_THREADS; i++) {
if (must_fail && thread_args[i].result != -1) {
ksft_print_msg("Thread %d should fail but succeeded (%d)\n",
i, thread_args[i].result);
need_exit = 1;
}
if (!must_fail && thread_args[i].result != 0) {
ksft_print_msg("Thread %d failed (%d)\n", i, thread_args[i].result);
need_exit = 1;
}
}
if (need_exit)
ksft_exit_fail_msg("Aborting due to earlier errors.\n");
}
static void test_futex(void *futex_ptr, int must_fail)
{
__test_futex(futex_ptr, must_fail, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA);
}
static void test_futex_mpol(void *futex_ptr, int must_fail)
{
__test_futex(futex_ptr, must_fail, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL);
}
static void usage(char *prog)
{
printf("Usage: %s\n", prog);
printf(" -c Use color\n");
printf(" -h Display this help message\n");
printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
VQUIET, VCRITICAL, VINFO);
}
int main(int argc, char *argv[])
{
struct futex32_numa *futex_numa;
int mem_size, i;
void *futex_ptr;
char c;
while ((c = getopt(argc, argv, "chv:")) != -1) {
switch (c) {
case 'c':
log_color(1);
break;
case 'h':
usage(basename(argv[0]));
exit(0);
break;
case 'v':
log_verbosity(atoi(optarg));
break;
default:
usage(basename(argv[0]));
exit(1);
}
}
ksft_print_header();
ksft_set_plan(1);
mem_size = sysconf(_SC_PAGE_SIZE);
futex_ptr = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
if (futex_ptr == MAP_FAILED)
ksft_exit_fail_msg("mmap() for %d bytes failed\n", mem_size);
futex_numa = futex_ptr;
ksft_print_msg("Regular test\n");
futex_numa->futex = 0;
futex_numa->numa = FUTEX_NO_NODE;
test_futex(futex_ptr, 0);
if (futex_numa->numa == FUTEX_NO_NODE)
ksft_exit_fail_msg("NUMA node is left uninitialized\n");
ksft_print_msg("Memory too small\n");
test_futex(futex_ptr + mem_size - 4, 1);
ksft_print_msg("Memory out of range\n");
test_futex(futex_ptr + mem_size, 1);
futex_numa->numa = FUTEX_NO_NODE;
mprotect(futex_ptr, mem_size, PROT_READ);
ksft_print_msg("Memory, RO\n");
test_futex(futex_ptr, 1);
mprotect(futex_ptr, mem_size, PROT_NONE);
ksft_print_msg("Memory, no access\n");
test_futex(futex_ptr, 1);
mprotect(futex_ptr, mem_size, PROT_READ | PROT_WRITE);
ksft_print_msg("Memory back to RW\n");
test_futex(futex_ptr, 0);
/* MPOL test. Does not work as expected */
for (i = 0; i < 4; i++) {
unsigned long nodemask;
int ret;
nodemask = 1 << i;
ret = mbind(futex_ptr, mem_size, MPOL_BIND, &nodemask,
sizeof(nodemask) * 8, 0);
if (ret == 0) {
ksft_print_msg("Node %d test\n", i);
futex_numa->futex = 0;
futex_numa->numa = FUTEX_NO_NODE;
ret = futex2_wake(futex_ptr, 0, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL);
if (ret < 0)
ksft_test_result_fail("Failed to wake 0 with MPOL: %m\n");
if (0)
test_futex_mpol(futex_numa, 0);
if (futex_numa->numa != i) {
ksft_test_result_fail("Returned NUMA node is %d expected %d\n",
futex_numa->numa, i);
}
}
}
ksft_test_result_pass("NUMA MPOL tests passed\n");
ksft_finished();
return 0;
}

View File

@@ -0,0 +1,292 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2025 Sebastian Andrzej Siewior <bigeasy@linutronix.de>
*/
#define _GNU_SOURCE
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <linux/prctl.h>
#include <sys/prctl.h>
#include "logging.h"
#define MAX_THREADS 64
static pthread_barrier_t barrier_main;
static pthread_mutex_t global_lock;
static pthread_t threads[MAX_THREADS];
static int counter;
#ifndef PR_FUTEX_HASH
#define PR_FUTEX_HASH 78
# define PR_FUTEX_HASH_SET_SLOTS 1
# define FH_FLAG_IMMUTABLE (1ULL << 0)
# define PR_FUTEX_HASH_GET_SLOTS 2
# define PR_FUTEX_HASH_GET_IMMUTABLE 3
#endif
static int futex_hash_slots_set(unsigned int slots, int flags)
{
return prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, slots, flags);
}
static int futex_hash_slots_get(void)
{
return prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS);
}
static int futex_hash_immutable_get(void)
{
return prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_IMMUTABLE);
}
static void futex_hash_slots_set_verify(int slots)
{
int ret;
ret = futex_hash_slots_set(slots, 0);
if (ret != 0) {
ksft_test_result_fail("Failed to set slots to %d: %m\n", slots);
ksft_finished();
}
ret = futex_hash_slots_get();
if (ret != slots) {
ksft_test_result_fail("Set %d slots but PR_FUTEX_HASH_GET_SLOTS returns: %d, %m\n",
slots, ret);
ksft_finished();
}
ksft_test_result_pass("SET and GET slots %d passed\n", slots);
}
static void futex_hash_slots_set_must_fail(int slots, int flags)
{
int ret;
ret = futex_hash_slots_set(slots, flags);
ksft_test_result(ret < 0, "futex_hash_slots_set(%d, %d)\n",
slots, flags);
}
static void *thread_return_fn(void *arg)
{
return NULL;
}
static void *thread_lock_fn(void *arg)
{
pthread_barrier_wait(&barrier_main);
pthread_mutex_lock(&global_lock);
counter++;
usleep(20);
pthread_mutex_unlock(&global_lock);
return NULL;
}
static void create_max_threads(void *(*thread_fn)(void *))
{
int i, ret;
for (i = 0; i < MAX_THREADS; i++) {
ret = pthread_create(&threads[i], NULL, thread_fn, NULL);
if (ret)
ksft_exit_fail_msg("pthread_create failed: %m\n");
}
}
static void join_max_threads(void)
{
int i, ret;
for (i = 0; i < MAX_THREADS; i++) {
ret = pthread_join(threads[i], NULL);
if (ret)
ksft_exit_fail_msg("pthread_join failed for thread %d\n", i);
}
}
static void usage(char *prog)
{
printf("Usage: %s\n", prog);
printf(" -c Use color\n");
printf(" -g Test global hash instead intead local immutable \n");
printf(" -h Display this help message\n");
printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
VQUIET, VCRITICAL, VINFO);
}
static const char *test_msg_auto_create = "Automatic hash bucket init on thread creation.\n";
static const char *test_msg_auto_inc = "Automatic increase with more than 16 CPUs\n";
int main(int argc, char *argv[])
{
int futex_slots1, futex_slotsn, online_cpus;
pthread_mutexattr_t mutex_attr_pi;
int use_global_hash = 0;
int ret;
char c;
while ((c = getopt(argc, argv, "cghv:")) != -1) {
switch (c) {
case 'c':
log_color(1);
break;
case 'g':
use_global_hash = 1;
break;
case 'h':
usage(basename(argv[0]));
exit(0);
break;
case 'v':
log_verbosity(atoi(optarg));
break;
default:
usage(basename(argv[0]));
exit(1);
}
}
ksft_print_header();
ksft_set_plan(22);
ret = pthread_mutexattr_init(&mutex_attr_pi);
ret |= pthread_mutexattr_setprotocol(&mutex_attr_pi, PTHREAD_PRIO_INHERIT);
ret |= pthread_mutex_init(&global_lock, &mutex_attr_pi);
if (ret != 0) {
ksft_exit_fail_msg("Failed to initialize pthread mutex.\n");
}
/* First thread, expect to be 0, not yet initialized */
ret = futex_hash_slots_get();
if (ret != 0)
ksft_exit_fail_msg("futex_hash_slots_get() failed: %d, %m\n", ret);
ret = futex_hash_immutable_get();
if (ret != 0)
ksft_exit_fail_msg("futex_hash_immutable_get() failed: %d, %m\n", ret);
ksft_test_result_pass("Basic get slots and immutable status.\n");
ret = pthread_create(&threads[0], NULL, thread_return_fn, NULL);
if (ret != 0)
ksft_exit_fail_msg("pthread_create() failed: %d, %m\n", ret);
ret = pthread_join(threads[0], NULL);
if (ret != 0)
ksft_exit_fail_msg("pthread_join() failed: %d, %m\n", ret);
/* First thread, has to initialiaze private hash */
futex_slots1 = futex_hash_slots_get();
if (futex_slots1 <= 0) {
ksft_print_msg("Current hash buckets: %d\n", futex_slots1);
ksft_exit_fail_msg(test_msg_auto_create);
}
ksft_test_result_pass(test_msg_auto_create);
online_cpus = sysconf(_SC_NPROCESSORS_ONLN);
ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS + 1);
if (ret != 0)
ksft_exit_fail_msg("pthread_barrier_init failed: %m.\n");
ret = pthread_mutex_lock(&global_lock);
if (ret != 0)
ksft_exit_fail_msg("pthread_mutex_lock failed: %m.\n");
counter = 0;
create_max_threads(thread_lock_fn);
pthread_barrier_wait(&barrier_main);
/*
* The current default size of hash buckets is 16. The auto increase
* works only if more than 16 CPUs are available.
*/
ksft_print_msg("Online CPUs: %d\n", online_cpus);
if (online_cpus > 16) {
futex_slotsn = futex_hash_slots_get();
if (futex_slotsn < 0 || futex_slots1 == futex_slotsn) {
ksft_print_msg("Expected increase of hash buckets but got: %d -> %d\n",
futex_slots1, futex_slotsn);
ksft_exit_fail_msg(test_msg_auto_inc);
}
ksft_test_result_pass(test_msg_auto_inc);
} else {
ksft_test_result_skip(test_msg_auto_inc);
}
ret = pthread_mutex_unlock(&global_lock);
/* Once the user changes it, it has to be what is set */
futex_hash_slots_set_verify(2);
futex_hash_slots_set_verify(4);
futex_hash_slots_set_verify(8);
futex_hash_slots_set_verify(32);
futex_hash_slots_set_verify(16);
ret = futex_hash_slots_set(15, 0);
ksft_test_result(ret < 0, "Use 15 slots\n");
futex_hash_slots_set_verify(2);
join_max_threads();
ksft_test_result(counter == MAX_THREADS, "Created of waited for %d of %d threads\n",
counter, MAX_THREADS);
counter = 0;
/* Once the user set something, auto reisze must be disabled */
ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS);
create_max_threads(thread_lock_fn);
join_max_threads();
ret = futex_hash_slots_get();
ksft_test_result(ret == 2, "No more auto-resize after manaul setting, got %d\n",
ret);
futex_hash_slots_set_must_fail(1 << 29, 0);
/*
* Once the private hash has been made immutable or global hash has been requested,
* then this requested can not be undone.
*/
if (use_global_hash) {
ret = futex_hash_slots_set(0, 0);
ksft_test_result(ret == 0, "Global hash request\n");
} else {
ret = futex_hash_slots_set(4, FH_FLAG_IMMUTABLE);
ksft_test_result(ret == 0, "Immutable resize to 4\n");
}
if (ret != 0)
goto out;
futex_hash_slots_set_must_fail(4, 0);
futex_hash_slots_set_must_fail(4, FH_FLAG_IMMUTABLE);
futex_hash_slots_set_must_fail(8, 0);
futex_hash_slots_set_must_fail(8, FH_FLAG_IMMUTABLE);
futex_hash_slots_set_must_fail(0, FH_FLAG_IMMUTABLE);
futex_hash_slots_set_must_fail(6, FH_FLAG_IMMUTABLE);
ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS);
if (ret != 0) {
ksft_exit_fail_msg("pthread_barrier_init failed: %m\n");
return 1;
}
create_max_threads(thread_lock_fn);
join_max_threads();
ret = futex_hash_slots_get();
if (use_global_hash) {
ksft_test_result(ret == 0, "Continue to use global hash\n");
} else {
ksft_test_result(ret == 4, "Continue to use the 4 hash buckets\n");
}
ret = futex_hash_immutable_get();
ksft_test_result(ret == 1, "Hash reports to be immutable\n");
out:
ksft_finished();
return 0;
}

View File

@@ -82,3 +82,10 @@ echo
echo echo
./futex_waitv $COLOR ./futex_waitv $COLOR
echo
./futex_priv_hash $COLOR
./futex_priv_hash -g $COLOR
echo
./futex_numa_mpol $COLOR

View File

@@ -8,6 +8,53 @@
#define u64_to_ptr(x) ((void *)(uintptr_t)(x)) #define u64_to_ptr(x) ((void *)(uintptr_t)(x))
#ifndef __NR_futex_waitv
#define __NR_futex_waitv 449
struct futex_waitv {
__u64 val;
__u64 uaddr;
__u32 flags;
__u32 __reserved;
};
#endif
#ifndef __NR_futex_wake
#define __NR_futex_wake 454
#endif
#ifndef __NR_futex_wait
#define __NR_futex_wait 455
#endif
#ifndef FUTEX2_SIZE_U32
#define FUTEX2_SIZE_U32 0x02
#endif
#ifndef FUTEX2_NUMA
#define FUTEX2_NUMA 0x04
#endif
#ifndef FUTEX2_MPOL
#define FUTEX2_MPOL 0x08
#endif
#ifndef FUTEX2_PRIVATE
#define FUTEX2_PRIVATE FUTEX_PRIVATE_FLAG
#endif
#ifndef FUTEX2_NO_NODE
#define FUTEX_NO_NODE (-1)
#endif
#ifndef FUTEX_32
#define FUTEX_32 FUTEX2_SIZE_U32
#endif
struct futex32_numa {
futex_t futex;
futex_t numa;
};
/** /**
* futex_waitv - Wait at multiple futexes, wake on any * futex_waitv - Wait at multiple futexes, wake on any
* @waiters: Array of waiters * @waiters: Array of waiters
@@ -20,3 +67,26 @@ static inline int futex_waitv(volatile struct futex_waitv *waiters, unsigned lon
{ {
return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo, clockid); return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo, clockid);
} }
/*
* futex_wait() - block on uaddr with optional timeout
* @val: Expected value
* @flags: FUTEX2 flags
* @timeout: Relative timeout
* @clockid: Clock id for the timeout
*/
static inline int futex2_wait(void *uaddr, long val, unsigned int flags,
struct timespec *timeout, clockid_t clockid)
{
return syscall(__NR_futex_wait, uaddr, val, ~0U, flags, timeout, clockid);
}
/*
* futex2_wake() - Wake a number of futexes
* @nr: Number of threads to wake at most
* @flags: FUTEX2 flags
*/
static inline int futex2_wake(void *uaddr, int nr, unsigned int flags)
{
return syscall(__NR_futex_wake, uaddr, ~0U, nr, flags);
}